dapla-toolbelt-metadata 0.2.1__py3-none-any.whl → 0.9.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dapla-toolbelt-metadata might be problematic. Click here for more details.

Files changed (97) hide show
  1. dapla_metadata/__init__.py +11 -1
  2. dapla_metadata/_shared/__init__.py +1 -0
  3. dapla_metadata/_shared/config.py +109 -0
  4. dapla_metadata/_shared/enums.py +27 -0
  5. dapla_metadata/_shared/py.typed +0 -0
  6. dapla_metadata/dapla/__init__.py +4 -0
  7. dapla_metadata/dapla/user_info.py +138 -0
  8. dapla_metadata/datasets/__init__.py +1 -1
  9. dapla_metadata/datasets/_merge.py +333 -0
  10. dapla_metadata/datasets/code_list.py +5 -6
  11. dapla_metadata/datasets/compatibility/__init__.py +10 -0
  12. dapla_metadata/datasets/compatibility/_handlers.py +363 -0
  13. dapla_metadata/datasets/compatibility/_utils.py +259 -0
  14. dapla_metadata/datasets/compatibility/model_backwards_compatibility.py +135 -0
  15. dapla_metadata/datasets/core.py +136 -182
  16. dapla_metadata/datasets/dapla_dataset_path_info.py +145 -19
  17. dapla_metadata/datasets/dataset_parser.py +41 -28
  18. dapla_metadata/datasets/model_validation.py +29 -20
  19. dapla_metadata/datasets/statistic_subject_mapping.py +5 -1
  20. dapla_metadata/datasets/utility/constants.py +22 -15
  21. dapla_metadata/datasets/utility/enums.py +8 -20
  22. dapla_metadata/datasets/utility/urn.py +234 -0
  23. dapla_metadata/datasets/utility/utils.py +183 -111
  24. dapla_metadata/standards/__init__.py +4 -0
  25. dapla_metadata/standards/name_validator.py +250 -0
  26. dapla_metadata/standards/standard_validators.py +98 -0
  27. dapla_metadata/standards/utils/__init__.py +1 -0
  28. dapla_metadata/standards/utils/constants.py +49 -0
  29. dapla_metadata/variable_definitions/__init__.py +11 -0
  30. dapla_metadata/variable_definitions/_generated/.openapi-generator/FILES +20 -0
  31. dapla_metadata/variable_definitions/_generated/.openapi-generator/VERSION +1 -0
  32. dapla_metadata/variable_definitions/_generated/.openapi-generator-ignore +6 -0
  33. dapla_metadata/variable_definitions/_generated/README.md +148 -0
  34. dapla_metadata/variable_definitions/_generated/__init__.py +0 -0
  35. dapla_metadata/variable_definitions/_generated/vardef_client/__init__.py +47 -0
  36. dapla_metadata/variable_definitions/_generated/vardef_client/api/__init__.py +8 -0
  37. dapla_metadata/variable_definitions/_generated/vardef_client/api/data_migration_api.py +766 -0
  38. dapla_metadata/variable_definitions/_generated/vardef_client/api/draft_variable_definitions_api.py +888 -0
  39. dapla_metadata/variable_definitions/_generated/vardef_client/api/patches_api.py +888 -0
  40. dapla_metadata/variable_definitions/_generated/vardef_client/api/validity_periods_api.py +583 -0
  41. dapla_metadata/variable_definitions/_generated/vardef_client/api/variable_definitions_api.py +613 -0
  42. dapla_metadata/variable_definitions/_generated/vardef_client/api_client.py +779 -0
  43. dapla_metadata/variable_definitions/_generated/vardef_client/api_response.py +27 -0
  44. dapla_metadata/variable_definitions/_generated/vardef_client/configuration.py +474 -0
  45. dapla_metadata/variable_definitions/_generated/vardef_client/docs/CompleteResponse.md +51 -0
  46. dapla_metadata/variable_definitions/_generated/vardef_client/docs/Contact.md +30 -0
  47. dapla_metadata/variable_definitions/_generated/vardef_client/docs/DataMigrationApi.md +90 -0
  48. dapla_metadata/variable_definitions/_generated/vardef_client/docs/Draft.md +42 -0
  49. dapla_metadata/variable_definitions/_generated/vardef_client/docs/DraftVariableDefinitionsApi.md +259 -0
  50. dapla_metadata/variable_definitions/_generated/vardef_client/docs/LanguageStringType.md +31 -0
  51. dapla_metadata/variable_definitions/_generated/vardef_client/docs/Owner.md +31 -0
  52. dapla_metadata/variable_definitions/_generated/vardef_client/docs/Patch.md +43 -0
  53. dapla_metadata/variable_definitions/_generated/vardef_client/docs/PatchesApi.md +249 -0
  54. dapla_metadata/variable_definitions/_generated/vardef_client/docs/PublicApi.md +218 -0
  55. dapla_metadata/variable_definitions/_generated/vardef_client/docs/SupportedLanguages.md +15 -0
  56. dapla_metadata/variable_definitions/_generated/vardef_client/docs/UpdateDraft.md +44 -0
  57. dapla_metadata/variable_definitions/_generated/vardef_client/docs/ValidityPeriod.md +42 -0
  58. dapla_metadata/variable_definitions/_generated/vardef_client/docs/ValidityPeriodsApi.md +236 -0
  59. dapla_metadata/variable_definitions/_generated/vardef_client/docs/VariableDefinitionsApi.md +304 -0
  60. dapla_metadata/variable_definitions/_generated/vardef_client/docs/VariableStatus.md +17 -0
  61. dapla_metadata/variable_definitions/_generated/vardef_client/exceptions.py +193 -0
  62. dapla_metadata/variable_definitions/_generated/vardef_client/models/__init__.py +31 -0
  63. dapla_metadata/variable_definitions/_generated/vardef_client/models/complete_response.py +260 -0
  64. dapla_metadata/variable_definitions/_generated/vardef_client/models/contact.py +94 -0
  65. dapla_metadata/variable_definitions/_generated/vardef_client/models/draft.py +228 -0
  66. dapla_metadata/variable_definitions/_generated/vardef_client/models/get_vardok_vardef_mapping_by_id200_response.py +158 -0
  67. dapla_metadata/variable_definitions/_generated/vardef_client/models/language_string_type.py +101 -0
  68. dapla_metadata/variable_definitions/_generated/vardef_client/models/owner.py +87 -0
  69. dapla_metadata/variable_definitions/_generated/vardef_client/models/patch.py +244 -0
  70. dapla_metadata/variable_definitions/_generated/vardef_client/models/problem.py +118 -0
  71. dapla_metadata/variable_definitions/_generated/vardef_client/models/update_draft.py +274 -0
  72. dapla_metadata/variable_definitions/_generated/vardef_client/models/validity_period.py +225 -0
  73. dapla_metadata/variable_definitions/_generated/vardef_client/models/vardok_id_response.py +81 -0
  74. dapla_metadata/variable_definitions/_generated/vardef_client/models/vardok_vardef_id_pair_response.py +84 -0
  75. dapla_metadata/variable_definitions/_generated/vardef_client/models/variable_status.py +33 -0
  76. dapla_metadata/variable_definitions/_generated/vardef_client/py.typed +0 -0
  77. dapla_metadata/variable_definitions/_generated/vardef_client/rest.py +249 -0
  78. dapla_metadata/variable_definitions/_utils/__init__.py +1 -0
  79. dapla_metadata/variable_definitions/_utils/_client.py +32 -0
  80. dapla_metadata/variable_definitions/_utils/config.py +54 -0
  81. dapla_metadata/variable_definitions/_utils/constants.py +80 -0
  82. dapla_metadata/variable_definitions/_utils/files.py +309 -0
  83. dapla_metadata/variable_definitions/_utils/template_files.py +99 -0
  84. dapla_metadata/variable_definitions/_utils/variable_definition_files.py +143 -0
  85. dapla_metadata/variable_definitions/exceptions.py +255 -0
  86. dapla_metadata/variable_definitions/vardef.py +372 -0
  87. dapla_metadata/variable_definitions/vardok_id.py +48 -0
  88. dapla_metadata/variable_definitions/vardok_vardef_id_pair.py +47 -0
  89. dapla_metadata/variable_definitions/variable_definition.py +422 -0
  90. {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info}/METADATA +34 -36
  91. dapla_toolbelt_metadata-0.9.11.dist-info/RECORD +97 -0
  92. {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info}/WHEEL +1 -1
  93. dapla_metadata/datasets/config.py +0 -80
  94. dapla_metadata/datasets/model_backwards_compatibility.py +0 -520
  95. dapla_metadata/datasets/user_info.py +0 -88
  96. dapla_toolbelt_metadata-0.2.1.dist-info/RECORD +0 -22
  97. {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,234 @@
1
+ """Validate, parse and render URNs."""
2
+
3
+ import logging
4
+ import re
5
+ from collections.abc import Iterable
6
+ from dataclasses import dataclass
7
+ from enum import Enum
8
+ from enum import auto
9
+ from typing import Literal
10
+
11
+ from pydantic import AnyUrl
12
+
13
+ from dapla_metadata._shared.config import get_dapla_environment
14
+ from dapla_metadata._shared.enums import DaplaEnvironment
15
+ from dapla_metadata.datasets.utility.utils import VariableListType
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ URN_ERROR_MESSAGE_BASE = "The URL is not in a supported format"
20
+
21
+ URN_ERROR_MESSAGE_TEMPLATE = (
22
+ URN_ERROR_MESSAGE_BASE
23
+ + " for field '{field_name}' of variable '{short_name}'. URL: '{value}'. Please contact Team Metadata if this URL should be supported."
24
+ )
25
+
26
+
27
+ VARDEF_URL_TEMPLATE = "https://{subdomain}.{domain}/variable-definitions"
28
+
29
+
30
+ class SsbNaisDomains(str, Enum):
31
+ """The available domains on SSBs Nais instance."""
32
+
33
+ TEST_EXTERNAL = "test.ssb.no"
34
+ TEST_INTERNAL = "intern.test.ssb.no"
35
+ PROD_EXTERNAL = "ssb.no"
36
+ PROD_INTERNAL = "intern.ssb.no"
37
+
38
+
39
+ class ReferenceUrlTypes(Enum):
40
+ """The general category of the URL.
41
+
42
+ This can be useful to refer to when constructing a URL from a URN for a
43
+ specific context.
44
+ """
45
+
46
+ API = auto()
47
+ FRONTEND = auto()
48
+
49
+
50
+ UrlVisibility = Literal["public", "internal"]
51
+
52
+
53
+ @dataclass
54
+ class UrnConverter:
55
+ """Converts URLs to URNs and vice versa.
56
+
57
+ Fields:
58
+ urn_base: The format for the URN, up to the identifier.
59
+ id_pattern: A capturing group pattern which matches identifiers for this resource.
60
+ url_bases: The list of all the different URL representations for a resource. There
61
+ will typically be a number of URL representations for a particular resource,
62
+ depending on which system or technology they are accessed through and other
63
+ technical factors. This list defines which concrete URLs can be considered
64
+ equivalent to a URN.
65
+ """
66
+
67
+ urn_base: str
68
+ id_pattern: str
69
+ url_bases: list[tuple[ReferenceUrlTypes, str]]
70
+
71
+ def _extract_id(self, url: str, pattern: re.Pattern[str]) -> str | None:
72
+ if match := pattern.match(url):
73
+ return match.group(1)
74
+ return None
75
+
76
+ def _build_pattern(self, url_base: str) -> re.Pattern[str]:
77
+ return re.compile(f"^{url_base}/{self.id_pattern}")
78
+
79
+ def get_urn(self, identifier: str) -> str:
80
+ """Build a URN for the given identifier."""
81
+ return f"{self.urn_base}:{identifier}"
82
+
83
+ def get_url(
84
+ self,
85
+ identifier: str,
86
+ url_type: ReferenceUrlTypes,
87
+ visibility: Literal["public", "internal"] = "public",
88
+ ) -> str | None:
89
+ """Build concrete URL to reference a resource.
90
+
91
+ There are typically multiple URLs used to refer to one resource, this method attempts to support known variations.
92
+
93
+ Args:
94
+ identifier (str): The identifier of the resource the URL refers to.
95
+ url_type (ReferenceUrlTypes): The representation type of the URL
96
+ visibility (UrlVisibility, optional): Whether the URL should be that which is publicly available or not. Defaults to "public".
97
+
98
+ Returns:
99
+ str | None: The concrete URL. None if we cannot satisfy the supplied requirements.
100
+ """
101
+ candidates = [base[-1] for base in self.url_bases if base[0] == url_type]
102
+
103
+ def matches_visibility(url: str, visibility: UrlVisibility):
104
+ return (".intern." in url) is (visibility == "internal")
105
+
106
+ def matches_environment(url: str):
107
+ current_environment = get_dapla_environment()
108
+ if current_environment == DaplaEnvironment.TEST:
109
+ return ".test." in url
110
+ return ".test." not in url
111
+
112
+ if url := next(
113
+ (
114
+ url
115
+ for url in candidates
116
+ if matches_visibility(url, visibility) and matches_environment(url)
117
+ ),
118
+ None,
119
+ ):
120
+ return url + "/" + identifier
121
+ return None
122
+
123
+ def get_id(self, urn_or_url: str | AnyUrl) -> str | None:
124
+ """Get an identifier from a URN or URL.
125
+
126
+ Args:
127
+ urn_or_url (str | AnyUrl): The URN or URL refering to a particular resource
128
+
129
+ Returns:
130
+ str | None: The identifier for the resource, or None if it cannot be extracted.
131
+ """
132
+ if str(urn_or_url).startswith(self.urn_base):
133
+ return str(urn_or_url).removeprefix(self.urn_base + ":")
134
+ return self._extract_id_from_url(urn_or_url)
135
+
136
+ def is_id(self, value: str) -> bool:
137
+ """Check if the value is an identifier for this URN type.
138
+
139
+ Args:
140
+ value (str): The value to check.
141
+ """
142
+ if not isinstance(value, str):
143
+ # Mypy thinks it's impossible to reach this branch, but there are no guarantees in Python.
144
+ return False # type: ignore [unreachable]
145
+ pattern = re.compile(f"^{self.id_pattern}$")
146
+ return bool(pattern.match(value))
147
+
148
+ def _extract_id_from_url(self, url: str | AnyUrl) -> str | None:
149
+ patterns = (self._build_pattern(url[-1]) for url in self.url_bases)
150
+ matches = (self._extract_id(str(url), p) for p in patterns)
151
+ return next((m for m in matches if m), None)
152
+
153
+ def convert_url_to_urn(self, url: str | AnyUrl) -> AnyUrl | None:
154
+ """Convert a URL to a generalized URN for that same resource.
155
+
156
+ Args:
157
+ url (str | AnyUrl): The URL to convert.
158
+
159
+ Returns:
160
+ str | None: The URN or None if it can't be converted.
161
+ """
162
+ if str(url).startswith(self.urn_base):
163
+ # In this case the value is already in the expected format and nothing needs to be done.
164
+ return AnyUrl(url)
165
+ if identifier := self._extract_id_from_url(url):
166
+ return AnyUrl(self.get_urn(identifier))
167
+
168
+ return None
169
+
170
+
171
+ vardef_urn_converter = UrnConverter(
172
+ urn_base="urn:ssb:variable-definition:vardef",
173
+ id_pattern=r"([a-z0-9]{8})",
174
+ url_bases=[
175
+ *[
176
+ (
177
+ ReferenceUrlTypes.API,
178
+ VARDEF_URL_TEMPLATE.format(
179
+ subdomain="metadata", domain=nais_domain.value
180
+ ),
181
+ )
182
+ for nais_domain in SsbNaisDomains
183
+ ],
184
+ *[
185
+ (
186
+ ReferenceUrlTypes.FRONTEND,
187
+ VARDEF_URL_TEMPLATE.format(
188
+ subdomain="catalog", domain=nais_domain.value
189
+ ),
190
+ )
191
+ for nais_domain in SsbNaisDomains
192
+ ],
193
+ ],
194
+ )
195
+
196
+ klass_urn_converter = UrnConverter(
197
+ urn_base="urn:ssb:classification:klass",
198
+ id_pattern=r"([0-9]{1,5})",
199
+ url_bases=[
200
+ (ReferenceUrlTypes.FRONTEND, "https://www.ssb.no/klass/klassifikasjoner"),
201
+ (ReferenceUrlTypes.FRONTEND, "https://www.ssb.no/en/klass/klassifikasjoner"),
202
+ (ReferenceUrlTypes.API, "https://data.ssb.no/api/klass/v1/classifications"),
203
+ ],
204
+ )
205
+
206
+
207
+ def convert_uris_to_urns(
208
+ variables: VariableListType, field_name: str, converters: Iterable[UrnConverter]
209
+ ) -> None:
210
+ """Where URIs are recognized URLs, convert them to URNs.
211
+
212
+ Where the value is not a known URL we preserve the value as it is and log an
213
+ ERROR level message.
214
+
215
+ Args:
216
+ variables (VariableListType): The list of variables.
217
+ field_name (str): The name of the field which has URLs to convert to URNs
218
+ converters (Iterable[UrnConverter]): One or more converters which implement
219
+ conversion of URLs into one specific URN format. These will typically be
220
+ specific to an individual metadata reference system.
221
+ """
222
+ for v in variables:
223
+ field = getattr(v, field_name, None)
224
+ if field:
225
+ if urn := next((c.convert_url_to_urn(field) for c in converters), None):
226
+ setattr(v, field_name, urn)
227
+ else:
228
+ logger.error(
229
+ URN_ERROR_MESSAGE_TEMPLATE.format(
230
+ field_name=field_name,
231
+ short_name=v.short_name,
232
+ value=field,
233
+ )
234
+ )
@@ -4,19 +4,25 @@ import datetime # import is needed in xdoctest
4
4
  import logging
5
5
  import pathlib
6
6
  import uuid
7
+ from typing import Any
8
+ from typing import TypeAlias
7
9
 
10
+ import datadoc_model.all_optional.model as all_optional_model
11
+ import datadoc_model.required.model as required_model
12
+ import google.auth
8
13
  from cloudpathlib import CloudPath
9
14
  from cloudpathlib import GSClient
10
15
  from cloudpathlib import GSPath
11
- from dapla import AuthClient
12
- from datadoc_model import model
13
- from datadoc_model.model import Assessment
14
- from datadoc_model.model import DataSetState
15
- from datadoc_model.model import VariableRole
16
-
17
- from dapla_metadata.datasets.utility.constants import (
18
- DATASET_FIELDS_FROM_EXISTING_METADATA,
19
- )
16
+ from datadoc_model.all_optional.model import Assessment
17
+ from datadoc_model.all_optional.model import DataSetState
18
+ from datadoc_model.all_optional.model import VariableRole
19
+
20
+ from dapla_metadata.dapla import user_info
21
+ from dapla_metadata.datasets.utility.constants import DAEAD_ENCRYPTION_KEY_REFERENCE
22
+ from dapla_metadata.datasets.utility.constants import ENCRYPTION_PARAMETER_KEY_ID
23
+ from dapla_metadata.datasets.utility.constants import ENCRYPTION_PARAMETER_SNAPSHOT_DATE
24
+ from dapla_metadata.datasets.utility.constants import ENCRYPTION_PARAMETER_STRATEGY
25
+ from dapla_metadata.datasets.utility.constants import ENCRYPTION_PARAMETER_STRATEGY_SKIP
20
26
  from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
21
27
  from dapla_metadata.datasets.utility.constants import (
22
28
  OBLIGATORY_DATASET_METADATA_IDENTIFIERS,
@@ -30,9 +36,33 @@ from dapla_metadata.datasets.utility.constants import (
30
36
  from dapla_metadata.datasets.utility.constants import (
31
37
  OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS_MULTILANGUAGE,
32
38
  )
39
+ from dapla_metadata.datasets.utility.constants import (
40
+ OBLIGATORY_VARIABLES_PSEUDONYMIZATION_IDENTIFIERS,
41
+ )
42
+ from dapla_metadata.datasets.utility.constants import PAPIS_ENCRYPTION_KEY_REFERENCE
43
+ from dapla_metadata.datasets.utility.constants import PAPIS_STABLE_IDENTIFIER_TYPE
44
+ from dapla_metadata.datasets.utility.enums import EncryptionAlgorithm
33
45
 
34
46
  logger = logging.getLogger(__name__)
35
47
 
48
+ DatadocMetadataType: TypeAlias = (
49
+ all_optional_model.DatadocMetadata | required_model.DatadocMetadata
50
+ )
51
+ DatasetType: TypeAlias = all_optional_model.Dataset | required_model.Dataset
52
+ VariableType: TypeAlias = all_optional_model.Variable | required_model.Variable
53
+ PseudonymizationType: TypeAlias = (
54
+ all_optional_model.Pseudonymization | required_model.Pseudonymization
55
+ )
56
+ VariableListType: TypeAlias = (
57
+ list[all_optional_model.Variable] | list[required_model.Variable]
58
+ )
59
+ OptionalDatadocMetadataType: TypeAlias = DatadocMetadataType | None
60
+
61
+
62
+ def get_current_date() -> str:
63
+ """Return a current date as str."""
64
+ return datetime.datetime.now(tz=datetime.timezone.utc).date().isoformat()
65
+
36
66
 
37
67
  def get_timestamp_now() -> datetime.datetime:
38
68
  """Return a timestamp for the current moment."""
@@ -51,7 +81,7 @@ def normalize_path(path: str) -> pathlib.Path | CloudPath:
51
81
  Pathlib compatible object.
52
82
  """
53
83
  if path.startswith(GSPath.cloud_prefix):
54
- client = GSClient(credentials=AuthClient.fetch_google_credentials())
84
+ client = GSClient(credentials=google.auth.default()[0])
55
85
  return GSPath(path, client=client)
56
86
  return pathlib.Path(path)
57
87
 
@@ -78,7 +108,7 @@ def derive_assessment_from_state(state: DataSetState) -> Assessment:
78
108
  Returns:
79
109
  The derived assessment of the dataset.
80
110
  """
81
- match (state):
111
+ match state:
82
112
  case (
83
113
  DataSetState.INPUT_DATA
84
114
  | DataSetState.PROCESSED_DATA
@@ -91,56 +121,67 @@ def derive_assessment_from_state(state: DataSetState) -> Assessment:
91
121
  return Assessment.SENSITIVE
92
122
 
93
123
 
94
- def set_default_values_variables(variables: list) -> None:
124
+ def set_default_values_variables(variables: VariableListType) -> None:
95
125
  """Set default values on variables.
96
126
 
97
127
  Args:
98
128
  variables: A list of variable objects to set default values on.
99
129
 
100
130
  Example:
101
- >>> variables = [model.Variable(short_name="pers",id=None, is_personal_data = None), model.Variable(short_name="fnr",id='9662875c-c245-41de-b667-12ad2091a1ee', is_personal_data='PSEUDONYMISED_ENCRYPTED_PERSONAL_DATA')]
131
+ >>> variables = [all_optional_model.Variable(short_name="pers",id=None, is_personal_data = None), all_optional_model.Variable(short_name="fnr",id='9662875c-c245-41de-b667-12ad2091a1ee', is_personal_data=True)]
102
132
  >>> set_default_values_variables(variables)
103
133
  >>> isinstance(variables[0].id, uuid.UUID)
104
134
  True
105
135
 
106
- >>> variables[1].is_personal_data == 'PSEUDONYMISED_ENCRYPTED_PERSONAL_DATA'
136
+ >>> variables[1].is_personal_data == True
107
137
  True
108
138
 
109
- >>> variables[0].is_personal_data == 'NOT_PERSONAL_DATA'
139
+ >>> variables[0].is_personal_data == False
110
140
  True
111
141
  """
112
142
  for v in variables:
113
143
  if v.id is None:
114
144
  v.id = uuid.uuid4()
115
145
  if v.is_personal_data is None:
116
- v.is_personal_data = model.IsPersonalData.NOT_PERSONAL_DATA
146
+ v.is_personal_data = False
117
147
  if v.variable_role is None:
118
148
  v.variable_role = VariableRole.MEASURE
119
149
 
120
150
 
121
- def set_default_values_dataset(dataset: model.Dataset) -> None:
151
+ def set_default_values_dataset(
152
+ dataset: DatasetType,
153
+ ) -> None:
122
154
  """Set default values on dataset.
123
155
 
124
156
  Args:
125
157
  dataset: The dataset object to set default values on.
126
158
 
127
159
  Example:
128
- >>> dataset = model.Dataset(id=None, contains_personal_data=None)
160
+ >>> dataset = all_optional_model.Dataset(id=None)
129
161
  >>> set_default_values_dataset(dataset)
130
162
  >>> dataset.id is not None
131
163
  True
132
-
133
- >>> dataset.contains_personal_data == False
134
- True
135
164
  """
136
165
  if not dataset.id:
137
166
  dataset.id = uuid.uuid4()
138
- if dataset.contains_personal_data is None:
139
- dataset.contains_personal_data = False
167
+
168
+
169
+ def set_dataset_owner(
170
+ dataset: DatasetType,
171
+ ) -> None:
172
+ """Sets the owner of the dataset from the DAPLA_GROUP_CONTEXT enviornment variable.
173
+
174
+ Args:
175
+ dataset: The dataset object to set default values on.
176
+ """
177
+ try:
178
+ dataset.owner = user_info.get_user_info_for_current_platform().current_team
179
+ except OSError:
180
+ logger.exception("Failed to find environment variable DAPLA_GROUP_CONTEXT")
140
181
 
141
182
 
142
183
  def set_variables_inherit_from_dataset(
143
- dataset: model.Dataset,
184
+ dataset: DatasetType,
144
185
  variables: list,
145
186
  ) -> None:
146
187
  """Set specific dataset values on a list of variable objects.
@@ -154,14 +195,9 @@ def set_variables_inherit_from_dataset(
154
195
  variables: A list of variable objects to update with dataset values.
155
196
 
156
197
  Example:
157
- >>> dataset = model.Dataset(short_name='person_data_v1',data_source='01',temporality_type='STATUS',id='9662875c-c245-41de-b667-12ad2091a1ee',contains_data_from="2010-09-05",contains_data_until="2022-09-05")
158
- >>> variables = [model.Variable(short_name="pers",data_source =None,temporality_type = None, contains_data_from = None,contains_data_until = None)]
198
+ >>> dataset = all_optional_model.Dataset(short_name='person_data_v1', id='9662875c-c245-41de-b667-12ad2091a1ee', contains_data_from="2010-09-05", contains_data_until="2022-09-05")
199
+ >>> variables = [all_optional_model.Variable(short_name="pers", data_source=None, temporality_type=None, contains_data_from=None, contains_data_until=None)]
159
200
  >>> set_variables_inherit_from_dataset(dataset, variables)
160
- >>> variables[0].data_source == dataset.data_source
161
- True
162
-
163
- >>> variables[0].temporality_type is None
164
- False
165
201
 
166
202
  >>> variables[0].contains_data_from == dataset.contains_data_from
167
203
  True
@@ -172,8 +208,6 @@ def set_variables_inherit_from_dataset(
172
208
  for v in variables:
173
209
  v.contains_data_from = v.contains_data_from or dataset.contains_data_from
174
210
  v.contains_data_until = v.contains_data_until or dataset.contains_data_until
175
- v.temporality_type = v.temporality_type or dataset.temporality_type
176
- v.data_source = v.data_source or dataset.data_source
177
211
 
178
212
 
179
213
  def incorrect_date_order(
@@ -232,10 +266,7 @@ def _is_missing_multilanguage_value(
232
266
  len(field_value[0]) > 0
233
267
  and not field_value[0]["languageText"]
234
268
  and (len(field_value) <= 1 or not field_value[1]["languageText"])
235
- and (
236
- len(field_value) <= 2 # noqa: PLR2004 approve magic value
237
- or not field_value[2]["languageText"]
238
- )
269
+ and (len(field_value) <= 2 or not field_value[2]["languageText"])
239
270
  ),
240
271
  )
241
272
 
@@ -264,8 +295,7 @@ def _is_missing_metadata(
264
295
  True if the field doesn't have a value, False otherwise.
265
296
  """
266
297
  return bool(
267
- field_name in obligatory_list
268
- and field_value is None
298
+ (field_name in obligatory_list and field_value is None)
269
299
  or _is_missing_multilanguage_value(
270
300
  field_name,
271
301
  field_value,
@@ -274,7 +304,9 @@ def _is_missing_metadata(
274
304
  )
275
305
 
276
306
 
277
- def num_obligatory_dataset_fields_completed(dataset: model.Dataset) -> int:
307
+ def num_obligatory_dataset_fields_completed(
308
+ dataset: DatasetType,
309
+ ) -> int:
278
310
  """Count the number of completed obligatory dataset fields.
279
311
 
280
312
  This function returns the total count of obligatory fields in the dataset that
@@ -310,7 +342,9 @@ def num_obligatory_variables_fields_completed(variables: list) -> int:
310
342
  return num_completed
311
343
 
312
344
 
313
- def num_obligatory_variable_fields_completed(variable: model.Variable) -> int:
345
+ def num_obligatory_variable_fields_completed(
346
+ variable: all_optional_model.Variable,
347
+ ) -> int:
314
348
  """Count the number of obligatory fields completed for one variable.
315
349
 
316
350
  This function calculates the total number of obligatory fields that have
@@ -336,7 +370,28 @@ def num_obligatory_variable_fields_completed(variable: model.Variable) -> int:
336
370
  return NUM_OBLIGATORY_VARIABLES_FIELDS - len(missing_metadata)
337
371
 
338
372
 
339
- def get_missing_obligatory_dataset_fields(dataset: model.Dataset) -> list:
373
+ def num_obligatory_pseudo_fields_missing(
374
+ variables: list[all_optional_model.Variable],
375
+ ) -> int:
376
+ """Counts the number of obligatory pseudonymization fields are missing.
377
+
378
+ Args:
379
+ variables: The variables to count obligatory fields for.
380
+
381
+ Returns:
382
+ The number of obligatory pseudonymization fields that are missing.
383
+ """
384
+ return sum(
385
+ getattr(v.pseudonymization, field, None) is None
386
+ for v in variables
387
+ if v.pseudonymization is not None
388
+ for field in OBLIGATORY_VARIABLES_PSEUDONYMIZATION_IDENTIFIERS
389
+ )
390
+
391
+
392
+ def get_missing_obligatory_dataset_fields(
393
+ dataset: DatasetType,
394
+ ) -> list:
340
395
  """Identify all obligatory dataset fields that are missing values.
341
396
 
342
397
  This function checks for obligatory fields that are either directly missing
@@ -400,6 +455,40 @@ def get_missing_obligatory_variables_fields(variables: list) -> list[dict]:
400
455
  return [item for item in missing_variables_fields if next(iter(item.values()))]
401
456
 
402
457
 
458
+ def get_missing_obligatory_variables_pseudo_fields(
459
+ variables: list[all_optional_model.Variable],
460
+ ) -> list[dict]:
461
+ """Identify obligatory variable pseudonymization fields that are missing values for each variable.
462
+
463
+ This function checks for obligatory fields that are directly missing
464
+ (i.e., set to `None`).
465
+
466
+ Args:
467
+ variables: A list of variable objects to check for missing obligatory pseudonymization fields.
468
+
469
+ Returns:
470
+ A list of dictionaries with variable short names as keys and list of missing
471
+ obligatory variable pseudonymization fields as values. This includes:
472
+ - Fields that are directly `None` and are listed as obligatory metadata.
473
+ """
474
+ return [
475
+ {
476
+ v.short_name: [
477
+ key
478
+ for key, value in v.pseudonymization.model_dump().items()
479
+ if _is_missing_metadata(
480
+ key,
481
+ value,
482
+ OBLIGATORY_VARIABLES_PSEUDONYMIZATION_IDENTIFIERS,
483
+ [],
484
+ )
485
+ ]
486
+ }
487
+ for v in variables
488
+ if v.pseudonymization is not None
489
+ ]
490
+
491
+
403
492
  def running_in_notebook() -> bool:
404
493
  """Return True if running in Jupyter Notebook."""
405
494
  try:
@@ -412,81 +501,64 @@ def running_in_notebook() -> bool:
412
501
  return False
413
502
 
414
503
 
415
- def override_dataset_fields(
416
- merged_metadata: model.DatadocMetadata,
417
- existing_metadata: model.DatadocMetadata,
418
- ) -> None:
419
- """Overrides specific fields in the dataset of `merged_metadata` with values from the dataset of `existing_metadata`.
504
+ def _ensure_encryption_parameters(
505
+ existing: list[dict[str, Any]] | None,
506
+ required: dict[str, Any],
507
+ ) -> list[dict[str, Any]]:
508
+ """Ensure required key/value pairs exist in parameters list."""
509
+ result = list(existing or [])
420
510
 
421
- This function iterates over a predefined list of fields, `DATASET_FIELDS_FROM_EXISTING_METADATA`,
422
- and sets the corresponding fields in the `merged_metadata.dataset` object to the values
423
- from the `existing_metadata.dataset` object.
424
-
425
- Args:
426
- merged_metadata: An instance of `DatadocMetadata` containing the dataset to be updated.
427
- existing_metadata: An instance of `DatadocMetadata` containing the dataset whose values are used to update `merged_metadata.dataset`.
428
-
429
- Returns:
430
- `None`.
431
- """
432
- if merged_metadata.dataset and existing_metadata.dataset:
433
- # Override the fields as defined
434
- for field in DATASET_FIELDS_FROM_EXISTING_METADATA:
435
- setattr(
436
- merged_metadata.dataset,
437
- field,
438
- getattr(existing_metadata.dataset, field),
439
- )
511
+ # Ensure each required key is present in at least one dict
512
+ for key, value in required.items():
513
+ if not any(key in d for d in result):
514
+ result.append({key: value})
440
515
 
516
+ return result
441
517
 
442
- def merge_variables(
443
- existing_metadata: model.DatadocMetadata,
444
- extracted_metadata: model.DatadocMetadata,
445
- merged_metadata: model.DatadocMetadata,
446
- ) -> model.DatadocMetadata:
447
- """Merges variables from the extracted metadata into the existing metadata and updates the merged metadata.
448
518
 
449
- This function compares the variables from `extracted_metadata` with those in `existing_metadata`.
450
- For each variable in `extracted_metadata`, it checks if a variable with the same `short_name` exists
451
- in `existing_metadata`. If a match is found, it updates the existing variable with information from
452
- `extracted_metadata`. If no match is found, the variable from `extracted_metadata` is directly added to `merged_metadata`.
453
-
454
- Args:
455
- existing_metadata: The metadata object containing the current state of variables.
456
- extracted_metadata: The metadata object containing new or updated variables to merge.
457
- merged_metadata: The metadata object that will contain the result of the merge.
519
+ def set_default_values_pseudonymization(
520
+ variable: VariableType,
521
+ pseudonymization: PseudonymizationType | None,
522
+ ) -> None:
523
+ """Populate pseudonymization fields with defaults based on the encryption algorithm.
458
524
 
459
- Returns:
460
- model.DatadocMetadata: The `merged_metadata` object containing variables from both `existing_metadata`
461
- and `extracted_metadata`.
525
+ Updates the encryption key reference and encryption parameters if they are not set,
526
+ handling both PAPIS and DAED algorithms. Leaves unknown algorithms unchanged.
462
527
  """
463
- if (
464
- existing_metadata.variables is not None
465
- and extracted_metadata is not None
466
- and extracted_metadata.variables is not None
467
- and merged_metadata.variables is not None
468
- ):
469
- for extracted in extracted_metadata.variables:
470
- existing = next(
471
- (
472
- existing
473
- for existing in existing_metadata.variables
474
- if existing.short_name == extracted.short_name
475
- ),
476
- None,
477
- )
478
- if existing:
479
- existing.id = (
480
- None # Set to None so that it will be set assigned a fresh ID later
528
+ if pseudonymization is None:
529
+ return
530
+ if variable.pseudonymization is None:
531
+ variable.pseudonymization = pseudonymization
532
+ match pseudonymization.encryption_algorithm:
533
+ case EncryptionAlgorithm.PAPIS_ENCRYPTION_ALGORITHM.value:
534
+ if not pseudonymization.encryption_key_reference:
535
+ pseudonymization.encryption_key_reference = (
536
+ PAPIS_ENCRYPTION_KEY_REFERENCE
537
+ )
538
+ base_params = {
539
+ ENCRYPTION_PARAMETER_KEY_ID: PAPIS_ENCRYPTION_KEY_REFERENCE,
540
+ ENCRYPTION_PARAMETER_STRATEGY: ENCRYPTION_PARAMETER_STRATEGY_SKIP,
541
+ }
542
+ if pseudonymization.stable_identifier_type == PAPIS_STABLE_IDENTIFIER_TYPE:
543
+ base_params[ENCRYPTION_PARAMETER_SNAPSHOT_DATE] = get_current_date()
544
+ pseudonymization.encryption_algorithm_parameters = (
545
+ _ensure_encryption_parameters(
546
+ pseudonymization.encryption_algorithm_parameters,
547
+ base_params,
481
548
  )
482
- existing.contains_data_from = (
483
- extracted.contains_data_from or existing.contains_data_from
549
+ )
550
+ case EncryptionAlgorithm.DAEAD_ENCRYPTION_ALGORITHM.value:
551
+ if not pseudonymization.encryption_key_reference:
552
+ pseudonymization.encryption_key_reference = (
553
+ DAEAD_ENCRYPTION_KEY_REFERENCE
484
554
  )
485
- existing.contains_data_until = (
486
- extracted.contains_data_until or existing.contains_data_until
555
+ pseudonymization.encryption_algorithm_parameters = (
556
+ _ensure_encryption_parameters(
557
+ pseudonymization.encryption_algorithm_parameters,
558
+ {
559
+ ENCRYPTION_PARAMETER_KEY_ID: DAEAD_ENCRYPTION_KEY_REFERENCE,
560
+ },
487
561
  )
488
- merged_metadata.variables.append(existing)
489
- else:
490
- # If there is no existing metadata for this variable, we just use what we have extracted
491
- merged_metadata.variables.append(extracted)
492
- return merged_metadata
562
+ )
563
+ case _:
564
+ pass
@@ -0,0 +1,4 @@
1
+ """Expose information specific to validating ssb standards."""
2
+
3
+ from .standard_validators import check_naming_standard
4
+ from .standard_validators import generate_validation_report