dapla-toolbelt-metadata 0.2.1__py3-none-any.whl → 0.9.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dapla-toolbelt-metadata might be problematic. Click here for more details.
- dapla_metadata/__init__.py +11 -1
- dapla_metadata/_shared/__init__.py +1 -0
- dapla_metadata/_shared/config.py +109 -0
- dapla_metadata/_shared/enums.py +27 -0
- dapla_metadata/_shared/py.typed +0 -0
- dapla_metadata/dapla/__init__.py +4 -0
- dapla_metadata/dapla/user_info.py +138 -0
- dapla_metadata/datasets/__init__.py +1 -1
- dapla_metadata/datasets/_merge.py +333 -0
- dapla_metadata/datasets/code_list.py +5 -6
- dapla_metadata/datasets/compatibility/__init__.py +10 -0
- dapla_metadata/datasets/compatibility/_handlers.py +363 -0
- dapla_metadata/datasets/compatibility/_utils.py +259 -0
- dapla_metadata/datasets/compatibility/model_backwards_compatibility.py +135 -0
- dapla_metadata/datasets/core.py +136 -182
- dapla_metadata/datasets/dapla_dataset_path_info.py +145 -19
- dapla_metadata/datasets/dataset_parser.py +41 -28
- dapla_metadata/datasets/model_validation.py +29 -20
- dapla_metadata/datasets/statistic_subject_mapping.py +5 -1
- dapla_metadata/datasets/utility/constants.py +22 -15
- dapla_metadata/datasets/utility/enums.py +8 -20
- dapla_metadata/datasets/utility/urn.py +234 -0
- dapla_metadata/datasets/utility/utils.py +183 -111
- dapla_metadata/standards/__init__.py +4 -0
- dapla_metadata/standards/name_validator.py +250 -0
- dapla_metadata/standards/standard_validators.py +98 -0
- dapla_metadata/standards/utils/__init__.py +1 -0
- dapla_metadata/standards/utils/constants.py +49 -0
- dapla_metadata/variable_definitions/__init__.py +11 -0
- dapla_metadata/variable_definitions/_generated/.openapi-generator/FILES +20 -0
- dapla_metadata/variable_definitions/_generated/.openapi-generator/VERSION +1 -0
- dapla_metadata/variable_definitions/_generated/.openapi-generator-ignore +6 -0
- dapla_metadata/variable_definitions/_generated/README.md +148 -0
- dapla_metadata/variable_definitions/_generated/__init__.py +0 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/__init__.py +47 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api/__init__.py +8 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api/data_migration_api.py +766 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api/draft_variable_definitions_api.py +888 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api/patches_api.py +888 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api/validity_periods_api.py +583 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api/variable_definitions_api.py +613 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api_client.py +779 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api_response.py +27 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/configuration.py +474 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/CompleteResponse.md +51 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/Contact.md +30 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/DataMigrationApi.md +90 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/Draft.md +42 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/DraftVariableDefinitionsApi.md +259 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/LanguageStringType.md +31 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/Owner.md +31 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/Patch.md +43 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/PatchesApi.md +249 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/PublicApi.md +218 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/SupportedLanguages.md +15 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/UpdateDraft.md +44 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/ValidityPeriod.md +42 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/ValidityPeriodsApi.md +236 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/VariableDefinitionsApi.md +304 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/VariableStatus.md +17 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/exceptions.py +193 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/__init__.py +31 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/complete_response.py +260 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/contact.py +94 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/draft.py +228 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/get_vardok_vardef_mapping_by_id200_response.py +158 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/language_string_type.py +101 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/owner.py +87 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/patch.py +244 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/problem.py +118 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/update_draft.py +274 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/validity_period.py +225 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/vardok_id_response.py +81 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/vardok_vardef_id_pair_response.py +84 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/variable_status.py +33 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/py.typed +0 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/rest.py +249 -0
- dapla_metadata/variable_definitions/_utils/__init__.py +1 -0
- dapla_metadata/variable_definitions/_utils/_client.py +32 -0
- dapla_metadata/variable_definitions/_utils/config.py +54 -0
- dapla_metadata/variable_definitions/_utils/constants.py +80 -0
- dapla_metadata/variable_definitions/_utils/files.py +309 -0
- dapla_metadata/variable_definitions/_utils/template_files.py +99 -0
- dapla_metadata/variable_definitions/_utils/variable_definition_files.py +143 -0
- dapla_metadata/variable_definitions/exceptions.py +255 -0
- dapla_metadata/variable_definitions/vardef.py +372 -0
- dapla_metadata/variable_definitions/vardok_id.py +48 -0
- dapla_metadata/variable_definitions/vardok_vardef_id_pair.py +47 -0
- dapla_metadata/variable_definitions/variable_definition.py +422 -0
- {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info}/METADATA +34 -36
- dapla_toolbelt_metadata-0.9.11.dist-info/RECORD +97 -0
- {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info}/WHEEL +1 -1
- dapla_metadata/datasets/config.py +0 -80
- dapla_metadata/datasets/model_backwards_compatibility.py +0 -520
- dapla_metadata/datasets/user_info.py +0 -88
- dapla_toolbelt_metadata-0.2.1.dist-info/RECORD +0 -22
- {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""Validate, parse and render URNs."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
from collections.abc import Iterable
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from enum import auto
|
|
9
|
+
from typing import Literal
|
|
10
|
+
|
|
11
|
+
from pydantic import AnyUrl
|
|
12
|
+
|
|
13
|
+
from dapla_metadata._shared.config import get_dapla_environment
|
|
14
|
+
from dapla_metadata._shared.enums import DaplaEnvironment
|
|
15
|
+
from dapla_metadata.datasets.utility.utils import VariableListType
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
URN_ERROR_MESSAGE_BASE = "The URL is not in a supported format"
|
|
20
|
+
|
|
21
|
+
URN_ERROR_MESSAGE_TEMPLATE = (
|
|
22
|
+
URN_ERROR_MESSAGE_BASE
|
|
23
|
+
+ " for field '{field_name}' of variable '{short_name}'. URL: '{value}'. Please contact Team Metadata if this URL should be supported."
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
VARDEF_URL_TEMPLATE = "https://{subdomain}.{domain}/variable-definitions"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SsbNaisDomains(str, Enum):
|
|
31
|
+
"""The available domains on SSBs Nais instance."""
|
|
32
|
+
|
|
33
|
+
TEST_EXTERNAL = "test.ssb.no"
|
|
34
|
+
TEST_INTERNAL = "intern.test.ssb.no"
|
|
35
|
+
PROD_EXTERNAL = "ssb.no"
|
|
36
|
+
PROD_INTERNAL = "intern.ssb.no"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ReferenceUrlTypes(Enum):
|
|
40
|
+
"""The general category of the URL.
|
|
41
|
+
|
|
42
|
+
This can be useful to refer to when constructing a URL from a URN for a
|
|
43
|
+
specific context.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
API = auto()
|
|
47
|
+
FRONTEND = auto()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
UrlVisibility = Literal["public", "internal"]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class UrnConverter:
|
|
55
|
+
"""Converts URLs to URNs and vice versa.
|
|
56
|
+
|
|
57
|
+
Fields:
|
|
58
|
+
urn_base: The format for the URN, up to the identifier.
|
|
59
|
+
id_pattern: A capturing group pattern which matches identifiers for this resource.
|
|
60
|
+
url_bases: The list of all the different URL representations for a resource. There
|
|
61
|
+
will typically be a number of URL representations for a particular resource,
|
|
62
|
+
depending on which system or technology they are accessed through and other
|
|
63
|
+
technical factors. This list defines which concrete URLs can be considered
|
|
64
|
+
equivalent to a URN.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
urn_base: str
|
|
68
|
+
id_pattern: str
|
|
69
|
+
url_bases: list[tuple[ReferenceUrlTypes, str]]
|
|
70
|
+
|
|
71
|
+
def _extract_id(self, url: str, pattern: re.Pattern[str]) -> str | None:
|
|
72
|
+
if match := pattern.match(url):
|
|
73
|
+
return match.group(1)
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
def _build_pattern(self, url_base: str) -> re.Pattern[str]:
|
|
77
|
+
return re.compile(f"^{url_base}/{self.id_pattern}")
|
|
78
|
+
|
|
79
|
+
def get_urn(self, identifier: str) -> str:
|
|
80
|
+
"""Build a URN for the given identifier."""
|
|
81
|
+
return f"{self.urn_base}:{identifier}"
|
|
82
|
+
|
|
83
|
+
def get_url(
|
|
84
|
+
self,
|
|
85
|
+
identifier: str,
|
|
86
|
+
url_type: ReferenceUrlTypes,
|
|
87
|
+
visibility: Literal["public", "internal"] = "public",
|
|
88
|
+
) -> str | None:
|
|
89
|
+
"""Build concrete URL to reference a resource.
|
|
90
|
+
|
|
91
|
+
There are typically multiple URLs used to refer to one resource, this method attempts to support known variations.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
identifier (str): The identifier of the resource the URL refers to.
|
|
95
|
+
url_type (ReferenceUrlTypes): The representation type of the URL
|
|
96
|
+
visibility (UrlVisibility, optional): Whether the URL should be that which is publicly available or not. Defaults to "public".
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
str | None: The concrete URL. None if we cannot satisfy the supplied requirements.
|
|
100
|
+
"""
|
|
101
|
+
candidates = [base[-1] for base in self.url_bases if base[0] == url_type]
|
|
102
|
+
|
|
103
|
+
def matches_visibility(url: str, visibility: UrlVisibility):
|
|
104
|
+
return (".intern." in url) is (visibility == "internal")
|
|
105
|
+
|
|
106
|
+
def matches_environment(url: str):
|
|
107
|
+
current_environment = get_dapla_environment()
|
|
108
|
+
if current_environment == DaplaEnvironment.TEST:
|
|
109
|
+
return ".test." in url
|
|
110
|
+
return ".test." not in url
|
|
111
|
+
|
|
112
|
+
if url := next(
|
|
113
|
+
(
|
|
114
|
+
url
|
|
115
|
+
for url in candidates
|
|
116
|
+
if matches_visibility(url, visibility) and matches_environment(url)
|
|
117
|
+
),
|
|
118
|
+
None,
|
|
119
|
+
):
|
|
120
|
+
return url + "/" + identifier
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
def get_id(self, urn_or_url: str | AnyUrl) -> str | None:
|
|
124
|
+
"""Get an identifier from a URN or URL.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
urn_or_url (str | AnyUrl): The URN or URL refering to a particular resource
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
str | None: The identifier for the resource, or None if it cannot be extracted.
|
|
131
|
+
"""
|
|
132
|
+
if str(urn_or_url).startswith(self.urn_base):
|
|
133
|
+
return str(urn_or_url).removeprefix(self.urn_base + ":")
|
|
134
|
+
return self._extract_id_from_url(urn_or_url)
|
|
135
|
+
|
|
136
|
+
def is_id(self, value: str) -> bool:
|
|
137
|
+
"""Check if the value is an identifier for this URN type.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
value (str): The value to check.
|
|
141
|
+
"""
|
|
142
|
+
if not isinstance(value, str):
|
|
143
|
+
# Mypy thinks it's impossible to reach this branch, but there are no guarantees in Python.
|
|
144
|
+
return False # type: ignore [unreachable]
|
|
145
|
+
pattern = re.compile(f"^{self.id_pattern}$")
|
|
146
|
+
return bool(pattern.match(value))
|
|
147
|
+
|
|
148
|
+
def _extract_id_from_url(self, url: str | AnyUrl) -> str | None:
|
|
149
|
+
patterns = (self._build_pattern(url[-1]) for url in self.url_bases)
|
|
150
|
+
matches = (self._extract_id(str(url), p) for p in patterns)
|
|
151
|
+
return next((m for m in matches if m), None)
|
|
152
|
+
|
|
153
|
+
def convert_url_to_urn(self, url: str | AnyUrl) -> AnyUrl | None:
|
|
154
|
+
"""Convert a URL to a generalized URN for that same resource.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
url (str | AnyUrl): The URL to convert.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
str | None: The URN or None if it can't be converted.
|
|
161
|
+
"""
|
|
162
|
+
if str(url).startswith(self.urn_base):
|
|
163
|
+
# In this case the value is already in the expected format and nothing needs to be done.
|
|
164
|
+
return AnyUrl(url)
|
|
165
|
+
if identifier := self._extract_id_from_url(url):
|
|
166
|
+
return AnyUrl(self.get_urn(identifier))
|
|
167
|
+
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
vardef_urn_converter = UrnConverter(
|
|
172
|
+
urn_base="urn:ssb:variable-definition:vardef",
|
|
173
|
+
id_pattern=r"([a-z0-9]{8})",
|
|
174
|
+
url_bases=[
|
|
175
|
+
*[
|
|
176
|
+
(
|
|
177
|
+
ReferenceUrlTypes.API,
|
|
178
|
+
VARDEF_URL_TEMPLATE.format(
|
|
179
|
+
subdomain="metadata", domain=nais_domain.value
|
|
180
|
+
),
|
|
181
|
+
)
|
|
182
|
+
for nais_domain in SsbNaisDomains
|
|
183
|
+
],
|
|
184
|
+
*[
|
|
185
|
+
(
|
|
186
|
+
ReferenceUrlTypes.FRONTEND,
|
|
187
|
+
VARDEF_URL_TEMPLATE.format(
|
|
188
|
+
subdomain="catalog", domain=nais_domain.value
|
|
189
|
+
),
|
|
190
|
+
)
|
|
191
|
+
for nais_domain in SsbNaisDomains
|
|
192
|
+
],
|
|
193
|
+
],
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
klass_urn_converter = UrnConverter(
|
|
197
|
+
urn_base="urn:ssb:classification:klass",
|
|
198
|
+
id_pattern=r"([0-9]{1,5})",
|
|
199
|
+
url_bases=[
|
|
200
|
+
(ReferenceUrlTypes.FRONTEND, "https://www.ssb.no/klass/klassifikasjoner"),
|
|
201
|
+
(ReferenceUrlTypes.FRONTEND, "https://www.ssb.no/en/klass/klassifikasjoner"),
|
|
202
|
+
(ReferenceUrlTypes.API, "https://data.ssb.no/api/klass/v1/classifications"),
|
|
203
|
+
],
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def convert_uris_to_urns(
|
|
208
|
+
variables: VariableListType, field_name: str, converters: Iterable[UrnConverter]
|
|
209
|
+
) -> None:
|
|
210
|
+
"""Where URIs are recognized URLs, convert them to URNs.
|
|
211
|
+
|
|
212
|
+
Where the value is not a known URL we preserve the value as it is and log an
|
|
213
|
+
ERROR level message.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
variables (VariableListType): The list of variables.
|
|
217
|
+
field_name (str): The name of the field which has URLs to convert to URNs
|
|
218
|
+
converters (Iterable[UrnConverter]): One or more converters which implement
|
|
219
|
+
conversion of URLs into one specific URN format. These will typically be
|
|
220
|
+
specific to an individual metadata reference system.
|
|
221
|
+
"""
|
|
222
|
+
for v in variables:
|
|
223
|
+
field = getattr(v, field_name, None)
|
|
224
|
+
if field:
|
|
225
|
+
if urn := next((c.convert_url_to_urn(field) for c in converters), None):
|
|
226
|
+
setattr(v, field_name, urn)
|
|
227
|
+
else:
|
|
228
|
+
logger.error(
|
|
229
|
+
URN_ERROR_MESSAGE_TEMPLATE.format(
|
|
230
|
+
field_name=field_name,
|
|
231
|
+
short_name=v.short_name,
|
|
232
|
+
value=field,
|
|
233
|
+
)
|
|
234
|
+
)
|
|
@@ -4,19 +4,25 @@ import datetime # import is needed in xdoctest
|
|
|
4
4
|
import logging
|
|
5
5
|
import pathlib
|
|
6
6
|
import uuid
|
|
7
|
+
from typing import Any
|
|
8
|
+
from typing import TypeAlias
|
|
7
9
|
|
|
10
|
+
import datadoc_model.all_optional.model as all_optional_model
|
|
11
|
+
import datadoc_model.required.model as required_model
|
|
12
|
+
import google.auth
|
|
8
13
|
from cloudpathlib import CloudPath
|
|
9
14
|
from cloudpathlib import GSClient
|
|
10
15
|
from cloudpathlib import GSPath
|
|
11
|
-
from
|
|
12
|
-
from datadoc_model import
|
|
13
|
-
from datadoc_model.model import
|
|
14
|
-
|
|
15
|
-
from
|
|
16
|
-
|
|
17
|
-
from dapla_metadata.datasets.utility.constants import
|
|
18
|
-
|
|
19
|
-
|
|
16
|
+
from datadoc_model.all_optional.model import Assessment
|
|
17
|
+
from datadoc_model.all_optional.model import DataSetState
|
|
18
|
+
from datadoc_model.all_optional.model import VariableRole
|
|
19
|
+
|
|
20
|
+
from dapla_metadata.dapla import user_info
|
|
21
|
+
from dapla_metadata.datasets.utility.constants import DAEAD_ENCRYPTION_KEY_REFERENCE
|
|
22
|
+
from dapla_metadata.datasets.utility.constants import ENCRYPTION_PARAMETER_KEY_ID
|
|
23
|
+
from dapla_metadata.datasets.utility.constants import ENCRYPTION_PARAMETER_SNAPSHOT_DATE
|
|
24
|
+
from dapla_metadata.datasets.utility.constants import ENCRYPTION_PARAMETER_STRATEGY
|
|
25
|
+
from dapla_metadata.datasets.utility.constants import ENCRYPTION_PARAMETER_STRATEGY_SKIP
|
|
20
26
|
from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
|
|
21
27
|
from dapla_metadata.datasets.utility.constants import (
|
|
22
28
|
OBLIGATORY_DATASET_METADATA_IDENTIFIERS,
|
|
@@ -30,9 +36,33 @@ from dapla_metadata.datasets.utility.constants import (
|
|
|
30
36
|
from dapla_metadata.datasets.utility.constants import (
|
|
31
37
|
OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS_MULTILANGUAGE,
|
|
32
38
|
)
|
|
39
|
+
from dapla_metadata.datasets.utility.constants import (
|
|
40
|
+
OBLIGATORY_VARIABLES_PSEUDONYMIZATION_IDENTIFIERS,
|
|
41
|
+
)
|
|
42
|
+
from dapla_metadata.datasets.utility.constants import PAPIS_ENCRYPTION_KEY_REFERENCE
|
|
43
|
+
from dapla_metadata.datasets.utility.constants import PAPIS_STABLE_IDENTIFIER_TYPE
|
|
44
|
+
from dapla_metadata.datasets.utility.enums import EncryptionAlgorithm
|
|
33
45
|
|
|
34
46
|
logger = logging.getLogger(__name__)
|
|
35
47
|
|
|
48
|
+
DatadocMetadataType: TypeAlias = (
|
|
49
|
+
all_optional_model.DatadocMetadata | required_model.DatadocMetadata
|
|
50
|
+
)
|
|
51
|
+
DatasetType: TypeAlias = all_optional_model.Dataset | required_model.Dataset
|
|
52
|
+
VariableType: TypeAlias = all_optional_model.Variable | required_model.Variable
|
|
53
|
+
PseudonymizationType: TypeAlias = (
|
|
54
|
+
all_optional_model.Pseudonymization | required_model.Pseudonymization
|
|
55
|
+
)
|
|
56
|
+
VariableListType: TypeAlias = (
|
|
57
|
+
list[all_optional_model.Variable] | list[required_model.Variable]
|
|
58
|
+
)
|
|
59
|
+
OptionalDatadocMetadataType: TypeAlias = DatadocMetadataType | None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_current_date() -> str:
|
|
63
|
+
"""Return a current date as str."""
|
|
64
|
+
return datetime.datetime.now(tz=datetime.timezone.utc).date().isoformat()
|
|
65
|
+
|
|
36
66
|
|
|
37
67
|
def get_timestamp_now() -> datetime.datetime:
|
|
38
68
|
"""Return a timestamp for the current moment."""
|
|
@@ -51,7 +81,7 @@ def normalize_path(path: str) -> pathlib.Path | CloudPath:
|
|
|
51
81
|
Pathlib compatible object.
|
|
52
82
|
"""
|
|
53
83
|
if path.startswith(GSPath.cloud_prefix):
|
|
54
|
-
client = GSClient(credentials=
|
|
84
|
+
client = GSClient(credentials=google.auth.default()[0])
|
|
55
85
|
return GSPath(path, client=client)
|
|
56
86
|
return pathlib.Path(path)
|
|
57
87
|
|
|
@@ -78,7 +108,7 @@ def derive_assessment_from_state(state: DataSetState) -> Assessment:
|
|
|
78
108
|
Returns:
|
|
79
109
|
The derived assessment of the dataset.
|
|
80
110
|
"""
|
|
81
|
-
match
|
|
111
|
+
match state:
|
|
82
112
|
case (
|
|
83
113
|
DataSetState.INPUT_DATA
|
|
84
114
|
| DataSetState.PROCESSED_DATA
|
|
@@ -91,56 +121,67 @@ def derive_assessment_from_state(state: DataSetState) -> Assessment:
|
|
|
91
121
|
return Assessment.SENSITIVE
|
|
92
122
|
|
|
93
123
|
|
|
94
|
-
def set_default_values_variables(variables:
|
|
124
|
+
def set_default_values_variables(variables: VariableListType) -> None:
|
|
95
125
|
"""Set default values on variables.
|
|
96
126
|
|
|
97
127
|
Args:
|
|
98
128
|
variables: A list of variable objects to set default values on.
|
|
99
129
|
|
|
100
130
|
Example:
|
|
101
|
-
>>> variables = [
|
|
131
|
+
>>> variables = [all_optional_model.Variable(short_name="pers",id=None, is_personal_data = None), all_optional_model.Variable(short_name="fnr",id='9662875c-c245-41de-b667-12ad2091a1ee', is_personal_data=True)]
|
|
102
132
|
>>> set_default_values_variables(variables)
|
|
103
133
|
>>> isinstance(variables[0].id, uuid.UUID)
|
|
104
134
|
True
|
|
105
135
|
|
|
106
|
-
>>> variables[1].is_personal_data ==
|
|
136
|
+
>>> variables[1].is_personal_data == True
|
|
107
137
|
True
|
|
108
138
|
|
|
109
|
-
>>> variables[0].is_personal_data ==
|
|
139
|
+
>>> variables[0].is_personal_data == False
|
|
110
140
|
True
|
|
111
141
|
"""
|
|
112
142
|
for v in variables:
|
|
113
143
|
if v.id is None:
|
|
114
144
|
v.id = uuid.uuid4()
|
|
115
145
|
if v.is_personal_data is None:
|
|
116
|
-
v.is_personal_data =
|
|
146
|
+
v.is_personal_data = False
|
|
117
147
|
if v.variable_role is None:
|
|
118
148
|
v.variable_role = VariableRole.MEASURE
|
|
119
149
|
|
|
120
150
|
|
|
121
|
-
def set_default_values_dataset(
|
|
151
|
+
def set_default_values_dataset(
|
|
152
|
+
dataset: DatasetType,
|
|
153
|
+
) -> None:
|
|
122
154
|
"""Set default values on dataset.
|
|
123
155
|
|
|
124
156
|
Args:
|
|
125
157
|
dataset: The dataset object to set default values on.
|
|
126
158
|
|
|
127
159
|
Example:
|
|
128
|
-
>>> dataset =
|
|
160
|
+
>>> dataset = all_optional_model.Dataset(id=None)
|
|
129
161
|
>>> set_default_values_dataset(dataset)
|
|
130
162
|
>>> dataset.id is not None
|
|
131
163
|
True
|
|
132
|
-
|
|
133
|
-
>>> dataset.contains_personal_data == False
|
|
134
|
-
True
|
|
135
164
|
"""
|
|
136
165
|
if not dataset.id:
|
|
137
166
|
dataset.id = uuid.uuid4()
|
|
138
|
-
|
|
139
|
-
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def set_dataset_owner(
|
|
170
|
+
dataset: DatasetType,
|
|
171
|
+
) -> None:
|
|
172
|
+
"""Sets the owner of the dataset from the DAPLA_GROUP_CONTEXT enviornment variable.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
dataset: The dataset object to set default values on.
|
|
176
|
+
"""
|
|
177
|
+
try:
|
|
178
|
+
dataset.owner = user_info.get_user_info_for_current_platform().current_team
|
|
179
|
+
except OSError:
|
|
180
|
+
logger.exception("Failed to find environment variable DAPLA_GROUP_CONTEXT")
|
|
140
181
|
|
|
141
182
|
|
|
142
183
|
def set_variables_inherit_from_dataset(
|
|
143
|
-
dataset:
|
|
184
|
+
dataset: DatasetType,
|
|
144
185
|
variables: list,
|
|
145
186
|
) -> None:
|
|
146
187
|
"""Set specific dataset values on a list of variable objects.
|
|
@@ -154,14 +195,9 @@ def set_variables_inherit_from_dataset(
|
|
|
154
195
|
variables: A list of variable objects to update with dataset values.
|
|
155
196
|
|
|
156
197
|
Example:
|
|
157
|
-
>>> dataset =
|
|
158
|
-
>>> variables = [
|
|
198
|
+
>>> dataset = all_optional_model.Dataset(short_name='person_data_v1', id='9662875c-c245-41de-b667-12ad2091a1ee', contains_data_from="2010-09-05", contains_data_until="2022-09-05")
|
|
199
|
+
>>> variables = [all_optional_model.Variable(short_name="pers", data_source=None, temporality_type=None, contains_data_from=None, contains_data_until=None)]
|
|
159
200
|
>>> set_variables_inherit_from_dataset(dataset, variables)
|
|
160
|
-
>>> variables[0].data_source == dataset.data_source
|
|
161
|
-
True
|
|
162
|
-
|
|
163
|
-
>>> variables[0].temporality_type is None
|
|
164
|
-
False
|
|
165
201
|
|
|
166
202
|
>>> variables[0].contains_data_from == dataset.contains_data_from
|
|
167
203
|
True
|
|
@@ -172,8 +208,6 @@ def set_variables_inherit_from_dataset(
|
|
|
172
208
|
for v in variables:
|
|
173
209
|
v.contains_data_from = v.contains_data_from or dataset.contains_data_from
|
|
174
210
|
v.contains_data_until = v.contains_data_until or dataset.contains_data_until
|
|
175
|
-
v.temporality_type = v.temporality_type or dataset.temporality_type
|
|
176
|
-
v.data_source = v.data_source or dataset.data_source
|
|
177
211
|
|
|
178
212
|
|
|
179
213
|
def incorrect_date_order(
|
|
@@ -232,10 +266,7 @@ def _is_missing_multilanguage_value(
|
|
|
232
266
|
len(field_value[0]) > 0
|
|
233
267
|
and not field_value[0]["languageText"]
|
|
234
268
|
and (len(field_value) <= 1 or not field_value[1]["languageText"])
|
|
235
|
-
and (
|
|
236
|
-
len(field_value) <= 2 # noqa: PLR2004 approve magic value
|
|
237
|
-
or not field_value[2]["languageText"]
|
|
238
|
-
)
|
|
269
|
+
and (len(field_value) <= 2 or not field_value[2]["languageText"])
|
|
239
270
|
),
|
|
240
271
|
)
|
|
241
272
|
|
|
@@ -264,8 +295,7 @@ def _is_missing_metadata(
|
|
|
264
295
|
True if the field doesn't have a value, False otherwise.
|
|
265
296
|
"""
|
|
266
297
|
return bool(
|
|
267
|
-
field_name in obligatory_list
|
|
268
|
-
and field_value is None
|
|
298
|
+
(field_name in obligatory_list and field_value is None)
|
|
269
299
|
or _is_missing_multilanguage_value(
|
|
270
300
|
field_name,
|
|
271
301
|
field_value,
|
|
@@ -274,7 +304,9 @@ def _is_missing_metadata(
|
|
|
274
304
|
)
|
|
275
305
|
|
|
276
306
|
|
|
277
|
-
def num_obligatory_dataset_fields_completed(
|
|
307
|
+
def num_obligatory_dataset_fields_completed(
|
|
308
|
+
dataset: DatasetType,
|
|
309
|
+
) -> int:
|
|
278
310
|
"""Count the number of completed obligatory dataset fields.
|
|
279
311
|
|
|
280
312
|
This function returns the total count of obligatory fields in the dataset that
|
|
@@ -310,7 +342,9 @@ def num_obligatory_variables_fields_completed(variables: list) -> int:
|
|
|
310
342
|
return num_completed
|
|
311
343
|
|
|
312
344
|
|
|
313
|
-
def num_obligatory_variable_fields_completed(
|
|
345
|
+
def num_obligatory_variable_fields_completed(
|
|
346
|
+
variable: all_optional_model.Variable,
|
|
347
|
+
) -> int:
|
|
314
348
|
"""Count the number of obligatory fields completed for one variable.
|
|
315
349
|
|
|
316
350
|
This function calculates the total number of obligatory fields that have
|
|
@@ -336,7 +370,28 @@ def num_obligatory_variable_fields_completed(variable: model.Variable) -> int:
|
|
|
336
370
|
return NUM_OBLIGATORY_VARIABLES_FIELDS - len(missing_metadata)
|
|
337
371
|
|
|
338
372
|
|
|
339
|
-
def
|
|
373
|
+
def num_obligatory_pseudo_fields_missing(
|
|
374
|
+
variables: list[all_optional_model.Variable],
|
|
375
|
+
) -> int:
|
|
376
|
+
"""Counts the number of obligatory pseudonymization fields are missing.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
variables: The variables to count obligatory fields for.
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
The number of obligatory pseudonymization fields that are missing.
|
|
383
|
+
"""
|
|
384
|
+
return sum(
|
|
385
|
+
getattr(v.pseudonymization, field, None) is None
|
|
386
|
+
for v in variables
|
|
387
|
+
if v.pseudonymization is not None
|
|
388
|
+
for field in OBLIGATORY_VARIABLES_PSEUDONYMIZATION_IDENTIFIERS
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def get_missing_obligatory_dataset_fields(
|
|
393
|
+
dataset: DatasetType,
|
|
394
|
+
) -> list:
|
|
340
395
|
"""Identify all obligatory dataset fields that are missing values.
|
|
341
396
|
|
|
342
397
|
This function checks for obligatory fields that are either directly missing
|
|
@@ -400,6 +455,40 @@ def get_missing_obligatory_variables_fields(variables: list) -> list[dict]:
|
|
|
400
455
|
return [item for item in missing_variables_fields if next(iter(item.values()))]
|
|
401
456
|
|
|
402
457
|
|
|
458
|
+
def get_missing_obligatory_variables_pseudo_fields(
|
|
459
|
+
variables: list[all_optional_model.Variable],
|
|
460
|
+
) -> list[dict]:
|
|
461
|
+
"""Identify obligatory variable pseudonymization fields that are missing values for each variable.
|
|
462
|
+
|
|
463
|
+
This function checks for obligatory fields that are directly missing
|
|
464
|
+
(i.e., set to `None`).
|
|
465
|
+
|
|
466
|
+
Args:
|
|
467
|
+
variables: A list of variable objects to check for missing obligatory pseudonymization fields.
|
|
468
|
+
|
|
469
|
+
Returns:
|
|
470
|
+
A list of dictionaries with variable short names as keys and list of missing
|
|
471
|
+
obligatory variable pseudonymization fields as values. This includes:
|
|
472
|
+
- Fields that are directly `None` and are listed as obligatory metadata.
|
|
473
|
+
"""
|
|
474
|
+
return [
|
|
475
|
+
{
|
|
476
|
+
v.short_name: [
|
|
477
|
+
key
|
|
478
|
+
for key, value in v.pseudonymization.model_dump().items()
|
|
479
|
+
if _is_missing_metadata(
|
|
480
|
+
key,
|
|
481
|
+
value,
|
|
482
|
+
OBLIGATORY_VARIABLES_PSEUDONYMIZATION_IDENTIFIERS,
|
|
483
|
+
[],
|
|
484
|
+
)
|
|
485
|
+
]
|
|
486
|
+
}
|
|
487
|
+
for v in variables
|
|
488
|
+
if v.pseudonymization is not None
|
|
489
|
+
]
|
|
490
|
+
|
|
491
|
+
|
|
403
492
|
def running_in_notebook() -> bool:
|
|
404
493
|
"""Return True if running in Jupyter Notebook."""
|
|
405
494
|
try:
|
|
@@ -412,81 +501,64 @@ def running_in_notebook() -> bool:
|
|
|
412
501
|
return False
|
|
413
502
|
|
|
414
503
|
|
|
415
|
-
def
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
) ->
|
|
419
|
-
"""
|
|
504
|
+
def _ensure_encryption_parameters(
|
|
505
|
+
existing: list[dict[str, Any]] | None,
|
|
506
|
+
required: dict[str, Any],
|
|
507
|
+
) -> list[dict[str, Any]]:
|
|
508
|
+
"""Ensure required key/value pairs exist in parameters list."""
|
|
509
|
+
result = list(existing or [])
|
|
420
510
|
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
Args:
|
|
426
|
-
merged_metadata: An instance of `DatadocMetadata` containing the dataset to be updated.
|
|
427
|
-
existing_metadata: An instance of `DatadocMetadata` containing the dataset whose values are used to update `merged_metadata.dataset`.
|
|
428
|
-
|
|
429
|
-
Returns:
|
|
430
|
-
`None`.
|
|
431
|
-
"""
|
|
432
|
-
if merged_metadata.dataset and existing_metadata.dataset:
|
|
433
|
-
# Override the fields as defined
|
|
434
|
-
for field in DATASET_FIELDS_FROM_EXISTING_METADATA:
|
|
435
|
-
setattr(
|
|
436
|
-
merged_metadata.dataset,
|
|
437
|
-
field,
|
|
438
|
-
getattr(existing_metadata.dataset, field),
|
|
439
|
-
)
|
|
511
|
+
# Ensure each required key is present in at least one dict
|
|
512
|
+
for key, value in required.items():
|
|
513
|
+
if not any(key in d for d in result):
|
|
514
|
+
result.append({key: value})
|
|
440
515
|
|
|
516
|
+
return result
|
|
441
517
|
|
|
442
|
-
def merge_variables(
|
|
443
|
-
existing_metadata: model.DatadocMetadata,
|
|
444
|
-
extracted_metadata: model.DatadocMetadata,
|
|
445
|
-
merged_metadata: model.DatadocMetadata,
|
|
446
|
-
) -> model.DatadocMetadata:
|
|
447
|
-
"""Merges variables from the extracted metadata into the existing metadata and updates the merged metadata.
|
|
448
518
|
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
Args:
|
|
455
|
-
existing_metadata: The metadata object containing the current state of variables.
|
|
456
|
-
extracted_metadata: The metadata object containing new or updated variables to merge.
|
|
457
|
-
merged_metadata: The metadata object that will contain the result of the merge.
|
|
519
|
+
def set_default_values_pseudonymization(
|
|
520
|
+
variable: VariableType,
|
|
521
|
+
pseudonymization: PseudonymizationType | None,
|
|
522
|
+
) -> None:
|
|
523
|
+
"""Populate pseudonymization fields with defaults based on the encryption algorithm.
|
|
458
524
|
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
and `extracted_metadata`.
|
|
525
|
+
Updates the encryption key reference and encryption parameters if they are not set,
|
|
526
|
+
handling both PAPIS and DAED algorithms. Leaves unknown algorithms unchanged.
|
|
462
527
|
"""
|
|
463
|
-
if
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
528
|
+
if pseudonymization is None:
|
|
529
|
+
return
|
|
530
|
+
if variable.pseudonymization is None:
|
|
531
|
+
variable.pseudonymization = pseudonymization
|
|
532
|
+
match pseudonymization.encryption_algorithm:
|
|
533
|
+
case EncryptionAlgorithm.PAPIS_ENCRYPTION_ALGORITHM.value:
|
|
534
|
+
if not pseudonymization.encryption_key_reference:
|
|
535
|
+
pseudonymization.encryption_key_reference = (
|
|
536
|
+
PAPIS_ENCRYPTION_KEY_REFERENCE
|
|
537
|
+
)
|
|
538
|
+
base_params = {
|
|
539
|
+
ENCRYPTION_PARAMETER_KEY_ID: PAPIS_ENCRYPTION_KEY_REFERENCE,
|
|
540
|
+
ENCRYPTION_PARAMETER_STRATEGY: ENCRYPTION_PARAMETER_STRATEGY_SKIP,
|
|
541
|
+
}
|
|
542
|
+
if pseudonymization.stable_identifier_type == PAPIS_STABLE_IDENTIFIER_TYPE:
|
|
543
|
+
base_params[ENCRYPTION_PARAMETER_SNAPSHOT_DATE] = get_current_date()
|
|
544
|
+
pseudonymization.encryption_algorithm_parameters = (
|
|
545
|
+
_ensure_encryption_parameters(
|
|
546
|
+
pseudonymization.encryption_algorithm_parameters,
|
|
547
|
+
base_params,
|
|
481
548
|
)
|
|
482
|
-
|
|
483
|
-
|
|
549
|
+
)
|
|
550
|
+
case EncryptionAlgorithm.DAEAD_ENCRYPTION_ALGORITHM.value:
|
|
551
|
+
if not pseudonymization.encryption_key_reference:
|
|
552
|
+
pseudonymization.encryption_key_reference = (
|
|
553
|
+
DAEAD_ENCRYPTION_KEY_REFERENCE
|
|
484
554
|
)
|
|
485
|
-
|
|
486
|
-
|
|
555
|
+
pseudonymization.encryption_algorithm_parameters = (
|
|
556
|
+
_ensure_encryption_parameters(
|
|
557
|
+
pseudonymization.encryption_algorithm_parameters,
|
|
558
|
+
{
|
|
559
|
+
ENCRYPTION_PARAMETER_KEY_ID: DAEAD_ENCRYPTION_KEY_REFERENCE,
|
|
560
|
+
},
|
|
487
561
|
)
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
merged_metadata.variables.append(extracted)
|
|
492
|
-
return merged_metadata
|
|
562
|
+
)
|
|
563
|
+
case _:
|
|
564
|
+
pass
|