dapla-toolbelt-metadata 0.9.6__py3-none-any.whl → 0.9.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dapla-toolbelt-metadata might be problematic. Click here for more details.
- dapla_metadata/datasets/core.py +20 -13
- dapla_metadata/datasets/utility/urn.py +161 -0
- dapla_metadata/datasets/utility/utils.py +1 -1
- {dapla_toolbelt_metadata-0.9.6.dist-info → dapla_toolbelt_metadata-0.9.7.dist-info}/METADATA +5 -1
- {dapla_toolbelt_metadata-0.9.6.dist-info → dapla_toolbelt_metadata-0.9.7.dist-info}/RECORD +7 -6
- {dapla_toolbelt_metadata-0.9.6.dist-info → dapla_toolbelt_metadata-0.9.7.dist-info}/WHEEL +0 -0
- {dapla_toolbelt_metadata-0.9.6.dist-info → dapla_toolbelt_metadata-0.9.7.dist-info}/licenses/LICENSE +0 -0
dapla_metadata/datasets/core.py
CHANGED
|
@@ -33,7 +33,12 @@ from dapla_metadata.datasets.utility.constants import (
|
|
|
33
33
|
from dapla_metadata.datasets.utility.constants import METADATA_DOCUMENT_FILE_SUFFIX
|
|
34
34
|
from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_DATASET_FIELDS
|
|
35
35
|
from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
|
|
36
|
+
from dapla_metadata.datasets.utility.urn import convert_uris_to_urns
|
|
37
|
+
from dapla_metadata.datasets.utility.urn import klass_urn_converter
|
|
38
|
+
from dapla_metadata.datasets.utility.urn import vardef_urn_converter
|
|
36
39
|
from dapla_metadata.datasets.utility.utils import OptionalDatadocMetadataType
|
|
40
|
+
from dapla_metadata.datasets.utility.utils import VariableListType
|
|
41
|
+
from dapla_metadata.datasets.utility.utils import VariableType
|
|
37
42
|
from dapla_metadata.datasets.utility.utils import calculate_percentage
|
|
38
43
|
from dapla_metadata.datasets.utility.utils import derive_assessment_from_state
|
|
39
44
|
from dapla_metadata.datasets.utility.utils import get_timestamp_now
|
|
@@ -108,8 +113,8 @@ class Datadoc:
|
|
|
108
113
|
self.container: all_optional_model.MetadataContainer | None = None
|
|
109
114
|
self.dataset_path: pathlib.Path | CloudPath | None = None
|
|
110
115
|
self.dataset = all_optional_model.Dataset()
|
|
111
|
-
self.variables:
|
|
112
|
-
self.variables_lookup: dict[str,
|
|
116
|
+
self.variables: VariableListType = []
|
|
117
|
+
self.variables_lookup: dict[str, VariableType] = {}
|
|
113
118
|
self.explicitly_defined_metadata_document = False
|
|
114
119
|
self.dataset_consistency_status: list[DatasetConsistencyStatus] = []
|
|
115
120
|
if metadata_document_path:
|
|
@@ -204,22 +209,24 @@ class Datadoc:
|
|
|
204
209
|
else:
|
|
205
210
|
self._set_metadata(existing_metadata or extracted_metadata)
|
|
206
211
|
|
|
207
|
-
set_default_values_variables(self.variables)
|
|
208
|
-
set_default_values_dataset(cast("all_optional_model.Dataset", self.dataset))
|
|
209
|
-
set_dataset_owner(self.dataset)
|
|
210
|
-
self._create_variables_lookup()
|
|
211
|
-
|
|
212
212
|
def _set_metadata(
|
|
213
213
|
self,
|
|
214
|
-
|
|
214
|
+
metadata: OptionalDatadocMetadataType,
|
|
215
215
|
) -> None:
|
|
216
|
-
if not
|
|
217
|
-
merged_metadata.dataset and merged_metadata.variables
|
|
218
|
-
):
|
|
216
|
+
if not metadata or not (metadata.dataset and metadata.variables):
|
|
219
217
|
msg = "Could not read metadata"
|
|
220
218
|
raise ValueError(msg)
|
|
221
|
-
self.dataset = cast("all_optional_model.Dataset",
|
|
222
|
-
self.variables =
|
|
219
|
+
self.dataset = cast("all_optional_model.Dataset", metadata.dataset)
|
|
220
|
+
self.variables = metadata.variables
|
|
221
|
+
|
|
222
|
+
set_default_values_variables(self.variables)
|
|
223
|
+
set_default_values_dataset(cast("all_optional_model.Dataset", self.dataset))
|
|
224
|
+
set_dataset_owner(self.dataset)
|
|
225
|
+
convert_uris_to_urns(self.variables, "definition_uri", [vardef_urn_converter])
|
|
226
|
+
convert_uris_to_urns(
|
|
227
|
+
self.variables, "classification_uri", [klass_urn_converter]
|
|
228
|
+
)
|
|
229
|
+
self._create_variables_lookup()
|
|
223
230
|
|
|
224
231
|
def _create_variables_lookup(self) -> None:
|
|
225
232
|
self.variables_lookup = {
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""Validate, parse and render URNs."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
from collections.abc import Iterable
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from enum import auto
|
|
9
|
+
|
|
10
|
+
from pydantic import AnyUrl
|
|
11
|
+
|
|
12
|
+
from dapla_metadata.datasets.utility.utils import VariableListType
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
URN_ERROR_MESSAGE_BASE = "The URL is not in a supported format"
|
|
17
|
+
|
|
18
|
+
URN_ERROR_MESSAGE_TEMPLATE = (
|
|
19
|
+
URN_ERROR_MESSAGE_BASE
|
|
20
|
+
+ " for field '{field_name}' of variable '{short_name}'. URL: '{value}'. Please contact Team Metadata if this URL should be supported."
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
VARDEF_URL_TEMPLATE = "https://{subdomain}.{domain}/variable-definitions"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class SsbNaisDomains(str, Enum):
|
|
28
|
+
"""The available domains on SSBs Nais instance."""
|
|
29
|
+
|
|
30
|
+
TEST_EXTERNAL = "test.ssb.no"
|
|
31
|
+
TEST_INTERNAL = "intern.test.ssb.no"
|
|
32
|
+
PROD_EXTERNAL = "ssb.no"
|
|
33
|
+
PROD_INTERNAL = "intern.ssb.no"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ReferenceUrlTypes(Enum):
|
|
37
|
+
"""The general category of the URL.
|
|
38
|
+
|
|
39
|
+
This can be useful to refer to when constructing a URL from a URN for a
|
|
40
|
+
specific context.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
API = auto()
|
|
44
|
+
FRONTEND = auto()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class UrnConverter:
|
|
49
|
+
"""Converts URLs to URNs and vice versa.
|
|
50
|
+
|
|
51
|
+
Fields:
|
|
52
|
+
urn_base: The format for the URN, up to the identifier.
|
|
53
|
+
id_pattern: A capturing group pattern which matches identifiers for this resource.
|
|
54
|
+
url_bases: The list of all the different URL representations for a resource. There
|
|
55
|
+
will typically be a number of URL representations for a particular resource,
|
|
56
|
+
depending on which system or technology they are accessed through and other
|
|
57
|
+
technical factors. This list defines which concrete URLs can be considered
|
|
58
|
+
equivalent to a URN.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
urn_base: str
|
|
62
|
+
id_pattern: str
|
|
63
|
+
url_bases: list[tuple[ReferenceUrlTypes, str]]
|
|
64
|
+
|
|
65
|
+
def _extract_id(self, url: str, pattern: re.Pattern[str]) -> str | None:
|
|
66
|
+
if match := pattern.match(url):
|
|
67
|
+
return match.group(1)
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
def _build_pattern(self, url_base: str) -> re.Pattern[str]:
|
|
71
|
+
return re.compile(f"^{url_base}/{self.id_pattern}")
|
|
72
|
+
|
|
73
|
+
def build_urn(self, identifier: str) -> str:
|
|
74
|
+
"""Build a URN for the given identifier."""
|
|
75
|
+
return f"{self.urn_base}:{identifier}"
|
|
76
|
+
|
|
77
|
+
def convert_to_urn(self, url: str | AnyUrl) -> AnyUrl | None:
|
|
78
|
+
"""Convert a URL to a generalized URN for that same resource.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
url (str | AnyUrl): The URL to convert.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
str | None: The URN or None if it can't be converted.
|
|
85
|
+
"""
|
|
86
|
+
if str(url).startswith(self.urn_base):
|
|
87
|
+
# In this case the value is already in the expected format and nothing needs to be done.
|
|
88
|
+
return AnyUrl(url)
|
|
89
|
+
patterns = (self._build_pattern(url[-1]) for url in self.url_bases)
|
|
90
|
+
matches = (self._extract_id(str(url), p) for p in patterns)
|
|
91
|
+
identifier = next((m for m in matches if m), None)
|
|
92
|
+
if identifier:
|
|
93
|
+
return AnyUrl(self.build_urn(identifier))
|
|
94
|
+
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
vardef_urn_converter = UrnConverter(
|
|
99
|
+
urn_base="urn:ssb:variable-definition:vardef",
|
|
100
|
+
id_pattern=r"([a-z0-9]{8})",
|
|
101
|
+
url_bases=[
|
|
102
|
+
*[
|
|
103
|
+
(
|
|
104
|
+
ReferenceUrlTypes.API,
|
|
105
|
+
VARDEF_URL_TEMPLATE.format(
|
|
106
|
+
subdomain="metadata", domain=nais_domain.value
|
|
107
|
+
),
|
|
108
|
+
)
|
|
109
|
+
for nais_domain in SsbNaisDomains
|
|
110
|
+
],
|
|
111
|
+
*[
|
|
112
|
+
(
|
|
113
|
+
ReferenceUrlTypes.FRONTEND,
|
|
114
|
+
VARDEF_URL_TEMPLATE.format(
|
|
115
|
+
subdomain="catalog", domain=nais_domain.value
|
|
116
|
+
),
|
|
117
|
+
)
|
|
118
|
+
for nais_domain in SsbNaisDomains
|
|
119
|
+
],
|
|
120
|
+
],
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
klass_urn_converter = UrnConverter(
|
|
124
|
+
urn_base="urn:ssb:classification:klass",
|
|
125
|
+
id_pattern=r"([0-9]{1,5})",
|
|
126
|
+
url_bases=[
|
|
127
|
+
(ReferenceUrlTypes.FRONTEND, "https://www.ssb.no/klass/klassifikasjoner"),
|
|
128
|
+
(ReferenceUrlTypes.FRONTEND, "https://www.ssb.no/en/klass/klassifikasjoner"),
|
|
129
|
+
(ReferenceUrlTypes.API, "https://data.ssb.no/api/klass/v1/classifications"),
|
|
130
|
+
],
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def convert_uris_to_urns(
|
|
135
|
+
variables: VariableListType, field_name: str, converters: Iterable[UrnConverter]
|
|
136
|
+
) -> None:
|
|
137
|
+
"""Where URIs are recognized URLs, convert them to URNs.
|
|
138
|
+
|
|
139
|
+
Where the value is not a known URL we preserve the value as it is and log an
|
|
140
|
+
ERROR level message.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
variables (VariableListType): The list of variables.
|
|
144
|
+
field_name (str): The name of the field which has URLs to convert to URNs
|
|
145
|
+
converters (Iterable[UrnConverter]): One or more converters which implement
|
|
146
|
+
conversion of URLs into one specific URN format. These will typically be
|
|
147
|
+
specific to an individual metadata reference system.
|
|
148
|
+
"""
|
|
149
|
+
for v in variables:
|
|
150
|
+
field = getattr(v, field_name, None)
|
|
151
|
+
if field:
|
|
152
|
+
if urn := next((c.convert_to_urn(field) for c in converters), None):
|
|
153
|
+
setattr(v, field_name, urn)
|
|
154
|
+
else:
|
|
155
|
+
logger.error(
|
|
156
|
+
URN_ERROR_MESSAGE_TEMPLATE.format(
|
|
157
|
+
field_name=field_name,
|
|
158
|
+
short_name=v.short_name,
|
|
159
|
+
value=field,
|
|
160
|
+
)
|
|
161
|
+
)
|
|
@@ -121,7 +121,7 @@ def derive_assessment_from_state(state: DataSetState) -> Assessment:
|
|
|
121
121
|
return Assessment.SENSITIVE
|
|
122
122
|
|
|
123
123
|
|
|
124
|
-
def set_default_values_variables(variables:
|
|
124
|
+
def set_default_values_variables(variables: VariableListType) -> None:
|
|
125
125
|
"""Set default values on variables.
|
|
126
126
|
|
|
127
127
|
Args:
|
{dapla_toolbelt_metadata-0.9.6.dist-info → dapla_toolbelt_metadata-0.9.7.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dapla-toolbelt-metadata
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.7
|
|
4
4
|
Summary: Dapla Toolbelt Metadata
|
|
5
5
|
Project-URL: homepage, https://github.com/statisticsnorway/dapla-toolbelt-metadata
|
|
6
6
|
Project-URL: repository, https://github.com/statisticsnorway/dapla-toolbelt-metadata
|
|
@@ -16,15 +16,19 @@ Requires-Dist: beautifulsoup4>=4.12.3
|
|
|
16
16
|
Requires-Dist: cloudpathlib[gs]>=0.17.0
|
|
17
17
|
Requires-Dist: google-auth>=2.38.0
|
|
18
18
|
Requires-Dist: lxml>=5.3.1
|
|
19
|
+
Requires-Dist: pandas>=2.3.3
|
|
19
20
|
Requires-Dist: pyarrow>=8.0.0
|
|
20
21
|
Requires-Dist: pydantic>=2.5.2
|
|
21
22
|
Requires-Dist: pyjwt>=2.8.0
|
|
23
|
+
Requires-Dist: python-dateutil>=2.9.0.post0
|
|
22
24
|
Requires-Dist: python-dotenv>=1.0.1
|
|
25
|
+
Requires-Dist: pytz>=2025.2
|
|
23
26
|
Requires-Dist: requests>=2.31.0
|
|
24
27
|
Requires-Dist: ruamel-yaml>=0.18.10
|
|
25
28
|
Requires-Dist: ssb-datadoc-model<9.0.0,>=8.0.0
|
|
26
29
|
Requires-Dist: ssb-klass-python>=1.0.1
|
|
27
30
|
Requires-Dist: typing-extensions>=4.12.2
|
|
31
|
+
Requires-Dist: urllib3>=2.5.0
|
|
28
32
|
Description-Content-Type: text/markdown
|
|
29
33
|
|
|
30
34
|
# Dapla Toolbelt Metadata
|
|
@@ -8,7 +8,7 @@ dapla_metadata/dapla/user_info.py,sha256=bENez-ICt9ySR8orYebO68Q3_2LkIW9QTL58DTc
|
|
|
8
8
|
dapla_metadata/datasets/__init__.py,sha256=an-REJgi7N8-S1SCz-MYO_8as6fMe03WvhjRP_hWWkg,293
|
|
9
9
|
dapla_metadata/datasets/_merge.py,sha256=Tk5wQz6xZGr8veUAHZb42O8HARU8ObBJ_E4afvVWdlo,12993
|
|
10
10
|
dapla_metadata/datasets/code_list.py,sha256=JtCE-5Q8grAKvkn0KKjzeGhO-96O7yGsastbuoakreg,9057
|
|
11
|
-
dapla_metadata/datasets/core.py,sha256=
|
|
11
|
+
dapla_metadata/datasets/core.py,sha256=OMMCnKADTZGTqenu8_F4lwjQ1sVg4JSgqv5CNBv8eGk,20902
|
|
12
12
|
dapla_metadata/datasets/dapla_dataset_path_info.py,sha256=WPeV_mwKk2B9sXd14SaP-kTb1bOQ_8W2KtrqOG7sJIY,26867
|
|
13
13
|
dapla_metadata/datasets/dataset_parser.py,sha256=3dtRXNy1C8SfG8zTYWdY26nV4l-dG25IC_0J5t2bYwI,8285
|
|
14
14
|
dapla_metadata/datasets/model_validation.py,sha256=6qqq1ueTWRWBPTwEGJD49Pv7ksMEaq0iDtuOXelaw-s,7223
|
|
@@ -23,7 +23,8 @@ dapla_metadata/datasets/external_sources/external_sources.py,sha256=9eIcOIUbaodN
|
|
|
23
23
|
dapla_metadata/datasets/utility/__init__.py,sha256=pp6tUcgUbo8iq9OPtFKQrTbLuI3uY7NHptwWSTpasOU,33
|
|
24
24
|
dapla_metadata/datasets/utility/constants.py,sha256=YKsn6GfNIkwLoBp0yq209o0TbsEhsA_jGaZLVR984JU,2933
|
|
25
25
|
dapla_metadata/datasets/utility/enums.py,sha256=i6dcxWya5k4LjLdGGIM_H37rRndizug3peaAgoE5UdM,652
|
|
26
|
-
dapla_metadata/datasets/utility/
|
|
26
|
+
dapla_metadata/datasets/utility/urn.py,sha256=Y_4wYwWWaFDffIN3uXjCodi-uUUQ7zkX1qEFSwGlVqs,5317
|
|
27
|
+
dapla_metadata/datasets/utility/utils.py,sha256=q76UJI8W4j2aHSq1jz_AfYnJmLfygEflgUrQpqQEPnY,20157
|
|
27
28
|
dapla_metadata/standards/__init__.py,sha256=n8jnMrudLuScSdfQ4UMJorc-Ptg3Y1-ilT8zAaQnM70,179
|
|
28
29
|
dapla_metadata/standards/name_validator.py,sha256=6-DQE_EKVd6UjL--EXpFcZDQtusVbSFaWaUY-CfOV2c,9184
|
|
29
30
|
dapla_metadata/standards/standard_validators.py,sha256=tcCiCI76wUVtMzXA2oCgdauZc0uGgUi11FKu-t7KGwQ,3767
|
|
@@ -90,7 +91,7 @@ dapla_metadata/variable_definitions/_utils/constants.py,sha256=zr5FNVCEz6TM9PVEr
|
|
|
90
91
|
dapla_metadata/variable_definitions/_utils/files.py,sha256=JbPgPNQ7iA38juMqGEdcg5OjZZUwCb6NQtPL0AEspD0,10933
|
|
91
92
|
dapla_metadata/variable_definitions/_utils/template_files.py,sha256=7fcc7yEHOl5JUZ698kqj4IiikXPHBi3SrAVOk4wqQtw,3308
|
|
92
93
|
dapla_metadata/variable_definitions/_utils/variable_definition_files.py,sha256=sGhcSpckR9NtYGNh2oVkiCd5SI3bbJEBhc1PA2uShs0,4701
|
|
93
|
-
dapla_toolbelt_metadata-0.9.
|
|
94
|
-
dapla_toolbelt_metadata-0.9.
|
|
95
|
-
dapla_toolbelt_metadata-0.9.
|
|
96
|
-
dapla_toolbelt_metadata-0.9.
|
|
94
|
+
dapla_toolbelt_metadata-0.9.7.dist-info/METADATA,sha256=yb6SMXPh6nMfUPfOMyicbrcConcCoWgEtcgsUsl2jJ0,4854
|
|
95
|
+
dapla_toolbelt_metadata-0.9.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
96
|
+
dapla_toolbelt_metadata-0.9.7.dist-info/licenses/LICENSE,sha256=np3IfD5m0ZUofn_kVzDZqliozuiO6wrktw3LRPjyEiI,1073
|
|
97
|
+
dapla_toolbelt_metadata-0.9.7.dist-info/RECORD,,
|
|
File without changes
|
{dapla_toolbelt_metadata-0.9.6.dist-info → dapla_toolbelt_metadata-0.9.7.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|