dapla-toolbelt-metadata 0.9.6.dev1759398171__py3-none-any.whl → 0.9.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dapla-toolbelt-metadata might be problematic. Click here for more details.

@@ -13,9 +13,6 @@ from dotenv import load_dotenv
13
13
  from dapla_metadata._shared.enums import DaplaEnvironment
14
14
  from dapla_metadata._shared.enums import DaplaRegion
15
15
  from dapla_metadata._shared.enums import DaplaService
16
- from dapla_metadata.datasets.utility.constants import (
17
- DATADOC_STATISTICAL_SUBJECT_SOURCE_URL,
18
- )
19
16
 
20
17
  logger = logging.getLogger(__name__)
21
18
 
@@ -28,6 +25,12 @@ DAPLA_SERVICE = "DAPLA_SERVICE"
28
25
  DAPLA_GROUP_CONTEXT = "DAPLA_GROUP_CONTEXT"
29
26
  OIDC_TOKEN = "OIDC_TOKEN" # noqa: S105
30
27
 
28
+
29
+ DATADOC_STATISTICAL_SUBJECT_SOURCE_URL_DEFAULT = (
30
+ "https://www.ssb.no/xp/_/service/mimir/subjectStructurStatistics"
31
+ )
32
+
33
+
31
34
  env_loaded = False
32
35
 
33
36
 
@@ -73,7 +76,7 @@ def get_statistical_subject_source_url() -> str | None:
73
76
  """Get the URL to the statistical subject source."""
74
77
  return (
75
78
  get_config_item("DATADOC_STATISTICAL_SUBJECT_SOURCE_URL")
76
- or DATADOC_STATISTICAL_SUBJECT_SOURCE_URL
79
+ or DATADOC_STATISTICAL_SUBJECT_SOURCE_URL_DEFAULT
77
80
  )
78
81
 
79
82
 
@@ -33,7 +33,12 @@ from dapla_metadata.datasets.utility.constants import (
33
33
  from dapla_metadata.datasets.utility.constants import METADATA_DOCUMENT_FILE_SUFFIX
34
34
  from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_DATASET_FIELDS
35
35
  from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
36
+ from dapla_metadata.datasets.utility.urn import convert_uris_to_urns
37
+ from dapla_metadata.datasets.utility.urn import klass_urn_converter
38
+ from dapla_metadata.datasets.utility.urn import vardef_urn_converter
36
39
  from dapla_metadata.datasets.utility.utils import OptionalDatadocMetadataType
40
+ from dapla_metadata.datasets.utility.utils import VariableListType
41
+ from dapla_metadata.datasets.utility.utils import VariableType
37
42
  from dapla_metadata.datasets.utility.utils import calculate_percentage
38
43
  from dapla_metadata.datasets.utility.utils import derive_assessment_from_state
39
44
  from dapla_metadata.datasets.utility.utils import get_timestamp_now
@@ -108,8 +113,8 @@ class Datadoc:
108
113
  self.container: all_optional_model.MetadataContainer | None = None
109
114
  self.dataset_path: pathlib.Path | CloudPath | None = None
110
115
  self.dataset = all_optional_model.Dataset()
111
- self.variables: list = []
112
- self.variables_lookup: dict[str, all_optional_model.Variable] = {}
116
+ self.variables: VariableListType = []
117
+ self.variables_lookup: dict[str, VariableType] = {}
113
118
  self.explicitly_defined_metadata_document = False
114
119
  self.dataset_consistency_status: list[DatasetConsistencyStatus] = []
115
120
  if metadata_document_path:
@@ -204,22 +209,24 @@ class Datadoc:
204
209
  else:
205
210
  self._set_metadata(existing_metadata or extracted_metadata)
206
211
 
207
- set_default_values_variables(self.variables)
208
- set_default_values_dataset(cast("all_optional_model.Dataset", self.dataset))
209
- set_dataset_owner(self.dataset)
210
- self._create_variables_lookup()
211
-
212
212
  def _set_metadata(
213
213
  self,
214
- merged_metadata: OptionalDatadocMetadataType,
214
+ metadata: OptionalDatadocMetadataType,
215
215
  ) -> None:
216
- if not merged_metadata or not (
217
- merged_metadata.dataset and merged_metadata.variables
218
- ):
216
+ if not metadata or not (metadata.dataset and metadata.variables):
219
217
  msg = "Could not read metadata"
220
218
  raise ValueError(msg)
221
- self.dataset = cast("all_optional_model.Dataset", merged_metadata.dataset)
222
- self.variables = merged_metadata.variables
219
+ self.dataset = cast("all_optional_model.Dataset", metadata.dataset)
220
+ self.variables = metadata.variables
221
+
222
+ set_default_values_variables(self.variables)
223
+ set_default_values_dataset(cast("all_optional_model.Dataset", self.dataset))
224
+ set_dataset_owner(self.dataset)
225
+ convert_uris_to_urns(self.variables, "definition_uri", [vardef_urn_converter])
226
+ convert_uris_to_urns(
227
+ self.variables, "classification_uri", [klass_urn_converter]
228
+ )
229
+ self._create_variables_lookup()
223
230
 
224
231
  def _create_variables_lookup(self) -> None:
225
232
  self.variables_lookup = {
@@ -94,10 +94,6 @@ DATASET_FIELDS_FROM_EXISTING_METADATA = [
94
94
 
95
95
  METADATA_DOCUMENT_FILE_SUFFIX = "__DOC.json"
96
96
 
97
- DATADOC_STATISTICAL_SUBJECT_SOURCE_URL = (
98
- "https://www.ssb.no/xp/_/service/mimir/subjectStructurStatistics"
99
- )
100
-
101
97
  PAPIS_STABLE_IDENTIFIER_TYPE = "FREG_SNR"
102
98
  PAPIS_ENCRYPTION_KEY_REFERENCE = "papis-common-key-1"
103
99
  DAEAD_ENCRYPTION_KEY_REFERENCE = "ssb-common-key-1"
@@ -0,0 +1,234 @@
1
+ """Validate, parse and render URNs."""
2
+
3
+ import logging
4
+ import re
5
+ from collections.abc import Iterable
6
+ from dataclasses import dataclass
7
+ from enum import Enum
8
+ from enum import auto
9
+ from typing import Literal
10
+
11
+ from pydantic import AnyUrl
12
+
13
+ from dapla_metadata._shared.config import get_dapla_environment
14
+ from dapla_metadata._shared.enums import DaplaEnvironment
15
+ from dapla_metadata.datasets.utility.utils import VariableListType
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ URN_ERROR_MESSAGE_BASE = "The URL is not in a supported format"
20
+
21
+ URN_ERROR_MESSAGE_TEMPLATE = (
22
+ URN_ERROR_MESSAGE_BASE
23
+ + " for field '{field_name}' of variable '{short_name}'. URL: '{value}'. Please contact Team Metadata if this URL should be supported."
24
+ )
25
+
26
+
27
+ VARDEF_URL_TEMPLATE = "https://{subdomain}.{domain}/variable-definitions"
28
+
29
+
30
+ class SsbNaisDomains(str, Enum):
31
+ """The available domains on SSBs Nais instance."""
32
+
33
+ TEST_EXTERNAL = "test.ssb.no"
34
+ TEST_INTERNAL = "intern.test.ssb.no"
35
+ PROD_EXTERNAL = "ssb.no"
36
+ PROD_INTERNAL = "intern.ssb.no"
37
+
38
+
39
+ class ReferenceUrlTypes(Enum):
40
+ """The general category of the URL.
41
+
42
+ This can be useful to refer to when constructing a URL from a URN for a
43
+ specific context.
44
+ """
45
+
46
+ API = auto()
47
+ FRONTEND = auto()
48
+
49
+
50
+ UrlVisibility = Literal["public", "internal"]
51
+
52
+
53
+ @dataclass
54
+ class UrnConverter:
55
+ """Converts URLs to URNs and vice versa.
56
+
57
+ Fields:
58
+ urn_base: The format for the URN, up to the identifier.
59
+ id_pattern: A capturing group pattern which matches identifiers for this resource.
60
+ url_bases: The list of all the different URL representations for a resource. There
61
+ will typically be a number of URL representations for a particular resource,
62
+ depending on which system or technology they are accessed through and other
63
+ technical factors. This list defines which concrete URLs can be considered
64
+ equivalent to a URN.
65
+ """
66
+
67
+ urn_base: str
68
+ id_pattern: str
69
+ url_bases: list[tuple[ReferenceUrlTypes, str]]
70
+
71
+ def _extract_id(self, url: str, pattern: re.Pattern[str]) -> str | None:
72
+ if match := pattern.match(url):
73
+ return match.group(1)
74
+ return None
75
+
76
+ def _build_pattern(self, url_base: str) -> re.Pattern[str]:
77
+ return re.compile(f"^{url_base}/{self.id_pattern}")
78
+
79
+ def get_urn(self, identifier: str) -> str:
80
+ """Build a URN for the given identifier."""
81
+ return f"{self.urn_base}:{identifier}"
82
+
83
+ def get_url(
84
+ self,
85
+ identifier: str,
86
+ url_type: ReferenceUrlTypes,
87
+ visibility: Literal["public", "internal"] = "public",
88
+ ) -> str | None:
89
+ """Build concrete URL to reference a resource.
90
+
91
+ There are typically multiple URLs used to refer to one resource, this method attempts to support known variations.
92
+
93
+ Args:
94
+ identifier (str): The identifier of the resource the URL refers to.
95
+ url_type (ReferenceUrlTypes): The representation type of the URL
96
+ visibility (UrlVisibility, optional): Whether the URL should be that which is publicly available or not. Defaults to "public".
97
+
98
+ Returns:
99
+ str | None: The concrete URL. None if we cannot satisfy the supplied requirements.
100
+ """
101
+ candidates = [base[-1] for base in self.url_bases if base[0] == url_type]
102
+
103
+ def matches_visibility(url: str, visibility: UrlVisibility):
104
+ return (".intern." in url) is (visibility == "internal")
105
+
106
+ def matches_environment(url: str):
107
+ current_environment = get_dapla_environment()
108
+ if current_environment == DaplaEnvironment.TEST:
109
+ return ".test." in url
110
+ return ".test." not in url
111
+
112
+ if url := next(
113
+ (
114
+ url
115
+ for url in candidates
116
+ if matches_visibility(url, visibility) and matches_environment(url)
117
+ ),
118
+ None,
119
+ ):
120
+ return url + "/" + identifier
121
+ return None
122
+
123
+ def get_id(self, urn_or_url: str | AnyUrl) -> str | None:
124
+ """Get an identifier from a URN or URL.
125
+
126
+ Args:
127
+ urn_or_url (str | AnyUrl): The URN or URL refering to a particular resource
128
+
129
+ Returns:
130
+ str | None: The identifier for the resource, or None if it cannot be extracted.
131
+ """
132
+ if str(urn_or_url).startswith(self.urn_base):
133
+ return str(urn_or_url).removeprefix(self.urn_base + ":")
134
+ return self._extract_id_from_url(urn_or_url)
135
+
136
+ def is_id(self, value: str) -> bool:
137
+ """Check if the value is an identifier for this URN type.
138
+
139
+ Args:
140
+ value (str): The value to check.
141
+ """
142
+ if not isinstance(value, str):
143
+ # Mypy thinks it's impossible to reach this branch, but there are no guarantees in Python.
144
+ return False # type: ignore [unreachable]
145
+ pattern = re.compile(f"^{self.id_pattern}$")
146
+ return bool(pattern.match(value))
147
+
148
+ def _extract_id_from_url(self, url: str | AnyUrl) -> str | None:
149
+ patterns = (self._build_pattern(url[-1]) for url in self.url_bases)
150
+ matches = (self._extract_id(str(url), p) for p in patterns)
151
+ return next((m for m in matches if m), None)
152
+
153
+ def convert_url_to_urn(self, url: str | AnyUrl) -> AnyUrl | None:
154
+ """Convert a URL to a generalized URN for that same resource.
155
+
156
+ Args:
157
+ url (str | AnyUrl): The URL to convert.
158
+
159
+ Returns:
160
+ str | None: The URN or None if it can't be converted.
161
+ """
162
+ if str(url).startswith(self.urn_base):
163
+ # In this case the value is already in the expected format and nothing needs to be done.
164
+ return AnyUrl(url)
165
+ if identifier := self._extract_id_from_url(url):
166
+ return AnyUrl(self.get_urn(identifier))
167
+
168
+ return None
169
+
170
+
171
+ vardef_urn_converter = UrnConverter(
172
+ urn_base="urn:ssb:variable-definition:vardef",
173
+ id_pattern=r"([a-z0-9]{8})",
174
+ url_bases=[
175
+ *[
176
+ (
177
+ ReferenceUrlTypes.API,
178
+ VARDEF_URL_TEMPLATE.format(
179
+ subdomain="metadata", domain=nais_domain.value
180
+ ),
181
+ )
182
+ for nais_domain in SsbNaisDomains
183
+ ],
184
+ *[
185
+ (
186
+ ReferenceUrlTypes.FRONTEND,
187
+ VARDEF_URL_TEMPLATE.format(
188
+ subdomain="catalog", domain=nais_domain.value
189
+ ),
190
+ )
191
+ for nais_domain in SsbNaisDomains
192
+ ],
193
+ ],
194
+ )
195
+
196
+ klass_urn_converter = UrnConverter(
197
+ urn_base="urn:ssb:classification:klass",
198
+ id_pattern=r"([0-9]{1,5})",
199
+ url_bases=[
200
+ (ReferenceUrlTypes.FRONTEND, "https://www.ssb.no/klass/klassifikasjoner"),
201
+ (ReferenceUrlTypes.FRONTEND, "https://www.ssb.no/en/klass/klassifikasjoner"),
202
+ (ReferenceUrlTypes.API, "https://data.ssb.no/api/klass/v1/classifications"),
203
+ ],
204
+ )
205
+
206
+
207
+ def convert_uris_to_urns(
208
+ variables: VariableListType, field_name: str, converters: Iterable[UrnConverter]
209
+ ) -> None:
210
+ """Where URIs are recognized URLs, convert them to URNs.
211
+
212
+ Where the value is not a known URL we preserve the value as it is and log an
213
+ ERROR level message.
214
+
215
+ Args:
216
+ variables (VariableListType): The list of variables.
217
+ field_name (str): The name of the field which has URLs to convert to URNs
218
+ converters (Iterable[UrnConverter]): One or more converters which implement
219
+ conversion of URLs into one specific URN format. These will typically be
220
+ specific to an individual metadata reference system.
221
+ """
222
+ for v in variables:
223
+ field = getattr(v, field_name, None)
224
+ if field:
225
+ if urn := next((c.convert_url_to_urn(field) for c in converters), None):
226
+ setattr(v, field_name, urn)
227
+ else:
228
+ logger.error(
229
+ URN_ERROR_MESSAGE_TEMPLATE.format(
230
+ field_name=field_name,
231
+ short_name=v.short_name,
232
+ value=field,
233
+ )
234
+ )
@@ -121,7 +121,7 @@ def derive_assessment_from_state(state: DataSetState) -> Assessment:
121
121
  return Assessment.SENSITIVE
122
122
 
123
123
 
124
- def set_default_values_variables(variables: list) -> None:
124
+ def set_default_values_variables(variables: VariableListType) -> None:
125
125
  """Set default values on variables.
126
126
 
127
127
  Args:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dapla-toolbelt-metadata
3
- Version: 0.9.6.dev1759398171
3
+ Version: 0.9.8
4
4
  Summary: Dapla Toolbelt Metadata
5
5
  Project-URL: homepage, https://github.com/statisticsnorway/dapla-toolbelt-metadata
6
6
  Project-URL: repository, https://github.com/statisticsnorway/dapla-toolbelt-metadata
@@ -16,15 +16,19 @@ Requires-Dist: beautifulsoup4>=4.12.3
16
16
  Requires-Dist: cloudpathlib[gs]>=0.17.0
17
17
  Requires-Dist: google-auth>=2.38.0
18
18
  Requires-Dist: lxml>=5.3.1
19
+ Requires-Dist: pandas>=2.3.3
19
20
  Requires-Dist: pyarrow>=8.0.0
20
21
  Requires-Dist: pydantic>=2.5.2
21
22
  Requires-Dist: pyjwt>=2.8.0
23
+ Requires-Dist: python-dateutil>=2.9.0.post0
22
24
  Requires-Dist: python-dotenv>=1.0.1
25
+ Requires-Dist: pytz>=2025.2
23
26
  Requires-Dist: requests>=2.31.0
24
27
  Requires-Dist: ruamel-yaml>=0.18.10
25
28
  Requires-Dist: ssb-datadoc-model<9.0.0,>=8.0.0
26
29
  Requires-Dist: ssb-klass-python>=1.0.1
27
30
  Requires-Dist: typing-extensions>=4.12.2
31
+ Requires-Dist: urllib3>=2.5.0
28
32
  Description-Content-Type: text/markdown
29
33
 
30
34
  # Dapla Toolbelt Metadata
@@ -1,6 +1,6 @@
1
1
  dapla_metadata/__init__.py,sha256=37yh9XWYQoLIVIS_fDdwNN8OXzbYY-6kMYwvjQrLMJQ,428
2
2
  dapla_metadata/_shared/__init__.py,sha256=qUFgnVhBVlPRQP0ePmY76c8FvWRrJ-9c5GvzibwERnQ,103
3
- dapla_metadata/_shared/config.py,sha256=QqXcmP66AfXF8wi6FMsa7et7kH2k4EJPOF4IELKuQig,3213
3
+ dapla_metadata/_shared/config.py,sha256=xRAXDULS85lAi5XyAirh1acDbf7_0xPhElssXvQQk3Y,3245
4
4
  dapla_metadata/_shared/enums.py,sha256=WHkH1d8xw41gOly6au_izZB1_-6XTcKu5rhBWUImjp8,509
5
5
  dapla_metadata/_shared/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  dapla_metadata/dapla/__init__.py,sha256=tkapF-YwmruPPrKvN3pEoCZqb7xvJx_ogBM8XyGMuJI,130
@@ -8,7 +8,7 @@ dapla_metadata/dapla/user_info.py,sha256=bENez-ICt9ySR8orYebO68Q3_2LkIW9QTL58DTc
8
8
  dapla_metadata/datasets/__init__.py,sha256=an-REJgi7N8-S1SCz-MYO_8as6fMe03WvhjRP_hWWkg,293
9
9
  dapla_metadata/datasets/_merge.py,sha256=Tk5wQz6xZGr8veUAHZb42O8HARU8ObBJ_E4afvVWdlo,12993
10
10
  dapla_metadata/datasets/code_list.py,sha256=JtCE-5Q8grAKvkn0KKjzeGhO-96O7yGsastbuoakreg,9057
11
- dapla_metadata/datasets/core.py,sha256=p-2OJsAEWCUqBlzn0YIYkK-pAgtvMROdoxXvCyjfWYs,20434
11
+ dapla_metadata/datasets/core.py,sha256=OMMCnKADTZGTqenu8_F4lwjQ1sVg4JSgqv5CNBv8eGk,20902
12
12
  dapla_metadata/datasets/dapla_dataset_path_info.py,sha256=WPeV_mwKk2B9sXd14SaP-kTb1bOQ_8W2KtrqOG7sJIY,26867
13
13
  dapla_metadata/datasets/dataset_parser.py,sha256=3dtRXNy1C8SfG8zTYWdY26nV4l-dG25IC_0J5t2bYwI,8285
14
14
  dapla_metadata/datasets/model_validation.py,sha256=6qqq1ueTWRWBPTwEGJD49Pv7ksMEaq0iDtuOXelaw-s,7223
@@ -21,9 +21,10 @@ dapla_metadata/datasets/compatibility/model_backwards_compatibility.py,sha256=W5
21
21
  dapla_metadata/datasets/external_sources/__init__.py,sha256=qvIdXwqyEmXNUCB94ZtZXRzifdW4hiXASFFPtC70f6E,83
22
22
  dapla_metadata/datasets/external_sources/external_sources.py,sha256=9eIcOIUbaodNX1w9Tj2wl4U4wUmr5kF1R0i01fKUzGs,2974
23
23
  dapla_metadata/datasets/utility/__init__.py,sha256=pp6tUcgUbo8iq9OPtFKQrTbLuI3uY7NHptwWSTpasOU,33
24
- dapla_metadata/datasets/utility/constants.py,sha256=YKsn6GfNIkwLoBp0yq209o0TbsEhsA_jGaZLVR984JU,2933
24
+ dapla_metadata/datasets/utility/constants.py,sha256=4ixDvz5nErQwXa3BEtaGZb2AFUUUldJtNZV46SKwUBc,2817
25
25
  dapla_metadata/datasets/utility/enums.py,sha256=i6dcxWya5k4LjLdGGIM_H37rRndizug3peaAgoE5UdM,652
26
- dapla_metadata/datasets/utility/utils.py,sha256=Enlmhj1BA7C9Im1ju3EwS6_kV1cpT53wb2cCtBGs_lI,20145
26
+ dapla_metadata/datasets/utility/urn.py,sha256=1NtM9OkTUe4_T5Iy8cR-ofwuR0gUlrtA7oQqEmgWKOc,8141
27
+ dapla_metadata/datasets/utility/utils.py,sha256=q76UJI8W4j2aHSq1jz_AfYnJmLfygEflgUrQpqQEPnY,20157
27
28
  dapla_metadata/standards/__init__.py,sha256=n8jnMrudLuScSdfQ4UMJorc-Ptg3Y1-ilT8zAaQnM70,179
28
29
  dapla_metadata/standards/name_validator.py,sha256=6-DQE_EKVd6UjL--EXpFcZDQtusVbSFaWaUY-CfOV2c,9184
29
30
  dapla_metadata/standards/standard_validators.py,sha256=tcCiCI76wUVtMzXA2oCgdauZc0uGgUi11FKu-t7KGwQ,3767
@@ -90,7 +91,7 @@ dapla_metadata/variable_definitions/_utils/constants.py,sha256=zr5FNVCEz6TM9PVEr
90
91
  dapla_metadata/variable_definitions/_utils/files.py,sha256=JbPgPNQ7iA38juMqGEdcg5OjZZUwCb6NQtPL0AEspD0,10933
91
92
  dapla_metadata/variable_definitions/_utils/template_files.py,sha256=7fcc7yEHOl5JUZ698kqj4IiikXPHBi3SrAVOk4wqQtw,3308
92
93
  dapla_metadata/variable_definitions/_utils/variable_definition_files.py,sha256=sGhcSpckR9NtYGNh2oVkiCd5SI3bbJEBhc1PA2uShs0,4701
93
- dapla_toolbelt_metadata-0.9.6.dev1759398171.dist-info/METADATA,sha256=kFTC8HmlIehHQghGFGBO0mXum-RNqrAm4jw3Tq9tIUE,4737
94
- dapla_toolbelt_metadata-0.9.6.dev1759398171.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
95
- dapla_toolbelt_metadata-0.9.6.dev1759398171.dist-info/licenses/LICENSE,sha256=np3IfD5m0ZUofn_kVzDZqliozuiO6wrktw3LRPjyEiI,1073
96
- dapla_toolbelt_metadata-0.9.6.dev1759398171.dist-info/RECORD,,
94
+ dapla_toolbelt_metadata-0.9.8.dist-info/METADATA,sha256=jBwnT3Vs5w-JIXTjzcMDmzhDr8XPlkkdU24zxTNaz4w,4854
95
+ dapla_toolbelt_metadata-0.9.8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
96
+ dapla_toolbelt_metadata-0.9.8.dist-info/licenses/LICENSE,sha256=np3IfD5m0ZUofn_kVzDZqliozuiO6wrktw3LRPjyEiI,1073
97
+ dapla_toolbelt_metadata-0.9.8.dist-info/RECORD,,