dapla-toolbelt-metadata 0.9.3__py3-none-any.whl → 0.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dapla-toolbelt-metadata might be problematic. Click here for more details.

@@ -33,7 +33,10 @@ from dapla_metadata.datasets.utility.constants import (
33
33
  from dapla_metadata.datasets.utility.constants import METADATA_DOCUMENT_FILE_SUFFIX
34
34
  from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_DATASET_FIELDS
35
35
  from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
36
+ from dapla_metadata.datasets.utility.urn import convert_definition_uris_to_urns
36
37
  from dapla_metadata.datasets.utility.utils import OptionalDatadocMetadataType
38
+ from dapla_metadata.datasets.utility.utils import VariableListType
39
+ from dapla_metadata.datasets.utility.utils import VariableType
37
40
  from dapla_metadata.datasets.utility.utils import calculate_percentage
38
41
  from dapla_metadata.datasets.utility.utils import derive_assessment_from_state
39
42
  from dapla_metadata.datasets.utility.utils import get_timestamp_now
@@ -108,8 +111,8 @@ class Datadoc:
108
111
  self.container: all_optional_model.MetadataContainer | None = None
109
112
  self.dataset_path: pathlib.Path | CloudPath | None = None
110
113
  self.dataset = all_optional_model.Dataset()
111
- self.variables: list = []
112
- self.variables_lookup: dict[str, all_optional_model.Variable] = {}
114
+ self.variables: VariableListType = []
115
+ self.variables_lookup: dict[str, VariableType] = {}
113
116
  self.explicitly_defined_metadata_document = False
114
117
  self.dataset_consistency_status: list[DatasetConsistencyStatus] = []
115
118
  if metadata_document_path:
@@ -204,22 +207,21 @@ class Datadoc:
204
207
  else:
205
208
  self._set_metadata(existing_metadata or extracted_metadata)
206
209
 
207
- set_default_values_variables(self.variables)
208
- set_default_values_dataset(cast("all_optional_model.Dataset", self.dataset))
209
- set_dataset_owner(self.dataset)
210
- self._create_variables_lookup()
211
-
212
210
  def _set_metadata(
213
211
  self,
214
- merged_metadata: OptionalDatadocMetadataType,
212
+ metadata: OptionalDatadocMetadataType,
215
213
  ) -> None:
216
- if not merged_metadata or not (
217
- merged_metadata.dataset and merged_metadata.variables
218
- ):
214
+ if not metadata or not (metadata.dataset and metadata.variables):
219
215
  msg = "Could not read metadata"
220
216
  raise ValueError(msg)
221
- self.dataset = cast("all_optional_model.Dataset", merged_metadata.dataset)
222
- self.variables = merged_metadata.variables
217
+ self.dataset = cast("all_optional_model.Dataset", metadata.dataset)
218
+ self.variables = metadata.variables
219
+
220
+ set_default_values_variables(self.variables)
221
+ set_default_values_dataset(cast("all_optional_model.Dataset", self.dataset))
222
+ set_dataset_owner(self.dataset)
223
+ convert_definition_uris_to_urns(self.variables)
224
+ self._create_variables_lookup()
223
225
 
224
226
  def _create_variables_lookup(self) -> None:
225
227
  self.variables_lookup = {
@@ -13,20 +13,19 @@ from typing_extensions import Self
13
13
 
14
14
  from dapla_metadata.datasets.utility.constants import DATE_VALIDATION_MESSAGE
15
15
  from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_DATASET_FIELDS
16
- from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
17
16
  from dapla_metadata.datasets.utility.constants import OBLIGATORY_METADATA_WARNING
18
17
  from dapla_metadata.datasets.utility.utils import get_missing_obligatory_dataset_fields
19
18
  from dapla_metadata.datasets.utility.utils import (
20
19
  get_missing_obligatory_variables_fields,
21
20
  )
21
+ from dapla_metadata.datasets.utility.utils import (
22
+ get_missing_obligatory_variables_pseudo_fields,
23
+ )
22
24
  from dapla_metadata.datasets.utility.utils import get_timestamp_now
23
25
  from dapla_metadata.datasets.utility.utils import incorrect_date_order
24
26
  from dapla_metadata.datasets.utility.utils import (
25
27
  num_obligatory_dataset_fields_completed,
26
28
  )
27
- from dapla_metadata.datasets.utility.utils import (
28
- num_obligatory_variables_fields_completed,
29
- )
30
29
  from dapla_metadata.datasets.utility.utils import set_variables_inherit_from_dataset
31
30
 
32
31
  if TYPE_CHECKING:
@@ -146,21 +145,31 @@ class ValidateDatadocMetadata(model.DatadocMetadata):
146
145
  ObligatoryVariableWarning: If not all obligatory variable metadata fields
147
146
  are filled in.
148
147
  """
149
- if self.variables is not None and num_obligatory_variables_fields_completed(
150
- self.variables,
151
- ) != (NUM_OBLIGATORY_VARIABLES_FIELDS * len(self.variables)):
152
- warnings.warn(
153
- f"{OBLIGATORY_METADATA_WARNING} {get_missing_obligatory_variables_fields(self.variables)}",
154
- ObligatoryVariableWarning,
155
- stacklevel=2,
156
- )
157
- logger.warning(
158
- "Type warning: %s.%s %s",
159
- ObligatoryVariableWarning,
160
- OBLIGATORY_METADATA_WARNING,
161
- get_missing_obligatory_variables_fields(self.variables),
162
- )
163
-
148
+ if self.variables is not None:
149
+ missing_fields_dict = {}
150
+ for d in get_missing_obligatory_variables_fields(self.variables):
151
+ for var, fields in d.items():
152
+ missing_fields_dict[var] = fields.copy()
153
+
154
+ for d in get_missing_obligatory_variables_pseudo_fields(self.variables):
155
+ for var, fields in d.items():
156
+ if var in missing_fields_dict:
157
+ missing_fields_dict[var].extend(fields)
158
+ else:
159
+ missing_fields_dict[var] = fields.copy()
160
+
161
+ missing_fields = [
162
+ {var: fields} for var, fields in missing_fields_dict.items()
163
+ ]
164
+ if missing_fields:
165
+ message = f"{OBLIGATORY_METADATA_WARNING} {missing_fields}"
166
+ warnings.warn(message, ObligatoryVariableWarning, stacklevel=2)
167
+ logger.warning(
168
+ "Type warning: %s.%s %s",
169
+ ObligatoryVariableWarning,
170
+ OBLIGATORY_METADATA_WARNING,
171
+ missing_fields,
172
+ )
164
173
  return self
165
174
 
166
175
 
@@ -47,9 +47,9 @@ OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS = [
47
47
  "temporality_type",
48
48
  ]
49
49
 
50
- OBLIGATORY_VARIABLES_PESUODONYMIZATION_IDENTIFIERS = [
50
+ OBLIGATORY_VARIABLES_PSEUDONYMIZATION_IDENTIFIERS = [
51
51
  "encryption_algorithm",
52
- "encryption_key_refrence",
52
+ "encryption_key_reference",
53
53
  ]
54
54
 
55
55
 
@@ -0,0 +1,129 @@
1
+ """Validate, parse and render URNs."""
2
+
3
+ import logging
4
+ import re
5
+ from dataclasses import dataclass
6
+ from enum import Enum
7
+ from enum import auto
8
+
9
+ from pydantic import AnyUrl
10
+
11
+ from dapla_metadata.datasets.utility.utils import VariableListType
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ VARDEF_URL_TEMPLATE = "https://{subdomain}.{domain}/variable-definitions"
17
+
18
+
19
+ class SsbNaisDomains(str, Enum):
20
+ """The available domains on SSBs Nais instance."""
21
+
22
+ TEST_EXTERNAL = "test.ssb.no"
23
+ TEST_INTERNAL = "intern.test.ssb.no"
24
+ PROD_EXTERNAL = "ssb.no"
25
+ PROD_INTERNAL = "intern.ssb.no"
26
+
27
+
28
+ class ReferenceUrlTypes(Enum):
29
+ """The general category of the URL.
30
+
31
+ This can be useful to refer to when constructing a URL from a URN for a
32
+ specific context.
33
+ """
34
+
35
+ API = auto()
36
+ FRONTEND = auto()
37
+
38
+
39
+ @dataclass
40
+ class UrnConverter:
41
+ """Converts URLs to URNs and vice versa.
42
+
43
+ Fields:
44
+ urn_base: The format for the URN, up to the identifier.
45
+ id_pattern: A capturing group pattern which matches identifiers for this resource.
46
+ url_bases: The list of all the different URL representations for a resource. There
47
+ will typically be a number of URL representations for a particular resource,
48
+ depending on which system or technology they are accessed through and other
49
+ technical factors. This list defines which concrete URLs can be considered
50
+ equivalent to a URN.
51
+ """
52
+
53
+ urn_base: str
54
+ id_pattern: str
55
+ url_bases: list[tuple[ReferenceUrlTypes, str]]
56
+
57
+ def _extract_id(self, url: str, pattern: re.Pattern[str]) -> str | None:
58
+ if match := pattern.match(url):
59
+ return match.group(1)
60
+ return None
61
+
62
+ def _build_pattern(self, url_base: str) -> re.Pattern[str]:
63
+ return re.compile(f"^{url_base}/{self.id_pattern}")
64
+
65
+ def build_urn(self, identifier: str) -> str:
66
+ """Build a URN for the given identifier."""
67
+ return f"{self.urn_base}:{identifier}"
68
+
69
+ def convert_to_urn(self, url: str | AnyUrl | None) -> str | None:
70
+ """Convert a URL to a generalized URN for that same resource.
71
+
72
+ Args:
73
+ url (str | None): The URL to convert.
74
+
75
+ Returns:
76
+ str | None: The URN or None if it can't be converted.
77
+ """
78
+ if not url:
79
+ return None
80
+
81
+ patterns = (self._build_pattern(url[-1]) for url in self.url_bases)
82
+ matches = (self._extract_id(str(url), p) for p in patterns)
83
+ identifier = next((m for m in matches if m), None)
84
+ if identifier:
85
+ return self.build_urn(identifier)
86
+
87
+ return None
88
+
89
+
90
+ vardef_urn_converter = UrnConverter(
91
+ urn_base="urn:ssb:variable-definition:vardef",
92
+ id_pattern=r"([a-z0-9]{8})",
93
+ url_bases=[
94
+ *[
95
+ (
96
+ ReferenceUrlTypes.API,
97
+ VARDEF_URL_TEMPLATE.format(
98
+ subdomain="metadata", domain=nais_domain.value
99
+ ),
100
+ )
101
+ for nais_domain in SsbNaisDomains
102
+ ],
103
+ *[
104
+ (
105
+ ReferenceUrlTypes.FRONTEND,
106
+ VARDEF_URL_TEMPLATE.format(
107
+ subdomain="catalog", domain=nais_domain.value
108
+ ),
109
+ )
110
+ for nais_domain in SsbNaisDomains
111
+ ],
112
+ ],
113
+ )
114
+
115
+
116
+ def convert_definition_uris_to_urns(variables: VariableListType) -> None:
117
+ """Where definition URIs are recognized URLs, convert them to URNs.
118
+
119
+ Where the value is not a known URL we preserve the value as it is and log an
120
+ ERROR level message.
121
+
122
+ Args:
123
+ variables (VariableListType): The list of variables.
124
+ """
125
+ for v in variables:
126
+ if urn := vardef_urn_converter.convert_to_urn(v.definition_uri):
127
+ v.definition_uri = urn # type: ignore [assignment]
128
+ else:
129
+ logger.error("Could not convert value to URN: %s", v.definition_uri)
@@ -36,6 +36,9 @@ from dapla_metadata.datasets.utility.constants import (
36
36
  from dapla_metadata.datasets.utility.constants import (
37
37
  OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS_MULTILANGUAGE,
38
38
  )
39
+ from dapla_metadata.datasets.utility.constants import (
40
+ OBLIGATORY_VARIABLES_PSEUDONYMIZATION_IDENTIFIERS,
41
+ )
39
42
  from dapla_metadata.datasets.utility.constants import PAPIS_ENCRYPTION_KEY_REFERENCE
40
43
  from dapla_metadata.datasets.utility.constants import PAPIS_STABLE_IDENTIFIER_TYPE
41
44
  from dapla_metadata.datasets.utility.enums import EncryptionAlgorithm
@@ -118,7 +121,7 @@ def derive_assessment_from_state(state: DataSetState) -> Assessment:
118
121
  return Assessment.SENSITIVE
119
122
 
120
123
 
121
- def set_default_values_variables(variables: list) -> None:
124
+ def set_default_values_variables(variables: VariableListType) -> None:
122
125
  """Set default values on variables.
123
126
 
124
127
  Args:
@@ -367,6 +370,25 @@ def num_obligatory_variable_fields_completed(
367
370
  return NUM_OBLIGATORY_VARIABLES_FIELDS - len(missing_metadata)
368
371
 
369
372
 
373
+ def num_obligatory_pseudo_fields_missing(
374
+ variables: list[all_optional_model.Variable],
375
+ ) -> int:
376
+ """Counts the number of obligatory pseudonymization fields are missing.
377
+
378
+ Args:
379
+ variables: The variables to count obligatory fields for.
380
+
381
+ Returns:
382
+ The number of obligatory pseudonymization fields that are missing.
383
+ """
384
+ return sum(
385
+ getattr(v.pseudonymization, field, None) is None
386
+ for v in variables
387
+ if v.pseudonymization is not None
388
+ for field in OBLIGATORY_VARIABLES_PSEUDONYMIZATION_IDENTIFIERS
389
+ )
390
+
391
+
370
392
  def get_missing_obligatory_dataset_fields(
371
393
  dataset: DatasetType,
372
394
  ) -> list:
@@ -433,6 +455,40 @@ def get_missing_obligatory_variables_fields(variables: list) -> list[dict]:
433
455
  return [item for item in missing_variables_fields if next(iter(item.values()))]
434
456
 
435
457
 
458
+ def get_missing_obligatory_variables_pseudo_fields(
459
+ variables: list[all_optional_model.Variable],
460
+ ) -> list[dict]:
461
+ """Identify obligatory variable pseudonymization fields that are missing values for each variable.
462
+
463
+ This function checks for obligatory fields that are directly missing
464
+ (i.e., set to `None`).
465
+
466
+ Args:
467
+ variables: A list of variable objects to check for missing obligatory pseudonymization fields.
468
+
469
+ Returns:
470
+ A list of dictionaries with variable short names as keys and list of missing
471
+ obligatory variable pseudonymization fields as values. This includes:
472
+ - Fields that are directly `None` and are listed as obligatory metadata.
473
+ """
474
+ return [
475
+ {
476
+ v.short_name: [
477
+ key
478
+ for key, value in v.pseudonymization.model_dump().items()
479
+ if _is_missing_metadata(
480
+ key,
481
+ value,
482
+ OBLIGATORY_VARIABLES_PSEUDONYMIZATION_IDENTIFIERS,
483
+ [],
484
+ )
485
+ ]
486
+ }
487
+ for v in variables
488
+ if v.pseudonymization is not None
489
+ ]
490
+
491
+
436
492
  def running_in_notebook() -> bool:
437
493
  """Return True if running in Jupyter Notebook."""
438
494
  try:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dapla-toolbelt-metadata
3
- Version: 0.9.3
3
+ Version: 0.9.5
4
4
  Summary: Dapla Toolbelt Metadata
5
5
  Project-URL: homepage, https://github.com/statisticsnorway/dapla-toolbelt-metadata
6
6
  Project-URL: repository, https://github.com/statisticsnorway/dapla-toolbelt-metadata
@@ -8,10 +8,10 @@ dapla_metadata/dapla/user_info.py,sha256=bENez-ICt9ySR8orYebO68Q3_2LkIW9QTL58DTc
8
8
  dapla_metadata/datasets/__init__.py,sha256=an-REJgi7N8-S1SCz-MYO_8as6fMe03WvhjRP_hWWkg,293
9
9
  dapla_metadata/datasets/_merge.py,sha256=Tk5wQz6xZGr8veUAHZb42O8HARU8ObBJ_E4afvVWdlo,12993
10
10
  dapla_metadata/datasets/code_list.py,sha256=JtCE-5Q8grAKvkn0KKjzeGhO-96O7yGsastbuoakreg,9057
11
- dapla_metadata/datasets/core.py,sha256=p-2OJsAEWCUqBlzn0YIYkK-pAgtvMROdoxXvCyjfWYs,20434
11
+ dapla_metadata/datasets/core.py,sha256=GVDnzLZagTpaXin6eC1xB62U-LFinDuWVeZmyx-9pas,20633
12
12
  dapla_metadata/datasets/dapla_dataset_path_info.py,sha256=WPeV_mwKk2B9sXd14SaP-kTb1bOQ_8W2KtrqOG7sJIY,26867
13
13
  dapla_metadata/datasets/dataset_parser.py,sha256=3dtRXNy1C8SfG8zTYWdY26nV4l-dG25IC_0J5t2bYwI,8285
14
- dapla_metadata/datasets/model_validation.py,sha256=pGT-jqaQQY4z7jz-7UQd0BQoTWDxDWPYAnDoRC2vd_c,6818
14
+ dapla_metadata/datasets/model_validation.py,sha256=6qqq1ueTWRWBPTwEGJD49Pv7ksMEaq0iDtuOXelaw-s,7223
15
15
  dapla_metadata/datasets/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  dapla_metadata/datasets/statistic_subject_mapping.py,sha256=ovT-bZv6eGPD3L0UIs5nIw4AjJrfZn0hyWyD72JBmhs,6395
17
17
  dapla_metadata/datasets/compatibility/__init__.py,sha256=hKoLOIhF-BMS8EZQUaAI_S-rf6QXufyI0tr9LB3ly74,400
@@ -21,9 +21,10 @@ dapla_metadata/datasets/compatibility/model_backwards_compatibility.py,sha256=W5
21
21
  dapla_metadata/datasets/external_sources/__init__.py,sha256=qvIdXwqyEmXNUCB94ZtZXRzifdW4hiXASFFPtC70f6E,83
22
22
  dapla_metadata/datasets/external_sources/external_sources.py,sha256=9eIcOIUbaodNX1w9Tj2wl4U4wUmr5kF1R0i01fKUzGs,2974
23
23
  dapla_metadata/datasets/utility/__init__.py,sha256=pp6tUcgUbo8iq9OPtFKQrTbLuI3uY7NHptwWSTpasOU,33
24
- dapla_metadata/datasets/utility/constants.py,sha256=94nGISL96rHvAndjHyaQEaJXNBnPAiRJN1slUaB03gM,2933
24
+ dapla_metadata/datasets/utility/constants.py,sha256=YKsn6GfNIkwLoBp0yq209o0TbsEhsA_jGaZLVR984JU,2933
25
25
  dapla_metadata/datasets/utility/enums.py,sha256=i6dcxWya5k4LjLdGGIM_H37rRndizug3peaAgoE5UdM,652
26
- dapla_metadata/datasets/utility/utils.py,sha256=85Ms6jEcUuQUm-RRosscDVpvA5W4TOqiOZo2LAnXjFA,18301
26
+ dapla_metadata/datasets/utility/urn.py,sha256=UfiQZf49zbrqRwBzD9yUgfwi6sXRl3KhS9seKhhYuUs,3896
27
+ dapla_metadata/datasets/utility/utils.py,sha256=q76UJI8W4j2aHSq1jz_AfYnJmLfygEflgUrQpqQEPnY,20157
27
28
  dapla_metadata/standards/__init__.py,sha256=n8jnMrudLuScSdfQ4UMJorc-Ptg3Y1-ilT8zAaQnM70,179
28
29
  dapla_metadata/standards/name_validator.py,sha256=6-DQE_EKVd6UjL--EXpFcZDQtusVbSFaWaUY-CfOV2c,9184
29
30
  dapla_metadata/standards/standard_validators.py,sha256=tcCiCI76wUVtMzXA2oCgdauZc0uGgUi11FKu-t7KGwQ,3767
@@ -90,7 +91,7 @@ dapla_metadata/variable_definitions/_utils/constants.py,sha256=zr5FNVCEz6TM9PVEr
90
91
  dapla_metadata/variable_definitions/_utils/files.py,sha256=JbPgPNQ7iA38juMqGEdcg5OjZZUwCb6NQtPL0AEspD0,10933
91
92
  dapla_metadata/variable_definitions/_utils/template_files.py,sha256=7fcc7yEHOl5JUZ698kqj4IiikXPHBi3SrAVOk4wqQtw,3308
92
93
  dapla_metadata/variable_definitions/_utils/variable_definition_files.py,sha256=sGhcSpckR9NtYGNh2oVkiCd5SI3bbJEBhc1PA2uShs0,4701
93
- dapla_toolbelt_metadata-0.9.3.dist-info/METADATA,sha256=aCJzU5NSK7_yY9lu5R9KnO-cE6P6o_-gc_32CS6qeKU,4723
94
- dapla_toolbelt_metadata-0.9.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
95
- dapla_toolbelt_metadata-0.9.3.dist-info/licenses/LICENSE,sha256=np3IfD5m0ZUofn_kVzDZqliozuiO6wrktw3LRPjyEiI,1073
96
- dapla_toolbelt_metadata-0.9.3.dist-info/RECORD,,
94
+ dapla_toolbelt_metadata-0.9.5.dist-info/METADATA,sha256=tS5t2BEedPZXNfVIp1fkBPF6hCEub3w2G8nv8UsDoY4,4723
95
+ dapla_toolbelt_metadata-0.9.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
96
+ dapla_toolbelt_metadata-0.9.5.dist-info/licenses/LICENSE,sha256=np3IfD5m0ZUofn_kVzDZqliozuiO6wrktw3LRPjyEiI,1073
97
+ dapla_toolbelt_metadata-0.9.5.dist-info/RECORD,,