dapla-toolbelt-metadata 0.6.6__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dapla-toolbelt-metadata might be problematic. Click here for more details.
- dapla_metadata/__init__.py +1 -1
- dapla_metadata/datasets/core.py +109 -43
- dapla_metadata/datasets/dapla_dataset_path_info.py +1 -1
- dapla_metadata/datasets/dataset_parser.py +4 -4
- dapla_metadata/datasets/statistic_subject_mapping.py +5 -1
- dapla_metadata/datasets/utility/constants.py +2 -2
- dapla_metadata/datasets/utility/utils.py +44 -17
- {dapla_toolbelt_metadata-0.6.6.dist-info → dapla_toolbelt_metadata-0.7.1.dist-info}/METADATA +2 -2
- {dapla_toolbelt_metadata-0.6.6.dist-info → dapla_toolbelt_metadata-0.7.1.dist-info}/RECORD +11 -11
- {dapla_toolbelt_metadata-0.6.6.dist-info → dapla_toolbelt_metadata-0.7.1.dist-info}/LICENSE +0 -0
- {dapla_toolbelt_metadata-0.6.6.dist-info → dapla_toolbelt_metadata-0.7.1.dist-info}/WHEEL +0 -0
dapla_metadata/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ warnings.filterwarnings(
|
|
|
7
7
|
message="As the c extension couldn't be imported, `google-crc32c` is using a pure python implementation that is significantly slower.",
|
|
8
8
|
)
|
|
9
9
|
|
|
10
|
-
import datadoc_model.model as datadoc_model
|
|
10
|
+
import datadoc_model.all_optional.model as datadoc_model
|
|
11
11
|
|
|
12
12
|
from . import dapla
|
|
13
13
|
from . import datasets
|
dapla_metadata/datasets/core.py
CHANGED
|
@@ -9,9 +9,11 @@ import warnings
|
|
|
9
9
|
from concurrent.futures import ThreadPoolExecutor
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
from typing import TYPE_CHECKING
|
|
12
|
+
from typing import cast
|
|
12
13
|
|
|
13
|
-
|
|
14
|
-
|
|
14
|
+
import datadoc_model.all_optional.model as all_optional_model
|
|
15
|
+
import datadoc_model.required.model as required_model
|
|
16
|
+
from datadoc_model.all_optional.model import DataSetStatus
|
|
15
17
|
|
|
16
18
|
from dapla_metadata._shared import config
|
|
17
19
|
from dapla_metadata.dapla import user_info
|
|
@@ -30,6 +32,8 @@ from dapla_metadata.datasets.utility.constants import INCONSISTENCIES_MESSAGE
|
|
|
30
32
|
from dapla_metadata.datasets.utility.constants import METADATA_DOCUMENT_FILE_SUFFIX
|
|
31
33
|
from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_DATASET_FIELDS
|
|
32
34
|
from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
|
|
35
|
+
from dapla_metadata.datasets.utility.utils import ExistingPseudonymizationMetadataType
|
|
36
|
+
from dapla_metadata.datasets.utility.utils import OptionalDatadocMetadataType
|
|
33
37
|
from dapla_metadata.datasets.utility.utils import calculate_percentage
|
|
34
38
|
from dapla_metadata.datasets.utility.utils import derive_assessment_from_state
|
|
35
39
|
from dapla_metadata.datasets.utility.utils import get_timestamp_now
|
|
@@ -84,8 +88,8 @@ class Datadoc:
|
|
|
84
88
|
dataset_path: str | None = None,
|
|
85
89
|
metadata_document_path: str | None = None,
|
|
86
90
|
statistic_subject_mapping: StatisticSubjectMapping | None = None,
|
|
87
|
-
*,
|
|
88
91
|
errors_as_warnings: bool = False,
|
|
92
|
+
validate_required_fields_on_existing_metadata: bool = False,
|
|
89
93
|
) -> None:
|
|
90
94
|
"""Initialize the Datadoc instance.
|
|
91
95
|
|
|
@@ -101,17 +105,23 @@ class Datadoc:
|
|
|
101
105
|
Defaults to None
|
|
102
106
|
errors_as_warnings: Disable raising exceptions if inconsistencies
|
|
103
107
|
are found between existing and extracted metadata.
|
|
108
|
+
validate_required_fields_on_existing_metadata: Use a Pydantic model
|
|
109
|
+
which validates whether required fields are present when reading
|
|
110
|
+
in an existing metadata file.
|
|
104
111
|
"""
|
|
105
112
|
self._statistic_subject_mapping = statistic_subject_mapping
|
|
106
113
|
self.errors_as_warnings = errors_as_warnings
|
|
114
|
+
self.validate_required_fields_on_existing_metadata = (
|
|
115
|
+
validate_required_fields_on_existing_metadata
|
|
116
|
+
)
|
|
107
117
|
self.metadata_document: pathlib.Path | CloudPath | None = None
|
|
108
|
-
self.container:
|
|
118
|
+
self.container: all_optional_model.MetadataContainer | None = None
|
|
109
119
|
self.dataset_path: pathlib.Path | CloudPath | None = None
|
|
110
|
-
self.dataset =
|
|
120
|
+
self.dataset = all_optional_model.Dataset()
|
|
111
121
|
self.variables: list = []
|
|
112
|
-
self.pseudo_variables: list[
|
|
113
|
-
self.variables_lookup: dict[str,
|
|
114
|
-
self.pseudo_variables_lookup: dict[str,
|
|
122
|
+
self.pseudo_variables: list[all_optional_model.PseudoVariable] = []
|
|
123
|
+
self.variables_lookup: dict[str, all_optional_model.Variable] = {}
|
|
124
|
+
self.pseudo_variables_lookup: dict[str, all_optional_model.PseudoVariable] = {}
|
|
115
125
|
self.explicitly_defined_metadata_document = False
|
|
116
126
|
self.dataset_consistency_status: list = []
|
|
117
127
|
if metadata_document_path:
|
|
@@ -149,9 +159,9 @@ class Datadoc:
|
|
|
149
159
|
- The 'contains_personal_data' attribute is set to False if not specified.
|
|
150
160
|
- A lookup dictionary for variables is created based on their short names.
|
|
151
161
|
"""
|
|
152
|
-
extracted_metadata:
|
|
153
|
-
existing_metadata:
|
|
154
|
-
existing_pseudonymization:
|
|
162
|
+
extracted_metadata: all_optional_model.DatadocMetadata | None = None
|
|
163
|
+
existing_metadata: OptionalDatadocMetadataType = None
|
|
164
|
+
existing_pseudonymization: ExistingPseudonymizationMetadataType = None
|
|
155
165
|
|
|
156
166
|
if self.metadata_document and self.metadata_document.exists():
|
|
157
167
|
existing_metadata = self._extract_metadata_from_existing_document(
|
|
@@ -166,7 +176,7 @@ class Datadoc:
|
|
|
166
176
|
|
|
167
177
|
if (
|
|
168
178
|
self.dataset_path is not None
|
|
169
|
-
and self.dataset ==
|
|
179
|
+
and self.dataset == all_optional_model.Dataset()
|
|
170
180
|
and len(self.variables) == 0
|
|
171
181
|
):
|
|
172
182
|
extracted_metadata = self._extract_metadata_from_dataset(self.dataset_path)
|
|
@@ -215,14 +225,14 @@ class Datadoc:
|
|
|
215
225
|
self._set_pseudonymization_metadata(existing_pseudonymization)
|
|
216
226
|
|
|
217
227
|
set_default_values_variables(self.variables)
|
|
218
|
-
set_default_values_dataset(self.dataset)
|
|
228
|
+
set_default_values_dataset(cast("all_optional_model.Dataset", self.dataset))
|
|
219
229
|
set_dataset_owner(self.dataset)
|
|
220
230
|
self._create_variables_lookup()
|
|
221
231
|
self._create_pseudo_variables_lookup()
|
|
222
232
|
|
|
223
233
|
def _get_existing_file_path(
|
|
224
234
|
self,
|
|
225
|
-
extracted_metadata:
|
|
235
|
+
extracted_metadata: all_optional_model.DatadocMetadata | None,
|
|
226
236
|
) -> str:
|
|
227
237
|
if (
|
|
228
238
|
extracted_metadata is not None
|
|
@@ -235,19 +245,19 @@ class Datadoc:
|
|
|
235
245
|
|
|
236
246
|
def _set_metadata(
|
|
237
247
|
self,
|
|
238
|
-
merged_metadata:
|
|
248
|
+
merged_metadata: OptionalDatadocMetadataType,
|
|
239
249
|
) -> None:
|
|
240
250
|
if not merged_metadata or not (
|
|
241
251
|
merged_metadata.dataset and merged_metadata.variables
|
|
242
252
|
):
|
|
243
253
|
msg = "Could not read metadata"
|
|
244
254
|
raise ValueError(msg)
|
|
245
|
-
self.dataset = merged_metadata.dataset
|
|
255
|
+
self.dataset = cast("all_optional_model.Dataset", merged_metadata.dataset)
|
|
246
256
|
self.variables = merged_metadata.variables
|
|
247
257
|
|
|
248
258
|
def _set_pseudonymization_metadata(
|
|
249
259
|
self,
|
|
250
|
-
existing_pseudonymization:
|
|
260
|
+
existing_pseudonymization: ExistingPseudonymizationMetadataType,
|
|
251
261
|
) -> None:
|
|
252
262
|
if not existing_pseudonymization or not (
|
|
253
263
|
existing_pseudonymization.pseudo_variables is not None
|
|
@@ -255,7 +265,10 @@ class Datadoc:
|
|
|
255
265
|
msg = "Error reading pseudonymization metadata"
|
|
256
266
|
logger.error(msg)
|
|
257
267
|
return
|
|
258
|
-
self.pseudo_variables =
|
|
268
|
+
self.pseudo_variables = cast(
|
|
269
|
+
"list[all_optional_model.PseudoVariable]",
|
|
270
|
+
existing_pseudonymization.pseudo_variables,
|
|
271
|
+
)
|
|
259
272
|
|
|
260
273
|
def _create_variables_lookup(self) -> None:
|
|
261
274
|
self.variables_lookup = {
|
|
@@ -272,8 +285,8 @@ class Datadoc:
|
|
|
272
285
|
def _check_dataset_consistency(
|
|
273
286
|
new_dataset_path: Path | CloudPath,
|
|
274
287
|
existing_dataset_path: Path,
|
|
275
|
-
extracted_metadata:
|
|
276
|
-
existing_metadata:
|
|
288
|
+
extracted_metadata: all_optional_model.DatadocMetadata,
|
|
289
|
+
existing_metadata: OptionalDatadocMetadataType,
|
|
277
290
|
) -> list[dict[str, object]]:
|
|
278
291
|
"""Run consistency tests.
|
|
279
292
|
|
|
@@ -320,14 +333,16 @@ class Datadoc:
|
|
|
320
333
|
{
|
|
321
334
|
"name": "Variable names",
|
|
322
335
|
"success": (
|
|
323
|
-
|
|
336
|
+
existing_metadata is not None
|
|
337
|
+
and {v.short_name for v in extracted_metadata.variables or []}
|
|
324
338
|
== {v.short_name for v in existing_metadata.variables or []}
|
|
325
339
|
),
|
|
326
340
|
},
|
|
327
341
|
{
|
|
328
342
|
"name": "Variable datatypes",
|
|
329
343
|
"success": (
|
|
330
|
-
|
|
344
|
+
existing_metadata is not None
|
|
345
|
+
and [v.data_type for v in extracted_metadata.variables or []]
|
|
331
346
|
== [v.data_type for v in existing_metadata.variables or []]
|
|
332
347
|
),
|
|
333
348
|
},
|
|
@@ -361,27 +376,29 @@ class Datadoc:
|
|
|
361
376
|
|
|
362
377
|
@staticmethod
|
|
363
378
|
def _merge_metadata(
|
|
364
|
-
extracted_metadata:
|
|
365
|
-
existing_metadata:
|
|
366
|
-
) ->
|
|
379
|
+
extracted_metadata: all_optional_model.DatadocMetadata | None,
|
|
380
|
+
existing_metadata: OptionalDatadocMetadataType,
|
|
381
|
+
) -> all_optional_model.DatadocMetadata:
|
|
367
382
|
if not existing_metadata:
|
|
368
383
|
logger.warning(
|
|
369
384
|
"No existing metadata found, no merge to perform. Continuing with extracted metadata.",
|
|
370
385
|
)
|
|
371
|
-
return extracted_metadata or
|
|
386
|
+
return extracted_metadata or all_optional_model.DatadocMetadata()
|
|
372
387
|
|
|
373
388
|
if not extracted_metadata:
|
|
374
|
-
return existing_metadata
|
|
389
|
+
return cast("all_optional_model.DatadocMetadata", existing_metadata)
|
|
375
390
|
|
|
376
391
|
# Use the extracted metadata as a base
|
|
377
|
-
merged_metadata =
|
|
392
|
+
merged_metadata = all_optional_model.DatadocMetadata(
|
|
378
393
|
dataset=copy.deepcopy(extracted_metadata.dataset),
|
|
379
394
|
variables=[],
|
|
380
395
|
)
|
|
381
396
|
|
|
382
397
|
override_dataset_fields(
|
|
383
398
|
merged_metadata=merged_metadata,
|
|
384
|
-
existing_metadata=
|
|
399
|
+
existing_metadata=cast(
|
|
400
|
+
"all_optional_model.DatadocMetadata", existing_metadata
|
|
401
|
+
),
|
|
385
402
|
)
|
|
386
403
|
|
|
387
404
|
# Merge variables.
|
|
@@ -395,7 +412,7 @@ class Datadoc:
|
|
|
395
412
|
def _extract_metadata_from_existing_document(
|
|
396
413
|
self,
|
|
397
414
|
document: pathlib.Path | CloudPath,
|
|
398
|
-
) ->
|
|
415
|
+
) -> OptionalDatadocMetadataType:
|
|
399
416
|
"""Read metadata from an existing metadata document.
|
|
400
417
|
|
|
401
418
|
If an existing metadata document is available, this method reads and
|
|
@@ -410,7 +427,13 @@ class Datadoc:
|
|
|
410
427
|
|
|
411
428
|
Raises:
|
|
412
429
|
json.JSONDecodeError: If the metadata document cannot be parsed.
|
|
430
|
+
pydantic.ValidationError: If the data does not successfully validate.
|
|
413
431
|
"""
|
|
432
|
+
metadata_model = (
|
|
433
|
+
required_model
|
|
434
|
+
if self.validate_required_fields_on_existing_metadata
|
|
435
|
+
else all_optional_model
|
|
436
|
+
)
|
|
414
437
|
fresh_metadata = {}
|
|
415
438
|
try:
|
|
416
439
|
with document.open(mode="r", encoding="utf-8") as file:
|
|
@@ -420,7 +443,7 @@ class Datadoc:
|
|
|
420
443
|
fresh_metadata,
|
|
421
444
|
)
|
|
422
445
|
if is_metadata_in_container_structure(fresh_metadata):
|
|
423
|
-
self.container =
|
|
446
|
+
self.container = metadata_model.MetadataContainer.model_validate_json(
|
|
424
447
|
json.dumps(fresh_metadata),
|
|
425
448
|
)
|
|
426
449
|
datadoc_metadata = fresh_metadata["datadoc"]
|
|
@@ -428,7 +451,7 @@ class Datadoc:
|
|
|
428
451
|
datadoc_metadata = fresh_metadata
|
|
429
452
|
if datadoc_metadata is None:
|
|
430
453
|
return None
|
|
431
|
-
return
|
|
454
|
+
return metadata_model.DatadocMetadata.model_validate_json(
|
|
432
455
|
json.dumps(datadoc_metadata),
|
|
433
456
|
)
|
|
434
457
|
except json.JSONDecodeError:
|
|
@@ -443,7 +466,11 @@ class Datadoc:
|
|
|
443
466
|
def _extract_pseudonymization_from_existing_document(
|
|
444
467
|
self,
|
|
445
468
|
document: pathlib.Path | CloudPath,
|
|
446
|
-
) ->
|
|
469
|
+
) -> (
|
|
470
|
+
all_optional_model.PseudonymizationMetadata
|
|
471
|
+
| required_model.PseudonymizationMetadata
|
|
472
|
+
| None
|
|
473
|
+
):
|
|
447
474
|
"""Read pseudo metadata from an existing metadata document.
|
|
448
475
|
|
|
449
476
|
If there is pseudo metadata in the document supplied, the method validates and returns the pseudonymization structure.
|
|
@@ -453,7 +480,14 @@ class Datadoc:
|
|
|
453
480
|
|
|
454
481
|
Raises:
|
|
455
482
|
json.JSONDecodeError: If the metadata document cannot be parsed.
|
|
483
|
+
pydantic.ValidationError: If the data does not successfully validate.
|
|
456
484
|
"""
|
|
485
|
+
metadata_model = (
|
|
486
|
+
required_model
|
|
487
|
+
if self.validate_required_fields_on_existing_metadata
|
|
488
|
+
else all_optional_model
|
|
489
|
+
)
|
|
490
|
+
|
|
457
491
|
try:
|
|
458
492
|
with document.open(mode="r", encoding="utf-8") as file:
|
|
459
493
|
fresh_metadata = json.load(file)
|
|
@@ -472,7 +506,7 @@ class Datadoc:
|
|
|
472
506
|
if pseudonymization_metadata is None:
|
|
473
507
|
return None
|
|
474
508
|
|
|
475
|
-
return
|
|
509
|
+
return metadata_model.PseudonymizationMetadata.model_validate_json(
|
|
476
510
|
json.dumps(pseudonymization_metadata),
|
|
477
511
|
)
|
|
478
512
|
|
|
@@ -508,7 +542,7 @@ class Datadoc:
|
|
|
508
542
|
def _extract_metadata_from_dataset(
|
|
509
543
|
self,
|
|
510
544
|
dataset: pathlib.Path | CloudPath,
|
|
511
|
-
) ->
|
|
545
|
+
) -> all_optional_model.DatadocMetadata:
|
|
512
546
|
"""Obtain what metadata we can from the dataset itself.
|
|
513
547
|
|
|
514
548
|
This makes it easier for the user by 'pre-filling' certain fields.
|
|
@@ -528,9 +562,9 @@ class Datadoc:
|
|
|
528
562
|
- variables: A list of fields extracted from the dataset schema.
|
|
529
563
|
"""
|
|
530
564
|
dapla_dataset_path_info = DaplaDatasetPathInfo(dataset)
|
|
531
|
-
metadata =
|
|
565
|
+
metadata = all_optional_model.DatadocMetadata()
|
|
532
566
|
|
|
533
|
-
metadata.dataset =
|
|
567
|
+
metadata.dataset = all_optional_model.Dataset(
|
|
534
568
|
short_name=dapla_dataset_path_info.dataset_short_name,
|
|
535
569
|
dataset_state=dapla_dataset_path_info.dataset_state,
|
|
536
570
|
dataset_status=DataSetStatus.DRAFT,
|
|
@@ -594,12 +628,14 @@ class Datadoc:
|
|
|
594
628
|
if self.container:
|
|
595
629
|
self.container.datadoc = datadoc
|
|
596
630
|
if not self.container.pseudonymization:
|
|
597
|
-
self.container.pseudonymization =
|
|
598
|
-
|
|
631
|
+
self.container.pseudonymization = (
|
|
632
|
+
all_optional_model.PseudonymizationMetadata(
|
|
633
|
+
pseudo_dataset=all_optional_model.PseudoDataset()
|
|
634
|
+
)
|
|
599
635
|
)
|
|
600
636
|
self.container.pseudonymization.pseudo_variables = self.pseudo_variables
|
|
601
637
|
else:
|
|
602
|
-
self.container =
|
|
638
|
+
self.container = all_optional_model.MetadataContainer(datadoc=datadoc)
|
|
603
639
|
if self.metadata_document:
|
|
604
640
|
content = self.container.model_dump_json(indent=4)
|
|
605
641
|
self.metadata_document.write_text(content)
|
|
@@ -629,14 +665,44 @@ class Datadoc:
|
|
|
629
665
|
return calculate_percentage(num_set_fields, num_all_fields)
|
|
630
666
|
|
|
631
667
|
def add_pseudo_variable(self, variable_short_name: str) -> None:
|
|
632
|
-
"""Adds a new pseudo variable to the list of pseudonymized variables.
|
|
668
|
+
"""Adds a new pseudo variable to the list of pseudonymized variables.
|
|
669
|
+
|
|
670
|
+
Sets is_personal_data to pseudonymized encrypted personal data.
|
|
671
|
+
"""
|
|
633
672
|
if self.variables_lookup[variable_short_name] is not None:
|
|
634
|
-
pseudo_variable =
|
|
673
|
+
pseudo_variable = all_optional_model.PseudoVariable(
|
|
674
|
+
short_name=variable_short_name
|
|
675
|
+
)
|
|
635
676
|
self.pseudo_variables.append(pseudo_variable)
|
|
636
677
|
self.pseudo_variables_lookup[variable_short_name] = pseudo_variable
|
|
678
|
+
self.variables_lookup[
|
|
679
|
+
variable_short_name
|
|
680
|
+
].is_personal_data = (
|
|
681
|
+
all_optional_model.IsPersonalData.PSEUDONYMISED_ENCRYPTED_PERSONAL_DATA
|
|
682
|
+
)
|
|
683
|
+
|
|
684
|
+
def remove_pseudo_variable(self, variable_short_name: str) -> None:
|
|
685
|
+
"""Removes a pseudo variable by using the shortname.
|
|
686
|
+
|
|
687
|
+
Updates the pseudo variable lookup by creating a new one.
|
|
688
|
+
Sets is_personal_data to non pseudonymized encrypted personal data.
|
|
689
|
+
"""
|
|
690
|
+
if self.pseudo_variables_lookup[variable_short_name] is not None:
|
|
691
|
+
pseudo_variable = self.get_pseudo_variable(variable_short_name)
|
|
692
|
+
|
|
693
|
+
if pseudo_variable is not None:
|
|
694
|
+
self.pseudo_variables = [
|
|
695
|
+
pseudo_variable
|
|
696
|
+
for pseudo_variable in self.pseudo_variables
|
|
697
|
+
if pseudo_variable.short_name != variable_short_name
|
|
698
|
+
]
|
|
699
|
+
self._create_pseudo_variables_lookup()
|
|
700
|
+
self.variables_lookup[
|
|
701
|
+
variable_short_name
|
|
702
|
+
].is_personal_data = all_optional_model.IsPersonalData.NON_PSEUDONYMISED_ENCRYPTED_PERSONAL_DATA
|
|
637
703
|
|
|
638
704
|
def get_pseudo_variable(
|
|
639
705
|
self, variable_short_name: str
|
|
640
|
-
) ->
|
|
706
|
+
) -> all_optional_model.PseudoVariable | None:
|
|
641
707
|
"""Finds a pseudo variable by shortname."""
|
|
642
708
|
return self.pseudo_variables_lookup.get(variable_short_name)
|
|
@@ -12,10 +12,10 @@ from abc import abstractmethod
|
|
|
12
12
|
from typing import TYPE_CHECKING
|
|
13
13
|
|
|
14
14
|
import pandas as pd
|
|
15
|
-
from datadoc_model.model import DataType
|
|
16
|
-
from datadoc_model.model import LanguageStringType
|
|
17
|
-
from datadoc_model.model import LanguageStringTypeItem
|
|
18
|
-
from datadoc_model.model import Variable
|
|
15
|
+
from datadoc_model.all_optional.model import DataType
|
|
16
|
+
from datadoc_model.all_optional.model import LanguageStringType
|
|
17
|
+
from datadoc_model.all_optional.model import LanguageStringTypeItem
|
|
18
|
+
from datadoc_model.all_optional.model import Variable
|
|
19
19
|
from pyarrow import parquet as pq
|
|
20
20
|
|
|
21
21
|
from dapla_metadata.datasets.utility.enums import SupportedLanguages
|
|
@@ -140,7 +140,11 @@ class StatisticSubjectMapping(GetExternalSource):
|
|
|
140
140
|
SecondarySubject(
|
|
141
141
|
self._extract_titles(s.titler),
|
|
142
142
|
s["emnekode"],
|
|
143
|
-
[
|
|
143
|
+
[
|
|
144
|
+
statistikk["kortnavn"]
|
|
145
|
+
for statistikk in s.find_all("Statistikk")
|
|
146
|
+
if statistikk["isPrimaerPlassering"] == "true"
|
|
147
|
+
],
|
|
144
148
|
)
|
|
145
149
|
for s in p.find_all("delemne")
|
|
146
150
|
]
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Repository for constant values in Datadoc backend."""
|
|
2
2
|
|
|
3
|
-
from datadoc_model.model import LanguageStringType
|
|
4
|
-
from datadoc_model.model import LanguageStringTypeItem
|
|
3
|
+
from datadoc_model.all_optional.model import LanguageStringType
|
|
4
|
+
from datadoc_model.all_optional.model import LanguageStringTypeItem
|
|
5
5
|
|
|
6
6
|
VALIDATION_ERROR = "Validation error: "
|
|
7
7
|
|
|
@@ -4,15 +4,19 @@ import datetime # import is needed in xdoctest
|
|
|
4
4
|
import logging
|
|
5
5
|
import pathlib
|
|
6
6
|
import uuid
|
|
7
|
+
from typing import cast
|
|
7
8
|
|
|
9
|
+
import datadoc_model
|
|
10
|
+
import datadoc_model.all_optional.model as all_optional_model
|
|
11
|
+
import datadoc_model.required.model as required_model
|
|
8
12
|
import google.auth
|
|
9
13
|
from cloudpathlib import CloudPath
|
|
10
14
|
from cloudpathlib import GSClient
|
|
11
15
|
from cloudpathlib import GSPath
|
|
12
16
|
from datadoc_model import model
|
|
13
|
-
from datadoc_model.model import Assessment
|
|
14
|
-
from datadoc_model.model import DataSetState
|
|
15
|
-
from datadoc_model.model import VariableRole
|
|
17
|
+
from datadoc_model.all_optional.model import Assessment
|
|
18
|
+
from datadoc_model.all_optional.model import DataSetState
|
|
19
|
+
from datadoc_model.all_optional.model import VariableRole
|
|
16
20
|
|
|
17
21
|
from dapla_metadata.dapla import user_info
|
|
18
22
|
from dapla_metadata.datasets.utility.constants import (
|
|
@@ -34,6 +38,17 @@ from dapla_metadata.datasets.utility.constants import (
|
|
|
34
38
|
|
|
35
39
|
logger = logging.getLogger(__name__)
|
|
36
40
|
|
|
41
|
+
DatadocMetadataType = (
|
|
42
|
+
all_optional_model.DatadocMetadata | required_model.DatadocMetadata
|
|
43
|
+
)
|
|
44
|
+
DatasetType = all_optional_model.Dataset | required_model.Dataset
|
|
45
|
+
OptionalDatadocMetadataType = DatadocMetadataType | None
|
|
46
|
+
ExistingPseudonymizationMetadataType = (
|
|
47
|
+
all_optional_model.PseudonymizationMetadata
|
|
48
|
+
| required_model.PseudonymizationMetadata
|
|
49
|
+
| None
|
|
50
|
+
)
|
|
51
|
+
|
|
37
52
|
|
|
38
53
|
def get_timestamp_now() -> datetime.datetime:
|
|
39
54
|
"""Return a timestamp for the current moment."""
|
|
@@ -119,7 +134,9 @@ def set_default_values_variables(variables: list) -> None:
|
|
|
119
134
|
v.variable_role = VariableRole.MEASURE
|
|
120
135
|
|
|
121
136
|
|
|
122
|
-
def set_default_values_dataset(
|
|
137
|
+
def set_default_values_dataset(
|
|
138
|
+
dataset: DatasetType,
|
|
139
|
+
) -> None:
|
|
123
140
|
"""Set default values on dataset.
|
|
124
141
|
|
|
125
142
|
Args:
|
|
@@ -140,7 +157,9 @@ def set_default_values_dataset(dataset: model.Dataset) -> None:
|
|
|
140
157
|
dataset.contains_personal_data = False
|
|
141
158
|
|
|
142
159
|
|
|
143
|
-
def set_dataset_owner(
|
|
160
|
+
def set_dataset_owner(
|
|
161
|
+
dataset: DatasetType,
|
|
162
|
+
) -> None:
|
|
144
163
|
"""Sets the owner of the dataset from the DAPLA_GROUP_CONTEXT enviornment variable.
|
|
145
164
|
|
|
146
165
|
Args:
|
|
@@ -153,7 +172,7 @@ def set_dataset_owner(dataset: model.Dataset) -> None:
|
|
|
153
172
|
|
|
154
173
|
|
|
155
174
|
def set_variables_inherit_from_dataset(
|
|
156
|
-
dataset:
|
|
175
|
+
dataset: DatasetType,
|
|
157
176
|
variables: list,
|
|
158
177
|
) -> None:
|
|
159
178
|
"""Set specific dataset values on a list of variable objects.
|
|
@@ -283,7 +302,9 @@ def _is_missing_metadata(
|
|
|
283
302
|
)
|
|
284
303
|
|
|
285
304
|
|
|
286
|
-
def num_obligatory_dataset_fields_completed(
|
|
305
|
+
def num_obligatory_dataset_fields_completed(
|
|
306
|
+
dataset: DatasetType,
|
|
307
|
+
) -> int:
|
|
287
308
|
"""Count the number of completed obligatory dataset fields.
|
|
288
309
|
|
|
289
310
|
This function returns the total count of obligatory fields in the dataset that
|
|
@@ -345,7 +366,9 @@ def num_obligatory_variable_fields_completed(variable: model.Variable) -> int:
|
|
|
345
366
|
return NUM_OBLIGATORY_VARIABLES_FIELDS - len(missing_metadata)
|
|
346
367
|
|
|
347
368
|
|
|
348
|
-
def get_missing_obligatory_dataset_fields(
|
|
369
|
+
def get_missing_obligatory_dataset_fields(
|
|
370
|
+
dataset: DatasetType,
|
|
371
|
+
) -> list:
|
|
349
372
|
"""Identify all obligatory dataset fields that are missing values.
|
|
350
373
|
|
|
351
374
|
This function checks for obligatory fields that are either directly missing
|
|
@@ -422,8 +445,9 @@ def running_in_notebook() -> bool:
|
|
|
422
445
|
|
|
423
446
|
|
|
424
447
|
def override_dataset_fields(
|
|
425
|
-
merged_metadata:
|
|
426
|
-
existing_metadata:
|
|
448
|
+
merged_metadata: all_optional_model.DatadocMetadata,
|
|
449
|
+
existing_metadata: all_optional_model.DatadocMetadata
|
|
450
|
+
| required_model.DatadocMetadata,
|
|
427
451
|
) -> None:
|
|
428
452
|
"""Overrides specific fields in the dataset of `merged_metadata` with values from the dataset of `existing_metadata`.
|
|
429
453
|
|
|
@@ -449,10 +473,10 @@ def override_dataset_fields(
|
|
|
449
473
|
|
|
450
474
|
|
|
451
475
|
def merge_variables(
|
|
452
|
-
existing_metadata:
|
|
453
|
-
extracted_metadata:
|
|
454
|
-
merged_metadata:
|
|
455
|
-
) ->
|
|
476
|
+
existing_metadata: OptionalDatadocMetadataType,
|
|
477
|
+
extracted_metadata: all_optional_model.DatadocMetadata,
|
|
478
|
+
merged_metadata: all_optional_model.DatadocMetadata,
|
|
479
|
+
) -> all_optional_model.DatadocMetadata:
|
|
456
480
|
"""Merges variables from the extracted metadata into the existing metadata and updates the merged metadata.
|
|
457
481
|
|
|
458
482
|
This function compares the variables from `extracted_metadata` with those in `existing_metadata`.
|
|
@@ -466,11 +490,12 @@ def merge_variables(
|
|
|
466
490
|
merged_metadata: The metadata object that will contain the result of the merge.
|
|
467
491
|
|
|
468
492
|
Returns:
|
|
469
|
-
|
|
493
|
+
all_optional_model.DatadocMetadata: The `merged_metadata` object containing variables from both `existing_metadata`
|
|
470
494
|
and `extracted_metadata`.
|
|
471
495
|
"""
|
|
472
496
|
if (
|
|
473
|
-
existing_metadata
|
|
497
|
+
existing_metadata is not None
|
|
498
|
+
and existing_metadata.variables is not None
|
|
474
499
|
and extracted_metadata is not None
|
|
475
500
|
and extracted_metadata.variables is not None
|
|
476
501
|
and merged_metadata.variables is not None
|
|
@@ -494,7 +519,9 @@ def merge_variables(
|
|
|
494
519
|
existing.contains_data_until = (
|
|
495
520
|
extracted.contains_data_until or existing.contains_data_until
|
|
496
521
|
)
|
|
497
|
-
merged_metadata.variables.append(
|
|
522
|
+
merged_metadata.variables.append(
|
|
523
|
+
cast("datadoc_model.all_optional.model.Variable", existing)
|
|
524
|
+
)
|
|
498
525
|
else:
|
|
499
526
|
# If there is no existing metadata for this variable, we just use what we have extracted
|
|
500
527
|
merged_metadata.variables.append(extracted)
|
{dapla_toolbelt_metadata-0.6.6.dist-info → dapla_toolbelt_metadata-0.7.1.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: dapla-toolbelt-metadata
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.1
|
|
4
4
|
Summary: Dapla Toolbelt Metadata
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Team Metadata
|
|
@@ -24,7 +24,7 @@ Requires-Dist: pyjwt (>=2.8.0)
|
|
|
24
24
|
Requires-Dist: python-dotenv (>=1.0.1)
|
|
25
25
|
Requires-Dist: requests (>=2.31.0)
|
|
26
26
|
Requires-Dist: ruamel-yaml (>=0.18.10)
|
|
27
|
-
Requires-Dist: ssb-datadoc-model (==6.
|
|
27
|
+
Requires-Dist: ssb-datadoc-model (==6.1.0)
|
|
28
28
|
Requires-Dist: ssb-klass-python (>=1.0.1)
|
|
29
29
|
Requires-Dist: typing-extensions (>=4.12.2)
|
|
30
30
|
Project-URL: Changelog, https://github.com/statisticsnorway/dapla-toolbelt-metadata/releases
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
dapla_metadata/__init__.py,sha256=
|
|
1
|
+
dapla_metadata/__init__.py,sha256=37yh9XWYQoLIVIS_fDdwNN8OXzbYY-6kMYwvjQrLMJQ,428
|
|
2
2
|
dapla_metadata/_shared/__init__.py,sha256=qUFgnVhBVlPRQP0ePmY76c8FvWRrJ-9c5GvzibwERnQ,103
|
|
3
3
|
dapla_metadata/_shared/config.py,sha256=QqXcmP66AfXF8wi6FMsa7et7kH2k4EJPOF4IELKuQig,3213
|
|
4
4
|
dapla_metadata/_shared/enums.py,sha256=WHkH1d8xw41gOly6au_izZB1_-6XTcKu5rhBWUImjp8,509
|
|
@@ -7,19 +7,19 @@ dapla_metadata/dapla/__init__.py,sha256=tkapF-YwmruPPrKvN3pEoCZqb7xvJx_ogBM8XyGM
|
|
|
7
7
|
dapla_metadata/dapla/user_info.py,sha256=bENez-ICt9ySR8orYebO68Q3_2LkIW9QTL58DTctmEQ,4833
|
|
8
8
|
dapla_metadata/datasets/__init__.py,sha256=TvzskpdFC6hGcC9_55URT5jr5wNAPzXuISd2UjJWM_8,280
|
|
9
9
|
dapla_metadata/datasets/code_list.py,sha256=kp1O6sUiUAP9WKlWY8IgHWx_1IOzJA63WveHqolgKmg,9082
|
|
10
|
-
dapla_metadata/datasets/core.py,sha256=
|
|
11
|
-
dapla_metadata/datasets/dapla_dataset_path_info.py,sha256=
|
|
12
|
-
dapla_metadata/datasets/dataset_parser.py,sha256=
|
|
10
|
+
dapla_metadata/datasets/core.py,sha256=4fB9q_pUDmFfBN9RDjhUg1ney66juL1B_bDgtwijTwM,29329
|
|
11
|
+
dapla_metadata/datasets/dapla_dataset_path_info.py,sha256=WPeV_mwKk2B9sXd14SaP-kTb1bOQ_8W2KtrqOG7sJIY,26867
|
|
12
|
+
dapla_metadata/datasets/dataset_parser.py,sha256=3dtRXNy1C8SfG8zTYWdY26nV4l-dG25IC_0J5t2bYwI,8285
|
|
13
13
|
dapla_metadata/datasets/external_sources/__init__.py,sha256=qvIdXwqyEmXNUCB94ZtZXRzifdW4hiXASFFPtC70f6E,83
|
|
14
14
|
dapla_metadata/datasets/external_sources/external_sources.py,sha256=9eIcOIUbaodNX1w9Tj2wl4U4wUmr5kF1R0i01fKUzGs,2974
|
|
15
15
|
dapla_metadata/datasets/model_backwards_compatibility.py,sha256=RKhi6cjqmPKW8lTYQ0mIXTAwhMGo_X-QMad4Y5tvq_0,19136
|
|
16
16
|
dapla_metadata/datasets/model_validation.py,sha256=pGT-jqaQQY4z7jz-7UQd0BQoTWDxDWPYAnDoRC2vd_c,6818
|
|
17
17
|
dapla_metadata/datasets/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
|
-
dapla_metadata/datasets/statistic_subject_mapping.py,sha256=
|
|
18
|
+
dapla_metadata/datasets/statistic_subject_mapping.py,sha256=ovT-bZv6eGPD3L0UIs5nIw4AjJrfZn0hyWyD72JBmhs,6395
|
|
19
19
|
dapla_metadata/datasets/utility/__init__.py,sha256=pp6tUcgUbo8iq9OPtFKQrTbLuI3uY7NHptwWSTpasOU,33
|
|
20
|
-
dapla_metadata/datasets/utility/constants.py,sha256=
|
|
20
|
+
dapla_metadata/datasets/utility/constants.py,sha256=YEs2ECLNJMM1SSORPTDnzNep_Qut5YbJ5JJx_oP3ios,2470
|
|
21
21
|
dapla_metadata/datasets/utility/enums.py,sha256=SpV4xlmP1YMaJPbmX03hqRLHUOhXIk5gquTeJ8G_5OE,432
|
|
22
|
-
dapla_metadata/datasets/utility/utils.py,sha256=
|
|
22
|
+
dapla_metadata/datasets/utility/utils.py,sha256=JpJuvYEXmNUXTgaxPhUg24aiiZS201wRNnAOWyH_DO0,19210
|
|
23
23
|
dapla_metadata/standards/__init__.py,sha256=n8jnMrudLuScSdfQ4UMJorc-Ptg3Y1-ilT8zAaQnM70,179
|
|
24
24
|
dapla_metadata/standards/name_validator.py,sha256=6-DQE_EKVd6UjL--EXpFcZDQtusVbSFaWaUY-CfOV2c,9184
|
|
25
25
|
dapla_metadata/standards/standard_validators.py,sha256=tcCiCI76wUVtMzXA2oCgdauZc0uGgUi11FKu-t7KGwQ,3767
|
|
@@ -81,7 +81,7 @@ dapla_metadata/variable_definitions/_utils/variable_definition_files.py,sha256=P
|
|
|
81
81
|
dapla_metadata/variable_definitions/exceptions.py,sha256=z6Gtd84FboDu7vWjC3wathIF7I0gF0imtRhwMkr16lY,7851
|
|
82
82
|
dapla_metadata/variable_definitions/vardef.py,sha256=KYd31nCGhxuzC0hpKR6foQjO39Tlb3vu9IDqUoMvTeY,11352
|
|
83
83
|
dapla_metadata/variable_definitions/variable_definition.py,sha256=sj49uot0e4UJW4QJ3dEJGgjY4yfCHOkxS2NdD2t60b8,14883
|
|
84
|
-
dapla_toolbelt_metadata-0.
|
|
85
|
-
dapla_toolbelt_metadata-0.
|
|
86
|
-
dapla_toolbelt_metadata-0.
|
|
87
|
-
dapla_toolbelt_metadata-0.
|
|
84
|
+
dapla_toolbelt_metadata-0.7.1.dist-info/LICENSE,sha256=np3IfD5m0ZUofn_kVzDZqliozuiO6wrktw3LRPjyEiI,1073
|
|
85
|
+
dapla_toolbelt_metadata-0.7.1.dist-info/METADATA,sha256=-fHBhP0AR_dZpEO98V4CFS97b3maRScHuO8NxzmWc8M,4905
|
|
86
|
+
dapla_toolbelt_metadata-0.7.1.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
|
|
87
|
+
dapla_toolbelt_metadata-0.7.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|