dapla-toolbelt-metadata 0.2.1__py3-none-any.whl → 0.9.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dapla-toolbelt-metadata might be problematic. Click here for more details.
- dapla_metadata/__init__.py +11 -1
- dapla_metadata/_shared/__init__.py +1 -0
- dapla_metadata/_shared/config.py +109 -0
- dapla_metadata/_shared/enums.py +27 -0
- dapla_metadata/_shared/py.typed +0 -0
- dapla_metadata/dapla/__init__.py +4 -0
- dapla_metadata/dapla/user_info.py +138 -0
- dapla_metadata/datasets/__init__.py +1 -1
- dapla_metadata/datasets/_merge.py +333 -0
- dapla_metadata/datasets/code_list.py +5 -6
- dapla_metadata/datasets/compatibility/__init__.py +10 -0
- dapla_metadata/datasets/compatibility/_handlers.py +363 -0
- dapla_metadata/datasets/compatibility/_utils.py +259 -0
- dapla_metadata/datasets/compatibility/model_backwards_compatibility.py +135 -0
- dapla_metadata/datasets/core.py +136 -182
- dapla_metadata/datasets/dapla_dataset_path_info.py +145 -19
- dapla_metadata/datasets/dataset_parser.py +41 -28
- dapla_metadata/datasets/model_validation.py +29 -20
- dapla_metadata/datasets/statistic_subject_mapping.py +5 -1
- dapla_metadata/datasets/utility/constants.py +22 -15
- dapla_metadata/datasets/utility/enums.py +8 -20
- dapla_metadata/datasets/utility/urn.py +234 -0
- dapla_metadata/datasets/utility/utils.py +183 -111
- dapla_metadata/standards/__init__.py +4 -0
- dapla_metadata/standards/name_validator.py +250 -0
- dapla_metadata/standards/standard_validators.py +98 -0
- dapla_metadata/standards/utils/__init__.py +1 -0
- dapla_metadata/standards/utils/constants.py +49 -0
- dapla_metadata/variable_definitions/__init__.py +11 -0
- dapla_metadata/variable_definitions/_generated/.openapi-generator/FILES +20 -0
- dapla_metadata/variable_definitions/_generated/.openapi-generator/VERSION +1 -0
- dapla_metadata/variable_definitions/_generated/.openapi-generator-ignore +6 -0
- dapla_metadata/variable_definitions/_generated/README.md +148 -0
- dapla_metadata/variable_definitions/_generated/__init__.py +0 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/__init__.py +47 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api/__init__.py +8 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api/data_migration_api.py +766 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api/draft_variable_definitions_api.py +888 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api/patches_api.py +888 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api/validity_periods_api.py +583 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api/variable_definitions_api.py +613 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api_client.py +779 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api_response.py +27 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/configuration.py +474 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/CompleteResponse.md +51 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/Contact.md +30 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/DataMigrationApi.md +90 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/Draft.md +42 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/DraftVariableDefinitionsApi.md +259 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/LanguageStringType.md +31 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/Owner.md +31 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/Patch.md +43 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/PatchesApi.md +249 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/PublicApi.md +218 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/SupportedLanguages.md +15 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/UpdateDraft.md +44 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/ValidityPeriod.md +42 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/ValidityPeriodsApi.md +236 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/VariableDefinitionsApi.md +304 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/VariableStatus.md +17 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/exceptions.py +193 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/__init__.py +31 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/complete_response.py +260 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/contact.py +94 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/draft.py +228 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/get_vardok_vardef_mapping_by_id200_response.py +158 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/language_string_type.py +101 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/owner.py +87 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/patch.py +244 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/problem.py +118 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/update_draft.py +274 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/validity_period.py +225 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/vardok_id_response.py +81 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/vardok_vardef_id_pair_response.py +84 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/variable_status.py +33 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/py.typed +0 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/rest.py +249 -0
- dapla_metadata/variable_definitions/_utils/__init__.py +1 -0
- dapla_metadata/variable_definitions/_utils/_client.py +32 -0
- dapla_metadata/variable_definitions/_utils/config.py +54 -0
- dapla_metadata/variable_definitions/_utils/constants.py +80 -0
- dapla_metadata/variable_definitions/_utils/files.py +309 -0
- dapla_metadata/variable_definitions/_utils/template_files.py +99 -0
- dapla_metadata/variable_definitions/_utils/variable_definition_files.py +143 -0
- dapla_metadata/variable_definitions/exceptions.py +255 -0
- dapla_metadata/variable_definitions/vardef.py +372 -0
- dapla_metadata/variable_definitions/vardok_id.py +48 -0
- dapla_metadata/variable_definitions/vardok_vardef_id_pair.py +47 -0
- dapla_metadata/variable_definitions/variable_definition.py +422 -0
- {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info}/METADATA +34 -36
- dapla_toolbelt_metadata-0.9.11.dist-info/RECORD +97 -0
- {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info}/WHEEL +1 -1
- dapla_metadata/datasets/config.py +0 -80
- dapla_metadata/datasets/model_backwards_compatibility.py +0 -520
- dapla_metadata/datasets/user_info.py +0 -88
- dapla_toolbelt_metadata-0.2.1.dist-info/RECORD +0 -22
- {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""Upgrade old metadata files to be compatible with new versions.
|
|
2
|
+
|
|
3
|
+
An important principle of Datadoc is that we ALWAYS guarantee backwards
|
|
4
|
+
compatibility of existing metadata documents. This means that we guarantee
|
|
5
|
+
that a user will never lose data, even if their document is decades old.
|
|
6
|
+
|
|
7
|
+
For each document version we release with breaking changes, we implement a
|
|
8
|
+
handler and register the version by defining a BackwardsCompatibleVersion
|
|
9
|
+
instance. These documents will then be upgraded when they're opened in Datadoc.
|
|
10
|
+
|
|
11
|
+
A test must also be implemented for each new version.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import logging
|
|
17
|
+
from collections import OrderedDict
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
from typing import TYPE_CHECKING
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
from dapla_metadata.datasets.compatibility._handlers import handle_current_version
|
|
23
|
+
from dapla_metadata.datasets.compatibility._handlers import handle_version_0_1_1
|
|
24
|
+
from dapla_metadata.datasets.compatibility._handlers import handle_version_1_0_0
|
|
25
|
+
from dapla_metadata.datasets.compatibility._handlers import handle_version_2_1_0
|
|
26
|
+
from dapla_metadata.datasets.compatibility._handlers import handle_version_2_2_0
|
|
27
|
+
from dapla_metadata.datasets.compatibility._handlers import handle_version_3_1_0
|
|
28
|
+
from dapla_metadata.datasets.compatibility._handlers import handle_version_3_2_0
|
|
29
|
+
from dapla_metadata.datasets.compatibility._handlers import handle_version_3_3_0
|
|
30
|
+
from dapla_metadata.datasets.compatibility._handlers import handle_version_4_0_0
|
|
31
|
+
from dapla_metadata.datasets.compatibility._handlers import handle_version_5_0_1
|
|
32
|
+
from dapla_metadata.datasets.compatibility._handlers import handle_version_6_0_0
|
|
33
|
+
from dapla_metadata.datasets.compatibility._utils import DATADOC_KEY
|
|
34
|
+
from dapla_metadata.datasets.compatibility._utils import DOCUMENT_VERSION_KEY
|
|
35
|
+
from dapla_metadata.datasets.compatibility._utils import UnknownModelVersionError
|
|
36
|
+
from dapla_metadata.datasets.compatibility._utils import (
|
|
37
|
+
is_metadata_in_container_structure,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
logger = logging.getLogger(__name__)
|
|
41
|
+
|
|
42
|
+
if TYPE_CHECKING:
|
|
43
|
+
from collections.abc import Callable
|
|
44
|
+
|
|
45
|
+
SUPPORTED_VERSIONS: OrderedDict[str, BackwardsCompatibleVersion] = OrderedDict()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass()
|
|
49
|
+
class BackwardsCompatibleVersion:
|
|
50
|
+
"""A version which we support with backwards compatibility.
|
|
51
|
+
|
|
52
|
+
This class registers a version and its corresponding handler function
|
|
53
|
+
for backwards compatibility.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
version: str
|
|
57
|
+
handler: Callable[[dict[str, Any]], dict[str, Any]]
|
|
58
|
+
|
|
59
|
+
def __post_init__(self) -> None:
|
|
60
|
+
"""Register this version in the supported versions map.
|
|
61
|
+
|
|
62
|
+
This method adds the instance to the `SUPPORTED_VERSIONS` dictionary
|
|
63
|
+
using the version as the key.
|
|
64
|
+
"""
|
|
65
|
+
SUPPORTED_VERSIONS[self.version] = self
|
|
66
|
+
|
|
67
|
+
def upgrade(self, metadata: dict[str, Any]) -> dict[str, Any]:
|
|
68
|
+
"""Upgrade metadata from the format of the previous version to the format of this version.
|
|
69
|
+
|
|
70
|
+
This method handles bumping the Document Version field so it's not necessary to do this in
|
|
71
|
+
the individual handler functions.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
metadata (dict[str, Any]): Metadata in the format of the previous version, to be upgraded.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
dict[str, Any]: The metadata upgraded to the version specified
|
|
78
|
+
"""
|
|
79
|
+
metadata = self.handler(metadata)
|
|
80
|
+
if is_metadata_in_container_structure(metadata):
|
|
81
|
+
metadata[DATADOC_KEY][DOCUMENT_VERSION_KEY] = self.version
|
|
82
|
+
else:
|
|
83
|
+
metadata[DOCUMENT_VERSION_KEY] = self.version
|
|
84
|
+
return metadata
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# Register all the supported versions and their handlers.
|
|
88
|
+
BackwardsCompatibleVersion(version="0.1.1", handler=handle_version_0_1_1)
|
|
89
|
+
BackwardsCompatibleVersion(version="1.0.0", handler=handle_version_1_0_0)
|
|
90
|
+
BackwardsCompatibleVersion(version="2.1.0", handler=handle_version_2_1_0)
|
|
91
|
+
BackwardsCompatibleVersion(version="2.2.0", handler=handle_version_2_2_0)
|
|
92
|
+
BackwardsCompatibleVersion(version="3.1.0", handler=handle_version_3_1_0)
|
|
93
|
+
BackwardsCompatibleVersion(version="3.2.0", handler=handle_version_3_2_0)
|
|
94
|
+
BackwardsCompatibleVersion(version="3.3.0", handler=handle_version_3_3_0)
|
|
95
|
+
BackwardsCompatibleVersion(version="4.0.0", handler=handle_version_4_0_0)
|
|
96
|
+
BackwardsCompatibleVersion(version="5.0.1", handler=handle_version_5_0_1)
|
|
97
|
+
BackwardsCompatibleVersion(version="6.0.0", handler=handle_version_6_0_0)
|
|
98
|
+
BackwardsCompatibleVersion(version="6.1.0", handler=handle_current_version)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def upgrade_metadata(fresh_metadata: dict[str, Any]) -> dict[str, Any]:
|
|
102
|
+
"""Upgrade the metadata to the latest version using registered handlers.
|
|
103
|
+
|
|
104
|
+
This function checks the version of the provided metadata and applies a series
|
|
105
|
+
of upgrade handlers to migrate the metadata to the latest version.
|
|
106
|
+
It starts from the provided version and applies all subsequent handlers in
|
|
107
|
+
sequence. If the metadata is already in the latest version or the version
|
|
108
|
+
cannot be determined, appropriate actions are taken.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
fresh_metadata: The metadata dictionary to be upgraded. This dictionary
|
|
112
|
+
must include version information that determines which handlers to apply.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
The upgraded metadata dictionary, after applying all necessary handlers.
|
|
116
|
+
|
|
117
|
+
Raises:
|
|
118
|
+
UnknownModelVersionError: If the metadata's version is unknown or unsupported.
|
|
119
|
+
"""
|
|
120
|
+
if is_metadata_in_container_structure(fresh_metadata):
|
|
121
|
+
if fresh_metadata[DATADOC_KEY] is None:
|
|
122
|
+
return fresh_metadata
|
|
123
|
+
supplied_version = fresh_metadata[DATADOC_KEY][DOCUMENT_VERSION_KEY]
|
|
124
|
+
else:
|
|
125
|
+
supplied_version = fresh_metadata[DOCUMENT_VERSION_KEY]
|
|
126
|
+
start_running_handlers = False
|
|
127
|
+
# Run all the handlers in order from the supplied version onwards
|
|
128
|
+
for k, v in SUPPORTED_VERSIONS.items():
|
|
129
|
+
if k == supplied_version:
|
|
130
|
+
start_running_handlers = True
|
|
131
|
+
if start_running_handlers:
|
|
132
|
+
fresh_metadata = v.upgrade(fresh_metadata)
|
|
133
|
+
if not start_running_handlers:
|
|
134
|
+
raise UnknownModelVersionError(supplied_version)
|
|
135
|
+
return fresh_metadata
|
dapla_metadata/datasets/core.py
CHANGED
|
@@ -5,35 +5,42 @@ from __future__ import annotations
|
|
|
5
5
|
import copy
|
|
6
6
|
import json
|
|
7
7
|
import logging
|
|
8
|
-
import warnings
|
|
9
8
|
from concurrent.futures import ThreadPoolExecutor
|
|
10
|
-
from pathlib import Path
|
|
11
9
|
from typing import TYPE_CHECKING
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
from
|
|
17
|
-
|
|
10
|
+
from typing import cast
|
|
11
|
+
|
|
12
|
+
import datadoc_model.all_optional.model as all_optional_model
|
|
13
|
+
import datadoc_model.required.model as required_model
|
|
14
|
+
from datadoc_model.all_optional.model import DataSetStatus
|
|
15
|
+
|
|
16
|
+
from dapla_metadata._shared import config
|
|
17
|
+
from dapla_metadata.dapla import user_info
|
|
18
|
+
from dapla_metadata.datasets._merge import DatasetConsistencyStatus
|
|
19
|
+
from dapla_metadata.datasets._merge import check_dataset_consistency
|
|
20
|
+
from dapla_metadata.datasets._merge import check_ready_to_merge
|
|
21
|
+
from dapla_metadata.datasets._merge import check_variables_consistency
|
|
22
|
+
from dapla_metadata.datasets._merge import merge_metadata
|
|
23
|
+
from dapla_metadata.datasets.compatibility import is_metadata_in_container_structure
|
|
24
|
+
from dapla_metadata.datasets.compatibility import upgrade_metadata
|
|
18
25
|
from dapla_metadata.datasets.dapla_dataset_path_info import DaplaDatasetPathInfo
|
|
19
26
|
from dapla_metadata.datasets.dataset_parser import DatasetParser
|
|
20
|
-
from dapla_metadata.datasets.model_backwards_compatibility import (
|
|
21
|
-
is_metadata_in_container_structure,
|
|
22
|
-
)
|
|
23
|
-
from dapla_metadata.datasets.model_backwards_compatibility import upgrade_metadata
|
|
24
27
|
from dapla_metadata.datasets.model_validation import ValidateDatadocMetadata
|
|
25
28
|
from dapla_metadata.datasets.statistic_subject_mapping import StatisticSubjectMapping
|
|
26
29
|
from dapla_metadata.datasets.utility.constants import (
|
|
27
30
|
DEFAULT_SPATIAL_COVERAGE_DESCRIPTION,
|
|
28
31
|
)
|
|
29
|
-
from dapla_metadata.datasets.utility.constants import INCONSISTENCIES_MESSAGE
|
|
30
32
|
from dapla_metadata.datasets.utility.constants import METADATA_DOCUMENT_FILE_SUFFIX
|
|
31
33
|
from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_DATASET_FIELDS
|
|
32
34
|
from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
|
|
35
|
+
from dapla_metadata.datasets.utility.urn import convert_uris_to_urns
|
|
36
|
+
from dapla_metadata.datasets.utility.urn import klass_urn_converter
|
|
37
|
+
from dapla_metadata.datasets.utility.urn import vardef_urn_converter
|
|
38
|
+
from dapla_metadata.datasets.utility.utils import OptionalDatadocMetadataType
|
|
39
|
+
from dapla_metadata.datasets.utility.utils import VariableListType
|
|
40
|
+
from dapla_metadata.datasets.utility.utils import VariableType
|
|
33
41
|
from dapla_metadata.datasets.utility.utils import calculate_percentage
|
|
34
42
|
from dapla_metadata.datasets.utility.utils import derive_assessment_from_state
|
|
35
43
|
from dapla_metadata.datasets.utility.utils import get_timestamp_now
|
|
36
|
-
from dapla_metadata.datasets.utility.utils import merge_variables
|
|
37
44
|
from dapla_metadata.datasets.utility.utils import normalize_path
|
|
38
45
|
from dapla_metadata.datasets.utility.utils import (
|
|
39
46
|
num_obligatory_dataset_fields_completed,
|
|
@@ -41,8 +48,9 @@ from dapla_metadata.datasets.utility.utils import (
|
|
|
41
48
|
from dapla_metadata.datasets.utility.utils import (
|
|
42
49
|
num_obligatory_variables_fields_completed,
|
|
43
50
|
)
|
|
44
|
-
from dapla_metadata.datasets.utility.utils import
|
|
51
|
+
from dapla_metadata.datasets.utility.utils import set_dataset_owner
|
|
45
52
|
from dapla_metadata.datasets.utility.utils import set_default_values_dataset
|
|
53
|
+
from dapla_metadata.datasets.utility.utils import set_default_values_pseudonymization
|
|
46
54
|
from dapla_metadata.datasets.utility.utils import set_default_values_variables
|
|
47
55
|
|
|
48
56
|
if TYPE_CHECKING:
|
|
@@ -51,18 +59,9 @@ if TYPE_CHECKING:
|
|
|
51
59
|
|
|
52
60
|
from cloudpathlib import CloudPath
|
|
53
61
|
|
|
54
|
-
|
|
55
62
|
logger = logging.getLogger(__name__)
|
|
56
63
|
|
|
57
64
|
|
|
58
|
-
class InconsistentDatasetsWarning(UserWarning):
|
|
59
|
-
"""Existing and new datasets differ significantly from one another."""
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
class InconsistentDatasetsError(ValueError):
|
|
63
|
-
"""Existing and new datasets differ significantly from one another."""
|
|
64
|
-
|
|
65
|
-
|
|
66
65
|
class Datadoc:
|
|
67
66
|
"""Handle reading, updating and writing of metadata.
|
|
68
67
|
|
|
@@ -83,8 +82,8 @@ class Datadoc:
|
|
|
83
82
|
dataset_path: str | None = None,
|
|
84
83
|
metadata_document_path: str | None = None,
|
|
85
84
|
statistic_subject_mapping: StatisticSubjectMapping | None = None,
|
|
86
|
-
*,
|
|
87
85
|
errors_as_warnings: bool = False,
|
|
86
|
+
validate_required_fields_on_existing_metadata: bool = False,
|
|
88
87
|
) -> None:
|
|
89
88
|
"""Initialize the Datadoc instance.
|
|
90
89
|
|
|
@@ -100,16 +99,23 @@ class Datadoc:
|
|
|
100
99
|
Defaults to None
|
|
101
100
|
errors_as_warnings: Disable raising exceptions if inconsistencies
|
|
102
101
|
are found between existing and extracted metadata.
|
|
102
|
+
validate_required_fields_on_existing_metadata: Use a Pydantic model
|
|
103
|
+
which validates whether required fields are present when reading
|
|
104
|
+
in an existing metadata file.
|
|
103
105
|
"""
|
|
104
106
|
self._statistic_subject_mapping = statistic_subject_mapping
|
|
105
107
|
self.errors_as_warnings = errors_as_warnings
|
|
108
|
+
self.validate_required_fields_on_existing_metadata = (
|
|
109
|
+
validate_required_fields_on_existing_metadata
|
|
110
|
+
)
|
|
106
111
|
self.metadata_document: pathlib.Path | CloudPath | None = None
|
|
107
|
-
self.container:
|
|
112
|
+
self.container: all_optional_model.MetadataContainer | None = None
|
|
108
113
|
self.dataset_path: pathlib.Path | CloudPath | None = None
|
|
109
|
-
self.dataset =
|
|
110
|
-
self.variables:
|
|
111
|
-
self.variables_lookup: dict[str,
|
|
114
|
+
self.dataset = all_optional_model.Dataset()
|
|
115
|
+
self.variables: VariableListType = []
|
|
116
|
+
self.variables_lookup: dict[str, VariableType] = {}
|
|
112
117
|
self.explicitly_defined_metadata_document = False
|
|
118
|
+
self.dataset_consistency_status: list[DatasetConsistencyStatus] = []
|
|
113
119
|
if metadata_document_path:
|
|
114
120
|
self.metadata_document = normalize_path(metadata_document_path)
|
|
115
121
|
self.explicitly_defined_metadata_document = True
|
|
@@ -145,8 +151,9 @@ class Datadoc:
|
|
|
145
151
|
- The 'contains_personal_data' attribute is set to False if not specified.
|
|
146
152
|
- A lookup dictionary for variables is created based on their short names.
|
|
147
153
|
"""
|
|
148
|
-
extracted_metadata:
|
|
149
|
-
existing_metadata:
|
|
154
|
+
extracted_metadata: all_optional_model.DatadocMetadata | None = None
|
|
155
|
+
existing_metadata: OptionalDatadocMetadataType = None
|
|
156
|
+
|
|
150
157
|
if self.metadata_document and self.metadata_document.exists():
|
|
151
158
|
existing_metadata = self._extract_metadata_from_existing_document(
|
|
152
159
|
self.metadata_document,
|
|
@@ -154,11 +161,28 @@ class Datadoc:
|
|
|
154
161
|
|
|
155
162
|
if (
|
|
156
163
|
self.dataset_path is not None
|
|
157
|
-
and self.dataset ==
|
|
164
|
+
and self.dataset == all_optional_model.Dataset()
|
|
158
165
|
and len(self.variables) == 0
|
|
159
166
|
):
|
|
160
167
|
extracted_metadata = self._extract_metadata_from_dataset(self.dataset_path)
|
|
161
168
|
|
|
169
|
+
if (
|
|
170
|
+
self.dataset_path
|
|
171
|
+
and self.metadata_document
|
|
172
|
+
and extracted_metadata
|
|
173
|
+
and existing_metadata
|
|
174
|
+
):
|
|
175
|
+
self.dataset_consistency_status = check_dataset_consistency(
|
|
176
|
+
self.dataset_path,
|
|
177
|
+
self.metadata_document,
|
|
178
|
+
)
|
|
179
|
+
self.dataset_consistency_status.extend(
|
|
180
|
+
check_variables_consistency(
|
|
181
|
+
extracted_metadata.variables or [],
|
|
182
|
+
existing_metadata.variables or [],
|
|
183
|
+
)
|
|
184
|
+
)
|
|
185
|
+
|
|
162
186
|
if (
|
|
163
187
|
self.dataset_path
|
|
164
188
|
and self.explicitly_defined_metadata_document
|
|
@@ -167,15 +191,11 @@ class Datadoc:
|
|
|
167
191
|
and extracted_metadata is not None
|
|
168
192
|
and existing_metadata is not None
|
|
169
193
|
):
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
self.dataset_path,
|
|
173
|
-
Path(existing_file_path),
|
|
174
|
-
extracted_metadata,
|
|
175
|
-
existing_metadata,
|
|
194
|
+
check_ready_to_merge(
|
|
195
|
+
self.dataset_consistency_status,
|
|
176
196
|
errors_as_warnings=self.errors_as_warnings,
|
|
177
197
|
)
|
|
178
|
-
merged_metadata =
|
|
198
|
+
merged_metadata = merge_metadata(
|
|
179
199
|
extracted_metadata,
|
|
180
200
|
existing_metadata,
|
|
181
201
|
)
|
|
@@ -187,157 +207,35 @@ class Datadoc:
|
|
|
187
207
|
self._set_metadata(merged_metadata)
|
|
188
208
|
else:
|
|
189
209
|
self._set_metadata(existing_metadata or extracted_metadata)
|
|
190
|
-
set_default_values_variables(self.variables)
|
|
191
|
-
set_default_values_dataset(self.dataset)
|
|
192
|
-
self._create_variables_lookup()
|
|
193
|
-
|
|
194
|
-
def _get_existing_file_path(
|
|
195
|
-
self,
|
|
196
|
-
extracted_metadata: model.DatadocMetadata | None,
|
|
197
|
-
) -> str:
|
|
198
|
-
if (
|
|
199
|
-
extracted_metadata is not None
|
|
200
|
-
and extracted_metadata.dataset is not None
|
|
201
|
-
and extracted_metadata.dataset.file_path is not None
|
|
202
|
-
):
|
|
203
|
-
return extracted_metadata.dataset.file_path
|
|
204
|
-
msg = "Could not access existing dataset file path"
|
|
205
|
-
raise ValueError(msg)
|
|
206
210
|
|
|
207
211
|
def _set_metadata(
|
|
208
212
|
self,
|
|
209
|
-
|
|
213
|
+
metadata: OptionalDatadocMetadataType,
|
|
210
214
|
) -> None:
|
|
211
|
-
if not
|
|
212
|
-
merged_metadata.dataset and merged_metadata.variables
|
|
213
|
-
):
|
|
215
|
+
if not metadata or not (metadata.dataset and metadata.variables):
|
|
214
216
|
msg = "Could not read metadata"
|
|
215
217
|
raise ValueError(msg)
|
|
216
|
-
self.dataset =
|
|
217
|
-
self.variables =
|
|
218
|
+
self.dataset = cast("all_optional_model.Dataset", metadata.dataset)
|
|
219
|
+
self.variables = metadata.variables
|
|
220
|
+
|
|
221
|
+
set_default_values_variables(self.variables)
|
|
222
|
+
set_default_values_dataset(cast("all_optional_model.Dataset", self.dataset))
|
|
223
|
+
set_dataset_owner(self.dataset)
|
|
224
|
+
convert_uris_to_urns(self.variables, "definition_uri", [vardef_urn_converter])
|
|
225
|
+
convert_uris_to_urns(
|
|
226
|
+
self.variables, "classification_uri", [klass_urn_converter]
|
|
227
|
+
)
|
|
228
|
+
self._create_variables_lookup()
|
|
218
229
|
|
|
219
230
|
def _create_variables_lookup(self) -> None:
|
|
220
231
|
self.variables_lookup = {
|
|
221
232
|
v.short_name: v for v in self.variables if v.short_name
|
|
222
233
|
}
|
|
223
234
|
|
|
224
|
-
@staticmethod
|
|
225
|
-
def _check_ready_to_merge(
|
|
226
|
-
new_dataset_path: Path | CloudPath,
|
|
227
|
-
existing_dataset_path: Path,
|
|
228
|
-
extracted_metadata: model.DatadocMetadata,
|
|
229
|
-
existing_metadata: model.DatadocMetadata,
|
|
230
|
-
*,
|
|
231
|
-
errors_as_warnings: bool,
|
|
232
|
-
) -> None:
|
|
233
|
-
"""Check if the datasets are consistent enough to make a successful merge of metadata.
|
|
234
|
-
|
|
235
|
-
Args:
|
|
236
|
-
new_dataset_path: Path to the dataset to be documented.
|
|
237
|
-
existing_dataset_path: Path stored in the existing metadata.
|
|
238
|
-
extracted_metadata: Metadata extracted from a physical dataset.
|
|
239
|
-
existing_metadata: Metadata from a previously created metadata document.
|
|
240
|
-
errors_as_warnings: True if failing checks should be raised as warnings, not errors.
|
|
241
|
-
|
|
242
|
-
Raises:
|
|
243
|
-
InconsistentDatasetsError: If inconsistencies are found and `errors_as_warnings == False`
|
|
244
|
-
"""
|
|
245
|
-
new_dataset_path_info = DaplaDatasetPathInfo(new_dataset_path)
|
|
246
|
-
existing_dataset_path_info = DaplaDatasetPathInfo(existing_dataset_path)
|
|
247
|
-
results = [
|
|
248
|
-
{
|
|
249
|
-
"name": "Bucket name",
|
|
250
|
-
"success": (
|
|
251
|
-
new_dataset_path_info.bucket_name
|
|
252
|
-
== existing_dataset_path_info.bucket_name
|
|
253
|
-
),
|
|
254
|
-
},
|
|
255
|
-
{
|
|
256
|
-
"name": "Data product name",
|
|
257
|
-
"success": (
|
|
258
|
-
new_dataset_path_info.statistic_short_name
|
|
259
|
-
== existing_dataset_path_info.statistic_short_name
|
|
260
|
-
),
|
|
261
|
-
},
|
|
262
|
-
{
|
|
263
|
-
"name": "Dataset state",
|
|
264
|
-
"success": (
|
|
265
|
-
new_dataset_path_info.dataset_state
|
|
266
|
-
== existing_dataset_path_info.dataset_state
|
|
267
|
-
),
|
|
268
|
-
},
|
|
269
|
-
{
|
|
270
|
-
"name": "Dataset short name",
|
|
271
|
-
"success": (
|
|
272
|
-
new_dataset_path_info.dataset_short_name
|
|
273
|
-
== existing_dataset_path_info.dataset_short_name
|
|
274
|
-
),
|
|
275
|
-
},
|
|
276
|
-
{
|
|
277
|
-
"name": "Variable names",
|
|
278
|
-
"success": (
|
|
279
|
-
{v.short_name for v in extracted_metadata.variables or []}
|
|
280
|
-
== {v.short_name for v in existing_metadata.variables or []}
|
|
281
|
-
),
|
|
282
|
-
},
|
|
283
|
-
{
|
|
284
|
-
"name": "Variable datatypes",
|
|
285
|
-
"success": (
|
|
286
|
-
[v.data_type for v in extracted_metadata.variables or []]
|
|
287
|
-
== [v.data_type for v in existing_metadata.variables or []]
|
|
288
|
-
),
|
|
289
|
-
},
|
|
290
|
-
]
|
|
291
|
-
if failures := [result for result in results if not result["success"]]:
|
|
292
|
-
msg = f"{INCONSISTENCIES_MESSAGE} {', '.join(str(f['name']) for f in failures)}"
|
|
293
|
-
if errors_as_warnings:
|
|
294
|
-
warnings.warn(
|
|
295
|
-
message=msg,
|
|
296
|
-
category=InconsistentDatasetsWarning,
|
|
297
|
-
stacklevel=2,
|
|
298
|
-
)
|
|
299
|
-
else:
|
|
300
|
-
raise InconsistentDatasetsError(
|
|
301
|
-
msg,
|
|
302
|
-
)
|
|
303
|
-
|
|
304
|
-
@staticmethod
|
|
305
|
-
def _merge_metadata(
|
|
306
|
-
extracted_metadata: model.DatadocMetadata | None,
|
|
307
|
-
existing_metadata: model.DatadocMetadata | None,
|
|
308
|
-
) -> model.DatadocMetadata:
|
|
309
|
-
if not existing_metadata:
|
|
310
|
-
logger.warning(
|
|
311
|
-
"No existing metadata found, no merge to perform. Continuing with extracted metadata.",
|
|
312
|
-
)
|
|
313
|
-
return extracted_metadata or model.DatadocMetadata()
|
|
314
|
-
|
|
315
|
-
if not extracted_metadata:
|
|
316
|
-
return existing_metadata
|
|
317
|
-
|
|
318
|
-
# Use the extracted metadata as a base
|
|
319
|
-
merged_metadata = model.DatadocMetadata(
|
|
320
|
-
dataset=copy.deepcopy(extracted_metadata.dataset),
|
|
321
|
-
variables=[],
|
|
322
|
-
)
|
|
323
|
-
|
|
324
|
-
override_dataset_fields(
|
|
325
|
-
merged_metadata=merged_metadata,
|
|
326
|
-
existing_metadata=existing_metadata,
|
|
327
|
-
)
|
|
328
|
-
|
|
329
|
-
# Merge variables.
|
|
330
|
-
# For each extracted variable, copy existing metadata into the merged metadata
|
|
331
|
-
return merge_variables(
|
|
332
|
-
existing_metadata=existing_metadata,
|
|
333
|
-
extracted_metadata=extracted_metadata,
|
|
334
|
-
merged_metadata=merged_metadata,
|
|
335
|
-
)
|
|
336
|
-
|
|
337
235
|
def _extract_metadata_from_existing_document(
|
|
338
236
|
self,
|
|
339
237
|
document: pathlib.Path | CloudPath,
|
|
340
|
-
) ->
|
|
238
|
+
) -> OptionalDatadocMetadataType:
|
|
341
239
|
"""Read metadata from an existing metadata document.
|
|
342
240
|
|
|
343
241
|
If an existing metadata document is available, this method reads and
|
|
@@ -352,7 +250,13 @@ class Datadoc:
|
|
|
352
250
|
|
|
353
251
|
Raises:
|
|
354
252
|
json.JSONDecodeError: If the metadata document cannot be parsed.
|
|
253
|
+
pydantic.ValidationError: If the data does not successfully validate.
|
|
355
254
|
"""
|
|
255
|
+
metadata_model = (
|
|
256
|
+
required_model
|
|
257
|
+
if self.validate_required_fields_on_existing_metadata
|
|
258
|
+
else all_optional_model
|
|
259
|
+
)
|
|
356
260
|
fresh_metadata = {}
|
|
357
261
|
try:
|
|
358
262
|
with document.open(mode="r", encoding="utf-8") as file:
|
|
@@ -362,7 +266,7 @@ class Datadoc:
|
|
|
362
266
|
fresh_metadata,
|
|
363
267
|
)
|
|
364
268
|
if is_metadata_in_container_structure(fresh_metadata):
|
|
365
|
-
self.container =
|
|
269
|
+
self.container = metadata_model.MetadataContainer.model_validate_json(
|
|
366
270
|
json.dumps(fresh_metadata),
|
|
367
271
|
)
|
|
368
272
|
datadoc_metadata = fresh_metadata["datadoc"]
|
|
@@ -370,7 +274,7 @@ class Datadoc:
|
|
|
370
274
|
datadoc_metadata = fresh_metadata
|
|
371
275
|
if datadoc_metadata is None:
|
|
372
276
|
return None
|
|
373
|
-
return
|
|
277
|
+
return metadata_model.DatadocMetadata.model_validate_json(
|
|
374
278
|
json.dumps(datadoc_metadata),
|
|
375
279
|
)
|
|
376
280
|
except json.JSONDecodeError:
|
|
@@ -414,7 +318,7 @@ class Datadoc:
|
|
|
414
318
|
def _extract_metadata_from_dataset(
|
|
415
319
|
self,
|
|
416
320
|
dataset: pathlib.Path | CloudPath,
|
|
417
|
-
) ->
|
|
321
|
+
) -> all_optional_model.DatadocMetadata:
|
|
418
322
|
"""Obtain what metadata we can from the dataset itself.
|
|
419
323
|
|
|
420
324
|
This makes it easier for the user by 'pre-filling' certain fields.
|
|
@@ -434,9 +338,9 @@ class Datadoc:
|
|
|
434
338
|
- variables: A list of fields extracted from the dataset schema.
|
|
435
339
|
"""
|
|
436
340
|
dapla_dataset_path_info = DaplaDatasetPathInfo(dataset)
|
|
437
|
-
metadata =
|
|
341
|
+
metadata = all_optional_model.DatadocMetadata()
|
|
438
342
|
|
|
439
|
-
metadata.dataset =
|
|
343
|
+
metadata.dataset = all_optional_model.Dataset(
|
|
440
344
|
short_name=dapla_dataset_path_info.dataset_short_name,
|
|
441
345
|
dataset_state=dapla_dataset_path_info.dataset_state,
|
|
442
346
|
dataset_status=DataSetStatus.DRAFT,
|
|
@@ -471,6 +375,19 @@ class Datadoc:
|
|
|
471
375
|
"""
|
|
472
376
|
return dataset_path.parent / (dataset_path.stem + METADATA_DOCUMENT_FILE_SUFFIX)
|
|
473
377
|
|
|
378
|
+
def datadoc_model(self) -> all_optional_model.MetadataContainer:
|
|
379
|
+
"""Return the underlying datadoc model."""
|
|
380
|
+
datadoc: ValidateDatadocMetadata = ValidateDatadocMetadata(
|
|
381
|
+
percentage_complete=self.percent_complete,
|
|
382
|
+
dataset=self.dataset,
|
|
383
|
+
variables=self.variables,
|
|
384
|
+
)
|
|
385
|
+
if self.container:
|
|
386
|
+
res = copy.deepcopy(self.container)
|
|
387
|
+
res.datadoc = datadoc
|
|
388
|
+
return res
|
|
389
|
+
return all_optional_model.MetadataContainer(datadoc=datadoc)
|
|
390
|
+
|
|
474
391
|
def write_metadata_document(self) -> None:
|
|
475
392
|
"""Write all currently known metadata to file.
|
|
476
393
|
|
|
@@ -500,12 +417,15 @@ class Datadoc:
|
|
|
500
417
|
if self.container:
|
|
501
418
|
self.container.datadoc = datadoc
|
|
502
419
|
else:
|
|
503
|
-
self.container =
|
|
420
|
+
self.container = all_optional_model.MetadataContainer(datadoc=datadoc)
|
|
504
421
|
if self.metadata_document:
|
|
505
422
|
content = self.container.model_dump_json(indent=4)
|
|
506
423
|
self.metadata_document.write_text(content)
|
|
507
424
|
logger.info("Saved metadata document %s", self.metadata_document)
|
|
508
|
-
logger.info(
|
|
425
|
+
logger.info(
|
|
426
|
+
"Metadata content",
|
|
427
|
+
extra={"metadata_content": json.loads(content)},
|
|
428
|
+
)
|
|
509
429
|
else:
|
|
510
430
|
msg = "No metadata document to save"
|
|
511
431
|
raise ValueError(msg)
|
|
@@ -525,3 +445,37 @@ class Datadoc:
|
|
|
525
445
|
self.dataset,
|
|
526
446
|
) + num_obligatory_variables_fields_completed(self.variables)
|
|
527
447
|
return calculate_percentage(num_set_fields, num_all_fields)
|
|
448
|
+
|
|
449
|
+
def add_pseudonymization(
|
|
450
|
+
self,
|
|
451
|
+
variable_short_name: str,
|
|
452
|
+
pseudonymization: all_optional_model.Pseudonymization | None = None,
|
|
453
|
+
) -> None:
|
|
454
|
+
"""Adds a new pseudo variable to the list of pseudonymized variables.
|
|
455
|
+
|
|
456
|
+
If `pseudonymization` is not supplied, an empty Pseudonymization structure
|
|
457
|
+
will be created and assigned to the variable.
|
|
458
|
+
If an encryption algorithm is recognized (one of the standard Dapla algorithms), default values are filled
|
|
459
|
+
for any missing fields.
|
|
460
|
+
|
|
461
|
+
Args:
|
|
462
|
+
variable_short_name: The short name for the variable that one wants to update the pseudo for.
|
|
463
|
+
pseudonymization: The updated pseudonymization.
|
|
464
|
+
|
|
465
|
+
"""
|
|
466
|
+
variable = self.variables_lookup[variable_short_name]
|
|
467
|
+
if pseudonymization:
|
|
468
|
+
set_default_values_pseudonymization(variable, pseudonymization)
|
|
469
|
+
else:
|
|
470
|
+
variable.pseudonymization = all_optional_model.Pseudonymization()
|
|
471
|
+
|
|
472
|
+
def remove_pseudonymization(self, variable_short_name: str) -> None:
|
|
473
|
+
"""Removes a pseudo variable by using the shortname.
|
|
474
|
+
|
|
475
|
+
Updates the pseudo variable lookup by creating a new one.
|
|
476
|
+
|
|
477
|
+
Args:
|
|
478
|
+
variable_short_name: The short name for the variable that one wants to remove the pseudo for.
|
|
479
|
+
"""
|
|
480
|
+
if self.variables_lookup[variable_short_name].pseudonymization is not None:
|
|
481
|
+
self.variables_lookup[variable_short_name].pseudonymization = None
|