dapla-toolbelt-metadata 0.2.1__py3-none-any.whl → 0.9.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dapla-toolbelt-metadata might be problematic. Click here for more details.

Files changed (97) hide show
  1. dapla_metadata/__init__.py +11 -1
  2. dapla_metadata/_shared/__init__.py +1 -0
  3. dapla_metadata/_shared/config.py +109 -0
  4. dapla_metadata/_shared/enums.py +27 -0
  5. dapla_metadata/_shared/py.typed +0 -0
  6. dapla_metadata/dapla/__init__.py +4 -0
  7. dapla_metadata/dapla/user_info.py +138 -0
  8. dapla_metadata/datasets/__init__.py +1 -1
  9. dapla_metadata/datasets/_merge.py +333 -0
  10. dapla_metadata/datasets/code_list.py +5 -6
  11. dapla_metadata/datasets/compatibility/__init__.py +10 -0
  12. dapla_metadata/datasets/compatibility/_handlers.py +363 -0
  13. dapla_metadata/datasets/compatibility/_utils.py +259 -0
  14. dapla_metadata/datasets/compatibility/model_backwards_compatibility.py +135 -0
  15. dapla_metadata/datasets/core.py +136 -182
  16. dapla_metadata/datasets/dapla_dataset_path_info.py +145 -19
  17. dapla_metadata/datasets/dataset_parser.py +41 -28
  18. dapla_metadata/datasets/model_validation.py +29 -20
  19. dapla_metadata/datasets/statistic_subject_mapping.py +5 -1
  20. dapla_metadata/datasets/utility/constants.py +22 -15
  21. dapla_metadata/datasets/utility/enums.py +8 -20
  22. dapla_metadata/datasets/utility/urn.py +234 -0
  23. dapla_metadata/datasets/utility/utils.py +183 -111
  24. dapla_metadata/standards/__init__.py +4 -0
  25. dapla_metadata/standards/name_validator.py +250 -0
  26. dapla_metadata/standards/standard_validators.py +98 -0
  27. dapla_metadata/standards/utils/__init__.py +1 -0
  28. dapla_metadata/standards/utils/constants.py +49 -0
  29. dapla_metadata/variable_definitions/__init__.py +11 -0
  30. dapla_metadata/variable_definitions/_generated/.openapi-generator/FILES +20 -0
  31. dapla_metadata/variable_definitions/_generated/.openapi-generator/VERSION +1 -0
  32. dapla_metadata/variable_definitions/_generated/.openapi-generator-ignore +6 -0
  33. dapla_metadata/variable_definitions/_generated/README.md +148 -0
  34. dapla_metadata/variable_definitions/_generated/__init__.py +0 -0
  35. dapla_metadata/variable_definitions/_generated/vardef_client/__init__.py +47 -0
  36. dapla_metadata/variable_definitions/_generated/vardef_client/api/__init__.py +8 -0
  37. dapla_metadata/variable_definitions/_generated/vardef_client/api/data_migration_api.py +766 -0
  38. dapla_metadata/variable_definitions/_generated/vardef_client/api/draft_variable_definitions_api.py +888 -0
  39. dapla_metadata/variable_definitions/_generated/vardef_client/api/patches_api.py +888 -0
  40. dapla_metadata/variable_definitions/_generated/vardef_client/api/validity_periods_api.py +583 -0
  41. dapla_metadata/variable_definitions/_generated/vardef_client/api/variable_definitions_api.py +613 -0
  42. dapla_metadata/variable_definitions/_generated/vardef_client/api_client.py +779 -0
  43. dapla_metadata/variable_definitions/_generated/vardef_client/api_response.py +27 -0
  44. dapla_metadata/variable_definitions/_generated/vardef_client/configuration.py +474 -0
  45. dapla_metadata/variable_definitions/_generated/vardef_client/docs/CompleteResponse.md +51 -0
  46. dapla_metadata/variable_definitions/_generated/vardef_client/docs/Contact.md +30 -0
  47. dapla_metadata/variable_definitions/_generated/vardef_client/docs/DataMigrationApi.md +90 -0
  48. dapla_metadata/variable_definitions/_generated/vardef_client/docs/Draft.md +42 -0
  49. dapla_metadata/variable_definitions/_generated/vardef_client/docs/DraftVariableDefinitionsApi.md +259 -0
  50. dapla_metadata/variable_definitions/_generated/vardef_client/docs/LanguageStringType.md +31 -0
  51. dapla_metadata/variable_definitions/_generated/vardef_client/docs/Owner.md +31 -0
  52. dapla_metadata/variable_definitions/_generated/vardef_client/docs/Patch.md +43 -0
  53. dapla_metadata/variable_definitions/_generated/vardef_client/docs/PatchesApi.md +249 -0
  54. dapla_metadata/variable_definitions/_generated/vardef_client/docs/PublicApi.md +218 -0
  55. dapla_metadata/variable_definitions/_generated/vardef_client/docs/SupportedLanguages.md +15 -0
  56. dapla_metadata/variable_definitions/_generated/vardef_client/docs/UpdateDraft.md +44 -0
  57. dapla_metadata/variable_definitions/_generated/vardef_client/docs/ValidityPeriod.md +42 -0
  58. dapla_metadata/variable_definitions/_generated/vardef_client/docs/ValidityPeriodsApi.md +236 -0
  59. dapla_metadata/variable_definitions/_generated/vardef_client/docs/VariableDefinitionsApi.md +304 -0
  60. dapla_metadata/variable_definitions/_generated/vardef_client/docs/VariableStatus.md +17 -0
  61. dapla_metadata/variable_definitions/_generated/vardef_client/exceptions.py +193 -0
  62. dapla_metadata/variable_definitions/_generated/vardef_client/models/__init__.py +31 -0
  63. dapla_metadata/variable_definitions/_generated/vardef_client/models/complete_response.py +260 -0
  64. dapla_metadata/variable_definitions/_generated/vardef_client/models/contact.py +94 -0
  65. dapla_metadata/variable_definitions/_generated/vardef_client/models/draft.py +228 -0
  66. dapla_metadata/variable_definitions/_generated/vardef_client/models/get_vardok_vardef_mapping_by_id200_response.py +158 -0
  67. dapla_metadata/variable_definitions/_generated/vardef_client/models/language_string_type.py +101 -0
  68. dapla_metadata/variable_definitions/_generated/vardef_client/models/owner.py +87 -0
  69. dapla_metadata/variable_definitions/_generated/vardef_client/models/patch.py +244 -0
  70. dapla_metadata/variable_definitions/_generated/vardef_client/models/problem.py +118 -0
  71. dapla_metadata/variable_definitions/_generated/vardef_client/models/update_draft.py +274 -0
  72. dapla_metadata/variable_definitions/_generated/vardef_client/models/validity_period.py +225 -0
  73. dapla_metadata/variable_definitions/_generated/vardef_client/models/vardok_id_response.py +81 -0
  74. dapla_metadata/variable_definitions/_generated/vardef_client/models/vardok_vardef_id_pair_response.py +84 -0
  75. dapla_metadata/variable_definitions/_generated/vardef_client/models/variable_status.py +33 -0
  76. dapla_metadata/variable_definitions/_generated/vardef_client/py.typed +0 -0
  77. dapla_metadata/variable_definitions/_generated/vardef_client/rest.py +249 -0
  78. dapla_metadata/variable_definitions/_utils/__init__.py +1 -0
  79. dapla_metadata/variable_definitions/_utils/_client.py +32 -0
  80. dapla_metadata/variable_definitions/_utils/config.py +54 -0
  81. dapla_metadata/variable_definitions/_utils/constants.py +80 -0
  82. dapla_metadata/variable_definitions/_utils/files.py +309 -0
  83. dapla_metadata/variable_definitions/_utils/template_files.py +99 -0
  84. dapla_metadata/variable_definitions/_utils/variable_definition_files.py +143 -0
  85. dapla_metadata/variable_definitions/exceptions.py +255 -0
  86. dapla_metadata/variable_definitions/vardef.py +372 -0
  87. dapla_metadata/variable_definitions/vardok_id.py +48 -0
  88. dapla_metadata/variable_definitions/vardok_vardef_id_pair.py +47 -0
  89. dapla_metadata/variable_definitions/variable_definition.py +422 -0
  90. {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info}/METADATA +34 -36
  91. dapla_toolbelt_metadata-0.9.11.dist-info/RECORD +97 -0
  92. {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info}/WHEEL +1 -1
  93. dapla_metadata/datasets/config.py +0 -80
  94. dapla_metadata/datasets/model_backwards_compatibility.py +0 -520
  95. dapla_metadata/datasets/user_info.py +0 -88
  96. dapla_toolbelt_metadata-0.2.1.dist-info/RECORD +0 -22
  97. {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,135 @@
1
+ """Upgrade old metadata files to be compatible with new versions.
2
+
3
+ An important principle of Datadoc is that we ALWAYS guarantee backwards
4
+ compatibility of existing metadata documents. This means that we guarantee
5
+ that a user will never lose data, even if their document is decades old.
6
+
7
+ For each document version we release with breaking changes, we implement a
8
+ handler and register the version by defining a BackwardsCompatibleVersion
9
+ instance. These documents will then be upgraded when they're opened in Datadoc.
10
+
11
+ A test must also be implemented for each new version.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import logging
17
+ from collections import OrderedDict
18
+ from dataclasses import dataclass
19
+ from typing import TYPE_CHECKING
20
+ from typing import Any
21
+
22
+ from dapla_metadata.datasets.compatibility._handlers import handle_current_version
23
+ from dapla_metadata.datasets.compatibility._handlers import handle_version_0_1_1
24
+ from dapla_metadata.datasets.compatibility._handlers import handle_version_1_0_0
25
+ from dapla_metadata.datasets.compatibility._handlers import handle_version_2_1_0
26
+ from dapla_metadata.datasets.compatibility._handlers import handle_version_2_2_0
27
+ from dapla_metadata.datasets.compatibility._handlers import handle_version_3_1_0
28
+ from dapla_metadata.datasets.compatibility._handlers import handle_version_3_2_0
29
+ from dapla_metadata.datasets.compatibility._handlers import handle_version_3_3_0
30
+ from dapla_metadata.datasets.compatibility._handlers import handle_version_4_0_0
31
+ from dapla_metadata.datasets.compatibility._handlers import handle_version_5_0_1
32
+ from dapla_metadata.datasets.compatibility._handlers import handle_version_6_0_0
33
+ from dapla_metadata.datasets.compatibility._utils import DATADOC_KEY
34
+ from dapla_metadata.datasets.compatibility._utils import DOCUMENT_VERSION_KEY
35
+ from dapla_metadata.datasets.compatibility._utils import UnknownModelVersionError
36
+ from dapla_metadata.datasets.compatibility._utils import (
37
+ is_metadata_in_container_structure,
38
+ )
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+ if TYPE_CHECKING:
43
+ from collections.abc import Callable
44
+
45
+ SUPPORTED_VERSIONS: OrderedDict[str, BackwardsCompatibleVersion] = OrderedDict()
46
+
47
+
48
+ @dataclass()
49
+ class BackwardsCompatibleVersion:
50
+ """A version which we support with backwards compatibility.
51
+
52
+ This class registers a version and its corresponding handler function
53
+ for backwards compatibility.
54
+ """
55
+
56
+ version: str
57
+ handler: Callable[[dict[str, Any]], dict[str, Any]]
58
+
59
+ def __post_init__(self) -> None:
60
+ """Register this version in the supported versions map.
61
+
62
+ This method adds the instance to the `SUPPORTED_VERSIONS` dictionary
63
+ using the version as the key.
64
+ """
65
+ SUPPORTED_VERSIONS[self.version] = self
66
+
67
+ def upgrade(self, metadata: dict[str, Any]) -> dict[str, Any]:
68
+ """Upgrade metadata from the format of the previous version to the format of this version.
69
+
70
+ This method handles bumping the Document Version field so it's not necessary to do this in
71
+ the individual handler functions.
72
+
73
+ Args:
74
+ metadata (dict[str, Any]): Metadata in the format of the previous version, to be upgraded.
75
+
76
+ Returns:
77
+ dict[str, Any]: The metadata upgraded to the version specified
78
+ """
79
+ metadata = self.handler(metadata)
80
+ if is_metadata_in_container_structure(metadata):
81
+ metadata[DATADOC_KEY][DOCUMENT_VERSION_KEY] = self.version
82
+ else:
83
+ metadata[DOCUMENT_VERSION_KEY] = self.version
84
+ return metadata
85
+
86
+
87
+ # Register all the supported versions and their handlers.
88
+ BackwardsCompatibleVersion(version="0.1.1", handler=handle_version_0_1_1)
89
+ BackwardsCompatibleVersion(version="1.0.0", handler=handle_version_1_0_0)
90
+ BackwardsCompatibleVersion(version="2.1.0", handler=handle_version_2_1_0)
91
+ BackwardsCompatibleVersion(version="2.2.0", handler=handle_version_2_2_0)
92
+ BackwardsCompatibleVersion(version="3.1.0", handler=handle_version_3_1_0)
93
+ BackwardsCompatibleVersion(version="3.2.0", handler=handle_version_3_2_0)
94
+ BackwardsCompatibleVersion(version="3.3.0", handler=handle_version_3_3_0)
95
+ BackwardsCompatibleVersion(version="4.0.0", handler=handle_version_4_0_0)
96
+ BackwardsCompatibleVersion(version="5.0.1", handler=handle_version_5_0_1)
97
+ BackwardsCompatibleVersion(version="6.0.0", handler=handle_version_6_0_0)
98
+ BackwardsCompatibleVersion(version="6.1.0", handler=handle_current_version)
99
+
100
+
101
+ def upgrade_metadata(fresh_metadata: dict[str, Any]) -> dict[str, Any]:
102
+ """Upgrade the metadata to the latest version using registered handlers.
103
+
104
+ This function checks the version of the provided metadata and applies a series
105
+ of upgrade handlers to migrate the metadata to the latest version.
106
+ It starts from the provided version and applies all subsequent handlers in
107
+ sequence. If the metadata is already in the latest version or the version
108
+ cannot be determined, appropriate actions are taken.
109
+
110
+ Args:
111
+ fresh_metadata: The metadata dictionary to be upgraded. This dictionary
112
+ must include version information that determines which handlers to apply.
113
+
114
+ Returns:
115
+ The upgraded metadata dictionary, after applying all necessary handlers.
116
+
117
+ Raises:
118
+ UnknownModelVersionError: If the metadata's version is unknown or unsupported.
119
+ """
120
+ if is_metadata_in_container_structure(fresh_metadata):
121
+ if fresh_metadata[DATADOC_KEY] is None:
122
+ return fresh_metadata
123
+ supplied_version = fresh_metadata[DATADOC_KEY][DOCUMENT_VERSION_KEY]
124
+ else:
125
+ supplied_version = fresh_metadata[DOCUMENT_VERSION_KEY]
126
+ start_running_handlers = False
127
+ # Run all the handlers in order from the supplied version onwards
128
+ for k, v in SUPPORTED_VERSIONS.items():
129
+ if k == supplied_version:
130
+ start_running_handlers = True
131
+ if start_running_handlers:
132
+ fresh_metadata = v.upgrade(fresh_metadata)
133
+ if not start_running_handlers:
134
+ raise UnknownModelVersionError(supplied_version)
135
+ return fresh_metadata
@@ -5,35 +5,42 @@ from __future__ import annotations
5
5
  import copy
6
6
  import json
7
7
  import logging
8
- import warnings
9
8
  from concurrent.futures import ThreadPoolExecutor
10
- from pathlib import Path
11
9
  from typing import TYPE_CHECKING
12
-
13
- from datadoc_model import model
14
- from datadoc_model.model import DataSetStatus
15
-
16
- from dapla_metadata.datasets import config
17
- from dapla_metadata.datasets import user_info
10
+ from typing import cast
11
+
12
+ import datadoc_model.all_optional.model as all_optional_model
13
+ import datadoc_model.required.model as required_model
14
+ from datadoc_model.all_optional.model import DataSetStatus
15
+
16
+ from dapla_metadata._shared import config
17
+ from dapla_metadata.dapla import user_info
18
+ from dapla_metadata.datasets._merge import DatasetConsistencyStatus
19
+ from dapla_metadata.datasets._merge import check_dataset_consistency
20
+ from dapla_metadata.datasets._merge import check_ready_to_merge
21
+ from dapla_metadata.datasets._merge import check_variables_consistency
22
+ from dapla_metadata.datasets._merge import merge_metadata
23
+ from dapla_metadata.datasets.compatibility import is_metadata_in_container_structure
24
+ from dapla_metadata.datasets.compatibility import upgrade_metadata
18
25
  from dapla_metadata.datasets.dapla_dataset_path_info import DaplaDatasetPathInfo
19
26
  from dapla_metadata.datasets.dataset_parser import DatasetParser
20
- from dapla_metadata.datasets.model_backwards_compatibility import (
21
- is_metadata_in_container_structure,
22
- )
23
- from dapla_metadata.datasets.model_backwards_compatibility import upgrade_metadata
24
27
  from dapla_metadata.datasets.model_validation import ValidateDatadocMetadata
25
28
  from dapla_metadata.datasets.statistic_subject_mapping import StatisticSubjectMapping
26
29
  from dapla_metadata.datasets.utility.constants import (
27
30
  DEFAULT_SPATIAL_COVERAGE_DESCRIPTION,
28
31
  )
29
- from dapla_metadata.datasets.utility.constants import INCONSISTENCIES_MESSAGE
30
32
  from dapla_metadata.datasets.utility.constants import METADATA_DOCUMENT_FILE_SUFFIX
31
33
  from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_DATASET_FIELDS
32
34
  from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
35
+ from dapla_metadata.datasets.utility.urn import convert_uris_to_urns
36
+ from dapla_metadata.datasets.utility.urn import klass_urn_converter
37
+ from dapla_metadata.datasets.utility.urn import vardef_urn_converter
38
+ from dapla_metadata.datasets.utility.utils import OptionalDatadocMetadataType
39
+ from dapla_metadata.datasets.utility.utils import VariableListType
40
+ from dapla_metadata.datasets.utility.utils import VariableType
33
41
  from dapla_metadata.datasets.utility.utils import calculate_percentage
34
42
  from dapla_metadata.datasets.utility.utils import derive_assessment_from_state
35
43
  from dapla_metadata.datasets.utility.utils import get_timestamp_now
36
- from dapla_metadata.datasets.utility.utils import merge_variables
37
44
  from dapla_metadata.datasets.utility.utils import normalize_path
38
45
  from dapla_metadata.datasets.utility.utils import (
39
46
  num_obligatory_dataset_fields_completed,
@@ -41,8 +48,9 @@ from dapla_metadata.datasets.utility.utils import (
41
48
  from dapla_metadata.datasets.utility.utils import (
42
49
  num_obligatory_variables_fields_completed,
43
50
  )
44
- from dapla_metadata.datasets.utility.utils import override_dataset_fields
51
+ from dapla_metadata.datasets.utility.utils import set_dataset_owner
45
52
  from dapla_metadata.datasets.utility.utils import set_default_values_dataset
53
+ from dapla_metadata.datasets.utility.utils import set_default_values_pseudonymization
46
54
  from dapla_metadata.datasets.utility.utils import set_default_values_variables
47
55
 
48
56
  if TYPE_CHECKING:
@@ -51,18 +59,9 @@ if TYPE_CHECKING:
51
59
 
52
60
  from cloudpathlib import CloudPath
53
61
 
54
-
55
62
  logger = logging.getLogger(__name__)
56
63
 
57
64
 
58
- class InconsistentDatasetsWarning(UserWarning):
59
- """Existing and new datasets differ significantly from one another."""
60
-
61
-
62
- class InconsistentDatasetsError(ValueError):
63
- """Existing and new datasets differ significantly from one another."""
64
-
65
-
66
65
  class Datadoc:
67
66
  """Handle reading, updating and writing of metadata.
68
67
 
@@ -83,8 +82,8 @@ class Datadoc:
83
82
  dataset_path: str | None = None,
84
83
  metadata_document_path: str | None = None,
85
84
  statistic_subject_mapping: StatisticSubjectMapping | None = None,
86
- *,
87
85
  errors_as_warnings: bool = False,
86
+ validate_required_fields_on_existing_metadata: bool = False,
88
87
  ) -> None:
89
88
  """Initialize the Datadoc instance.
90
89
 
@@ -100,16 +99,23 @@ class Datadoc:
100
99
  Defaults to None
101
100
  errors_as_warnings: Disable raising exceptions if inconsistencies
102
101
  are found between existing and extracted metadata.
102
+ validate_required_fields_on_existing_metadata: Use a Pydantic model
103
+ which validates whether required fields are present when reading
104
+ in an existing metadata file.
103
105
  """
104
106
  self._statistic_subject_mapping = statistic_subject_mapping
105
107
  self.errors_as_warnings = errors_as_warnings
108
+ self.validate_required_fields_on_existing_metadata = (
109
+ validate_required_fields_on_existing_metadata
110
+ )
106
111
  self.metadata_document: pathlib.Path | CloudPath | None = None
107
- self.container: model.MetadataContainer | None = None
112
+ self.container: all_optional_model.MetadataContainer | None = None
108
113
  self.dataset_path: pathlib.Path | CloudPath | None = None
109
- self.dataset = model.Dataset()
110
- self.variables: list = []
111
- self.variables_lookup: dict[str, model.Variable] = {}
114
+ self.dataset = all_optional_model.Dataset()
115
+ self.variables: VariableListType = []
116
+ self.variables_lookup: dict[str, VariableType] = {}
112
117
  self.explicitly_defined_metadata_document = False
118
+ self.dataset_consistency_status: list[DatasetConsistencyStatus] = []
113
119
  if metadata_document_path:
114
120
  self.metadata_document = normalize_path(metadata_document_path)
115
121
  self.explicitly_defined_metadata_document = True
@@ -145,8 +151,9 @@ class Datadoc:
145
151
  - The 'contains_personal_data' attribute is set to False if not specified.
146
152
  - A lookup dictionary for variables is created based on their short names.
147
153
  """
148
- extracted_metadata: model.DatadocMetadata | None = None
149
- existing_metadata: model.DatadocMetadata | None = None
154
+ extracted_metadata: all_optional_model.DatadocMetadata | None = None
155
+ existing_metadata: OptionalDatadocMetadataType = None
156
+
150
157
  if self.metadata_document and self.metadata_document.exists():
151
158
  existing_metadata = self._extract_metadata_from_existing_document(
152
159
  self.metadata_document,
@@ -154,11 +161,28 @@ class Datadoc:
154
161
 
155
162
  if (
156
163
  self.dataset_path is not None
157
- and self.dataset == model.Dataset()
164
+ and self.dataset == all_optional_model.Dataset()
158
165
  and len(self.variables) == 0
159
166
  ):
160
167
  extracted_metadata = self._extract_metadata_from_dataset(self.dataset_path)
161
168
 
169
+ if (
170
+ self.dataset_path
171
+ and self.metadata_document
172
+ and extracted_metadata
173
+ and existing_metadata
174
+ ):
175
+ self.dataset_consistency_status = check_dataset_consistency(
176
+ self.dataset_path,
177
+ self.metadata_document,
178
+ )
179
+ self.dataset_consistency_status.extend(
180
+ check_variables_consistency(
181
+ extracted_metadata.variables or [],
182
+ existing_metadata.variables or [],
183
+ )
184
+ )
185
+
162
186
  if (
163
187
  self.dataset_path
164
188
  and self.explicitly_defined_metadata_document
@@ -167,15 +191,11 @@ class Datadoc:
167
191
  and extracted_metadata is not None
168
192
  and existing_metadata is not None
169
193
  ):
170
- existing_file_path = self._get_existing_file_path(extracted_metadata)
171
- self._check_ready_to_merge(
172
- self.dataset_path,
173
- Path(existing_file_path),
174
- extracted_metadata,
175
- existing_metadata,
194
+ check_ready_to_merge(
195
+ self.dataset_consistency_status,
176
196
  errors_as_warnings=self.errors_as_warnings,
177
197
  )
178
- merged_metadata = self._merge_metadata(
198
+ merged_metadata = merge_metadata(
179
199
  extracted_metadata,
180
200
  existing_metadata,
181
201
  )
@@ -187,157 +207,35 @@ class Datadoc:
187
207
  self._set_metadata(merged_metadata)
188
208
  else:
189
209
  self._set_metadata(existing_metadata or extracted_metadata)
190
- set_default_values_variables(self.variables)
191
- set_default_values_dataset(self.dataset)
192
- self._create_variables_lookup()
193
-
194
- def _get_existing_file_path(
195
- self,
196
- extracted_metadata: model.DatadocMetadata | None,
197
- ) -> str:
198
- if (
199
- extracted_metadata is not None
200
- and extracted_metadata.dataset is not None
201
- and extracted_metadata.dataset.file_path is not None
202
- ):
203
- return extracted_metadata.dataset.file_path
204
- msg = "Could not access existing dataset file path"
205
- raise ValueError(msg)
206
210
 
207
211
  def _set_metadata(
208
212
  self,
209
- merged_metadata: model.DatadocMetadata | None,
213
+ metadata: OptionalDatadocMetadataType,
210
214
  ) -> None:
211
- if not merged_metadata or not (
212
- merged_metadata.dataset and merged_metadata.variables
213
- ):
215
+ if not metadata or not (metadata.dataset and metadata.variables):
214
216
  msg = "Could not read metadata"
215
217
  raise ValueError(msg)
216
- self.dataset = merged_metadata.dataset
217
- self.variables = merged_metadata.variables
218
+ self.dataset = cast("all_optional_model.Dataset", metadata.dataset)
219
+ self.variables = metadata.variables
220
+
221
+ set_default_values_variables(self.variables)
222
+ set_default_values_dataset(cast("all_optional_model.Dataset", self.dataset))
223
+ set_dataset_owner(self.dataset)
224
+ convert_uris_to_urns(self.variables, "definition_uri", [vardef_urn_converter])
225
+ convert_uris_to_urns(
226
+ self.variables, "classification_uri", [klass_urn_converter]
227
+ )
228
+ self._create_variables_lookup()
218
229
 
219
230
  def _create_variables_lookup(self) -> None:
220
231
  self.variables_lookup = {
221
232
  v.short_name: v for v in self.variables if v.short_name
222
233
  }
223
234
 
224
- @staticmethod
225
- def _check_ready_to_merge(
226
- new_dataset_path: Path | CloudPath,
227
- existing_dataset_path: Path,
228
- extracted_metadata: model.DatadocMetadata,
229
- existing_metadata: model.DatadocMetadata,
230
- *,
231
- errors_as_warnings: bool,
232
- ) -> None:
233
- """Check if the datasets are consistent enough to make a successful merge of metadata.
234
-
235
- Args:
236
- new_dataset_path: Path to the dataset to be documented.
237
- existing_dataset_path: Path stored in the existing metadata.
238
- extracted_metadata: Metadata extracted from a physical dataset.
239
- existing_metadata: Metadata from a previously created metadata document.
240
- errors_as_warnings: True if failing checks should be raised as warnings, not errors.
241
-
242
- Raises:
243
- InconsistentDatasetsError: If inconsistencies are found and `errors_as_warnings == False`
244
- """
245
- new_dataset_path_info = DaplaDatasetPathInfo(new_dataset_path)
246
- existing_dataset_path_info = DaplaDatasetPathInfo(existing_dataset_path)
247
- results = [
248
- {
249
- "name": "Bucket name",
250
- "success": (
251
- new_dataset_path_info.bucket_name
252
- == existing_dataset_path_info.bucket_name
253
- ),
254
- },
255
- {
256
- "name": "Data product name",
257
- "success": (
258
- new_dataset_path_info.statistic_short_name
259
- == existing_dataset_path_info.statistic_short_name
260
- ),
261
- },
262
- {
263
- "name": "Dataset state",
264
- "success": (
265
- new_dataset_path_info.dataset_state
266
- == existing_dataset_path_info.dataset_state
267
- ),
268
- },
269
- {
270
- "name": "Dataset short name",
271
- "success": (
272
- new_dataset_path_info.dataset_short_name
273
- == existing_dataset_path_info.dataset_short_name
274
- ),
275
- },
276
- {
277
- "name": "Variable names",
278
- "success": (
279
- {v.short_name for v in extracted_metadata.variables or []}
280
- == {v.short_name for v in existing_metadata.variables or []}
281
- ),
282
- },
283
- {
284
- "name": "Variable datatypes",
285
- "success": (
286
- [v.data_type for v in extracted_metadata.variables or []]
287
- == [v.data_type for v in existing_metadata.variables or []]
288
- ),
289
- },
290
- ]
291
- if failures := [result for result in results if not result["success"]]:
292
- msg = f"{INCONSISTENCIES_MESSAGE} {', '.join(str(f['name']) for f in failures)}"
293
- if errors_as_warnings:
294
- warnings.warn(
295
- message=msg,
296
- category=InconsistentDatasetsWarning,
297
- stacklevel=2,
298
- )
299
- else:
300
- raise InconsistentDatasetsError(
301
- msg,
302
- )
303
-
304
- @staticmethod
305
- def _merge_metadata(
306
- extracted_metadata: model.DatadocMetadata | None,
307
- existing_metadata: model.DatadocMetadata | None,
308
- ) -> model.DatadocMetadata:
309
- if not existing_metadata:
310
- logger.warning(
311
- "No existing metadata found, no merge to perform. Continuing with extracted metadata.",
312
- )
313
- return extracted_metadata or model.DatadocMetadata()
314
-
315
- if not extracted_metadata:
316
- return existing_metadata
317
-
318
- # Use the extracted metadata as a base
319
- merged_metadata = model.DatadocMetadata(
320
- dataset=copy.deepcopy(extracted_metadata.dataset),
321
- variables=[],
322
- )
323
-
324
- override_dataset_fields(
325
- merged_metadata=merged_metadata,
326
- existing_metadata=existing_metadata,
327
- )
328
-
329
- # Merge variables.
330
- # For each extracted variable, copy existing metadata into the merged metadata
331
- return merge_variables(
332
- existing_metadata=existing_metadata,
333
- extracted_metadata=extracted_metadata,
334
- merged_metadata=merged_metadata,
335
- )
336
-
337
235
  def _extract_metadata_from_existing_document(
338
236
  self,
339
237
  document: pathlib.Path | CloudPath,
340
- ) -> model.DatadocMetadata | None:
238
+ ) -> OptionalDatadocMetadataType:
341
239
  """Read metadata from an existing metadata document.
342
240
 
343
241
  If an existing metadata document is available, this method reads and
@@ -352,7 +250,13 @@ class Datadoc:
352
250
 
353
251
  Raises:
354
252
  json.JSONDecodeError: If the metadata document cannot be parsed.
253
+ pydantic.ValidationError: If the data does not successfully validate.
355
254
  """
255
+ metadata_model = (
256
+ required_model
257
+ if self.validate_required_fields_on_existing_metadata
258
+ else all_optional_model
259
+ )
356
260
  fresh_metadata = {}
357
261
  try:
358
262
  with document.open(mode="r", encoding="utf-8") as file:
@@ -362,7 +266,7 @@ class Datadoc:
362
266
  fresh_metadata,
363
267
  )
364
268
  if is_metadata_in_container_structure(fresh_metadata):
365
- self.container = model.MetadataContainer.model_validate_json(
269
+ self.container = metadata_model.MetadataContainer.model_validate_json(
366
270
  json.dumps(fresh_metadata),
367
271
  )
368
272
  datadoc_metadata = fresh_metadata["datadoc"]
@@ -370,7 +274,7 @@ class Datadoc:
370
274
  datadoc_metadata = fresh_metadata
371
275
  if datadoc_metadata is None:
372
276
  return None
373
- return model.DatadocMetadata.model_validate_json(
277
+ return metadata_model.DatadocMetadata.model_validate_json(
374
278
  json.dumps(datadoc_metadata),
375
279
  )
376
280
  except json.JSONDecodeError:
@@ -414,7 +318,7 @@ class Datadoc:
414
318
  def _extract_metadata_from_dataset(
415
319
  self,
416
320
  dataset: pathlib.Path | CloudPath,
417
- ) -> model.DatadocMetadata:
321
+ ) -> all_optional_model.DatadocMetadata:
418
322
  """Obtain what metadata we can from the dataset itself.
419
323
 
420
324
  This makes it easier for the user by 'pre-filling' certain fields.
@@ -434,9 +338,9 @@ class Datadoc:
434
338
  - variables: A list of fields extracted from the dataset schema.
435
339
  """
436
340
  dapla_dataset_path_info = DaplaDatasetPathInfo(dataset)
437
- metadata = model.DatadocMetadata()
341
+ metadata = all_optional_model.DatadocMetadata()
438
342
 
439
- metadata.dataset = model.Dataset(
343
+ metadata.dataset = all_optional_model.Dataset(
440
344
  short_name=dapla_dataset_path_info.dataset_short_name,
441
345
  dataset_state=dapla_dataset_path_info.dataset_state,
442
346
  dataset_status=DataSetStatus.DRAFT,
@@ -471,6 +375,19 @@ class Datadoc:
471
375
  """
472
376
  return dataset_path.parent / (dataset_path.stem + METADATA_DOCUMENT_FILE_SUFFIX)
473
377
 
378
+ def datadoc_model(self) -> all_optional_model.MetadataContainer:
379
+ """Return the underlying datadoc model."""
380
+ datadoc: ValidateDatadocMetadata = ValidateDatadocMetadata(
381
+ percentage_complete=self.percent_complete,
382
+ dataset=self.dataset,
383
+ variables=self.variables,
384
+ )
385
+ if self.container:
386
+ res = copy.deepcopy(self.container)
387
+ res.datadoc = datadoc
388
+ return res
389
+ return all_optional_model.MetadataContainer(datadoc=datadoc)
390
+
474
391
  def write_metadata_document(self) -> None:
475
392
  """Write all currently known metadata to file.
476
393
 
@@ -500,12 +417,15 @@ class Datadoc:
500
417
  if self.container:
501
418
  self.container.datadoc = datadoc
502
419
  else:
503
- self.container = model.MetadataContainer(datadoc=datadoc)
420
+ self.container = all_optional_model.MetadataContainer(datadoc=datadoc)
504
421
  if self.metadata_document:
505
422
  content = self.container.model_dump_json(indent=4)
506
423
  self.metadata_document.write_text(content)
507
424
  logger.info("Saved metadata document %s", self.metadata_document)
508
- logger.info("Metadata content:\n%s", content)
425
+ logger.info(
426
+ "Metadata content",
427
+ extra={"metadata_content": json.loads(content)},
428
+ )
509
429
  else:
510
430
  msg = "No metadata document to save"
511
431
  raise ValueError(msg)
@@ -525,3 +445,37 @@ class Datadoc:
525
445
  self.dataset,
526
446
  ) + num_obligatory_variables_fields_completed(self.variables)
527
447
  return calculate_percentage(num_set_fields, num_all_fields)
448
+
449
+ def add_pseudonymization(
450
+ self,
451
+ variable_short_name: str,
452
+ pseudonymization: all_optional_model.Pseudonymization | None = None,
453
+ ) -> None:
454
+ """Adds a new pseudo variable to the list of pseudonymized variables.
455
+
456
+ If `pseudonymization` is not supplied, an empty Pseudonymization structure
457
+ will be created and assigned to the variable.
458
+ If an encryption algorithm is recognized (one of the standard Dapla algorithms), default values are filled
459
+ for any missing fields.
460
+
461
+ Args:
462
+ variable_short_name: The short name for the variable that one wants to update the pseudo for.
463
+ pseudonymization: The updated pseudonymization.
464
+
465
+ """
466
+ variable = self.variables_lookup[variable_short_name]
467
+ if pseudonymization:
468
+ set_default_values_pseudonymization(variable, pseudonymization)
469
+ else:
470
+ variable.pseudonymization = all_optional_model.Pseudonymization()
471
+
472
+ def remove_pseudonymization(self, variable_short_name: str) -> None:
473
+ """Removes a pseudo variable by using the shortname.
474
+
475
+ Updates the pseudo variable lookup by creating a new one.
476
+
477
+ Args:
478
+ variable_short_name: The short name for the variable that one wants to remove the pseudo for.
479
+ """
480
+ if self.variables_lookup[variable_short_name].pseudonymization is not None:
481
+ self.variables_lookup[variable_short_name].pseudonymization = None