dapla-toolbelt-metadata 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dapla-toolbelt-metadata might be problematic. Click here for more details.

dataset/core.py ADDED
@@ -0,0 +1,543 @@
1
+ """Handle reading, updating and writing of metadata."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import copy
6
+ import json
7
+ import logging
8
+ import warnings
9
+ from concurrent.futures import ThreadPoolExecutor
10
+ from pathlib import Path
11
+ from typing import TYPE_CHECKING
12
+
13
+ from datadoc_model import model
14
+ from datadoc_model.model import DataSetStatus
15
+
16
+ from dataset import config
17
+ from dataset import user_info
18
+ from dataset.dapla_dataset_path_info import DaplaDatasetPathInfo
19
+ from dataset.dataset_parser import DatasetParser
20
+ from dataset.model_backwards_compatibility import is_metadata_in_container_structure
21
+ from dataset.model_backwards_compatibility import upgrade_metadata
22
+ from dataset.model_validation import ValidateDatadocMetadata
23
+ from dataset.statistic_subject_mapping import StatisticSubjectMapping
24
+ from dataset.utility.constants import DATASET_FIELDS_FROM_EXISTING_METADATA
25
+ from dataset.utility.constants import DEFAULT_SPATIAL_COVERAGE_DESCRIPTION
26
+ from dataset.utility.constants import INCONSISTENCIES_MESSAGE
27
+ from dataset.utility.constants import METADATA_DOCUMENT_FILE_SUFFIX
28
+ from dataset.utility.constants import NUM_OBLIGATORY_DATASET_FIELDS
29
+ from dataset.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
30
+ from dataset.utility.utils import calculate_percentage
31
+ from dataset.utility.utils import derive_assessment_from_state
32
+ from dataset.utility.utils import get_timestamp_now
33
+ from dataset.utility.utils import normalize_path
34
+ from dataset.utility.utils import num_obligatory_dataset_fields_completed
35
+ from dataset.utility.utils import num_obligatory_variables_fields_completed
36
+ from dataset.utility.utils import set_default_values_dataset
37
+ from dataset.utility.utils import set_default_values_variables
38
+
39
+ if TYPE_CHECKING:
40
+ import pathlib
41
+ from datetime import datetime
42
+
43
+ from cloudpathlib import CloudPath
44
+
45
+
46
+ logger = logging.getLogger(__name__)
47
+
48
+
49
+ class InconsistentDatasetsWarning(UserWarning):
50
+ """Existing and new datasets differ significantly from one another."""
51
+
52
+
53
+ class InconsistentDatasetsError(ValueError):
54
+ """Existing and new datasets differ significantly from one another."""
55
+
56
+
57
+ class Datadoc:
58
+ """Handle reading, updating and writing of metadata.
59
+
60
+ If a metadata document exists, it is this information that is loaded. Nothing
61
+ is inferred from the dataset. If only a dataset path is supplied the metadata
62
+ document path is build based on the dataset path.
63
+
64
+ Example: /path/to/dataset.parquet -> /path/to/dataset__DOC.json
65
+
66
+ Attributes:
67
+ dataset_path: A file path to the path to where the dataset is stored.
68
+ metadata_document_path: A path to a metadata document if it exists.
69
+ statistic_subject_mapping: An instance of StatisticSubjectMapping.
70
+ """
71
+
72
+ def __init__(
73
+ self,
74
+ dataset_path: str | None = None,
75
+ metadata_document_path: str | None = None,
76
+ statistic_subject_mapping: StatisticSubjectMapping | None = None,
77
+ *,
78
+ errors_as_warnings: bool = False,
79
+ ) -> None:
80
+ """Initialize the Datadoc instance.
81
+
82
+ If a dataset path is supplied, it attempts to locate and load the
83
+ corresponding metadata document. If no dataset path is provided, the class
84
+ is instantiated without loading any metadata.
85
+
86
+ Args:
87
+ dataset_path: The file path to the dataset. Defaults to None.
88
+ metadata_document_path: The file path to the metadata document.
89
+ Defaults to None.
90
+ statistic_subject_mapping: An instance of StatisticSubjectMapping.
91
+ Defaults to None
92
+ errors_as_warnings: Disable raising exceptions if inconsistencies
93
+ are found between existing and extracted metadata.
94
+ """
95
+ self._statistic_subject_mapping = statistic_subject_mapping
96
+ self.errors_as_warnings = errors_as_warnings
97
+ self.metadata_document: pathlib.Path | CloudPath | None = None
98
+ self.container: model.MetadataContainer | None = None
99
+ self.dataset_path: pathlib.Path | CloudPath | None = None
100
+ self.dataset = model.Dataset()
101
+ self.variables: list = []
102
+ self.variables_lookup: dict[str, model.Variable] = {}
103
+ self.explicitly_defined_metadata_document = False
104
+ if metadata_document_path:
105
+ self.metadata_document = normalize_path(metadata_document_path)
106
+ self.explicitly_defined_metadata_document = True
107
+ if not self.metadata_document.exists():
108
+ msg = f"Metadata document does not exist! Provided path: {self.metadata_document}"
109
+ raise ValueError(
110
+ msg,
111
+ )
112
+ if dataset_path:
113
+ self.dataset_path = normalize_path(dataset_path)
114
+ if not metadata_document_path:
115
+ self.metadata_document = self.build_metadata_document_path(
116
+ self.dataset_path,
117
+ )
118
+ if metadata_document_path or dataset_path:
119
+ self._extract_metadata_from_files()
120
+
121
+ def _extract_metadata_from_files(self) -> None:
122
+ """Read metadata from an existing metadata document or create one.
123
+
124
+ If a metadata document exists, it reads and extracts metadata from it.
125
+ If no metadata document is found, it creates metadata from scratch by
126
+ extracting information from the dataset file.
127
+
128
+ This method ensures that:
129
+ - Metadata is extracted from an existing document if available.
130
+ - If metadata is not available, it is extracted from the dataset file.
131
+ - The dataset ID is set if not already present.
132
+ - Default values are set for variables, particularly the variable role on
133
+ creation.
134
+ - Default values for variables ID and 'is_personal_data' are set if the
135
+ values are None.
136
+ - The 'contains_personal_data' attribute is set to False if not specified.
137
+ - A lookup dictionary for variables is created based on their short names.
138
+ """
139
+ extracted_metadata: model.DatadocMetadata | None = None
140
+ existing_metadata: model.DatadocMetadata | None = None
141
+ if self.metadata_document is not None and self.metadata_document.exists():
142
+ existing_metadata = self._extract_metadata_from_existing_document(
143
+ self.metadata_document,
144
+ )
145
+ if (
146
+ self.dataset_path is not None
147
+ and self.dataset == model.Dataset()
148
+ and len(self.variables) == 0
149
+ ):
150
+ extracted_metadata = self._extract_metadata_from_dataset(self.dataset_path)
151
+
152
+ if (
153
+ self.dataset_path
154
+ and self.explicitly_defined_metadata_document
155
+ and self.metadata_document is not None
156
+ and self.metadata_document.exists()
157
+ and extracted_metadata is not None
158
+ and existing_metadata is not None
159
+ ):
160
+ if (
161
+ extracted_metadata.dataset is not None
162
+ and extracted_metadata.dataset.file_path is not None
163
+ ):
164
+ existing_file_path = extracted_metadata.dataset.file_path
165
+ else:
166
+ msg = "Could not access existing dataset file path"
167
+ raise ValueError(msg)
168
+ self._check_ready_to_merge(
169
+ self.dataset_path,
170
+ Path(existing_file_path),
171
+ extracted_metadata,
172
+ existing_metadata,
173
+ errors_as_warnings=self.errors_as_warnings,
174
+ )
175
+ merged_metadata = self._merge_metadata(
176
+ extracted_metadata,
177
+ existing_metadata,
178
+ )
179
+ # We need to override this so that the document gets saved to the correct
180
+ # location, otherwise we would overwrite the existing document!
181
+ self.metadata_document = self.build_metadata_document_path(
182
+ self.dataset_path,
183
+ )
184
+ if merged_metadata.dataset and merged_metadata.variables:
185
+ self.dataset = merged_metadata.dataset
186
+ self.variables = merged_metadata.variables
187
+ else:
188
+ msg = "Could not read metadata"
189
+ raise ValueError(msg)
190
+ elif (
191
+ existing_metadata
192
+ and existing_metadata.dataset
193
+ and existing_metadata.variables
194
+ ):
195
+ self.dataset = existing_metadata.dataset
196
+ self.variables = existing_metadata.variables
197
+ elif (
198
+ extracted_metadata
199
+ and extracted_metadata.dataset
200
+ and extracted_metadata.variables
201
+ ):
202
+ self.dataset = extracted_metadata.dataset
203
+ self.variables = extracted_metadata.variables
204
+ else:
205
+ msg = "Could not read metadata"
206
+ raise ValueError(msg)
207
+ set_default_values_variables(self.variables)
208
+ set_default_values_dataset(self.dataset)
209
+ self.variables_lookup = {
210
+ v.short_name: v for v in self.variables if v.short_name
211
+ }
212
+
213
+ @staticmethod
214
+ def _check_ready_to_merge(
215
+ new_dataset_path: Path | CloudPath,
216
+ existing_dataset_path: Path,
217
+ extracted_metadata: model.DatadocMetadata,
218
+ existing_metadata: model.DatadocMetadata,
219
+ *,
220
+ errors_as_warnings: bool,
221
+ ) -> None:
222
+ """Check if the datasets are consistent enough to make a successful merge of metadata.
223
+
224
+ Args:
225
+ new_dataset_path: Path to the dataset to be documented.
226
+ existing_dataset_path: Path stored in the existing metadata.
227
+ extracted_metadata: Metadata extracted from a physical dataset.
228
+ existing_metadata: Metadata from a previously created metadata document.
229
+ errors_as_warnings: True if failing checks should be raised as warnings, not errors.
230
+
231
+ Raises:
232
+ InconsistentDatasetsError: If inconsistencies are found and `errors_as_warnings == False`
233
+ """
234
+ new_dataset_path_info = DaplaDatasetPathInfo(new_dataset_path)
235
+ existing_dataset_path_info = DaplaDatasetPathInfo(existing_dataset_path)
236
+ results = [
237
+ {
238
+ "name": "Bucket name",
239
+ "success": (
240
+ new_dataset_path_info.bucket_name
241
+ == existing_dataset_path_info.bucket_name
242
+ ),
243
+ },
244
+ {
245
+ "name": "Data product name",
246
+ "success": (
247
+ new_dataset_path_info.statistic_short_name
248
+ == existing_dataset_path_info.statistic_short_name
249
+ ),
250
+ },
251
+ {
252
+ "name": "Dataset state",
253
+ "success": (
254
+ new_dataset_path_info.dataset_state
255
+ == existing_dataset_path_info.dataset_state
256
+ ),
257
+ },
258
+ {
259
+ "name": "Dataset short name",
260
+ "success": (
261
+ new_dataset_path_info.dataset_short_name
262
+ == existing_dataset_path_info.dataset_short_name
263
+ ),
264
+ },
265
+ {
266
+ "name": "Variable names",
267
+ "success": (
268
+ {v.short_name for v in extracted_metadata.variables or []}
269
+ == {v.short_name for v in existing_metadata.variables or []}
270
+ ),
271
+ },
272
+ {
273
+ "name": "Variable datatypes",
274
+ "success": (
275
+ [v.data_type for v in extracted_metadata.variables or []]
276
+ == [v.data_type for v in existing_metadata.variables or []]
277
+ ),
278
+ },
279
+ ]
280
+ if failures := [result for result in results if not result["success"]]:
281
+ msg = f"{INCONSISTENCIES_MESSAGE} {', '.join(str(f['name']) for f in failures)}"
282
+ if errors_as_warnings:
283
+ warnings.warn(
284
+ message=msg,
285
+ category=InconsistentDatasetsWarning,
286
+ stacklevel=2,
287
+ )
288
+ else:
289
+ raise InconsistentDatasetsError(
290
+ msg,
291
+ )
292
+
293
+ @staticmethod
294
+ def _merge_metadata(
295
+ extracted_metadata: model.DatadocMetadata | None,
296
+ existing_metadata: model.DatadocMetadata | None,
297
+ ) -> model.DatadocMetadata:
298
+ if not existing_metadata:
299
+ logger.warning(
300
+ "No existing metadata found, no merge to perform. Continuing with extracted metadata.",
301
+ )
302
+ return extracted_metadata or model.DatadocMetadata()
303
+ if not extracted_metadata:
304
+ return existing_metadata
305
+ # Use the extracted metadata as a base
306
+ merged_metadata = model.DatadocMetadata(
307
+ dataset=copy.deepcopy(extracted_metadata.dataset),
308
+ variables=[],
309
+ )
310
+ if (
311
+ merged_metadata.dataset is not None
312
+ and existing_metadata.dataset is not None
313
+ ):
314
+ # Override the fields as defined
315
+ for field in DATASET_FIELDS_FROM_EXISTING_METADATA:
316
+ setattr(
317
+ merged_metadata.dataset,
318
+ field,
319
+ getattr(existing_metadata.dataset, field),
320
+ )
321
+
322
+ # Merge variables.
323
+ # For each extracted variable, copy existing metadata into the merged metadata
324
+ if (
325
+ existing_metadata.variables is not None
326
+ and extracted_metadata is not None
327
+ and extracted_metadata.variables is not None
328
+ and merged_metadata.variables is not None
329
+ ):
330
+ for extracted in extracted_metadata.variables:
331
+ existing = next(
332
+ (
333
+ existing
334
+ for existing in existing_metadata.variables
335
+ if existing.short_name == extracted.short_name
336
+ ),
337
+ None,
338
+ )
339
+ if existing:
340
+ existing.id = None # Set to None so that it will be set assigned a fresh ID later
341
+ existing.contains_data_from = (
342
+ extracted.contains_data_from or existing.contains_data_from
343
+ )
344
+ existing.contains_data_until = (
345
+ extracted.contains_data_until or existing.contains_data_until
346
+ )
347
+ merged_metadata.variables.append(existing)
348
+ else:
349
+ # If there is no existing metadata for this variable, we just use what we have extracted
350
+ merged_metadata.variables.append(extracted)
351
+ return merged_metadata
352
+
353
+ def _extract_metadata_from_existing_document(
354
+ self,
355
+ document: pathlib.Path | CloudPath,
356
+ ) -> model.DatadocMetadata | None:
357
+ """Read metadata from an existing metadata document.
358
+
359
+ If an existing metadata document is available, this method reads and
360
+ loads the metadata from it. It validates and upgrades the metadata as
361
+ necessary. If we have read in a file with an empty "datadoc" structure
362
+ the process ends.
363
+ A typical example causing a empty datadoc is a file produced from a
364
+ pseudonymization process.
365
+
366
+ Args:
367
+ document: A path to the existing metadata document.
368
+
369
+ Raises:
370
+ json.JSONDecodeError: If the metadata document cannot be parsed.
371
+ """
372
+ fresh_metadata = {}
373
+ try:
374
+ with document.open(mode="r", encoding="utf-8") as file:
375
+ fresh_metadata = json.load(file)
376
+ logger.info("Opened existing metadata file %s", document)
377
+ fresh_metadata = upgrade_metadata(
378
+ fresh_metadata,
379
+ )
380
+ if is_metadata_in_container_structure(fresh_metadata):
381
+ self.container = model.MetadataContainer.model_validate_json(
382
+ json.dumps(fresh_metadata),
383
+ )
384
+ datadoc_metadata = fresh_metadata["datadoc"]
385
+ else:
386
+ datadoc_metadata = fresh_metadata
387
+ if datadoc_metadata is None:
388
+ return None
389
+ return model.DatadocMetadata.model_validate_json(
390
+ json.dumps(datadoc_metadata),
391
+ )
392
+ except json.JSONDecodeError:
393
+ logger.warning(
394
+ "Could not open existing metadata file %s. \
395
+ Falling back to collecting data from the dataset",
396
+ document,
397
+ exc_info=True,
398
+ )
399
+ return None
400
+
401
+ def _extract_subject_field_from_path(
402
+ self,
403
+ dapla_dataset_path_info: DaplaDatasetPathInfo,
404
+ ) -> str | None:
405
+ """Extract the statistic short name from the dataset file path.
406
+
407
+ Map the extracted statistic short name to its corresponding statistical
408
+ subject.
409
+
410
+ Args:
411
+ dapla_dataset_path_info: The object representing the decomposed file
412
+ path.
413
+
414
+ Returns:
415
+ The code for the statistical subject or None if we couldn't map to one.
416
+ """
417
+ if self._statistic_subject_mapping is None:
418
+ with ThreadPoolExecutor(max_workers=12) as executor:
419
+ return StatisticSubjectMapping(
420
+ executor,
421
+ config.get_statistical_subject_source_url(),
422
+ ).get_secondary_subject(
423
+ dapla_dataset_path_info.statistic_short_name,
424
+ )
425
+ else:
426
+ return self._statistic_subject_mapping.get_secondary_subject(
427
+ dapla_dataset_path_info.statistic_short_name,
428
+ )
429
+
430
+ def _extract_metadata_from_dataset(
431
+ self,
432
+ dataset: pathlib.Path | CloudPath,
433
+ ) -> model.DatadocMetadata:
434
+ """Obtain what metadata we can from the dataset itself.
435
+
436
+ This makes it easier for the user by 'pre-filling' certain fields.
437
+ Certain elements are dependent on the dataset being saved according
438
+ to SSB's standard.
439
+
440
+ Args:
441
+ dataset: The path to the dataset file, which can be a local or
442
+ cloud path.
443
+
444
+ Side Effects:
445
+ Updates the following instance attributes:
446
+ - ds_schema: An instance of DatasetParser initialized for the
447
+ given dataset file.
448
+ - dataset: An instance of model.Dataset with pre-filled metadata
449
+ fields.
450
+ - variables: A list of fields extracted from the dataset schema.
451
+ """
452
+ dapla_dataset_path_info = DaplaDatasetPathInfo(dataset)
453
+ metadata = model.DatadocMetadata()
454
+
455
+ metadata.dataset = model.Dataset(
456
+ short_name=dapla_dataset_path_info.dataset_short_name,
457
+ dataset_state=dapla_dataset_path_info.dataset_state,
458
+ dataset_status=DataSetStatus.DRAFT,
459
+ assessment=(
460
+ derive_assessment_from_state(
461
+ dapla_dataset_path_info.dataset_state,
462
+ )
463
+ if dapla_dataset_path_info.dataset_state is not None
464
+ else None
465
+ ),
466
+ version=dapla_dataset_path_info.dataset_version,
467
+ contains_data_from=dapla_dataset_path_info.contains_data_from,
468
+ contains_data_until=dapla_dataset_path_info.contains_data_until,
469
+ file_path=str(self.dataset_path),
470
+ metadata_created_by=user_info.get_user_info_for_current_platform().short_email,
471
+ subject_field=self._extract_subject_field_from_path(
472
+ dapla_dataset_path_info,
473
+ ),
474
+ spatial_coverage_description=DEFAULT_SPATIAL_COVERAGE_DESCRIPTION,
475
+ )
476
+ metadata.variables = DatasetParser.for_file(dataset).get_fields()
477
+ return metadata
478
+
479
+ @staticmethod
480
+ def build_metadata_document_path(
481
+ dataset_path: pathlib.Path | CloudPath,
482
+ ) -> pathlib.Path | CloudPath:
483
+ """Build the path to the metadata document corresponding to the given dataset.
484
+
485
+ Args:
486
+ dataset_path: Path to the dataset we wish to create metadata for.
487
+ """
488
+ return dataset_path.parent / (dataset_path.stem + METADATA_DOCUMENT_FILE_SUFFIX)
489
+
490
+ def write_metadata_document(self) -> None:
491
+ """Write all currently known metadata to file.
492
+
493
+ Side Effects:
494
+ - Updates the dataset's metadata_last_updated_date and
495
+ metadata_last_updated_by attributes.
496
+ - Updates the dataset's file_path attribute.
497
+ - Validates the metadata model and stores it in a MetadataContainer.
498
+ - Writes the validated metadata to a file if the metadata_document
499
+ attribute is set.
500
+ - Logs the action and the content of the metadata document.
501
+
502
+ Raises:
503
+ ValueError: If no metadata document is specified for saving.
504
+ """
505
+ timestamp: datetime = get_timestamp_now()
506
+ self.dataset.metadata_last_updated_date = timestamp
507
+ self.dataset.metadata_last_updated_by = (
508
+ user_info.get_user_info_for_current_platform().short_email
509
+ )
510
+ self.dataset.file_path = str(self.dataset_path)
511
+ datadoc: ValidateDatadocMetadata = ValidateDatadocMetadata(
512
+ percentage_complete=self.percent_complete,
513
+ dataset=self.dataset,
514
+ variables=self.variables,
515
+ )
516
+ if self.container:
517
+ self.container.datadoc = datadoc
518
+ else:
519
+ self.container = model.MetadataContainer(datadoc=datadoc)
520
+ if self.metadata_document:
521
+ content = self.container.model_dump_json(indent=4)
522
+ self.metadata_document.write_text(content)
523
+ logger.info("Saved metadata document %s", self.metadata_document)
524
+ logger.info("Metadata content:\n%s", content)
525
+ else:
526
+ msg = "No metadata document to save"
527
+ raise ValueError(msg)
528
+
529
+ @property
530
+ def percent_complete(self) -> int:
531
+ """The percentage of obligatory metadata completed.
532
+
533
+ A metadata field is counted as complete when any non-None value is
534
+ assigned. Used for a live progress bar in the UI, as well as being
535
+ saved in the datadoc as a simple quality indicator.
536
+ """
537
+ num_all_fields = NUM_OBLIGATORY_DATASET_FIELDS + (
538
+ NUM_OBLIGATORY_VARIABLES_FIELDS * len(self.variables)
539
+ )
540
+ num_set_fields = num_obligatory_dataset_fields_completed(
541
+ self.dataset,
542
+ ) + num_obligatory_variables_fields_completed(self.variables)
543
+ return calculate_percentage(num_set_fields, num_all_fields)