dapla-toolbelt-metadata 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dapla-toolbelt-metadata might be problematic. Click here for more details.
- dapla_toolbelt_metadata-0.1.1.dist-info/LICENSE +21 -0
- dapla_toolbelt_metadata-0.1.1.dist-info/METADATA +125 -0
- dapla_toolbelt_metadata-0.1.1.dist-info/RECORD +21 -0
- dapla_toolbelt_metadata-0.1.1.dist-info/WHEEL +4 -0
- dataset/__init__.py +11 -0
- dataset/code_list.py +244 -0
- dataset/config.py +151 -0
- dataset/core.py +543 -0
- dataset/dapla_dataset_path_info.py +685 -0
- dataset/dataset_parser.py +241 -0
- dataset/external_sources/__init__.py +1 -0
- dataset/external_sources/external_sources.py +87 -0
- dataset/model_backwards_compatibility.py +520 -0
- dataset/model_validation.py +188 -0
- dataset/py.typed +0 -0
- dataset/statistic_subject_mapping.py +182 -0
- dataset/user_info.py +88 -0
- dataset/utility/__init__.py +1 -0
- dataset/utility/constants.py +92 -0
- dataset/utility/enums.py +35 -0
- dataset/utility/utils.py +405 -0
dataset/utility/utils.py
ADDED
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import datetime # import is needed in xdoctest
|
|
4
|
+
import logging
|
|
5
|
+
import pathlib
|
|
6
|
+
import uuid
|
|
7
|
+
|
|
8
|
+
from cloudpathlib import CloudPath
|
|
9
|
+
from cloudpathlib import GSClient
|
|
10
|
+
from cloudpathlib import GSPath
|
|
11
|
+
from dapla import AuthClient
|
|
12
|
+
from datadoc_model import model
|
|
13
|
+
from datadoc_model.model import Assessment
|
|
14
|
+
from datadoc_model.model import DataSetState
|
|
15
|
+
from datadoc_model.model import VariableRole
|
|
16
|
+
|
|
17
|
+
from dataset.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
|
|
18
|
+
from dataset.utility.constants import OBLIGATORY_DATASET_METADATA_IDENTIFIERS
|
|
19
|
+
from dataset.utility.constants import (
|
|
20
|
+
OBLIGATORY_DATASET_METADATA_IDENTIFIERS_MULTILANGUAGE,
|
|
21
|
+
)
|
|
22
|
+
from dataset.utility.constants import OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS
|
|
23
|
+
from dataset.utility.constants import (
|
|
24
|
+
OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS_MULTILANGUAGE,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_timestamp_now() -> datetime.datetime:
|
|
31
|
+
"""Return a timestamp for the current moment."""
|
|
32
|
+
return datetime.datetime.now(tz=datetime.timezone.utc)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def normalize_path(path: str) -> pathlib.Path | CloudPath:
|
|
36
|
+
"""Obtain a pathlib compatible Path.
|
|
37
|
+
|
|
38
|
+
Obtains a pathlib compatible Path regardless of whether the file is on a filesystem or in GCS.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
path: Path on a filesystem or in cloud storage.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Pathlib compatible object.
|
|
45
|
+
"""
|
|
46
|
+
if path.startswith(GSPath.cloud_prefix):
|
|
47
|
+
client = GSClient(credentials=AuthClient.fetch_google_credentials())
|
|
48
|
+
return GSPath(path, client=client)
|
|
49
|
+
return pathlib.Path(path)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def calculate_percentage(completed: int, total: int) -> int:
|
|
53
|
+
"""Calculate percentage as a rounded integer.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
completed: The number of completed items.
|
|
57
|
+
total: The total number of items.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
The rounded percentage of completed items out of the total.
|
|
61
|
+
"""
|
|
62
|
+
return round((completed / total) * 100)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def derive_assessment_from_state(state: DataSetState) -> Assessment:
|
|
66
|
+
"""Derive assessment from dataset state.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
state: The state of the dataset.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
The derived assessment of the dataset.
|
|
73
|
+
"""
|
|
74
|
+
match (state):
|
|
75
|
+
case (
|
|
76
|
+
DataSetState.INPUT_DATA
|
|
77
|
+
| DataSetState.PROCESSED_DATA
|
|
78
|
+
| DataSetState.STATISTICS
|
|
79
|
+
):
|
|
80
|
+
return Assessment.PROTECTED
|
|
81
|
+
case DataSetState.OUTPUT_DATA:
|
|
82
|
+
return Assessment.OPEN
|
|
83
|
+
case DataSetState.SOURCE_DATA:
|
|
84
|
+
return Assessment.SENSITIVE
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def set_default_values_variables(variables: list) -> None:
|
|
88
|
+
"""Set default values on variables.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
variables: A list of variable objects to set default values on.
|
|
92
|
+
|
|
93
|
+
Example:
|
|
94
|
+
>>> variables = [model.Variable(short_name="pers",id=None, is_personal_data = None), model.Variable(short_name="fnr",id='9662875c-c245-41de-b667-12ad2091a1ee', is_personal_data='PSEUDONYMISED_ENCRYPTED_PERSONAL_DATA')]
|
|
95
|
+
>>> set_default_values_variables(variables)
|
|
96
|
+
>>> isinstance(variables[0].id, uuid.UUID)
|
|
97
|
+
True
|
|
98
|
+
|
|
99
|
+
>>> variables[1].is_personal_data == 'PSEUDONYMISED_ENCRYPTED_PERSONAL_DATA'
|
|
100
|
+
True
|
|
101
|
+
|
|
102
|
+
>>> variables[0].is_personal_data == 'NOT_PERSONAL_DATA'
|
|
103
|
+
True
|
|
104
|
+
"""
|
|
105
|
+
for v in variables:
|
|
106
|
+
if v.id is None:
|
|
107
|
+
v.id = uuid.uuid4()
|
|
108
|
+
if v.is_personal_data is None:
|
|
109
|
+
v.is_personal_data = model.IsPersonalData.NOT_PERSONAL_DATA
|
|
110
|
+
if v.variable_role is None:
|
|
111
|
+
v.variable_role = VariableRole.MEASURE
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def set_default_values_dataset(dataset: model.Dataset) -> None:
|
|
115
|
+
"""Set default values on dataset.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
dataset: The dataset object to set default values on.
|
|
119
|
+
|
|
120
|
+
Example:
|
|
121
|
+
>>> dataset = model.Dataset(id=None, contains_personal_data=None)
|
|
122
|
+
>>> set_default_values_dataset(dataset)
|
|
123
|
+
>>> dataset.id is not None
|
|
124
|
+
True
|
|
125
|
+
|
|
126
|
+
>>> dataset.contains_personal_data == False
|
|
127
|
+
True
|
|
128
|
+
"""
|
|
129
|
+
if not dataset.id:
|
|
130
|
+
dataset.id = uuid.uuid4()
|
|
131
|
+
if dataset.contains_personal_data is None:
|
|
132
|
+
dataset.contains_personal_data = False
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def set_variables_inherit_from_dataset(
|
|
136
|
+
dataset: model.Dataset,
|
|
137
|
+
variables: list,
|
|
138
|
+
) -> None:
|
|
139
|
+
"""Set specific dataset values on a list of variable objects.
|
|
140
|
+
|
|
141
|
+
This function populates 'data source', 'temporality type', 'contains data from',
|
|
142
|
+
and 'contains data until' fields in each variable if they are not set (None).
|
|
143
|
+
The values are inherited from the corresponding fields in the dataset.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
dataset: The dataset object from which to inherit values.
|
|
147
|
+
variables: A list of variable objects to update with dataset values.
|
|
148
|
+
|
|
149
|
+
Example:
|
|
150
|
+
>>> dataset = model.Dataset(short_name='person_data_v1',data_source='01',temporality_type='STATUS',id='9662875c-c245-41de-b667-12ad2091a1ee',contains_data_from="2010-09-05",contains_data_until="2022-09-05")
|
|
151
|
+
>>> variables = [model.Variable(short_name="pers",data_source =None,temporality_type = None, contains_data_from = None,contains_data_until = None)]
|
|
152
|
+
>>> set_variables_inherit_from_dataset(dataset, variables)
|
|
153
|
+
>>> variables[0].data_source == dataset.data_source
|
|
154
|
+
True
|
|
155
|
+
|
|
156
|
+
>>> variables[0].temporality_type is None
|
|
157
|
+
False
|
|
158
|
+
|
|
159
|
+
>>> variables[0].contains_data_from == dataset.contains_data_from
|
|
160
|
+
True
|
|
161
|
+
|
|
162
|
+
>>> variables[0].contains_data_until == dataset.contains_data_until
|
|
163
|
+
True
|
|
164
|
+
"""
|
|
165
|
+
for v in variables:
|
|
166
|
+
v.contains_data_from = v.contains_data_from or dataset.contains_data_from
|
|
167
|
+
v.contains_data_until = v.contains_data_until or dataset.contains_data_until
|
|
168
|
+
v.temporality_type = v.temporality_type or dataset.temporality_type
|
|
169
|
+
v.data_source = v.data_source or dataset.data_source
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def incorrect_date_order(
|
|
173
|
+
date_from: datetime.date | None,
|
|
174
|
+
date_until: datetime.date | None,
|
|
175
|
+
) -> bool:
|
|
176
|
+
"""Evaluate the chronological order of two dates.
|
|
177
|
+
|
|
178
|
+
This function checks if 'date until' is earlier than 'date from'. If so, it
|
|
179
|
+
indicates an incorrect date order.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
date_from: The start date of the time period.
|
|
183
|
+
date_until: The end date of the time period.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
True if 'date_until' is earlier than 'date_from' or if only 'date_from' is None, False otherwise.
|
|
187
|
+
|
|
188
|
+
Example:
|
|
189
|
+
>>> incorrect_date_order(datetime.date(1980, 1, 1), datetime.date(1967, 1, 1))
|
|
190
|
+
True
|
|
191
|
+
|
|
192
|
+
>>> incorrect_date_order(datetime.date(1967, 1, 1), datetime.date(1980, 1, 1))
|
|
193
|
+
False
|
|
194
|
+
|
|
195
|
+
>>> incorrect_date_order(None, datetime.date(2024,7,1))
|
|
196
|
+
True
|
|
197
|
+
"""
|
|
198
|
+
if date_from is None and date_until is not None:
|
|
199
|
+
return True
|
|
200
|
+
return date_from is not None and date_until is not None and date_until < date_from
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _is_missing_multilanguage_value(
|
|
204
|
+
field_name: str,
|
|
205
|
+
field_value, # noqa: ANN001 Skip type hint to enable dynamically handling value for LanguageStringType not indexable
|
|
206
|
+
obligatory_list: list,
|
|
207
|
+
) -> bool:
|
|
208
|
+
"""Check obligatory fields with multilanguage value.
|
|
209
|
+
|
|
210
|
+
This function checks whether a given field, which is supposed to have
|
|
211
|
+
multilanguage values, is missing values in all specified languages.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
field_name: The name of the field to check.
|
|
215
|
+
field_value: The value of the field. Expected to be of type LanguageStringType.
|
|
216
|
+
obligatory_list: A list of obligatory field names that should have multilanguage values.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
True if no value in any of languages for one field, False otherwise.
|
|
220
|
+
"""
|
|
221
|
+
return bool(
|
|
222
|
+
field_name in obligatory_list
|
|
223
|
+
and field_value
|
|
224
|
+
and (
|
|
225
|
+
len(field_value[0]) > 0
|
|
226
|
+
and not field_value[0]["languageText"]
|
|
227
|
+
and (len(field_value) <= 1 or not field_value[1]["languageText"])
|
|
228
|
+
and (
|
|
229
|
+
len(field_value) <= 2 # noqa: PLR2004 approve magic value
|
|
230
|
+
or not field_value[2]["languageText"]
|
|
231
|
+
)
|
|
232
|
+
),
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _is_missing_metadata(
|
|
237
|
+
field_name: str,
|
|
238
|
+
field_value, # noqa: ANN001 Skip type hint because method '_is_missing_multilanguage_value'
|
|
239
|
+
obligatory_list: list,
|
|
240
|
+
obligatory_multi_language_list: list,
|
|
241
|
+
) -> bool:
|
|
242
|
+
"""Check if an obligatory field is missing its value.
|
|
243
|
+
|
|
244
|
+
This function checks whether a given field, which may be a simple string or a
|
|
245
|
+
multilanguage value, is missing its value. It considers two lists: one for
|
|
246
|
+
obligatory fields and another for obligatory multilanguage fields.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
field_name: The name of the field to check.
|
|
250
|
+
field_value: The value of the field. Can be of type str, or LanguageStringType for
|
|
251
|
+
multilanguage fields.
|
|
252
|
+
obligatory_list: List of obligatory fields.
|
|
253
|
+
obligatory_multi_language_list: List of obligatory fields with multilanguage
|
|
254
|
+
values.
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
True if the field doesn't have a value, False otherwise.
|
|
258
|
+
"""
|
|
259
|
+
return bool(
|
|
260
|
+
field_name in obligatory_list
|
|
261
|
+
and field_value is None
|
|
262
|
+
or _is_missing_multilanguage_value(
|
|
263
|
+
field_name,
|
|
264
|
+
field_value,
|
|
265
|
+
obligatory_multi_language_list,
|
|
266
|
+
),
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def num_obligatory_dataset_fields_completed(dataset: model.Dataset) -> int:
|
|
271
|
+
"""Count the number of completed obligatory dataset fields.
|
|
272
|
+
|
|
273
|
+
This function returns the total count of obligatory fields in the dataset that
|
|
274
|
+
have values (are not None).
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
dataset: The dataset object for which to count the fields.
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
The number of obligatory dataset fields that have been completed (not None).
|
|
281
|
+
"""
|
|
282
|
+
return len(OBLIGATORY_DATASET_METADATA_IDENTIFIERS) - len(
|
|
283
|
+
get_missing_obligatory_dataset_fields(dataset),
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def num_obligatory_variables_fields_completed(variables: list) -> int:
|
|
288
|
+
"""Count the number of obligatory fields completed for all variables.
|
|
289
|
+
|
|
290
|
+
This function calculates the total number of obligatory fields that have
|
|
291
|
+
values (are not None) for one variable in the list.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
variables: A list with variable objects.
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
The total number of obligatory variable fields that have been completed
|
|
298
|
+
(not None) for all variables.
|
|
299
|
+
"""
|
|
300
|
+
num_completed = 0
|
|
301
|
+
for v in variables:
|
|
302
|
+
num_completed += num_obligatory_variable_fields_completed(v)
|
|
303
|
+
return num_completed
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def num_obligatory_variable_fields_completed(variable: model.Variable) -> int:
|
|
307
|
+
"""Count the number of obligatory fields completed for one variable.
|
|
308
|
+
|
|
309
|
+
This function calculates the total number of obligatory fields that have
|
|
310
|
+
values (are not None) for one variable in the list.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
variable: The variable to count obligatory fields for.
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
The total number of obligatory variable fields that have been completed
|
|
317
|
+
(not None) for one variable.
|
|
318
|
+
"""
|
|
319
|
+
missing_metadata = [
|
|
320
|
+
key
|
|
321
|
+
for key, value in variable.model_dump().items()
|
|
322
|
+
if _is_missing_metadata(
|
|
323
|
+
key,
|
|
324
|
+
value,
|
|
325
|
+
OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS,
|
|
326
|
+
OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS_MULTILANGUAGE,
|
|
327
|
+
)
|
|
328
|
+
]
|
|
329
|
+
return NUM_OBLIGATORY_VARIABLES_FIELDS - len(missing_metadata)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def get_missing_obligatory_dataset_fields(dataset: model.Dataset) -> list:
|
|
333
|
+
"""Identify all obligatory dataset fields that are missing values.
|
|
334
|
+
|
|
335
|
+
This function checks for obligatory fields that are either directly missing
|
|
336
|
+
(i.e., set to `None`) or have multilanguage values with empty content.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
dataset: The dataset object to examine. This object must support the
|
|
340
|
+
`model_dump()` method which returns a dictionary of field names and
|
|
341
|
+
values.
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
A list of field names (as strings) that are missing values. This includes:
|
|
345
|
+
- Fields that are directly `None` and are listed as obligatory metadata.
|
|
346
|
+
- Multilanguage fields (listed as obligatory metadata`) where
|
|
347
|
+
the value exists but the primary language text is empty.
|
|
348
|
+
"""
|
|
349
|
+
return [
|
|
350
|
+
key
|
|
351
|
+
for key, value in dataset.model_dump().items()
|
|
352
|
+
if _is_missing_metadata(
|
|
353
|
+
key,
|
|
354
|
+
value,
|
|
355
|
+
OBLIGATORY_DATASET_METADATA_IDENTIFIERS,
|
|
356
|
+
OBLIGATORY_DATASET_METADATA_IDENTIFIERS_MULTILANGUAGE,
|
|
357
|
+
)
|
|
358
|
+
]
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def get_missing_obligatory_variables_fields(variables: list) -> list[dict]:
|
|
362
|
+
"""Identify obligatory variable fields that are missing values for each variable.
|
|
363
|
+
|
|
364
|
+
This function checks for obligatory fields that are either directly missing
|
|
365
|
+
(i.e., set to `None`) or have multilanguage values with empty content.
|
|
366
|
+
|
|
367
|
+
Args:
|
|
368
|
+
variables: A list of variable objects to check for missing obligatory fields.
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
A list of dictionaries with variable short names as keys and list of missing
|
|
372
|
+
obligatory variable fields as values. This includes:
|
|
373
|
+
- Fields that are directly `None` and are llisted as obligatory metadata.
|
|
374
|
+
- Multilanguage fields (listed as obligatory metadata) where the value
|
|
375
|
+
exists but the primary language text is empty.
|
|
376
|
+
"""
|
|
377
|
+
missing_variables_fields = [
|
|
378
|
+
{
|
|
379
|
+
variable.short_name: [
|
|
380
|
+
key
|
|
381
|
+
for key, value in variable.model_dump().items()
|
|
382
|
+
if _is_missing_metadata(
|
|
383
|
+
key,
|
|
384
|
+
value,
|
|
385
|
+
OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS,
|
|
386
|
+
OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS_MULTILANGUAGE,
|
|
387
|
+
)
|
|
388
|
+
],
|
|
389
|
+
}
|
|
390
|
+
for variable in variables
|
|
391
|
+
]
|
|
392
|
+
# Filtering out variable keys with empty values list
|
|
393
|
+
return [item for item in missing_variables_fields if next(iter(item.values()))]
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def running_in_notebook() -> bool:
|
|
397
|
+
"""Return True if running in Jupyter Notebook."""
|
|
398
|
+
try:
|
|
399
|
+
return bool(get_ipython().__class__.__name__ == "ZMQInteractiveShell") # type: ignore [name-defined]
|
|
400
|
+
except NameError:
|
|
401
|
+
# The get_ipython method is globally available in ipython interpreters
|
|
402
|
+
# as used in Jupyter. However it is not available in other python
|
|
403
|
+
# interpreters and will throw a NameError. Therefore we're not running
|
|
404
|
+
# in Jupyter.
|
|
405
|
+
return False
|