dapla-toolbelt-metadata 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dapla-toolbelt-metadata might be problematic. Click here for more details.

@@ -0,0 +1,405 @@
1
+ from __future__ import annotations
2
+
3
+ import datetime # import is needed in xdoctest
4
+ import logging
5
+ import pathlib
6
+ import uuid
7
+
8
+ from cloudpathlib import CloudPath
9
+ from cloudpathlib import GSClient
10
+ from cloudpathlib import GSPath
11
+ from dapla import AuthClient
12
+ from datadoc_model import model
13
+ from datadoc_model.model import Assessment
14
+ from datadoc_model.model import DataSetState
15
+ from datadoc_model.model import VariableRole
16
+
17
+ from dataset.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
18
+ from dataset.utility.constants import OBLIGATORY_DATASET_METADATA_IDENTIFIERS
19
+ from dataset.utility.constants import (
20
+ OBLIGATORY_DATASET_METADATA_IDENTIFIERS_MULTILANGUAGE,
21
+ )
22
+ from dataset.utility.constants import OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS
23
+ from dataset.utility.constants import (
24
+ OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS_MULTILANGUAGE,
25
+ )
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ def get_timestamp_now() -> datetime.datetime:
31
+ """Return a timestamp for the current moment."""
32
+ return datetime.datetime.now(tz=datetime.timezone.utc)
33
+
34
+
35
+ def normalize_path(path: str) -> pathlib.Path | CloudPath:
36
+ """Obtain a pathlib compatible Path.
37
+
38
+ Obtains a pathlib compatible Path regardless of whether the file is on a filesystem or in GCS.
39
+
40
+ Args:
41
+ path: Path on a filesystem or in cloud storage.
42
+
43
+ Returns:
44
+ Pathlib compatible object.
45
+ """
46
+ if path.startswith(GSPath.cloud_prefix):
47
+ client = GSClient(credentials=AuthClient.fetch_google_credentials())
48
+ return GSPath(path, client=client)
49
+ return pathlib.Path(path)
50
+
51
+
52
+ def calculate_percentage(completed: int, total: int) -> int:
53
+ """Calculate percentage as a rounded integer.
54
+
55
+ Args:
56
+ completed: The number of completed items.
57
+ total: The total number of items.
58
+
59
+ Returns:
60
+ The rounded percentage of completed items out of the total.
61
+ """
62
+ return round((completed / total) * 100)
63
+
64
+
65
+ def derive_assessment_from_state(state: DataSetState) -> Assessment:
66
+ """Derive assessment from dataset state.
67
+
68
+ Args:
69
+ state: The state of the dataset.
70
+
71
+ Returns:
72
+ The derived assessment of the dataset.
73
+ """
74
+ match (state):
75
+ case (
76
+ DataSetState.INPUT_DATA
77
+ | DataSetState.PROCESSED_DATA
78
+ | DataSetState.STATISTICS
79
+ ):
80
+ return Assessment.PROTECTED
81
+ case DataSetState.OUTPUT_DATA:
82
+ return Assessment.OPEN
83
+ case DataSetState.SOURCE_DATA:
84
+ return Assessment.SENSITIVE
85
+
86
+
87
+ def set_default_values_variables(variables: list) -> None:
88
+ """Set default values on variables.
89
+
90
+ Args:
91
+ variables: A list of variable objects to set default values on.
92
+
93
+ Example:
94
+ >>> variables = [model.Variable(short_name="pers",id=None, is_personal_data = None), model.Variable(short_name="fnr",id='9662875c-c245-41de-b667-12ad2091a1ee', is_personal_data='PSEUDONYMISED_ENCRYPTED_PERSONAL_DATA')]
95
+ >>> set_default_values_variables(variables)
96
+ >>> isinstance(variables[0].id, uuid.UUID)
97
+ True
98
+
99
+ >>> variables[1].is_personal_data == 'PSEUDONYMISED_ENCRYPTED_PERSONAL_DATA'
100
+ True
101
+
102
+ >>> variables[0].is_personal_data == 'NOT_PERSONAL_DATA'
103
+ True
104
+ """
105
+ for v in variables:
106
+ if v.id is None:
107
+ v.id = uuid.uuid4()
108
+ if v.is_personal_data is None:
109
+ v.is_personal_data = model.IsPersonalData.NOT_PERSONAL_DATA
110
+ if v.variable_role is None:
111
+ v.variable_role = VariableRole.MEASURE
112
+
113
+
114
+ def set_default_values_dataset(dataset: model.Dataset) -> None:
115
+ """Set default values on dataset.
116
+
117
+ Args:
118
+ dataset: The dataset object to set default values on.
119
+
120
+ Example:
121
+ >>> dataset = model.Dataset(id=None, contains_personal_data=None)
122
+ >>> set_default_values_dataset(dataset)
123
+ >>> dataset.id is not None
124
+ True
125
+
126
+ >>> dataset.contains_personal_data == False
127
+ True
128
+ """
129
+ if not dataset.id:
130
+ dataset.id = uuid.uuid4()
131
+ if dataset.contains_personal_data is None:
132
+ dataset.contains_personal_data = False
133
+
134
+
135
+ def set_variables_inherit_from_dataset(
136
+ dataset: model.Dataset,
137
+ variables: list,
138
+ ) -> None:
139
+ """Set specific dataset values on a list of variable objects.
140
+
141
+ This function populates 'data source', 'temporality type', 'contains data from',
142
+ and 'contains data until' fields in each variable if they are not set (None).
143
+ The values are inherited from the corresponding fields in the dataset.
144
+
145
+ Args:
146
+ dataset: The dataset object from which to inherit values.
147
+ variables: A list of variable objects to update with dataset values.
148
+
149
+ Example:
150
+ >>> dataset = model.Dataset(short_name='person_data_v1',data_source='01',temporality_type='STATUS',id='9662875c-c245-41de-b667-12ad2091a1ee',contains_data_from="2010-09-05",contains_data_until="2022-09-05")
151
+ >>> variables = [model.Variable(short_name="pers",data_source =None,temporality_type = None, contains_data_from = None,contains_data_until = None)]
152
+ >>> set_variables_inherit_from_dataset(dataset, variables)
153
+ >>> variables[0].data_source == dataset.data_source
154
+ True
155
+
156
+ >>> variables[0].temporality_type is None
157
+ False
158
+
159
+ >>> variables[0].contains_data_from == dataset.contains_data_from
160
+ True
161
+
162
+ >>> variables[0].contains_data_until == dataset.contains_data_until
163
+ True
164
+ """
165
+ for v in variables:
166
+ v.contains_data_from = v.contains_data_from or dataset.contains_data_from
167
+ v.contains_data_until = v.contains_data_until or dataset.contains_data_until
168
+ v.temporality_type = v.temporality_type or dataset.temporality_type
169
+ v.data_source = v.data_source or dataset.data_source
170
+
171
+
172
+ def incorrect_date_order(
173
+ date_from: datetime.date | None,
174
+ date_until: datetime.date | None,
175
+ ) -> bool:
176
+ """Evaluate the chronological order of two dates.
177
+
178
+ This function checks if 'date until' is earlier than 'date from'. If so, it
179
+ indicates an incorrect date order.
180
+
181
+ Args:
182
+ date_from: The start date of the time period.
183
+ date_until: The end date of the time period.
184
+
185
+ Returns:
186
+ True if 'date_until' is earlier than 'date_from' or if only 'date_from' is None, False otherwise.
187
+
188
+ Example:
189
+ >>> incorrect_date_order(datetime.date(1980, 1, 1), datetime.date(1967, 1, 1))
190
+ True
191
+
192
+ >>> incorrect_date_order(datetime.date(1967, 1, 1), datetime.date(1980, 1, 1))
193
+ False
194
+
195
+ >>> incorrect_date_order(None, datetime.date(2024,7,1))
196
+ True
197
+ """
198
+ if date_from is None and date_until is not None:
199
+ return True
200
+ return date_from is not None and date_until is not None and date_until < date_from
201
+
202
+
203
+ def _is_missing_multilanguage_value(
204
+ field_name: str,
205
+ field_value, # noqa: ANN001 Skip type hint to enable dynamically handling value for LanguageStringType not indexable
206
+ obligatory_list: list,
207
+ ) -> bool:
208
+ """Check obligatory fields with multilanguage value.
209
+
210
+ This function checks whether a given field, which is supposed to have
211
+ multilanguage values, is missing values in all specified languages.
212
+
213
+ Args:
214
+ field_name: The name of the field to check.
215
+ field_value: The value of the field. Expected to be of type LanguageStringType.
216
+ obligatory_list: A list of obligatory field names that should have multilanguage values.
217
+
218
+ Returns:
219
+ True if no value in any of languages for one field, False otherwise.
220
+ """
221
+ return bool(
222
+ field_name in obligatory_list
223
+ and field_value
224
+ and (
225
+ len(field_value[0]) > 0
226
+ and not field_value[0]["languageText"]
227
+ and (len(field_value) <= 1 or not field_value[1]["languageText"])
228
+ and (
229
+ len(field_value) <= 2 # noqa: PLR2004 approve magic value
230
+ or not field_value[2]["languageText"]
231
+ )
232
+ ),
233
+ )
234
+
235
+
236
+ def _is_missing_metadata(
237
+ field_name: str,
238
+ field_value, # noqa: ANN001 Skip type hint because method '_is_missing_multilanguage_value'
239
+ obligatory_list: list,
240
+ obligatory_multi_language_list: list,
241
+ ) -> bool:
242
+ """Check if an obligatory field is missing its value.
243
+
244
+ This function checks whether a given field, which may be a simple string or a
245
+ multilanguage value, is missing its value. It considers two lists: one for
246
+ obligatory fields and another for obligatory multilanguage fields.
247
+
248
+ Args:
249
+ field_name: The name of the field to check.
250
+ field_value: The value of the field. Can be of type str, or LanguageStringType for
251
+ multilanguage fields.
252
+ obligatory_list: List of obligatory fields.
253
+ obligatory_multi_language_list: List of obligatory fields with multilanguage
254
+ values.
255
+
256
+ Returns:
257
+ True if the field doesn't have a value, False otherwise.
258
+ """
259
+ return bool(
260
+ field_name in obligatory_list
261
+ and field_value is None
262
+ or _is_missing_multilanguage_value(
263
+ field_name,
264
+ field_value,
265
+ obligatory_multi_language_list,
266
+ ),
267
+ )
268
+
269
+
270
+ def num_obligatory_dataset_fields_completed(dataset: model.Dataset) -> int:
271
+ """Count the number of completed obligatory dataset fields.
272
+
273
+ This function returns the total count of obligatory fields in the dataset that
274
+ have values (are not None).
275
+
276
+ Args:
277
+ dataset: The dataset object for which to count the fields.
278
+
279
+ Returns:
280
+ The number of obligatory dataset fields that have been completed (not None).
281
+ """
282
+ return len(OBLIGATORY_DATASET_METADATA_IDENTIFIERS) - len(
283
+ get_missing_obligatory_dataset_fields(dataset),
284
+ )
285
+
286
+
287
+ def num_obligatory_variables_fields_completed(variables: list) -> int:
288
+ """Count the number of obligatory fields completed for all variables.
289
+
290
+ This function calculates the total number of obligatory fields that have
291
+ values (are not None) for one variable in the list.
292
+
293
+ Args:
294
+ variables: A list with variable objects.
295
+
296
+ Returns:
297
+ The total number of obligatory variable fields that have been completed
298
+ (not None) for all variables.
299
+ """
300
+ num_completed = 0
301
+ for v in variables:
302
+ num_completed += num_obligatory_variable_fields_completed(v)
303
+ return num_completed
304
+
305
+
306
+ def num_obligatory_variable_fields_completed(variable: model.Variable) -> int:
307
+ """Count the number of obligatory fields completed for one variable.
308
+
309
+ This function calculates the total number of obligatory fields that have
310
+ values (are not None) for one variable in the list.
311
+
312
+ Args:
313
+ variable: The variable to count obligatory fields for.
314
+
315
+ Returns:
316
+ The total number of obligatory variable fields that have been completed
317
+ (not None) for one variable.
318
+ """
319
+ missing_metadata = [
320
+ key
321
+ for key, value in variable.model_dump().items()
322
+ if _is_missing_metadata(
323
+ key,
324
+ value,
325
+ OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS,
326
+ OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS_MULTILANGUAGE,
327
+ )
328
+ ]
329
+ return NUM_OBLIGATORY_VARIABLES_FIELDS - len(missing_metadata)
330
+
331
+
332
+ def get_missing_obligatory_dataset_fields(dataset: model.Dataset) -> list:
333
+ """Identify all obligatory dataset fields that are missing values.
334
+
335
+ This function checks for obligatory fields that are either directly missing
336
+ (i.e., set to `None`) or have multilanguage values with empty content.
337
+
338
+ Args:
339
+ dataset: The dataset object to examine. This object must support the
340
+ `model_dump()` method which returns a dictionary of field names and
341
+ values.
342
+
343
+ Returns:
344
+ A list of field names (as strings) that are missing values. This includes:
345
+ - Fields that are directly `None` and are listed as obligatory metadata.
346
+ - Multilanguage fields (listed as obligatory metadata`) where
347
+ the value exists but the primary language text is empty.
348
+ """
349
+ return [
350
+ key
351
+ for key, value in dataset.model_dump().items()
352
+ if _is_missing_metadata(
353
+ key,
354
+ value,
355
+ OBLIGATORY_DATASET_METADATA_IDENTIFIERS,
356
+ OBLIGATORY_DATASET_METADATA_IDENTIFIERS_MULTILANGUAGE,
357
+ )
358
+ ]
359
+
360
+
361
+ def get_missing_obligatory_variables_fields(variables: list) -> list[dict]:
362
+ """Identify obligatory variable fields that are missing values for each variable.
363
+
364
+ This function checks for obligatory fields that are either directly missing
365
+ (i.e., set to `None`) or have multilanguage values with empty content.
366
+
367
+ Args:
368
+ variables: A list of variable objects to check for missing obligatory fields.
369
+
370
+ Returns:
371
+ A list of dictionaries with variable short names as keys and list of missing
372
+ obligatory variable fields as values. This includes:
373
+ - Fields that are directly `None` and are llisted as obligatory metadata.
374
+ - Multilanguage fields (listed as obligatory metadata) where the value
375
+ exists but the primary language text is empty.
376
+ """
377
+ missing_variables_fields = [
378
+ {
379
+ variable.short_name: [
380
+ key
381
+ for key, value in variable.model_dump().items()
382
+ if _is_missing_metadata(
383
+ key,
384
+ value,
385
+ OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS,
386
+ OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS_MULTILANGUAGE,
387
+ )
388
+ ],
389
+ }
390
+ for variable in variables
391
+ ]
392
+ # Filtering out variable keys with empty values list
393
+ return [item for item in missing_variables_fields if next(iter(item.values()))]
394
+
395
+
396
+ def running_in_notebook() -> bool:
397
+ """Return True if running in Jupyter Notebook."""
398
+ try:
399
+ return bool(get_ipython().__class__.__name__ == "ZMQInteractiveShell") # type: ignore [name-defined]
400
+ except NameError:
401
+ # The get_ipython method is globally available in ipython interpreters
402
+ # as used in Jupyter. However it is not available in other python
403
+ # interpreters and will throw a NameError. Therefore we're not running
404
+ # in Jupyter.
405
+ return False