dapla-toolbelt-metadata 0.2.1__py3-none-any.whl → 0.9.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dapla-toolbelt-metadata might be problematic. Click here for more details.
- dapla_metadata/__init__.py +11 -1
- dapla_metadata/_shared/__init__.py +1 -0
- dapla_metadata/_shared/config.py +109 -0
- dapla_metadata/_shared/enums.py +27 -0
- dapla_metadata/_shared/py.typed +0 -0
- dapla_metadata/dapla/__init__.py +4 -0
- dapla_metadata/dapla/user_info.py +138 -0
- dapla_metadata/datasets/__init__.py +1 -1
- dapla_metadata/datasets/_merge.py +333 -0
- dapla_metadata/datasets/code_list.py +5 -6
- dapla_metadata/datasets/compatibility/__init__.py +10 -0
- dapla_metadata/datasets/compatibility/_handlers.py +363 -0
- dapla_metadata/datasets/compatibility/_utils.py +259 -0
- dapla_metadata/datasets/compatibility/model_backwards_compatibility.py +135 -0
- dapla_metadata/datasets/core.py +136 -182
- dapla_metadata/datasets/dapla_dataset_path_info.py +145 -19
- dapla_metadata/datasets/dataset_parser.py +41 -28
- dapla_metadata/datasets/model_validation.py +29 -20
- dapla_metadata/datasets/statistic_subject_mapping.py +5 -1
- dapla_metadata/datasets/utility/constants.py +22 -15
- dapla_metadata/datasets/utility/enums.py +8 -20
- dapla_metadata/datasets/utility/urn.py +234 -0
- dapla_metadata/datasets/utility/utils.py +183 -111
- dapla_metadata/standards/__init__.py +4 -0
- dapla_metadata/standards/name_validator.py +250 -0
- dapla_metadata/standards/standard_validators.py +98 -0
- dapla_metadata/standards/utils/__init__.py +1 -0
- dapla_metadata/standards/utils/constants.py +49 -0
- dapla_metadata/variable_definitions/__init__.py +11 -0
- dapla_metadata/variable_definitions/_generated/.openapi-generator/FILES +20 -0
- dapla_metadata/variable_definitions/_generated/.openapi-generator/VERSION +1 -0
- dapla_metadata/variable_definitions/_generated/.openapi-generator-ignore +6 -0
- dapla_metadata/variable_definitions/_generated/README.md +148 -0
- dapla_metadata/variable_definitions/_generated/__init__.py +0 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/__init__.py +47 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api/__init__.py +8 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api/data_migration_api.py +766 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api/draft_variable_definitions_api.py +888 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api/patches_api.py +888 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api/validity_periods_api.py +583 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api/variable_definitions_api.py +613 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api_client.py +779 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/api_response.py +27 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/configuration.py +474 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/CompleteResponse.md +51 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/Contact.md +30 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/DataMigrationApi.md +90 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/Draft.md +42 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/DraftVariableDefinitionsApi.md +259 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/LanguageStringType.md +31 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/Owner.md +31 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/Patch.md +43 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/PatchesApi.md +249 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/PublicApi.md +218 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/SupportedLanguages.md +15 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/UpdateDraft.md +44 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/ValidityPeriod.md +42 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/ValidityPeriodsApi.md +236 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/VariableDefinitionsApi.md +304 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/docs/VariableStatus.md +17 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/exceptions.py +193 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/__init__.py +31 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/complete_response.py +260 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/contact.py +94 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/draft.py +228 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/get_vardok_vardef_mapping_by_id200_response.py +158 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/language_string_type.py +101 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/owner.py +87 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/patch.py +244 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/problem.py +118 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/update_draft.py +274 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/validity_period.py +225 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/vardok_id_response.py +81 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/vardok_vardef_id_pair_response.py +84 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/models/variable_status.py +33 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/py.typed +0 -0
- dapla_metadata/variable_definitions/_generated/vardef_client/rest.py +249 -0
- dapla_metadata/variable_definitions/_utils/__init__.py +1 -0
- dapla_metadata/variable_definitions/_utils/_client.py +32 -0
- dapla_metadata/variable_definitions/_utils/config.py +54 -0
- dapla_metadata/variable_definitions/_utils/constants.py +80 -0
- dapla_metadata/variable_definitions/_utils/files.py +309 -0
- dapla_metadata/variable_definitions/_utils/template_files.py +99 -0
- dapla_metadata/variable_definitions/_utils/variable_definition_files.py +143 -0
- dapla_metadata/variable_definitions/exceptions.py +255 -0
- dapla_metadata/variable_definitions/vardef.py +372 -0
- dapla_metadata/variable_definitions/vardok_id.py +48 -0
- dapla_metadata/variable_definitions/vardok_vardef_id_pair.py +47 -0
- dapla_metadata/variable_definitions/variable_definition.py +422 -0
- {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info}/METADATA +34 -36
- dapla_toolbelt_metadata-0.9.11.dist-info/RECORD +97 -0
- {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info}/WHEEL +1 -1
- dapla_metadata/datasets/config.py +0 -80
- dapla_metadata/datasets/model_backwards_compatibility.py +0 -520
- dapla_metadata/datasets/user_info.py +0 -88
- dapla_toolbelt_metadata-0.2.1.dist-info/RECORD +0 -22
- {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info/licenses}/LICENSE +0 -0
|
@@ -14,7 +14,7 @@ from typing import Literal
|
|
|
14
14
|
|
|
15
15
|
import arrow
|
|
16
16
|
from cloudpathlib import GSPath
|
|
17
|
-
from datadoc_model.model import DataSetState
|
|
17
|
+
from datadoc_model.all_optional.model import DataSetState
|
|
18
18
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
20
20
|
import datetime
|
|
@@ -141,6 +141,9 @@ class SsbDateFormat(DateFormat):
|
|
|
141
141
|
|
|
142
142
|
>>> SSB_BIMESTER.get_floor("2003B4")
|
|
143
143
|
datetime.date(2003, 7, 1)
|
|
144
|
+
|
|
145
|
+
>>> SSB_BIMESTER.get_floor("2003-B4")
|
|
146
|
+
datetime.date(2003, 7, 1)
|
|
144
147
|
"""
|
|
145
148
|
try:
|
|
146
149
|
year = period_string[:4]
|
|
@@ -170,6 +173,9 @@ class SsbDateFormat(DateFormat):
|
|
|
170
173
|
|
|
171
174
|
>>> SSB_HALF_YEAR.get_ceil("2024H1")
|
|
172
175
|
datetime.date(2024, 6, 30)
|
|
176
|
+
|
|
177
|
+
>>> SSB_HALF_YEAR.get_ceil("2024-H1")
|
|
178
|
+
datetime.date(2024, 6, 30)
|
|
173
179
|
"""
|
|
174
180
|
try:
|
|
175
181
|
year = period_string[:4]
|
|
@@ -182,7 +188,7 @@ class SsbDateFormat(DateFormat):
|
|
|
182
188
|
|
|
183
189
|
SSB_BIMESTER = SsbDateFormat(
|
|
184
190
|
name="SSB_BIMESTER",
|
|
185
|
-
regex_pattern=r"^\d{4}[B]\d{1}$",
|
|
191
|
+
regex_pattern=r"^\d{4}-?[B]\d{1}$",
|
|
186
192
|
arrow_pattern="YYYYMM",
|
|
187
193
|
timeframe="month",
|
|
188
194
|
ssb_dates={
|
|
@@ -215,7 +221,7 @@ SSB_BIMESTER = SsbDateFormat(
|
|
|
215
221
|
|
|
216
222
|
SSB_QUARTERLY = SsbDateFormat(
|
|
217
223
|
name="SSB_QUARTERLY",
|
|
218
|
-
regex_pattern=r"^\d{4}[Q]\d{1}$",
|
|
224
|
+
regex_pattern=r"^\d{4}-?[Q]\d{1}$",
|
|
219
225
|
arrow_pattern="YYYYMM",
|
|
220
226
|
timeframe="month",
|
|
221
227
|
ssb_dates={
|
|
@@ -240,7 +246,7 @@ SSB_QUARTERLY = SsbDateFormat(
|
|
|
240
246
|
|
|
241
247
|
SSB_TRIANNUAL = SsbDateFormat(
|
|
242
248
|
name="SSB_TRIANNUAL",
|
|
243
|
-
regex_pattern=r"^\d{4}[T]\d{1}$",
|
|
249
|
+
regex_pattern=r"^\d{4}-?[T]\d{1}$",
|
|
244
250
|
arrow_pattern="YYYYMM",
|
|
245
251
|
timeframe="month",
|
|
246
252
|
ssb_dates={
|
|
@@ -260,7 +266,7 @@ SSB_TRIANNUAL = SsbDateFormat(
|
|
|
260
266
|
)
|
|
261
267
|
SSB_HALF_YEAR = SsbDateFormat(
|
|
262
268
|
name="SSB_HALF_YEAR",
|
|
263
|
-
regex_pattern=r"^\d{4}[H]\d{1}$",
|
|
269
|
+
regex_pattern=r"^\d{4}-?[H]\d{1}$",
|
|
264
270
|
arrow_pattern="YYYYMM",
|
|
265
271
|
timeframe="month",
|
|
266
272
|
ssb_dates={
|
|
@@ -413,6 +419,9 @@ class DaplaDatasetPathInfo:
|
|
|
413
419
|
>>> DaplaDatasetPathInfo._extract_period_strings(['p1990Q1', 'kommune', 'v1'])
|
|
414
420
|
['1990Q1']
|
|
415
421
|
|
|
422
|
+
>>> DaplaDatasetPathInfo._extract_period_strings(['p1990-Q1', 'kommune', 'v1'])
|
|
423
|
+
['1990-Q1']
|
|
424
|
+
|
|
416
425
|
>>> DaplaDatasetPathInfo._extract_period_strings(['varehandel','v1'])
|
|
417
426
|
[]
|
|
418
427
|
"""
|
|
@@ -469,7 +478,7 @@ class DaplaDatasetPathInfo:
|
|
|
469
478
|
"""Extract the bucket name from the dataset path.
|
|
470
479
|
|
|
471
480
|
Returns:
|
|
472
|
-
The bucket name or None if the dataset path is not a GCS path.
|
|
481
|
+
The bucket name or None if the dataset path is not a GCS path nor ssb bucketeer path.
|
|
473
482
|
|
|
474
483
|
Examples:
|
|
475
484
|
>>> DaplaDatasetPathInfo('gs://ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet').bucket_name
|
|
@@ -483,17 +492,35 @@ class DaplaDatasetPathInfo:
|
|
|
483
492
|
|
|
484
493
|
>>> DaplaDatasetPathInfo('ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet').bucket_name
|
|
485
494
|
None
|
|
495
|
+
|
|
496
|
+
>>> DaplaDatasetPathInfo('ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet').bucket_name
|
|
497
|
+
None
|
|
498
|
+
|
|
499
|
+
>>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/stat/utdata/person_data_p2021_v2.parquet').bucket_name
|
|
500
|
+
ssb-staging-dapla-felles-data-delt
|
|
501
|
+
|
|
502
|
+
>>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/person_data_p2021_v2.parquet').bucket_name
|
|
503
|
+
ssb-staging-dapla-felles-data-delt
|
|
504
|
+
|
|
505
|
+
>>> DaplaDatasetPathInfo('home/work/buckets/ssb-staging-dapla-felles-produkt/stat/utdata/person_data_p2021_v2.parquet').bucket_name
|
|
506
|
+
ssb-staging-dapla-felles-produkt
|
|
486
507
|
"""
|
|
487
508
|
prefix: str | None = None
|
|
488
|
-
|
|
509
|
+
dataset_string = str(self.dataset_string)
|
|
510
|
+
if GSPath.cloud_prefix in self.dataset_string:
|
|
489
511
|
prefix = GSPath.cloud_prefix
|
|
490
|
-
|
|
512
|
+
_, bucket_and_rest = dataset_string.split(prefix, 1)
|
|
513
|
+
elif GS_PREFIX_FROM_PATHLIB in self.dataset_string:
|
|
491
514
|
prefix = GS_PREFIX_FROM_PATHLIB
|
|
515
|
+
_, bucket_and_rest = self.dataset_string.split(prefix, 1)
|
|
516
|
+
elif "buckets/" in self.dataset_string:
|
|
517
|
+
prefix = "buckets/"
|
|
518
|
+
_, bucket_and_rest = self.dataset_string.split(prefix, 1)
|
|
492
519
|
else:
|
|
493
520
|
return None
|
|
494
521
|
|
|
495
522
|
return pathlib.Path(
|
|
496
|
-
|
|
523
|
+
bucket_and_rest,
|
|
497
524
|
).parts[0]
|
|
498
525
|
|
|
499
526
|
@property
|
|
@@ -519,6 +546,15 @@ class DaplaDatasetPathInfo:
|
|
|
519
546
|
|
|
520
547
|
>>> DaplaDatasetPathInfo('my_data/simple_dataset_name.parquet').dataset_short_name
|
|
521
548
|
simple_dataset_name
|
|
549
|
+
|
|
550
|
+
>>> DaplaDatasetPathInfo('gs:/ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet').dataset_short_name
|
|
551
|
+
person_data
|
|
552
|
+
|
|
553
|
+
>>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/stat/utdata/folk_data_p2021_v2.parquet').dataset_short_name
|
|
554
|
+
folk_data
|
|
555
|
+
|
|
556
|
+
>>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/stat/utdata/dapla/bus_p2021_v2.parquet').dataset_short_name
|
|
557
|
+
bus
|
|
522
558
|
"""
|
|
523
559
|
if self.contains_data_from or self.contains_data_until:
|
|
524
560
|
short_name_sections = self.dataset_name_sections[
|
|
@@ -586,9 +622,15 @@ class DaplaDatasetPathInfo:
|
|
|
586
622
|
>>> DaplaDatasetPathInfo('klargjorte_data/person_data_v1.parquet').dataset_state
|
|
587
623
|
<DataSetState.PROCESSED_DATA: 'PROCESSED_DATA'>
|
|
588
624
|
|
|
625
|
+
>>> DaplaDatasetPathInfo('klargjorte-data/person_data_v1.parquet').dataset_state
|
|
626
|
+
<DataSetState.PROCESSED_DATA: 'PROCESSED_DATA'>
|
|
627
|
+
|
|
589
628
|
>>> DaplaDatasetPathInfo('utdata/min_statistikk/person_data_v1.parquet').dataset_state
|
|
590
629
|
<DataSetState.OUTPUT_DATA: 'OUTPUT_DATA'>
|
|
591
630
|
|
|
631
|
+
>>> DaplaDatasetPathInfo('buckets/bucket_name/stat_name/inndata/min_statistikk/person_data_v1.parquet').dataset_state
|
|
632
|
+
<DataSetState.INPUT_DATA: 'INPUT_DATA'>
|
|
633
|
+
|
|
592
634
|
>>> DaplaDatasetPathInfo('my_special_data/person_data_v1.parquet').dataset_state
|
|
593
635
|
None
|
|
594
636
|
"""
|
|
@@ -620,6 +662,12 @@ class DaplaDatasetPathInfo:
|
|
|
620
662
|
|
|
621
663
|
>>> DaplaDatasetPathInfo('person_data.parquet').dataset_version
|
|
622
664
|
None
|
|
665
|
+
|
|
666
|
+
>>> DaplaDatasetPathInfo('buckets/bucket_name/stat_name/inndata/min_statistikk/person_data_v1.parquet').dataset_version
|
|
667
|
+
'1'
|
|
668
|
+
|
|
669
|
+
>>> DaplaDatasetPathInfo('buckets/bucket_name/stat_name/inndata/min_statistikk/person_data.parquet').dataset_version
|
|
670
|
+
None
|
|
623
671
|
"""
|
|
624
672
|
minimum_elements_in_file_name: Final[int] = 2
|
|
625
673
|
minimum_characters_in_version_string: Final[int] = 2
|
|
@@ -633,13 +681,37 @@ class DaplaDatasetPathInfo:
|
|
|
633
681
|
return last_filename_element[1:]
|
|
634
682
|
return None
|
|
635
683
|
|
|
684
|
+
def _get_left_parts(
|
|
685
|
+
self,
|
|
686
|
+
dataset_path_parts: list[str],
|
|
687
|
+
state_index: int,
|
|
688
|
+
) -> list[str]:
|
|
689
|
+
"""Retrieve the path parts before the dataset state, considering bucket prefixes."""
|
|
690
|
+
bucket_prefix = {"gs:", "buckets"}
|
|
691
|
+
left_parts = dataset_path_parts[:state_index]
|
|
692
|
+
|
|
693
|
+
# Stop checking beyond the bucket prefix
|
|
694
|
+
prefix_intersection = bucket_prefix & set(left_parts)
|
|
695
|
+
if prefix_intersection:
|
|
696
|
+
first_prefix = min(
|
|
697
|
+
left_parts.index(prefix) for prefix in prefix_intersection
|
|
698
|
+
)
|
|
699
|
+
left_parts = left_parts[first_prefix:]
|
|
700
|
+
|
|
701
|
+
return (
|
|
702
|
+
[]
|
|
703
|
+
if left_parts == ["/"]
|
|
704
|
+
or (left_parts[0] in bucket_prefix and len(left_parts) <= 2)
|
|
705
|
+
else left_parts
|
|
706
|
+
)
|
|
707
|
+
|
|
636
708
|
@property
|
|
637
709
|
def statistic_short_name(
|
|
638
710
|
self,
|
|
639
711
|
) -> str | None:
|
|
640
712
|
"""Extract the statistical short name from the filepath.
|
|
641
713
|
|
|
642
|
-
Extract the statistical short name from the filepath right before the
|
|
714
|
+
Extract the statistical short name from the filepath either after bucket name or right before the
|
|
643
715
|
dataset state based on the Dapla filepath naming convention.
|
|
644
716
|
|
|
645
717
|
Returns:
|
|
@@ -650,21 +722,75 @@ class DaplaDatasetPathInfo:
|
|
|
650
722
|
>>> DaplaDatasetPathInfo('prosjekt/befolkning/klargjorte_data/person_data_v1.parquet').statistic_short_name
|
|
651
723
|
befolkning
|
|
652
724
|
|
|
725
|
+
>>> DaplaDatasetPathInfo('buckets/prosjekt/befolkning/person_data_v1.parquet').statistic_short_name
|
|
726
|
+
befolkning
|
|
727
|
+
|
|
653
728
|
>>> DaplaDatasetPathInfo('befolkning/inndata/person_data_v1.parquet').statistic_short_name
|
|
654
729
|
befolkning
|
|
655
730
|
|
|
731
|
+
>>> DaplaDatasetPathInfo('buckets/bucket_name/stat_name/inndata/min_statistikk/person_data.parquet').statistic_short_name
|
|
732
|
+
stat_name
|
|
733
|
+
|
|
734
|
+
>>> DaplaDatasetPathInfo('buckets/stat_name/utdata/person_data.parquet').statistic_short_name
|
|
735
|
+
None
|
|
736
|
+
|
|
656
737
|
>>> DaplaDatasetPathInfo('befolkning/person_data.parquet').statistic_short_name
|
|
657
738
|
None
|
|
739
|
+
|
|
740
|
+
>>> DaplaDatasetPathInfo('buckets/produkt/befolkning/utdata/person_data.parquet').statistic_short_name
|
|
741
|
+
befolkning
|
|
742
|
+
|
|
743
|
+
>>> DaplaDatasetPathInfo('resources/buckets/produkt/befolkning/utdata/person_data.parquet').statistic_short_name
|
|
744
|
+
befolkning
|
|
745
|
+
|
|
746
|
+
>>> DaplaDatasetPathInfo('gs://statistikk/produkt/klargjorte-data/persondata_p1990-Q1_p2023-Q4_v1/aar=2019/data.parquet').statistic_short_name
|
|
747
|
+
produkt
|
|
748
|
+
|
|
749
|
+
>>> DaplaDatasetPathInfo('gs://statistikk/produkt/persondata_p1990-Q1_p2023-Q4_v1/aar=2019/data.parquet').statistic_short_name
|
|
750
|
+
None
|
|
751
|
+
|
|
752
|
+
>>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/person_data_p2021_v2.parquet').statistic_short_name
|
|
753
|
+
None
|
|
658
754
|
"""
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
755
|
+
if not self.dataset_state:
|
|
756
|
+
if self.bucket_name:
|
|
757
|
+
parts = self.dataset_path.parent.parts
|
|
758
|
+
|
|
759
|
+
if self.bucket_name not in parts:
|
|
760
|
+
return None
|
|
761
|
+
|
|
762
|
+
# Find the index of bucket_name in the path
|
|
763
|
+
bucket_name_index = self.dataset_path.parent.parts.index(
|
|
764
|
+
self.bucket_name,
|
|
765
|
+
)
|
|
766
|
+
|
|
767
|
+
# If there are parts after bucket_name, return the part immediately after it
|
|
768
|
+
if len(self.dataset_path.parent.parts) > bucket_name_index + 1:
|
|
769
|
+
return self.dataset_path.parent.parts[bucket_name_index + 1]
|
|
770
|
+
|
|
771
|
+
return None
|
|
772
|
+
|
|
773
|
+
dataset_state_names = self._extract_norwegian_dataset_state_path_part(
|
|
774
|
+
self.dataset_state,
|
|
775
|
+
)
|
|
776
|
+
dataset_path_parts = list(self.dataset_path.parts)
|
|
777
|
+
|
|
778
|
+
for state in dataset_state_names:
|
|
779
|
+
if state not in dataset_path_parts:
|
|
780
|
+
continue
|
|
781
|
+
|
|
782
|
+
index = dataset_path_parts.index(state)
|
|
783
|
+
|
|
784
|
+
if index == 0:
|
|
785
|
+
continue
|
|
786
|
+
|
|
787
|
+
left_parts = self._get_left_parts(dataset_path_parts, index)
|
|
788
|
+
|
|
789
|
+
if not left_parts:
|
|
790
|
+
return None
|
|
791
|
+
|
|
792
|
+
return dataset_path_parts[index - 1]
|
|
793
|
+
|
|
668
794
|
return None
|
|
669
795
|
|
|
670
796
|
def path_complies_with_naming_standard(self) -> bool:
|
|
@@ -5,18 +5,17 @@ Handles reading in the data and transforming data types to generic metadata type
|
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
-
import pathlib # noqa:
|
|
8
|
+
import pathlib # noqa: TC003 import is needed for docs build
|
|
9
9
|
import re
|
|
10
|
-
import typing as t
|
|
11
10
|
from abc import ABC
|
|
12
11
|
from abc import abstractmethod
|
|
13
12
|
from typing import TYPE_CHECKING
|
|
14
13
|
|
|
15
14
|
import pandas as pd
|
|
16
|
-
from datadoc_model.model import DataType
|
|
17
|
-
from datadoc_model.model import LanguageStringType
|
|
18
|
-
from datadoc_model.model import LanguageStringTypeItem
|
|
19
|
-
from datadoc_model.model import Variable
|
|
15
|
+
from datadoc_model.all_optional.model import DataType
|
|
16
|
+
from datadoc_model.all_optional.model import LanguageStringType
|
|
17
|
+
from datadoc_model.all_optional.model import LanguageStringTypeItem
|
|
18
|
+
from datadoc_model.all_optional.model import Variable
|
|
20
19
|
from pyarrow import parquet as pq
|
|
21
20
|
|
|
22
21
|
from dapla_metadata.datasets.utility.enums import SupportedLanguages
|
|
@@ -56,6 +55,8 @@ KNOWN_FLOAT_TYPES = (
|
|
|
56
55
|
|
|
57
56
|
KNOWN_STRING_TYPES = (
|
|
58
57
|
"string",
|
|
58
|
+
"string[pyarrow]",
|
|
59
|
+
"large_string",
|
|
59
60
|
"str",
|
|
60
61
|
"char",
|
|
61
62
|
"varchar",
|
|
@@ -67,13 +68,18 @@ KNOWN_STRING_TYPES = (
|
|
|
67
68
|
|
|
68
69
|
KNOWN_DATETIME_TYPES = (
|
|
69
70
|
"timestamp",
|
|
71
|
+
"timestamp[s]",
|
|
72
|
+
"timestamp[ms]",
|
|
70
73
|
"timestamp[us]",
|
|
71
74
|
"timestamp[ns]",
|
|
75
|
+
"datetime",
|
|
72
76
|
"datetime64",
|
|
73
|
-
"
|
|
74
|
-
"
|
|
77
|
+
"datetime64[s]",
|
|
78
|
+
"datetime64[ms]",
|
|
79
|
+
"datetime64[us]",
|
|
80
|
+
"datetime64[ns]",
|
|
75
81
|
"date",
|
|
76
|
-
"
|
|
82
|
+
"date32[day]",
|
|
77
83
|
"time",
|
|
78
84
|
)
|
|
79
85
|
|
|
@@ -89,9 +95,7 @@ TYPE_CORRESPONDENCE: list[tuple[tuple[str, ...], DataType]] = [
|
|
|
89
95
|
]
|
|
90
96
|
TYPE_MAP: dict[str, DataType] = {}
|
|
91
97
|
for concrete_type, abstract_type in TYPE_CORRESPONDENCE:
|
|
92
|
-
TYPE_MAP.update(
|
|
93
|
-
|
|
94
|
-
TDatasetParser = t.TypeVar("TDatasetParser", bound="DatasetParser")
|
|
98
|
+
TYPE_MAP.update(dict.fromkeys(concrete_type, abstract_type))
|
|
95
99
|
|
|
96
100
|
|
|
97
101
|
class DatasetParser(ABC):
|
|
@@ -112,31 +116,23 @@ class DatasetParser(ABC):
|
|
|
112
116
|
@staticmethod
|
|
113
117
|
def for_file(dataset: pathlib.Path | CloudPath) -> DatasetParser:
|
|
114
118
|
"""Return the correct subclass based on the given dataset file."""
|
|
115
|
-
supported_file_types: dict[
|
|
116
|
-
str,
|
|
117
|
-
type[DatasetParser],
|
|
118
|
-
] = {
|
|
119
|
-
".parquet": DatasetParserParquet,
|
|
120
|
-
".sas7bdat": DatasetParserSas7Bdat,
|
|
121
|
-
".parquet.gzip": DatasetParserParquet,
|
|
122
|
-
}
|
|
123
119
|
file_type = "Unknown"
|
|
124
120
|
try:
|
|
125
121
|
file_type = dataset.suffix
|
|
126
122
|
# Gzipped parquet files can be read with DatasetParserParquet
|
|
127
|
-
match = re.search(
|
|
128
|
-
file_type =
|
|
129
|
-
# Extract the appropriate reader class from the SUPPORTED_FILE_TYPES dict
|
|
130
|
-
reader =
|
|
123
|
+
match = re.search(PARQUET_GZIP_FILE_SUFFIX, str(dataset).lower())
|
|
124
|
+
file_type = PARQUET_GZIP_FILE_SUFFIX if match else file_type
|
|
125
|
+
# Extract the appropriate reader class from the SUPPORTED_FILE_TYPES dict
|
|
126
|
+
reader = SUPPORTED_DATASET_FILE_SUFFIXES[file_type](dataset)
|
|
131
127
|
except IndexError as e:
|
|
132
128
|
# Thrown when just one element is returned from split, meaning there is no file extension supplied
|
|
133
|
-
msg = f"Could not recognise file type for provided {dataset = }. Supported file types are: {', '.join(
|
|
129
|
+
msg = f"Could not recognise file type for provided {dataset = }. Supported file types are: {', '.join(SUPPORTED_DATASET_FILE_SUFFIXES.keys())}"
|
|
134
130
|
raise FileNotFoundError(
|
|
135
131
|
msg,
|
|
136
132
|
) from e
|
|
137
133
|
except KeyError as e:
|
|
138
134
|
# In this case the file type is not supported, so we throw a helpful exception
|
|
139
|
-
msg = f"{file_type = } is not supported. Please open one of the following supported files types: {', '.join(
|
|
135
|
+
msg = f"{file_type = } is not supported. Please open one of the following supported files types: {', '.join(SUPPORTED_DATASET_FILE_SUFFIXES.keys())} or contact the maintainers to request support."
|
|
140
136
|
raise NotImplementedError(
|
|
141
137
|
msg,
|
|
142
138
|
) from e
|
|
@@ -157,6 +153,9 @@ class DatasetParser(ABC):
|
|
|
157
153
|
|
|
158
154
|
Arguments:
|
|
159
155
|
data_type: The concrete data type to map.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
The abstract data type or None
|
|
160
159
|
"""
|
|
161
160
|
return TYPE_MAP.get(data_type.lower(), None)
|
|
162
161
|
|
|
@@ -179,11 +178,11 @@ class DatasetParserParquet(DatasetParser):
|
|
|
179
178
|
def get_fields(self) -> list[Variable]:
|
|
180
179
|
"""Extract the fields from this dataset."""
|
|
181
180
|
with self.dataset.open(mode="rb") as f:
|
|
182
|
-
schema: pa.Schema = pq.read_schema(f) # type: ignore [arg-type]
|
|
181
|
+
schema: pa.Schema = pq.read_schema(f) # type: ignore [arg-type, assignment]
|
|
183
182
|
return [
|
|
184
183
|
Variable(
|
|
185
184
|
short_name=data_field.name.strip(),
|
|
186
|
-
data_type=self.transform_data_type(str(data_field.type)),
|
|
185
|
+
data_type=self.transform_data_type(str(data_field.type)), # type: ignore [attr-defined]
|
|
187
186
|
)
|
|
188
187
|
for data_field in schema
|
|
189
188
|
if data_field.name
|
|
@@ -239,3 +238,17 @@ class DatasetParserSas7Bdat(DatasetParser):
|
|
|
239
238
|
)
|
|
240
239
|
|
|
241
240
|
return fields
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
PARQUET_FILE_SUFFIX = ".parquet"
|
|
244
|
+
PARQUET_GZIP_FILE_SUFFIX = ".parquet.gzip"
|
|
245
|
+
SAS7BDAT_FILE_SUFFIX = ".sas7bdat"
|
|
246
|
+
|
|
247
|
+
SUPPORTED_DATASET_FILE_SUFFIXES: dict[
|
|
248
|
+
str,
|
|
249
|
+
type[DatasetParser],
|
|
250
|
+
] = {
|
|
251
|
+
PARQUET_FILE_SUFFIX: DatasetParserParquet,
|
|
252
|
+
PARQUET_GZIP_FILE_SUFFIX: DatasetParserParquet,
|
|
253
|
+
SAS7BDAT_FILE_SUFFIX: DatasetParserSas7Bdat,
|
|
254
|
+
}
|
|
@@ -13,20 +13,19 @@ from typing_extensions import Self
|
|
|
13
13
|
|
|
14
14
|
from dapla_metadata.datasets.utility.constants import DATE_VALIDATION_MESSAGE
|
|
15
15
|
from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_DATASET_FIELDS
|
|
16
|
-
from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
|
|
17
16
|
from dapla_metadata.datasets.utility.constants import OBLIGATORY_METADATA_WARNING
|
|
18
17
|
from dapla_metadata.datasets.utility.utils import get_missing_obligatory_dataset_fields
|
|
19
18
|
from dapla_metadata.datasets.utility.utils import (
|
|
20
19
|
get_missing_obligatory_variables_fields,
|
|
21
20
|
)
|
|
21
|
+
from dapla_metadata.datasets.utility.utils import (
|
|
22
|
+
get_missing_obligatory_variables_pseudo_fields,
|
|
23
|
+
)
|
|
22
24
|
from dapla_metadata.datasets.utility.utils import get_timestamp_now
|
|
23
25
|
from dapla_metadata.datasets.utility.utils import incorrect_date_order
|
|
24
26
|
from dapla_metadata.datasets.utility.utils import (
|
|
25
27
|
num_obligatory_dataset_fields_completed,
|
|
26
28
|
)
|
|
27
|
-
from dapla_metadata.datasets.utility.utils import (
|
|
28
|
-
num_obligatory_variables_fields_completed,
|
|
29
|
-
)
|
|
30
29
|
from dapla_metadata.datasets.utility.utils import set_variables_inherit_from_dataset
|
|
31
30
|
|
|
32
31
|
if TYPE_CHECKING:
|
|
@@ -146,21 +145,31 @@ class ValidateDatadocMetadata(model.DatadocMetadata):
|
|
|
146
145
|
ObligatoryVariableWarning: If not all obligatory variable metadata fields
|
|
147
146
|
are filled in.
|
|
148
147
|
"""
|
|
149
|
-
if self.variables is not None
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
148
|
+
if self.variables is not None:
|
|
149
|
+
missing_fields_dict = {}
|
|
150
|
+
for d in get_missing_obligatory_variables_fields(self.variables):
|
|
151
|
+
for var, fields in d.items():
|
|
152
|
+
missing_fields_dict[var] = fields.copy()
|
|
153
|
+
|
|
154
|
+
for d in get_missing_obligatory_variables_pseudo_fields(self.variables):
|
|
155
|
+
for var, fields in d.items():
|
|
156
|
+
if var in missing_fields_dict:
|
|
157
|
+
missing_fields_dict[var].extend(fields)
|
|
158
|
+
else:
|
|
159
|
+
missing_fields_dict[var] = fields.copy()
|
|
160
|
+
|
|
161
|
+
missing_fields = [
|
|
162
|
+
{var: fields} for var, fields in missing_fields_dict.items()
|
|
163
|
+
]
|
|
164
|
+
if missing_fields:
|
|
165
|
+
message = f"{OBLIGATORY_METADATA_WARNING} {missing_fields}"
|
|
166
|
+
warnings.warn(message, ObligatoryVariableWarning, stacklevel=2)
|
|
167
|
+
logger.warning(
|
|
168
|
+
"Type warning: %s.%s %s",
|
|
169
|
+
ObligatoryVariableWarning,
|
|
170
|
+
OBLIGATORY_METADATA_WARNING,
|
|
171
|
+
missing_fields,
|
|
172
|
+
)
|
|
164
173
|
return self
|
|
165
174
|
|
|
166
175
|
|
|
@@ -176,7 +185,7 @@ class ObligatoryVariableWarning(UserWarning):
|
|
|
176
185
|
"""Custom warning for checking obligatory metadata for variables."""
|
|
177
186
|
|
|
178
187
|
|
|
179
|
-
def custom_warning_handler(
|
|
188
|
+
def custom_warning_handler(
|
|
180
189
|
message: Warning | str,
|
|
181
190
|
category: type[Warning],
|
|
182
191
|
filename: str,
|
|
@@ -140,7 +140,11 @@ class StatisticSubjectMapping(GetExternalSource):
|
|
|
140
140
|
SecondarySubject(
|
|
141
141
|
self._extract_titles(s.titler),
|
|
142
142
|
s["emnekode"],
|
|
143
|
-
[
|
|
143
|
+
[
|
|
144
|
+
statistikk["kortnavn"]
|
|
145
|
+
for statistikk in s.find_all("Statistikk")
|
|
146
|
+
if statistikk["isPrimaerPlassering"] == "true"
|
|
147
|
+
],
|
|
144
148
|
)
|
|
145
149
|
for s in p.find_all("delemne")
|
|
146
150
|
]
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Repository for constant values in Datadoc backend."""
|
|
2
2
|
|
|
3
|
-
from datadoc_model.model import LanguageStringType
|
|
4
|
-
from datadoc_model.model import LanguageStringTypeItem
|
|
3
|
+
from datadoc_model.all_optional.model import LanguageStringType
|
|
4
|
+
from datadoc_model.all_optional.model import LanguageStringTypeItem
|
|
5
5
|
|
|
6
6
|
VALIDATION_ERROR = "Validation error: "
|
|
7
7
|
|
|
@@ -9,7 +9,7 @@ DATE_VALIDATION_MESSAGE = f"{VALIDATION_ERROR}contains_data_from must be the sam
|
|
|
9
9
|
|
|
10
10
|
OBLIGATORY_METADATA_WARNING = "Obligatory metadata is missing: "
|
|
11
11
|
|
|
12
|
-
INCONSISTENCIES_MESSAGE = "Inconsistencies found between extracted and existing metadata
|
|
12
|
+
INCONSISTENCIES_MESSAGE = "Inconsistencies found between extracted and existing metadata! This usually means that the new dataset has a different structure and that the version number should be incremented.\nDetails:"
|
|
13
13
|
|
|
14
14
|
OBLIGATORY_DATASET_METADATA_IDENTIFIERS: list = [
|
|
15
15
|
"assessment",
|
|
@@ -17,12 +17,9 @@ OBLIGATORY_DATASET_METADATA_IDENTIFIERS: list = [
|
|
|
17
17
|
"dataset_status",
|
|
18
18
|
"name",
|
|
19
19
|
"description",
|
|
20
|
-
"data_source",
|
|
21
20
|
"population_description",
|
|
22
21
|
"version",
|
|
23
22
|
"version_description",
|
|
24
|
-
"unit_type",
|
|
25
|
-
"temporality_type",
|
|
26
23
|
"subject_field",
|
|
27
24
|
"spatial_coverage_description",
|
|
28
25
|
"owner",
|
|
@@ -44,8 +41,18 @@ OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS = [
|
|
|
44
41
|
"data_type",
|
|
45
42
|
"variable_role",
|
|
46
43
|
"is_personal_data",
|
|
44
|
+
"unit_type",
|
|
45
|
+
"population_description",
|
|
46
|
+
"data_source",
|
|
47
|
+
"temporality_type",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
OBLIGATORY_VARIABLES_PSEUDONYMIZATION_IDENTIFIERS = [
|
|
51
|
+
"encryption_algorithm",
|
|
52
|
+
"encryption_key_reference",
|
|
47
53
|
]
|
|
48
54
|
|
|
55
|
+
|
|
49
56
|
OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS_MULTILANGUAGE = [
|
|
50
57
|
"name",
|
|
51
58
|
]
|
|
@@ -75,22 +82,22 @@ DATASET_FIELDS_FROM_EXISTING_METADATA = [
|
|
|
75
82
|
"dataset_status",
|
|
76
83
|
"name",
|
|
77
84
|
"description",
|
|
78
|
-
"data_source",
|
|
79
85
|
"population_description",
|
|
80
|
-
"unit_type",
|
|
81
|
-
"temporality_type",
|
|
82
86
|
"subject_field",
|
|
83
87
|
"keyword",
|
|
84
88
|
"spatial_coverage_description",
|
|
85
|
-
"
|
|
86
|
-
"use_restriction",
|
|
87
|
-
"use_restriction_date",
|
|
89
|
+
"use_restrictions",
|
|
88
90
|
"custom_type",
|
|
89
91
|
"owner",
|
|
92
|
+
"version_description",
|
|
90
93
|
]
|
|
91
94
|
|
|
92
95
|
METADATA_DOCUMENT_FILE_SUFFIX = "__DOC.json"
|
|
93
96
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
+
PAPIS_STABLE_IDENTIFIER_TYPE = "FREG_SNR"
|
|
98
|
+
PAPIS_ENCRYPTION_KEY_REFERENCE = "papis-common-key-1"
|
|
99
|
+
DAEAD_ENCRYPTION_KEY_REFERENCE = "ssb-common-key-1"
|
|
100
|
+
ENCRYPTION_PARAMETER_SNAPSHOT_DATE = "snapshotDate"
|
|
101
|
+
ENCRYPTION_PARAMETER_KEY_ID = "keyId"
|
|
102
|
+
ENCRYPTION_PARAMETER_STRATEGY = "strategy"
|
|
103
|
+
ENCRYPTION_PARAMETER_STRATEGY_SKIP = "skip"
|
|
@@ -5,31 +5,19 @@ from __future__ import annotations
|
|
|
5
5
|
from enum import Enum
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
class DaplaRegion(str, Enum):
|
|
9
|
-
"""Dapla platforms/regions."""
|
|
10
|
-
|
|
11
|
-
DAPLA_LAB = "DAPLA_LAB"
|
|
12
|
-
BIP = "BIP"
|
|
13
|
-
ON_PREM = "ON_PREM"
|
|
14
|
-
CLOUD_RUN = "CLOUD_RUN"
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class DaplaService(str, Enum):
|
|
18
|
-
"""Dapla services."""
|
|
19
|
-
|
|
20
|
-
DATADOC = "DATADOC"
|
|
21
|
-
JUPYTERLAB = "JUPYTERLAB"
|
|
22
|
-
VS_CODE = "VS_CODE"
|
|
23
|
-
R_STUDIO = "R_STUDIO"
|
|
24
|
-
KILDOMATEN = "KILDOMATEN"
|
|
25
|
-
|
|
26
|
-
|
|
27
8
|
class SupportedLanguages(str, Enum):
|
|
28
9
|
"""The list of languages metadata may be recorded in.
|
|
29
10
|
|
|
30
11
|
Reference: https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
|
|
31
12
|
"""
|
|
32
13
|
|
|
33
|
-
NORSK_BOKMÅL = "nb"
|
|
14
|
+
NORSK_BOKMÅL = "nb" # noqa: PLC2401 the listed problems do not apply in this case
|
|
34
15
|
NORSK_NYNORSK = "nn"
|
|
35
16
|
ENGLISH = "en"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class EncryptionAlgorithm(str, Enum):
|
|
20
|
+
"""Encryption algorithm values for pseudonymization algoprithms offered on Dapla."""
|
|
21
|
+
|
|
22
|
+
PAPIS_ENCRYPTION_ALGORITHM = "TINK-FPE"
|
|
23
|
+
DAEAD_ENCRYPTION_ALGORITHM = "TINK-DAEAD"
|