dapla-toolbelt-metadata 0.2.1__py3-none-any.whl → 0.9.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dapla-toolbelt-metadata might be problematic. Click here for more details.

Files changed (97) hide show
  1. dapla_metadata/__init__.py +11 -1
  2. dapla_metadata/_shared/__init__.py +1 -0
  3. dapla_metadata/_shared/config.py +109 -0
  4. dapla_metadata/_shared/enums.py +27 -0
  5. dapla_metadata/_shared/py.typed +0 -0
  6. dapla_metadata/dapla/__init__.py +4 -0
  7. dapla_metadata/dapla/user_info.py +138 -0
  8. dapla_metadata/datasets/__init__.py +1 -1
  9. dapla_metadata/datasets/_merge.py +333 -0
  10. dapla_metadata/datasets/code_list.py +5 -6
  11. dapla_metadata/datasets/compatibility/__init__.py +10 -0
  12. dapla_metadata/datasets/compatibility/_handlers.py +363 -0
  13. dapla_metadata/datasets/compatibility/_utils.py +259 -0
  14. dapla_metadata/datasets/compatibility/model_backwards_compatibility.py +135 -0
  15. dapla_metadata/datasets/core.py +136 -182
  16. dapla_metadata/datasets/dapla_dataset_path_info.py +145 -19
  17. dapla_metadata/datasets/dataset_parser.py +41 -28
  18. dapla_metadata/datasets/model_validation.py +29 -20
  19. dapla_metadata/datasets/statistic_subject_mapping.py +5 -1
  20. dapla_metadata/datasets/utility/constants.py +22 -15
  21. dapla_metadata/datasets/utility/enums.py +8 -20
  22. dapla_metadata/datasets/utility/urn.py +234 -0
  23. dapla_metadata/datasets/utility/utils.py +183 -111
  24. dapla_metadata/standards/__init__.py +4 -0
  25. dapla_metadata/standards/name_validator.py +250 -0
  26. dapla_metadata/standards/standard_validators.py +98 -0
  27. dapla_metadata/standards/utils/__init__.py +1 -0
  28. dapla_metadata/standards/utils/constants.py +49 -0
  29. dapla_metadata/variable_definitions/__init__.py +11 -0
  30. dapla_metadata/variable_definitions/_generated/.openapi-generator/FILES +20 -0
  31. dapla_metadata/variable_definitions/_generated/.openapi-generator/VERSION +1 -0
  32. dapla_metadata/variable_definitions/_generated/.openapi-generator-ignore +6 -0
  33. dapla_metadata/variable_definitions/_generated/README.md +148 -0
  34. dapla_metadata/variable_definitions/_generated/__init__.py +0 -0
  35. dapla_metadata/variable_definitions/_generated/vardef_client/__init__.py +47 -0
  36. dapla_metadata/variable_definitions/_generated/vardef_client/api/__init__.py +8 -0
  37. dapla_metadata/variable_definitions/_generated/vardef_client/api/data_migration_api.py +766 -0
  38. dapla_metadata/variable_definitions/_generated/vardef_client/api/draft_variable_definitions_api.py +888 -0
  39. dapla_metadata/variable_definitions/_generated/vardef_client/api/patches_api.py +888 -0
  40. dapla_metadata/variable_definitions/_generated/vardef_client/api/validity_periods_api.py +583 -0
  41. dapla_metadata/variable_definitions/_generated/vardef_client/api/variable_definitions_api.py +613 -0
  42. dapla_metadata/variable_definitions/_generated/vardef_client/api_client.py +779 -0
  43. dapla_metadata/variable_definitions/_generated/vardef_client/api_response.py +27 -0
  44. dapla_metadata/variable_definitions/_generated/vardef_client/configuration.py +474 -0
  45. dapla_metadata/variable_definitions/_generated/vardef_client/docs/CompleteResponse.md +51 -0
  46. dapla_metadata/variable_definitions/_generated/vardef_client/docs/Contact.md +30 -0
  47. dapla_metadata/variable_definitions/_generated/vardef_client/docs/DataMigrationApi.md +90 -0
  48. dapla_metadata/variable_definitions/_generated/vardef_client/docs/Draft.md +42 -0
  49. dapla_metadata/variable_definitions/_generated/vardef_client/docs/DraftVariableDefinitionsApi.md +259 -0
  50. dapla_metadata/variable_definitions/_generated/vardef_client/docs/LanguageStringType.md +31 -0
  51. dapla_metadata/variable_definitions/_generated/vardef_client/docs/Owner.md +31 -0
  52. dapla_metadata/variable_definitions/_generated/vardef_client/docs/Patch.md +43 -0
  53. dapla_metadata/variable_definitions/_generated/vardef_client/docs/PatchesApi.md +249 -0
  54. dapla_metadata/variable_definitions/_generated/vardef_client/docs/PublicApi.md +218 -0
  55. dapla_metadata/variable_definitions/_generated/vardef_client/docs/SupportedLanguages.md +15 -0
  56. dapla_metadata/variable_definitions/_generated/vardef_client/docs/UpdateDraft.md +44 -0
  57. dapla_metadata/variable_definitions/_generated/vardef_client/docs/ValidityPeriod.md +42 -0
  58. dapla_metadata/variable_definitions/_generated/vardef_client/docs/ValidityPeriodsApi.md +236 -0
  59. dapla_metadata/variable_definitions/_generated/vardef_client/docs/VariableDefinitionsApi.md +304 -0
  60. dapla_metadata/variable_definitions/_generated/vardef_client/docs/VariableStatus.md +17 -0
  61. dapla_metadata/variable_definitions/_generated/vardef_client/exceptions.py +193 -0
  62. dapla_metadata/variable_definitions/_generated/vardef_client/models/__init__.py +31 -0
  63. dapla_metadata/variable_definitions/_generated/vardef_client/models/complete_response.py +260 -0
  64. dapla_metadata/variable_definitions/_generated/vardef_client/models/contact.py +94 -0
  65. dapla_metadata/variable_definitions/_generated/vardef_client/models/draft.py +228 -0
  66. dapla_metadata/variable_definitions/_generated/vardef_client/models/get_vardok_vardef_mapping_by_id200_response.py +158 -0
  67. dapla_metadata/variable_definitions/_generated/vardef_client/models/language_string_type.py +101 -0
  68. dapla_metadata/variable_definitions/_generated/vardef_client/models/owner.py +87 -0
  69. dapla_metadata/variable_definitions/_generated/vardef_client/models/patch.py +244 -0
  70. dapla_metadata/variable_definitions/_generated/vardef_client/models/problem.py +118 -0
  71. dapla_metadata/variable_definitions/_generated/vardef_client/models/update_draft.py +274 -0
  72. dapla_metadata/variable_definitions/_generated/vardef_client/models/validity_period.py +225 -0
  73. dapla_metadata/variable_definitions/_generated/vardef_client/models/vardok_id_response.py +81 -0
  74. dapla_metadata/variable_definitions/_generated/vardef_client/models/vardok_vardef_id_pair_response.py +84 -0
  75. dapla_metadata/variable_definitions/_generated/vardef_client/models/variable_status.py +33 -0
  76. dapla_metadata/variable_definitions/_generated/vardef_client/py.typed +0 -0
  77. dapla_metadata/variable_definitions/_generated/vardef_client/rest.py +249 -0
  78. dapla_metadata/variable_definitions/_utils/__init__.py +1 -0
  79. dapla_metadata/variable_definitions/_utils/_client.py +32 -0
  80. dapla_metadata/variable_definitions/_utils/config.py +54 -0
  81. dapla_metadata/variable_definitions/_utils/constants.py +80 -0
  82. dapla_metadata/variable_definitions/_utils/files.py +309 -0
  83. dapla_metadata/variable_definitions/_utils/template_files.py +99 -0
  84. dapla_metadata/variable_definitions/_utils/variable_definition_files.py +143 -0
  85. dapla_metadata/variable_definitions/exceptions.py +255 -0
  86. dapla_metadata/variable_definitions/vardef.py +372 -0
  87. dapla_metadata/variable_definitions/vardok_id.py +48 -0
  88. dapla_metadata/variable_definitions/vardok_vardef_id_pair.py +47 -0
  89. dapla_metadata/variable_definitions/variable_definition.py +422 -0
  90. {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info}/METADATA +34 -36
  91. dapla_toolbelt_metadata-0.9.11.dist-info/RECORD +97 -0
  92. {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info}/WHEEL +1 -1
  93. dapla_metadata/datasets/config.py +0 -80
  94. dapla_metadata/datasets/model_backwards_compatibility.py +0 -520
  95. dapla_metadata/datasets/user_info.py +0 -88
  96. dapla_toolbelt_metadata-0.2.1.dist-info/RECORD +0 -22
  97. {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info/licenses}/LICENSE +0 -0
@@ -14,7 +14,7 @@ from typing import Literal
14
14
 
15
15
  import arrow
16
16
  from cloudpathlib import GSPath
17
- from datadoc_model.model import DataSetState
17
+ from datadoc_model.all_optional.model import DataSetState
18
18
 
19
19
  if TYPE_CHECKING:
20
20
  import datetime
@@ -141,6 +141,9 @@ class SsbDateFormat(DateFormat):
141
141
 
142
142
  >>> SSB_BIMESTER.get_floor("2003B4")
143
143
  datetime.date(2003, 7, 1)
144
+
145
+ >>> SSB_BIMESTER.get_floor("2003-B4")
146
+ datetime.date(2003, 7, 1)
144
147
  """
145
148
  try:
146
149
  year = period_string[:4]
@@ -170,6 +173,9 @@ class SsbDateFormat(DateFormat):
170
173
 
171
174
  >>> SSB_HALF_YEAR.get_ceil("2024H1")
172
175
  datetime.date(2024, 6, 30)
176
+
177
+ >>> SSB_HALF_YEAR.get_ceil("2024-H1")
178
+ datetime.date(2024, 6, 30)
173
179
  """
174
180
  try:
175
181
  year = period_string[:4]
@@ -182,7 +188,7 @@ class SsbDateFormat(DateFormat):
182
188
 
183
189
  SSB_BIMESTER = SsbDateFormat(
184
190
  name="SSB_BIMESTER",
185
- regex_pattern=r"^\d{4}[B]\d{1}$",
191
+ regex_pattern=r"^\d{4}-?[B]\d{1}$",
186
192
  arrow_pattern="YYYYMM",
187
193
  timeframe="month",
188
194
  ssb_dates={
@@ -215,7 +221,7 @@ SSB_BIMESTER = SsbDateFormat(
215
221
 
216
222
  SSB_QUARTERLY = SsbDateFormat(
217
223
  name="SSB_QUARTERLY",
218
- regex_pattern=r"^\d{4}[Q]\d{1}$",
224
+ regex_pattern=r"^\d{4}-?[Q]\d{1}$",
219
225
  arrow_pattern="YYYYMM",
220
226
  timeframe="month",
221
227
  ssb_dates={
@@ -240,7 +246,7 @@ SSB_QUARTERLY = SsbDateFormat(
240
246
 
241
247
  SSB_TRIANNUAL = SsbDateFormat(
242
248
  name="SSB_TRIANNUAL",
243
- regex_pattern=r"^\d{4}[T]\d{1}$",
249
+ regex_pattern=r"^\d{4}-?[T]\d{1}$",
244
250
  arrow_pattern="YYYYMM",
245
251
  timeframe="month",
246
252
  ssb_dates={
@@ -260,7 +266,7 @@ SSB_TRIANNUAL = SsbDateFormat(
260
266
  )
261
267
  SSB_HALF_YEAR = SsbDateFormat(
262
268
  name="SSB_HALF_YEAR",
263
- regex_pattern=r"^\d{4}[H]\d{1}$",
269
+ regex_pattern=r"^\d{4}-?[H]\d{1}$",
264
270
  arrow_pattern="YYYYMM",
265
271
  timeframe="month",
266
272
  ssb_dates={
@@ -413,6 +419,9 @@ class DaplaDatasetPathInfo:
413
419
  >>> DaplaDatasetPathInfo._extract_period_strings(['p1990Q1', 'kommune', 'v1'])
414
420
  ['1990Q1']
415
421
 
422
+ >>> DaplaDatasetPathInfo._extract_period_strings(['p1990-Q1', 'kommune', 'v1'])
423
+ ['1990-Q1']
424
+
416
425
  >>> DaplaDatasetPathInfo._extract_period_strings(['varehandel','v1'])
417
426
  []
418
427
  """
@@ -469,7 +478,7 @@ class DaplaDatasetPathInfo:
469
478
  """Extract the bucket name from the dataset path.
470
479
 
471
480
  Returns:
472
- The bucket name or None if the dataset path is not a GCS path.
481
+ The bucket name or None if the dataset path is not a GCS path nor ssb bucketeer path.
473
482
 
474
483
  Examples:
475
484
  >>> DaplaDatasetPathInfo('gs://ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet').bucket_name
@@ -483,17 +492,35 @@ class DaplaDatasetPathInfo:
483
492
 
484
493
  >>> DaplaDatasetPathInfo('ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet').bucket_name
485
494
  None
495
+
496
+ >>> DaplaDatasetPathInfo('ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet').bucket_name
497
+ None
498
+
499
+ >>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/stat/utdata/person_data_p2021_v2.parquet').bucket_name
500
+ ssb-staging-dapla-felles-data-delt
501
+
502
+ >>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/person_data_p2021_v2.parquet').bucket_name
503
+ ssb-staging-dapla-felles-data-delt
504
+
505
+ >>> DaplaDatasetPathInfo('home/work/buckets/ssb-staging-dapla-felles-produkt/stat/utdata/person_data_p2021_v2.parquet').bucket_name
506
+ ssb-staging-dapla-felles-produkt
486
507
  """
487
508
  prefix: str | None = None
488
- if self.dataset_string.startswith(GSPath.cloud_prefix):
509
+ dataset_string = str(self.dataset_string)
510
+ if GSPath.cloud_prefix in self.dataset_string:
489
511
  prefix = GSPath.cloud_prefix
490
- elif self.dataset_string.startswith(GS_PREFIX_FROM_PATHLIB):
512
+ _, bucket_and_rest = dataset_string.split(prefix, 1)
513
+ elif GS_PREFIX_FROM_PATHLIB in self.dataset_string:
491
514
  prefix = GS_PREFIX_FROM_PATHLIB
515
+ _, bucket_and_rest = self.dataset_string.split(prefix, 1)
516
+ elif "buckets/" in self.dataset_string:
517
+ prefix = "buckets/"
518
+ _, bucket_and_rest = self.dataset_string.split(prefix, 1)
492
519
  else:
493
520
  return None
494
521
 
495
522
  return pathlib.Path(
496
- self.dataset_string.removeprefix(prefix),
523
+ bucket_and_rest,
497
524
  ).parts[0]
498
525
 
499
526
  @property
@@ -519,6 +546,15 @@ class DaplaDatasetPathInfo:
519
546
 
520
547
  >>> DaplaDatasetPathInfo('my_data/simple_dataset_name.parquet').dataset_short_name
521
548
  simple_dataset_name
549
+
550
+ >>> DaplaDatasetPathInfo('gs:/ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet').dataset_short_name
551
+ person_data
552
+
553
+ >>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/stat/utdata/folk_data_p2021_v2.parquet').dataset_short_name
554
+ folk_data
555
+
556
+ >>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/stat/utdata/dapla/bus_p2021_v2.parquet').dataset_short_name
557
+ bus
522
558
  """
523
559
  if self.contains_data_from or self.contains_data_until:
524
560
  short_name_sections = self.dataset_name_sections[
@@ -586,9 +622,15 @@ class DaplaDatasetPathInfo:
586
622
  >>> DaplaDatasetPathInfo('klargjorte_data/person_data_v1.parquet').dataset_state
587
623
  <DataSetState.PROCESSED_DATA: 'PROCESSED_DATA'>
588
624
 
625
+ >>> DaplaDatasetPathInfo('klargjorte-data/person_data_v1.parquet').dataset_state
626
+ <DataSetState.PROCESSED_DATA: 'PROCESSED_DATA'>
627
+
589
628
  >>> DaplaDatasetPathInfo('utdata/min_statistikk/person_data_v1.parquet').dataset_state
590
629
  <DataSetState.OUTPUT_DATA: 'OUTPUT_DATA'>
591
630
 
631
+ >>> DaplaDatasetPathInfo('buckets/bucket_name/stat_name/inndata/min_statistikk/person_data_v1.parquet').dataset_state
632
+ <DataSetState.INPUT_DATA: 'INPUT_DATA'>
633
+
592
634
  >>> DaplaDatasetPathInfo('my_special_data/person_data_v1.parquet').dataset_state
593
635
  None
594
636
  """
@@ -620,6 +662,12 @@ class DaplaDatasetPathInfo:
620
662
 
621
663
  >>> DaplaDatasetPathInfo('person_data.parquet').dataset_version
622
664
  None
665
+
666
+ >>> DaplaDatasetPathInfo('buckets/bucket_name/stat_name/inndata/min_statistikk/person_data_v1.parquet').dataset_version
667
+ '1'
668
+
669
+ >>> DaplaDatasetPathInfo('buckets/bucket_name/stat_name/inndata/min_statistikk/person_data.parquet').dataset_version
670
+ None
623
671
  """
624
672
  minimum_elements_in_file_name: Final[int] = 2
625
673
  minimum_characters_in_version_string: Final[int] = 2
@@ -633,13 +681,37 @@ class DaplaDatasetPathInfo:
633
681
  return last_filename_element[1:]
634
682
  return None
635
683
 
684
+ def _get_left_parts(
685
+ self,
686
+ dataset_path_parts: list[str],
687
+ state_index: int,
688
+ ) -> list[str]:
689
+ """Retrieve the path parts before the dataset state, considering bucket prefixes."""
690
+ bucket_prefix = {"gs:", "buckets"}
691
+ left_parts = dataset_path_parts[:state_index]
692
+
693
+ # Stop checking beyond the bucket prefix
694
+ prefix_intersection = bucket_prefix & set(left_parts)
695
+ if prefix_intersection:
696
+ first_prefix = min(
697
+ left_parts.index(prefix) for prefix in prefix_intersection
698
+ )
699
+ left_parts = left_parts[first_prefix:]
700
+
701
+ return (
702
+ []
703
+ if left_parts == ["/"]
704
+ or (left_parts[0] in bucket_prefix and len(left_parts) <= 2)
705
+ else left_parts
706
+ )
707
+
636
708
  @property
637
709
  def statistic_short_name(
638
710
  self,
639
711
  ) -> str | None:
640
712
  """Extract the statistical short name from the filepath.
641
713
 
642
- Extract the statistical short name from the filepath right before the
714
+ Extract the statistical short name from the filepath either after bucket name or right before the
643
715
  dataset state based on the Dapla filepath naming convention.
644
716
 
645
717
  Returns:
@@ -650,21 +722,75 @@ class DaplaDatasetPathInfo:
650
722
  >>> DaplaDatasetPathInfo('prosjekt/befolkning/klargjorte_data/person_data_v1.parquet').statistic_short_name
651
723
  befolkning
652
724
 
725
+ >>> DaplaDatasetPathInfo('buckets/prosjekt/befolkning/person_data_v1.parquet').statistic_short_name
726
+ befolkning
727
+
653
728
  >>> DaplaDatasetPathInfo('befolkning/inndata/person_data_v1.parquet').statistic_short_name
654
729
  befolkning
655
730
 
731
+ >>> DaplaDatasetPathInfo('buckets/bucket_name/stat_name/inndata/min_statistikk/person_data.parquet').statistic_short_name
732
+ stat_name
733
+
734
+ >>> DaplaDatasetPathInfo('buckets/stat_name/utdata/person_data.parquet').statistic_short_name
735
+ None
736
+
656
737
  >>> DaplaDatasetPathInfo('befolkning/person_data.parquet').statistic_short_name
657
738
  None
739
+
740
+ >>> DaplaDatasetPathInfo('buckets/produkt/befolkning/utdata/person_data.parquet').statistic_short_name
741
+ befolkning
742
+
743
+ >>> DaplaDatasetPathInfo('resources/buckets/produkt/befolkning/utdata/person_data.parquet').statistic_short_name
744
+ befolkning
745
+
746
+ >>> DaplaDatasetPathInfo('gs://statistikk/produkt/klargjorte-data/persondata_p1990-Q1_p2023-Q4_v1/aar=2019/data.parquet').statistic_short_name
747
+ produkt
748
+
749
+ >>> DaplaDatasetPathInfo('gs://statistikk/produkt/persondata_p1990-Q1_p2023-Q4_v1/aar=2019/data.parquet').statistic_short_name
750
+ None
751
+
752
+ >>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/person_data_p2021_v2.parquet').statistic_short_name
753
+ None
658
754
  """
659
- dataset_state = self.dataset_state
660
- if dataset_state is not None:
661
- dataset_state_names = self._extract_norwegian_dataset_state_path_part(
662
- dataset_state,
663
- )
664
- dataset_path_parts = list(self.dataset_path.parts)
665
- for i in dataset_state_names:
666
- if i in dataset_path_parts and dataset_path_parts.index(i) != 0:
667
- return dataset_path_parts[dataset_path_parts.index(i) - 1]
755
+ if not self.dataset_state:
756
+ if self.bucket_name:
757
+ parts = self.dataset_path.parent.parts
758
+
759
+ if self.bucket_name not in parts:
760
+ return None
761
+
762
+ # Find the index of bucket_name in the path
763
+ bucket_name_index = self.dataset_path.parent.parts.index(
764
+ self.bucket_name,
765
+ )
766
+
767
+ # If there are parts after bucket_name, return the part immediately after it
768
+ if len(self.dataset_path.parent.parts) > bucket_name_index + 1:
769
+ return self.dataset_path.parent.parts[bucket_name_index + 1]
770
+
771
+ return None
772
+
773
+ dataset_state_names = self._extract_norwegian_dataset_state_path_part(
774
+ self.dataset_state,
775
+ )
776
+ dataset_path_parts = list(self.dataset_path.parts)
777
+
778
+ for state in dataset_state_names:
779
+ if state not in dataset_path_parts:
780
+ continue
781
+
782
+ index = dataset_path_parts.index(state)
783
+
784
+ if index == 0:
785
+ continue
786
+
787
+ left_parts = self._get_left_parts(dataset_path_parts, index)
788
+
789
+ if not left_parts:
790
+ return None
791
+
792
+ return dataset_path_parts[index - 1]
793
+
668
794
  return None
669
795
 
670
796
  def path_complies_with_naming_standard(self) -> bool:
@@ -5,18 +5,17 @@ Handles reading in the data and transforming data types to generic metadata type
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
- import pathlib # noqa: TCH003 import is needed for docs build
8
+ import pathlib # noqa: TC003 import is needed for docs build
9
9
  import re
10
- import typing as t
11
10
  from abc import ABC
12
11
  from abc import abstractmethod
13
12
  from typing import TYPE_CHECKING
14
13
 
15
14
  import pandas as pd
16
- from datadoc_model.model import DataType
17
- from datadoc_model.model import LanguageStringType
18
- from datadoc_model.model import LanguageStringTypeItem
19
- from datadoc_model.model import Variable
15
+ from datadoc_model.all_optional.model import DataType
16
+ from datadoc_model.all_optional.model import LanguageStringType
17
+ from datadoc_model.all_optional.model import LanguageStringTypeItem
18
+ from datadoc_model.all_optional.model import Variable
20
19
  from pyarrow import parquet as pq
21
20
 
22
21
  from dapla_metadata.datasets.utility.enums import SupportedLanguages
@@ -56,6 +55,8 @@ KNOWN_FLOAT_TYPES = (
56
55
 
57
56
  KNOWN_STRING_TYPES = (
58
57
  "string",
58
+ "string[pyarrow]",
59
+ "large_string",
59
60
  "str",
60
61
  "char",
61
62
  "varchar",
@@ -67,13 +68,18 @@ KNOWN_STRING_TYPES = (
67
68
 
68
69
  KNOWN_DATETIME_TYPES = (
69
70
  "timestamp",
71
+ "timestamp[s]",
72
+ "timestamp[ms]",
70
73
  "timestamp[us]",
71
74
  "timestamp[ns]",
75
+ "datetime",
72
76
  "datetime64",
73
- " datetime64[ns]",
74
- " datetime64[us]",
77
+ "datetime64[s]",
78
+ "datetime64[ms]",
79
+ "datetime64[us]",
80
+ "datetime64[ns]",
75
81
  "date",
76
- "datetime",
82
+ "date32[day]",
77
83
  "time",
78
84
  )
79
85
 
@@ -89,9 +95,7 @@ TYPE_CORRESPONDENCE: list[tuple[tuple[str, ...], DataType]] = [
89
95
  ]
90
96
  TYPE_MAP: dict[str, DataType] = {}
91
97
  for concrete_type, abstract_type in TYPE_CORRESPONDENCE:
92
- TYPE_MAP.update({c: abstract_type for c in concrete_type})
93
-
94
- TDatasetParser = t.TypeVar("TDatasetParser", bound="DatasetParser")
98
+ TYPE_MAP.update(dict.fromkeys(concrete_type, abstract_type))
95
99
 
96
100
 
97
101
  class DatasetParser(ABC):
@@ -112,31 +116,23 @@ class DatasetParser(ABC):
112
116
  @staticmethod
113
117
  def for_file(dataset: pathlib.Path | CloudPath) -> DatasetParser:
114
118
  """Return the correct subclass based on the given dataset file."""
115
- supported_file_types: dict[
116
- str,
117
- type[DatasetParser],
118
- ] = {
119
- ".parquet": DatasetParserParquet,
120
- ".sas7bdat": DatasetParserSas7Bdat,
121
- ".parquet.gzip": DatasetParserParquet,
122
- }
123
119
  file_type = "Unknown"
124
120
  try:
125
121
  file_type = dataset.suffix
126
122
  # Gzipped parquet files can be read with DatasetParserParquet
127
- match = re.search(r"(.parquet.gzip)", str(dataset).lower())
128
- file_type = ".parquet.gzip" if match else file_type
129
- # Extract the appropriate reader class from the SUPPORTED_FILE_TYPES dict and return an instance of it
130
- reader = supported_file_types[file_type](dataset)
123
+ match = re.search(PARQUET_GZIP_FILE_SUFFIX, str(dataset).lower())
124
+ file_type = PARQUET_GZIP_FILE_SUFFIX if match else file_type
125
+ # Extract the appropriate reader class from the SUPPORTED_FILE_TYPES dict
126
+ reader = SUPPORTED_DATASET_FILE_SUFFIXES[file_type](dataset)
131
127
  except IndexError as e:
132
128
  # Thrown when just one element is returned from split, meaning there is no file extension supplied
133
- msg = f"Could not recognise file type for provided {dataset = }. Supported file types are: {', '.join(supported_file_types.keys())}"
129
+ msg = f"Could not recognise file type for provided {dataset = }. Supported file types are: {', '.join(SUPPORTED_DATASET_FILE_SUFFIXES.keys())}"
134
130
  raise FileNotFoundError(
135
131
  msg,
136
132
  ) from e
137
133
  except KeyError as e:
138
134
  # In this case the file type is not supported, so we throw a helpful exception
139
- msg = f"{file_type = } is not supported. Please open one of the following supported files types: {', '.join(supported_file_types.keys())} or contact the maintainers to request support."
135
+ msg = f"{file_type = } is not supported. Please open one of the following supported files types: {', '.join(SUPPORTED_DATASET_FILE_SUFFIXES.keys())} or contact the maintainers to request support."
140
136
  raise NotImplementedError(
141
137
  msg,
142
138
  ) from e
@@ -157,6 +153,9 @@ class DatasetParser(ABC):
157
153
 
158
154
  Arguments:
159
155
  data_type: The concrete data type to map.
156
+
157
+ Returns:
158
+ The abstract data type or None
160
159
  """
161
160
  return TYPE_MAP.get(data_type.lower(), None)
162
161
 
@@ -179,11 +178,11 @@ class DatasetParserParquet(DatasetParser):
179
178
  def get_fields(self) -> list[Variable]:
180
179
  """Extract the fields from this dataset."""
181
180
  with self.dataset.open(mode="rb") as f:
182
- schema: pa.Schema = pq.read_schema(f) # type: ignore [arg-type]
181
+ schema: pa.Schema = pq.read_schema(f) # type: ignore [arg-type, assignment]
183
182
  return [
184
183
  Variable(
185
184
  short_name=data_field.name.strip(),
186
- data_type=self.transform_data_type(str(data_field.type)),
185
+ data_type=self.transform_data_type(str(data_field.type)), # type: ignore [attr-defined]
187
186
  )
188
187
  for data_field in schema
189
188
  if data_field.name
@@ -239,3 +238,17 @@ class DatasetParserSas7Bdat(DatasetParser):
239
238
  )
240
239
 
241
240
  return fields
241
+
242
+
243
+ PARQUET_FILE_SUFFIX = ".parquet"
244
+ PARQUET_GZIP_FILE_SUFFIX = ".parquet.gzip"
245
+ SAS7BDAT_FILE_SUFFIX = ".sas7bdat"
246
+
247
+ SUPPORTED_DATASET_FILE_SUFFIXES: dict[
248
+ str,
249
+ type[DatasetParser],
250
+ ] = {
251
+ PARQUET_FILE_SUFFIX: DatasetParserParquet,
252
+ PARQUET_GZIP_FILE_SUFFIX: DatasetParserParquet,
253
+ SAS7BDAT_FILE_SUFFIX: DatasetParserSas7Bdat,
254
+ }
@@ -13,20 +13,19 @@ from typing_extensions import Self
13
13
 
14
14
  from dapla_metadata.datasets.utility.constants import DATE_VALIDATION_MESSAGE
15
15
  from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_DATASET_FIELDS
16
- from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
17
16
  from dapla_metadata.datasets.utility.constants import OBLIGATORY_METADATA_WARNING
18
17
  from dapla_metadata.datasets.utility.utils import get_missing_obligatory_dataset_fields
19
18
  from dapla_metadata.datasets.utility.utils import (
20
19
  get_missing_obligatory_variables_fields,
21
20
  )
21
+ from dapla_metadata.datasets.utility.utils import (
22
+ get_missing_obligatory_variables_pseudo_fields,
23
+ )
22
24
  from dapla_metadata.datasets.utility.utils import get_timestamp_now
23
25
  from dapla_metadata.datasets.utility.utils import incorrect_date_order
24
26
  from dapla_metadata.datasets.utility.utils import (
25
27
  num_obligatory_dataset_fields_completed,
26
28
  )
27
- from dapla_metadata.datasets.utility.utils import (
28
- num_obligatory_variables_fields_completed,
29
- )
30
29
  from dapla_metadata.datasets.utility.utils import set_variables_inherit_from_dataset
31
30
 
32
31
  if TYPE_CHECKING:
@@ -146,21 +145,31 @@ class ValidateDatadocMetadata(model.DatadocMetadata):
146
145
  ObligatoryVariableWarning: If not all obligatory variable metadata fields
147
146
  are filled in.
148
147
  """
149
- if self.variables is not None and num_obligatory_variables_fields_completed(
150
- self.variables,
151
- ) != (NUM_OBLIGATORY_VARIABLES_FIELDS * len(self.variables)):
152
- warnings.warn(
153
- f"{OBLIGATORY_METADATA_WARNING} {get_missing_obligatory_variables_fields(self.variables)}",
154
- ObligatoryVariableWarning,
155
- stacklevel=2,
156
- )
157
- logger.warning(
158
- "Type warning: %s.%s %s",
159
- ObligatoryVariableWarning,
160
- OBLIGATORY_METADATA_WARNING,
161
- get_missing_obligatory_variables_fields(self.variables),
162
- )
163
-
148
+ if self.variables is not None:
149
+ missing_fields_dict = {}
150
+ for d in get_missing_obligatory_variables_fields(self.variables):
151
+ for var, fields in d.items():
152
+ missing_fields_dict[var] = fields.copy()
153
+
154
+ for d in get_missing_obligatory_variables_pseudo_fields(self.variables):
155
+ for var, fields in d.items():
156
+ if var in missing_fields_dict:
157
+ missing_fields_dict[var].extend(fields)
158
+ else:
159
+ missing_fields_dict[var] = fields.copy()
160
+
161
+ missing_fields = [
162
+ {var: fields} for var, fields in missing_fields_dict.items()
163
+ ]
164
+ if missing_fields:
165
+ message = f"{OBLIGATORY_METADATA_WARNING} {missing_fields}"
166
+ warnings.warn(message, ObligatoryVariableWarning, stacklevel=2)
167
+ logger.warning(
168
+ "Type warning: %s.%s %s",
169
+ ObligatoryVariableWarning,
170
+ OBLIGATORY_METADATA_WARNING,
171
+ missing_fields,
172
+ )
164
173
  return self
165
174
 
166
175
 
@@ -176,7 +185,7 @@ class ObligatoryVariableWarning(UserWarning):
176
185
  """Custom warning for checking obligatory metadata for variables."""
177
186
 
178
187
 
179
- def custom_warning_handler( # noqa: PLR0913 remove fields causes incompatible types
188
+ def custom_warning_handler(
180
189
  message: Warning | str,
181
190
  category: type[Warning],
182
191
  filename: str,
@@ -140,7 +140,11 @@ class StatisticSubjectMapping(GetExternalSource):
140
140
  SecondarySubject(
141
141
  self._extract_titles(s.titler),
142
142
  s["emnekode"],
143
- [statistikk["kortnavn"] for statistikk in s.find_all("Statistikk")],
143
+ [
144
+ statistikk["kortnavn"]
145
+ for statistikk in s.find_all("Statistikk")
146
+ if statistikk["isPrimaerPlassering"] == "true"
147
+ ],
144
148
  )
145
149
  for s in p.find_all("delemne")
146
150
  ]
@@ -1,7 +1,7 @@
1
1
  """Repository for constant values in Datadoc backend."""
2
2
 
3
- from datadoc_model.model import LanguageStringType
4
- from datadoc_model.model import LanguageStringTypeItem
3
+ from datadoc_model.all_optional.model import LanguageStringType
4
+ from datadoc_model.all_optional.model import LanguageStringTypeItem
5
5
 
6
6
  VALIDATION_ERROR = "Validation error: "
7
7
 
@@ -9,7 +9,7 @@ DATE_VALIDATION_MESSAGE = f"{VALIDATION_ERROR}contains_data_from must be the sam
9
9
 
10
10
  OBLIGATORY_METADATA_WARNING = "Obligatory metadata is missing: "
11
11
 
12
- INCONSISTENCIES_MESSAGE = "Inconsistencies found between extracted and existing metadata. Inconsistencies are:"
12
+ INCONSISTENCIES_MESSAGE = "Inconsistencies found between extracted and existing metadata! This usually means that the new dataset has a different structure and that the version number should be incremented.\nDetails:"
13
13
 
14
14
  OBLIGATORY_DATASET_METADATA_IDENTIFIERS: list = [
15
15
  "assessment",
@@ -17,12 +17,9 @@ OBLIGATORY_DATASET_METADATA_IDENTIFIERS: list = [
17
17
  "dataset_status",
18
18
  "name",
19
19
  "description",
20
- "data_source",
21
20
  "population_description",
22
21
  "version",
23
22
  "version_description",
24
- "unit_type",
25
- "temporality_type",
26
23
  "subject_field",
27
24
  "spatial_coverage_description",
28
25
  "owner",
@@ -44,8 +41,18 @@ OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS = [
44
41
  "data_type",
45
42
  "variable_role",
46
43
  "is_personal_data",
44
+ "unit_type",
45
+ "population_description",
46
+ "data_source",
47
+ "temporality_type",
48
+ ]
49
+
50
+ OBLIGATORY_VARIABLES_PSEUDONYMIZATION_IDENTIFIERS = [
51
+ "encryption_algorithm",
52
+ "encryption_key_reference",
47
53
  ]
48
54
 
55
+
49
56
  OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS_MULTILANGUAGE = [
50
57
  "name",
51
58
  ]
@@ -75,22 +82,22 @@ DATASET_FIELDS_FROM_EXISTING_METADATA = [
75
82
  "dataset_status",
76
83
  "name",
77
84
  "description",
78
- "data_source",
79
85
  "population_description",
80
- "unit_type",
81
- "temporality_type",
82
86
  "subject_field",
83
87
  "keyword",
84
88
  "spatial_coverage_description",
85
- "contains_personal_data",
86
- "use_restriction",
87
- "use_restriction_date",
89
+ "use_restrictions",
88
90
  "custom_type",
89
91
  "owner",
92
+ "version_description",
90
93
  ]
91
94
 
92
95
  METADATA_DOCUMENT_FILE_SUFFIX = "__DOC.json"
93
96
 
94
- DATADOC_STATISTICAL_SUBJECT_SOURCE_URL = (
95
- "https://www.ssb.no/xp/_/service/mimir/subjectStructurStatistics"
96
- )
97
+ PAPIS_STABLE_IDENTIFIER_TYPE = "FREG_SNR"
98
+ PAPIS_ENCRYPTION_KEY_REFERENCE = "papis-common-key-1"
99
+ DAEAD_ENCRYPTION_KEY_REFERENCE = "ssb-common-key-1"
100
+ ENCRYPTION_PARAMETER_SNAPSHOT_DATE = "snapshotDate"
101
+ ENCRYPTION_PARAMETER_KEY_ID = "keyId"
102
+ ENCRYPTION_PARAMETER_STRATEGY = "strategy"
103
+ ENCRYPTION_PARAMETER_STRATEGY_SKIP = "skip"
@@ -5,31 +5,19 @@ from __future__ import annotations
5
5
  from enum import Enum
6
6
 
7
7
 
8
- class DaplaRegion(str, Enum):
9
- """Dapla platforms/regions."""
10
-
11
- DAPLA_LAB = "DAPLA_LAB"
12
- BIP = "BIP"
13
- ON_PREM = "ON_PREM"
14
- CLOUD_RUN = "CLOUD_RUN"
15
-
16
-
17
- class DaplaService(str, Enum):
18
- """Dapla services."""
19
-
20
- DATADOC = "DATADOC"
21
- JUPYTERLAB = "JUPYTERLAB"
22
- VS_CODE = "VS_CODE"
23
- R_STUDIO = "R_STUDIO"
24
- KILDOMATEN = "KILDOMATEN"
25
-
26
-
27
8
  class SupportedLanguages(str, Enum):
28
9
  """The list of languages metadata may be recorded in.
29
10
 
30
11
  Reference: https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
31
12
  """
32
13
 
33
- NORSK_BOKMÅL = "nb"
14
+ NORSK_BOKMÅL = "nb" # noqa: PLC2401 the listed problems do not apply in this case
34
15
  NORSK_NYNORSK = "nn"
35
16
  ENGLISH = "en"
17
+
18
+
19
+ class EncryptionAlgorithm(str, Enum):
20
+ """Encryption algorithm values for pseudonymization algoprithms offered on Dapla."""
21
+
22
+ PAPIS_ENCRYPTION_ALGORITHM = "TINK-FPE"
23
+ DAEAD_ENCRYPTION_ALGORITHM = "TINK-DAEAD"