dapla-toolbelt-metadata 0.4.2__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dapla-toolbelt-metadata might be problematic. Click here for more details.
- dapla_metadata/__init__.py +9 -0
- dapla_metadata/dapla/__init__.py +4 -0
- dapla_metadata/{_shared → dapla}/user_info.py +66 -20
- dapla_metadata/datasets/code_list.py +1 -1
- dapla_metadata/datasets/core.py +1 -1
- dapla_metadata/datasets/dapla_dataset_path_info.py +128 -14
- dapla_metadata/datasets/dataset_parser.py +21 -15
- dapla_metadata/datasets/model_backwards_compatibility.py +6 -6
- dapla_metadata/datasets/model_validation.py +2 -2
- dapla_metadata/datasets/utility/constants.py +1 -0
- dapla_metadata/datasets/utility/enums.py +1 -1
- dapla_metadata/datasets/utility/utils.py +8 -12
- dapla_metadata/standards/__init__.py +4 -0
- dapla_metadata/standards/name_validator.py +250 -0
- dapla_metadata/standards/standard_validators.py +98 -0
- dapla_metadata/standards/utils/__init__.py +1 -0
- dapla_metadata/standards/utils/constants.py +49 -0
- dapla_metadata/variable_definitions/__init__.py +5 -3
- dapla_metadata/variable_definitions/{generated → _generated}/.openapi-generator/FILES +0 -5
- dapla_metadata/variable_definitions/_generated/.openapi-generator/VERSION +1 -0
- dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/__init__.py +0 -5
- dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/api/__init__.py +0 -1
- dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/api/data_migration_api.py +2 -2
- dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/api/draft_variable_definitions_api.py +14 -14
- dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/api/patches_api.py +15 -15
- dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/api/validity_periods_api.py +8 -281
- dapla_metadata/variable_definitions/{generated/vardef_client/api/public_api.py → _generated/vardef_client/api/variable_definitions_api.py} +73 -358
- dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/models/__init__.py +2 -6
- dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/models/complete_response.py +8 -32
- dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/models/contact.py +2 -2
- dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/models/draft.py +8 -23
- dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/models/language_string_type.py +7 -6
- dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/models/owner.py +2 -2
- dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/models/patch.py +16 -61
- dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/models/problem.py +2 -2
- dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/models/update_draft.py +22 -55
- dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/models/validity_period.py +14 -48
- dapla_metadata/variable_definitions/_generated/vardef_client/models/variable_status.py +33 -0
- dapla_metadata/variable_definitions/_utils/__init__.py +1 -0
- dapla_metadata/variable_definitions/{_client.py → _utils/_client.py} +5 -3
- dapla_metadata/variable_definitions/{config.py → _utils/config.py} +25 -1
- dapla_metadata/variable_definitions/_utils/constants.py +41 -0
- dapla_metadata/variable_definitions/_utils/descriptions.py +89 -0
- dapla_metadata/variable_definitions/_utils/files.py +273 -0
- dapla_metadata/variable_definitions/_utils/template_files.py +112 -0
- dapla_metadata/variable_definitions/_utils/variable_definition_files.py +93 -0
- dapla_metadata/variable_definitions/exceptions.py +141 -11
- dapla_metadata/variable_definitions/resources/vardef_model_descriptions_nb.yaml +63 -0
- dapla_metadata/variable_definitions/vardef.py +131 -10
- dapla_metadata/variable_definitions/variable_definition.py +251 -43
- {dapla_toolbelt_metadata-0.4.2.dist-info → dapla_toolbelt_metadata-0.6.0.dist-info}/METADATA +8 -10
- dapla_toolbelt_metadata-0.6.0.dist-info/RECORD +89 -0
- {dapla_toolbelt_metadata-0.4.2.dist-info → dapla_toolbelt_metadata-0.6.0.dist-info}/WHEEL +1 -1
- dapla_metadata/variable_definitions/generated/.openapi-generator/VERSION +0 -1
- dapla_metadata/variable_definitions/generated/vardef_client/api/variable_definitions_api.py +0 -1205
- dapla_metadata/variable_definitions/generated/vardef_client/models/klass_reference.py +0 -99
- dapla_metadata/variable_definitions/generated/vardef_client/models/rendered_contact.py +0 -92
- dapla_metadata/variable_definitions/generated/vardef_client/models/rendered_variable_definition.py +0 -235
- dapla_metadata/variable_definitions/generated/vardef_client/models/supported_languages.py +0 -33
- dapla_metadata/variable_definitions/generated/vardef_client/models/variable_status.py +0 -33
- dapla_toolbelt_metadata-0.4.2.dist-info/RECORD +0 -81
- /dapla_metadata/variable_definitions/{generated → _generated}/.openapi-generator-ignore +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/README.md +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/__init__.py +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/api_client.py +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/api_response.py +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/configuration.py +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/docs/CompleteResponse.md +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/docs/Contact.md +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/docs/DataMigrationApi.md +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/docs/Draft.md +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/docs/DraftVariableDefinitionsApi.md +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/docs/LanguageStringType.md +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/docs/Owner.md +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/docs/Patch.md +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/docs/PatchesApi.md +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/docs/PublicApi.md +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/docs/SupportedLanguages.md +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/docs/UpdateDraft.md +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/docs/ValidityPeriod.md +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/docs/ValidityPeriodsApi.md +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/docs/VariableDefinitionsApi.md +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/docs/VariableStatus.md +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/exceptions.py +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/py.typed +0 -0
- /dapla_metadata/variable_definitions/{generated → _generated}/vardef_client/rest.py +0 -0
- {dapla_toolbelt_metadata-0.4.2.dist-info → dapla_toolbelt_metadata-0.6.0.dist-info}/LICENSE +0 -0
dapla_metadata/__init__.py
CHANGED
|
@@ -1,6 +1,15 @@
|
|
|
1
1
|
"""Tools and clients for working with the Dapla Metadata system."""
|
|
2
2
|
|
|
3
|
+
import warnings
|
|
4
|
+
|
|
5
|
+
warnings.filterwarnings(
|
|
6
|
+
"ignore",
|
|
7
|
+
message="As the c extension couldn't be imported, `google-crc32c` is using a pure python implementation that is significantly slower.",
|
|
8
|
+
)
|
|
9
|
+
|
|
3
10
|
import datadoc_model.model as datadoc_model
|
|
4
11
|
|
|
12
|
+
from . import dapla
|
|
5
13
|
from . import datasets
|
|
14
|
+
from . import standards
|
|
6
15
|
from . import variable_definitions
|
|
@@ -13,9 +13,6 @@ from dapla_metadata._shared.enums import DaplaService
|
|
|
13
13
|
logger = logging.getLogger(__name__)
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
PLACEHOLDER_EMAIL_ADDRESS = "default_user@ssb.no"
|
|
17
|
-
|
|
18
|
-
|
|
19
16
|
class UserInfo(Protocol):
|
|
20
17
|
"""Information about the current user.
|
|
21
18
|
|
|
@@ -27,6 +24,16 @@ class UserInfo(Protocol):
|
|
|
27
24
|
"""Get the short email address."""
|
|
28
25
|
...
|
|
29
26
|
|
|
27
|
+
@property
|
|
28
|
+
def current_group(self) -> str:
|
|
29
|
+
"""Get the group which the user is currently representing."""
|
|
30
|
+
...
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def current_team(self) -> str:
|
|
34
|
+
"""Get the team which the user is currently representing."""
|
|
35
|
+
...
|
|
36
|
+
|
|
30
37
|
|
|
31
38
|
class UnknownUserInfo:
|
|
32
39
|
"""Fallback when no implementation is found."""
|
|
@@ -36,14 +43,38 @@ class UnknownUserInfo:
|
|
|
36
43
|
"""Unknown email address."""
|
|
37
44
|
return None
|
|
38
45
|
|
|
46
|
+
@property
|
|
47
|
+
def current_group(self) -> str:
|
|
48
|
+
"""Get the group which the user is currently representing."""
|
|
49
|
+
return ""
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def current_team(self) -> str:
|
|
53
|
+
"""Get the team which the user is currently representing."""
|
|
54
|
+
return ""
|
|
55
|
+
|
|
39
56
|
|
|
40
57
|
class TestUserInfo:
|
|
41
58
|
"""Information about the current user for local development and testing."""
|
|
42
59
|
|
|
60
|
+
PLACEHOLDER_EMAIL_ADDRESS = "default_user@ssb.no"
|
|
61
|
+
PLACEHOLDER_GROUP = "default-team-developers"
|
|
62
|
+
PLACEHOLDER_TEAM = "default-team"
|
|
63
|
+
|
|
43
64
|
@property
|
|
44
65
|
def short_email(self) -> str | None:
|
|
45
66
|
"""Get the short email address."""
|
|
46
|
-
return PLACEHOLDER_EMAIL_ADDRESS
|
|
67
|
+
return TestUserInfo.PLACEHOLDER_EMAIL_ADDRESS
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def current_group(self) -> str | None:
|
|
71
|
+
"""Get the group which the user is currently representing."""
|
|
72
|
+
return TestUserInfo.PLACEHOLDER_GROUP
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def current_team(self) -> str | None:
|
|
76
|
+
"""Get the team which the user is currently representing."""
|
|
77
|
+
return TestUserInfo.PLACEHOLDER_TEAM
|
|
47
78
|
|
|
48
79
|
|
|
49
80
|
class DaplaLabUserInfo:
|
|
@@ -65,6 +96,19 @@ class DaplaLabUserInfo:
|
|
|
65
96
|
)
|
|
66
97
|
return None
|
|
67
98
|
|
|
99
|
+
@property
|
|
100
|
+
def current_group(self) -> str:
|
|
101
|
+
"""Get the group which the user is currently representing."""
|
|
102
|
+
if group := config.get_dapla_group_context():
|
|
103
|
+
return group
|
|
104
|
+
msg = "DAPLA_GROUP_CONTEXT environment variable not found"
|
|
105
|
+
raise OSError(msg)
|
|
106
|
+
|
|
107
|
+
@property
|
|
108
|
+
def current_team(self) -> str:
|
|
109
|
+
"""Get the team which the user is currently representing."""
|
|
110
|
+
return parse_team_name(self.current_group)
|
|
111
|
+
|
|
68
112
|
|
|
69
113
|
class JupyterHubUserInfo:
|
|
70
114
|
"""Information about the current user when running on JupyterHub."""
|
|
@@ -74,6 +118,16 @@ class JupyterHubUserInfo:
|
|
|
74
118
|
"""Get the short email address."""
|
|
75
119
|
return config.get_jupyterhub_user()
|
|
76
120
|
|
|
121
|
+
@property
|
|
122
|
+
def current_group(self) -> str:
|
|
123
|
+
"""Get the group which the user is currently representing."""
|
|
124
|
+
raise NotImplementedError
|
|
125
|
+
|
|
126
|
+
@property
|
|
127
|
+
def current_team(self) -> str:
|
|
128
|
+
"""Get the team which the user is currently representing."""
|
|
129
|
+
raise NotImplementedError
|
|
130
|
+
|
|
77
131
|
|
|
78
132
|
def get_user_info_for_current_platform() -> UserInfo:
|
|
79
133
|
"""Return the correct implementation of UserInfo for the current platform."""
|
|
@@ -88,28 +142,20 @@ def get_user_info_for_current_platform() -> UserInfo:
|
|
|
88
142
|
return UnknownUserInfo()
|
|
89
143
|
|
|
90
144
|
|
|
91
|
-
def get_owner() -> str:
|
|
92
|
-
"""Returns the owner read from the GROUP_CONTEXT environment variable."""
|
|
93
|
-
if group := config.get_dapla_group_context():
|
|
94
|
-
return parse_team_name(group)
|
|
95
|
-
msg = "DAPLA_GROUP_CONTEXT environment variable not found"
|
|
96
|
-
raise OSError(msg)
|
|
97
|
-
|
|
98
|
-
|
|
99
145
|
def parse_team_name(group: str) -> str:
|
|
100
146
|
"""Parses the group to get the current team.
|
|
101
147
|
|
|
102
|
-
>>> parse_team_name(dapla-metadata-developers)
|
|
103
|
-
|
|
148
|
+
>>> parse_team_name("dapla-metadata-developers")
|
|
149
|
+
'dapla-metadata'
|
|
104
150
|
|
|
105
|
-
>>> parse_team_name(dapla-metadata-data-admins)
|
|
106
|
-
|
|
151
|
+
>>> parse_team_name("dapla-metadata-data-admins")
|
|
152
|
+
'dapla-metadata'
|
|
107
153
|
|
|
108
|
-
>>> parse_team_name(dapla-metadata)
|
|
109
|
-
|
|
154
|
+
>>> parse_team_name("dapla-metadata")
|
|
155
|
+
'dapla'
|
|
110
156
|
|
|
111
|
-
>>> parse_team_name(dapla-metadata-not-real-name)
|
|
112
|
-
|
|
157
|
+
>>> parse_team_name("dapla-metadata-not-real-name")
|
|
158
|
+
'dapla-metadata-not-real'
|
|
113
159
|
"""
|
|
114
160
|
parts = group.split("-")
|
|
115
161
|
return "-".join(parts[:-2] if group.endswith("data-admins") else parts[:-1])
|
dapla_metadata/datasets/core.py
CHANGED
|
@@ -14,7 +14,7 @@ from datadoc_model import model
|
|
|
14
14
|
from datadoc_model.model import DataSetStatus
|
|
15
15
|
|
|
16
16
|
from dapla_metadata._shared import config
|
|
17
|
-
from dapla_metadata.
|
|
17
|
+
from dapla_metadata.dapla import user_info
|
|
18
18
|
from dapla_metadata.datasets.dapla_dataset_path_info import DaplaDatasetPathInfo
|
|
19
19
|
from dapla_metadata.datasets.dataset_parser import DatasetParser
|
|
20
20
|
from dapla_metadata.datasets.model_backwards_compatibility import (
|
|
@@ -478,7 +478,7 @@ class DaplaDatasetPathInfo:
|
|
|
478
478
|
"""Extract the bucket name from the dataset path.
|
|
479
479
|
|
|
480
480
|
Returns:
|
|
481
|
-
The bucket name or None if the dataset path is not a GCS path.
|
|
481
|
+
The bucket name or None if the dataset path is not a GCS path nor ssb bucketeer path.
|
|
482
482
|
|
|
483
483
|
Examples:
|
|
484
484
|
>>> DaplaDatasetPathInfo('gs://ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet').bucket_name
|
|
@@ -492,17 +492,35 @@ class DaplaDatasetPathInfo:
|
|
|
492
492
|
|
|
493
493
|
>>> DaplaDatasetPathInfo('ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet').bucket_name
|
|
494
494
|
None
|
|
495
|
+
|
|
496
|
+
>>> DaplaDatasetPathInfo('ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet').bucket_name
|
|
497
|
+
None
|
|
498
|
+
|
|
499
|
+
>>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/stat/utdata/person_data_p2021_v2.parquet').bucket_name
|
|
500
|
+
ssb-staging-dapla-felles-data-delt
|
|
501
|
+
|
|
502
|
+
>>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/person_data_p2021_v2.parquet').bucket_name
|
|
503
|
+
ssb-staging-dapla-felles-data-delt
|
|
504
|
+
|
|
505
|
+
>>> DaplaDatasetPathInfo('home/work/buckets/ssb-staging-dapla-felles-produkt/stat/utdata/person_data_p2021_v2.parquet').bucket_name
|
|
506
|
+
ssb-staging-dapla-felles-produkt
|
|
495
507
|
"""
|
|
496
508
|
prefix: str | None = None
|
|
497
|
-
|
|
509
|
+
dataset_string = str(self.dataset_string)
|
|
510
|
+
if GSPath.cloud_prefix in self.dataset_string:
|
|
498
511
|
prefix = GSPath.cloud_prefix
|
|
499
|
-
|
|
512
|
+
_, bucket_and_rest = dataset_string.split(prefix, 1)
|
|
513
|
+
elif GS_PREFIX_FROM_PATHLIB in self.dataset_string:
|
|
500
514
|
prefix = GS_PREFIX_FROM_PATHLIB
|
|
515
|
+
_, bucket_and_rest = self.dataset_string.split(prefix, 1)
|
|
516
|
+
elif "buckets/" in self.dataset_string:
|
|
517
|
+
prefix = "buckets/"
|
|
518
|
+
_, bucket_and_rest = self.dataset_string.split(prefix, 1)
|
|
501
519
|
else:
|
|
502
520
|
return None
|
|
503
521
|
|
|
504
522
|
return pathlib.Path(
|
|
505
|
-
|
|
523
|
+
bucket_and_rest,
|
|
506
524
|
).parts[0]
|
|
507
525
|
|
|
508
526
|
@property
|
|
@@ -528,6 +546,15 @@ class DaplaDatasetPathInfo:
|
|
|
528
546
|
|
|
529
547
|
>>> DaplaDatasetPathInfo('my_data/simple_dataset_name.parquet').dataset_short_name
|
|
530
548
|
simple_dataset_name
|
|
549
|
+
|
|
550
|
+
>>> DaplaDatasetPathInfo('gs:/ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet').dataset_short_name
|
|
551
|
+
person_data
|
|
552
|
+
|
|
553
|
+
>>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/stat/utdata/folk_data_p2021_v2.parquet').dataset_short_name
|
|
554
|
+
folk_data
|
|
555
|
+
|
|
556
|
+
>>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/stat/utdata/dapla/bus_p2021_v2.parquet').dataset_short_name
|
|
557
|
+
bus
|
|
531
558
|
"""
|
|
532
559
|
if self.contains_data_from or self.contains_data_until:
|
|
533
560
|
short_name_sections = self.dataset_name_sections[
|
|
@@ -601,6 +628,9 @@ class DaplaDatasetPathInfo:
|
|
|
601
628
|
>>> DaplaDatasetPathInfo('utdata/min_statistikk/person_data_v1.parquet').dataset_state
|
|
602
629
|
<DataSetState.OUTPUT_DATA: 'OUTPUT_DATA'>
|
|
603
630
|
|
|
631
|
+
>>> DaplaDatasetPathInfo('buckets/bucket_name/stat_name/inndata/min_statistikk/person_data_v1.parquet').dataset_state
|
|
632
|
+
<DataSetState.INPUT_DATA: 'INPUT_DATA'>
|
|
633
|
+
|
|
604
634
|
>>> DaplaDatasetPathInfo('my_special_data/person_data_v1.parquet').dataset_state
|
|
605
635
|
None
|
|
606
636
|
"""
|
|
@@ -632,6 +662,12 @@ class DaplaDatasetPathInfo:
|
|
|
632
662
|
|
|
633
663
|
>>> DaplaDatasetPathInfo('person_data.parquet').dataset_version
|
|
634
664
|
None
|
|
665
|
+
|
|
666
|
+
>>> DaplaDatasetPathInfo('buckets/bucket_name/stat_name/inndata/min_statistikk/person_data_v1.parquet').dataset_version
|
|
667
|
+
'1'
|
|
668
|
+
|
|
669
|
+
>>> DaplaDatasetPathInfo('buckets/bucket_name/stat_name/inndata/min_statistikk/person_data.parquet').dataset_version
|
|
670
|
+
None
|
|
635
671
|
"""
|
|
636
672
|
minimum_elements_in_file_name: Final[int] = 2
|
|
637
673
|
minimum_characters_in_version_string: Final[int] = 2
|
|
@@ -645,13 +681,37 @@ class DaplaDatasetPathInfo:
|
|
|
645
681
|
return last_filename_element[1:]
|
|
646
682
|
return None
|
|
647
683
|
|
|
684
|
+
def _get_left_parts(
|
|
685
|
+
self,
|
|
686
|
+
dataset_path_parts: list[str],
|
|
687
|
+
state_index: int,
|
|
688
|
+
) -> list[str]:
|
|
689
|
+
"""Retrieve the path parts before the dataset state, considering bucket prefixes."""
|
|
690
|
+
bucket_prefix = {"gs:", "buckets"}
|
|
691
|
+
left_parts = dataset_path_parts[:state_index]
|
|
692
|
+
|
|
693
|
+
# Stop checking beyond the bucket prefix
|
|
694
|
+
prefix_intersection = bucket_prefix & set(left_parts)
|
|
695
|
+
if prefix_intersection:
|
|
696
|
+
first_prefix = min(
|
|
697
|
+
left_parts.index(prefix) for prefix in prefix_intersection
|
|
698
|
+
)
|
|
699
|
+
left_parts = left_parts[first_prefix:]
|
|
700
|
+
|
|
701
|
+
return (
|
|
702
|
+
[]
|
|
703
|
+
if left_parts == ["/"]
|
|
704
|
+
or (left_parts[0] in bucket_prefix and len(left_parts) <= 2)
|
|
705
|
+
else left_parts
|
|
706
|
+
)
|
|
707
|
+
|
|
648
708
|
@property
|
|
649
709
|
def statistic_short_name(
|
|
650
710
|
self,
|
|
651
711
|
) -> str | None:
|
|
652
712
|
"""Extract the statistical short name from the filepath.
|
|
653
713
|
|
|
654
|
-
Extract the statistical short name from the filepath right before the
|
|
714
|
+
Extract the statistical short name from the filepath either after bucket name or right before the
|
|
655
715
|
dataset state based on the Dapla filepath naming convention.
|
|
656
716
|
|
|
657
717
|
Returns:
|
|
@@ -662,21 +722,75 @@ class DaplaDatasetPathInfo:
|
|
|
662
722
|
>>> DaplaDatasetPathInfo('prosjekt/befolkning/klargjorte_data/person_data_v1.parquet').statistic_short_name
|
|
663
723
|
befolkning
|
|
664
724
|
|
|
725
|
+
>>> DaplaDatasetPathInfo('buckets/prosjekt/befolkning/person_data_v1.parquet').statistic_short_name
|
|
726
|
+
befolkning
|
|
727
|
+
|
|
665
728
|
>>> DaplaDatasetPathInfo('befolkning/inndata/person_data_v1.parquet').statistic_short_name
|
|
666
729
|
befolkning
|
|
667
730
|
|
|
731
|
+
>>> DaplaDatasetPathInfo('buckets/bucket_name/stat_name/inndata/min_statistikk/person_data.parquet').statistic_short_name
|
|
732
|
+
stat_name
|
|
733
|
+
|
|
734
|
+
>>> DaplaDatasetPathInfo('buckets/stat_name/utdata/person_data.parquet').statistic_short_name
|
|
735
|
+
None
|
|
736
|
+
|
|
668
737
|
>>> DaplaDatasetPathInfo('befolkning/person_data.parquet').statistic_short_name
|
|
669
738
|
None
|
|
739
|
+
|
|
740
|
+
>>> DaplaDatasetPathInfo('buckets/produkt/befolkning/utdata/person_data.parquet').statistic_short_name
|
|
741
|
+
befolkning
|
|
742
|
+
|
|
743
|
+
>>> DaplaDatasetPathInfo('resources/buckets/produkt/befolkning/utdata/person_data.parquet').statistic_short_name
|
|
744
|
+
befolkning
|
|
745
|
+
|
|
746
|
+
>>> DaplaDatasetPathInfo('gs://statistikk/produkt/klargjorte-data/persondata_p1990-Q1_p2023-Q4_v1/aar=2019/data.parquet').statistic_short_name
|
|
747
|
+
produkt
|
|
748
|
+
|
|
749
|
+
>>> DaplaDatasetPathInfo('gs://statistikk/produkt/persondata_p1990-Q1_p2023-Q4_v1/aar=2019/data.parquet').statistic_short_name
|
|
750
|
+
None
|
|
751
|
+
|
|
752
|
+
>>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/person_data_p2021_v2.parquet').statistic_short_name
|
|
753
|
+
None
|
|
670
754
|
"""
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
755
|
+
if not self.dataset_state:
|
|
756
|
+
if self.bucket_name:
|
|
757
|
+
parts = self.dataset_path.parent.parts
|
|
758
|
+
|
|
759
|
+
if self.bucket_name not in parts:
|
|
760
|
+
return None
|
|
761
|
+
|
|
762
|
+
# Find the index of bucket_name in the path
|
|
763
|
+
bucket_name_index = self.dataset_path.parent.parts.index(
|
|
764
|
+
self.bucket_name,
|
|
765
|
+
)
|
|
766
|
+
|
|
767
|
+
# If there are parts after bucket_name, return the part immediately after it
|
|
768
|
+
if len(self.dataset_path.parent.parts) > bucket_name_index + 1:
|
|
769
|
+
return self.dataset_path.parent.parts[bucket_name_index + 1]
|
|
770
|
+
|
|
771
|
+
return None
|
|
772
|
+
|
|
773
|
+
dataset_state_names = self._extract_norwegian_dataset_state_path_part(
|
|
774
|
+
self.dataset_state,
|
|
775
|
+
)
|
|
776
|
+
dataset_path_parts = list(self.dataset_path.parts)
|
|
777
|
+
|
|
778
|
+
for state in dataset_state_names:
|
|
779
|
+
if state not in dataset_path_parts:
|
|
780
|
+
continue
|
|
781
|
+
|
|
782
|
+
index = dataset_path_parts.index(state)
|
|
783
|
+
|
|
784
|
+
if index == 0:
|
|
785
|
+
continue
|
|
786
|
+
|
|
787
|
+
left_parts = self._get_left_parts(dataset_path_parts, index)
|
|
788
|
+
|
|
789
|
+
if not left_parts:
|
|
790
|
+
return None
|
|
791
|
+
|
|
792
|
+
return dataset_path_parts[index - 1]
|
|
793
|
+
|
|
680
794
|
return None
|
|
681
795
|
|
|
682
796
|
def path_complies_with_naming_standard(self) -> bool:
|
|
@@ -5,7 +5,7 @@ Handles reading in the data and transforming data types to generic metadata type
|
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
-
import pathlib # noqa:
|
|
8
|
+
import pathlib # noqa: TC003 import is needed for docs build
|
|
9
9
|
import re
|
|
10
10
|
import typing as t
|
|
11
11
|
from abc import ABC
|
|
@@ -89,7 +89,7 @@ TYPE_CORRESPONDENCE: list[tuple[tuple[str, ...], DataType]] = [
|
|
|
89
89
|
]
|
|
90
90
|
TYPE_MAP: dict[str, DataType] = {}
|
|
91
91
|
for concrete_type, abstract_type in TYPE_CORRESPONDENCE:
|
|
92
|
-
TYPE_MAP.update(
|
|
92
|
+
TYPE_MAP.update(dict.fromkeys(concrete_type, abstract_type))
|
|
93
93
|
|
|
94
94
|
TDatasetParser = t.TypeVar("TDatasetParser", bound="DatasetParser")
|
|
95
95
|
|
|
@@ -112,31 +112,23 @@ class DatasetParser(ABC):
|
|
|
112
112
|
@staticmethod
|
|
113
113
|
def for_file(dataset: pathlib.Path | CloudPath) -> DatasetParser:
|
|
114
114
|
"""Return the correct subclass based on the given dataset file."""
|
|
115
|
-
supported_file_types: dict[
|
|
116
|
-
str,
|
|
117
|
-
type[DatasetParser],
|
|
118
|
-
] = {
|
|
119
|
-
".parquet": DatasetParserParquet,
|
|
120
|
-
".sas7bdat": DatasetParserSas7Bdat,
|
|
121
|
-
".parquet.gzip": DatasetParserParquet,
|
|
122
|
-
}
|
|
123
115
|
file_type = "Unknown"
|
|
124
116
|
try:
|
|
125
117
|
file_type = dataset.suffix
|
|
126
118
|
# Gzipped parquet files can be read with DatasetParserParquet
|
|
127
|
-
match = re.search(
|
|
128
|
-
file_type =
|
|
119
|
+
match = re.search(PARQUET_GZIP_FILE_SUFFIX, str(dataset).lower())
|
|
120
|
+
file_type = PARQUET_GZIP_FILE_SUFFIX if match else file_type
|
|
129
121
|
# Extract the appropriate reader class from the SUPPORTED_FILE_TYPES dict and return an instance of it
|
|
130
|
-
reader =
|
|
122
|
+
reader = SUPPORTED_DATASET_FILE_SUFFIXES[file_type](dataset)
|
|
131
123
|
except IndexError as e:
|
|
132
124
|
# Thrown when just one element is returned from split, meaning there is no file extension supplied
|
|
133
|
-
msg = f"Could not recognise file type for provided {dataset = }. Supported file types are: {', '.join(
|
|
125
|
+
msg = f"Could not recognise file type for provided {dataset = }. Supported file types are: {', '.join(SUPPORTED_DATASET_FILE_SUFFIXES.keys())}"
|
|
134
126
|
raise FileNotFoundError(
|
|
135
127
|
msg,
|
|
136
128
|
) from e
|
|
137
129
|
except KeyError as e:
|
|
138
130
|
# In this case the file type is not supported, so we throw a helpful exception
|
|
139
|
-
msg = f"{file_type = } is not supported. Please open one of the following supported files types: {', '.join(
|
|
131
|
+
msg = f"{file_type = } is not supported. Please open one of the following supported files types: {', '.join(SUPPORTED_DATASET_FILE_SUFFIXES.keys())} or contact the maintainers to request support."
|
|
140
132
|
raise NotImplementedError(
|
|
141
133
|
msg,
|
|
142
134
|
) from e
|
|
@@ -239,3 +231,17 @@ class DatasetParserSas7Bdat(DatasetParser):
|
|
|
239
231
|
)
|
|
240
232
|
|
|
241
233
|
return fields
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
PARQUET_FILE_SUFFIX = ".parquet"
|
|
237
|
+
PARQUET_GZIP_FILE_SUFFIX = ".parquet.gzip"
|
|
238
|
+
SAS7BDAT_FILE_SUFFIX = ".sas7bdat"
|
|
239
|
+
|
|
240
|
+
SUPPORTED_DATASET_FILE_SUFFIXES: dict[
|
|
241
|
+
str,
|
|
242
|
+
type[DatasetParser],
|
|
243
|
+
] = {
|
|
244
|
+
PARQUET_FILE_SUFFIX: DatasetParserParquet,
|
|
245
|
+
PARQUET_GZIP_FILE_SUFFIX: DatasetParserParquet,
|
|
246
|
+
SAS7BDAT_FILE_SUFFIX: DatasetParserSas7Bdat,
|
|
247
|
+
}
|
|
@@ -15,8 +15,8 @@ from __future__ import annotations
|
|
|
15
15
|
|
|
16
16
|
from collections import OrderedDict
|
|
17
17
|
from dataclasses import dataclass
|
|
18
|
+
from datetime import UTC
|
|
18
19
|
from datetime import datetime
|
|
19
|
-
from datetime import timezone
|
|
20
20
|
from typing import TYPE_CHECKING
|
|
21
21
|
from typing import Any
|
|
22
22
|
|
|
@@ -305,10 +305,10 @@ def handle_version_2_2_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
|
|
|
305
305
|
)
|
|
306
306
|
supplied_metadata["datadoc"]["variables"][i]["special_value"] = None
|
|
307
307
|
supplied_metadata["datadoc"]["variables"][i]["custom_type"] = None
|
|
308
|
-
supplied_metadata["datadoc"]["variables"][
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
308
|
+
supplied_metadata["datadoc"]["variables"][i] = (
|
|
309
|
+
_find_and_update_language_strings(
|
|
310
|
+
supplied_metadata["datadoc"]["variables"][i],
|
|
311
|
+
)
|
|
312
312
|
)
|
|
313
313
|
supplied_metadata["datadoc"]["dataset"]["custom_type"] = None
|
|
314
314
|
supplied_metadata["datadoc"]["dataset"] = _find_and_update_language_strings(
|
|
@@ -384,7 +384,7 @@ def handle_version_1_0_0(supplied_metadata: dict[str, Any]) -> dict[str, Any]:
|
|
|
384
384
|
if supplied_metadata["dataset"][field]:
|
|
385
385
|
supplied_metadata["dataset"][field] = datetime.isoformat(
|
|
386
386
|
datetime.fromisoformat(supplied_metadata["dataset"][field]).astimezone(
|
|
387
|
-
tz=
|
|
387
|
+
tz=UTC,
|
|
388
388
|
),
|
|
389
389
|
timespec="seconds",
|
|
390
390
|
)
|
|
@@ -5,11 +5,11 @@ from __future__ import annotations
|
|
|
5
5
|
import logging
|
|
6
6
|
import warnings
|
|
7
7
|
from typing import TYPE_CHECKING
|
|
8
|
+
from typing import Self
|
|
8
9
|
from typing import TextIO
|
|
9
10
|
|
|
10
11
|
from datadoc_model import model
|
|
11
12
|
from pydantic import model_validator
|
|
12
|
-
from typing_extensions import Self
|
|
13
13
|
|
|
14
14
|
from dapla_metadata.datasets.utility.constants import DATE_VALIDATION_MESSAGE
|
|
15
15
|
from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_DATASET_FIELDS
|
|
@@ -176,7 +176,7 @@ class ObligatoryVariableWarning(UserWarning):
|
|
|
176
176
|
"""Custom warning for checking obligatory metadata for variables."""
|
|
177
177
|
|
|
178
178
|
|
|
179
|
-
def custom_warning_handler(
|
|
179
|
+
def custom_warning_handler(
|
|
180
180
|
message: Warning | str,
|
|
181
181
|
category: type[Warning],
|
|
182
182
|
filename: str,
|
|
@@ -11,6 +11,6 @@ class SupportedLanguages(str, Enum):
|
|
|
11
11
|
Reference: https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
|
-
NORSK_BOKMÅL = "nb"
|
|
14
|
+
NORSK_BOKMÅL = "nb" # noqa: PLC2401 the listed problems do not apply in this case
|
|
15
15
|
NORSK_NYNORSK = "nn"
|
|
16
16
|
ENGLISH = "en"
|
|
@@ -5,16 +5,16 @@ import logging
|
|
|
5
5
|
import pathlib
|
|
6
6
|
import uuid
|
|
7
7
|
|
|
8
|
+
import google.auth
|
|
8
9
|
from cloudpathlib import CloudPath
|
|
9
10
|
from cloudpathlib import GSClient
|
|
10
11
|
from cloudpathlib import GSPath
|
|
11
|
-
from dapla import AuthClient
|
|
12
12
|
from datadoc_model import model
|
|
13
13
|
from datadoc_model.model import Assessment
|
|
14
14
|
from datadoc_model.model import DataSetState
|
|
15
15
|
from datadoc_model.model import VariableRole
|
|
16
16
|
|
|
17
|
-
from dapla_metadata.
|
|
17
|
+
from dapla_metadata.dapla import user_info
|
|
18
18
|
from dapla_metadata.datasets.utility.constants import (
|
|
19
19
|
DATASET_FIELDS_FROM_EXISTING_METADATA,
|
|
20
20
|
)
|
|
@@ -37,7 +37,7 @@ logger = logging.getLogger(__name__)
|
|
|
37
37
|
|
|
38
38
|
def get_timestamp_now() -> datetime.datetime:
|
|
39
39
|
"""Return a timestamp for the current moment."""
|
|
40
|
-
return datetime.datetime.now(tz=datetime.
|
|
40
|
+
return datetime.datetime.now(tz=datetime.UTC)
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
def normalize_path(path: str) -> pathlib.Path | CloudPath:
|
|
@@ -52,7 +52,7 @@ def normalize_path(path: str) -> pathlib.Path | CloudPath:
|
|
|
52
52
|
Pathlib compatible object.
|
|
53
53
|
"""
|
|
54
54
|
if path.startswith(GSPath.cloud_prefix):
|
|
55
|
-
client = GSClient(credentials=
|
|
55
|
+
client = GSClient(credentials=google.auth.default()[0])
|
|
56
56
|
return GSPath(path, client=client)
|
|
57
57
|
return pathlib.Path(path)
|
|
58
58
|
|
|
@@ -79,7 +79,7 @@ def derive_assessment_from_state(state: DataSetState) -> Assessment:
|
|
|
79
79
|
Returns:
|
|
80
80
|
The derived assessment of the dataset.
|
|
81
81
|
"""
|
|
82
|
-
match
|
|
82
|
+
match state:
|
|
83
83
|
case (
|
|
84
84
|
DataSetState.INPUT_DATA
|
|
85
85
|
| DataSetState.PROCESSED_DATA
|
|
@@ -147,7 +147,7 @@ def set_dataset_owner(dataset: model.Dataset) -> None:
|
|
|
147
147
|
dataset: The dataset object to set default values on.
|
|
148
148
|
"""
|
|
149
149
|
try:
|
|
150
|
-
dataset.owner = user_info.
|
|
150
|
+
dataset.owner = user_info.get_user_info_for_current_platform().current_team
|
|
151
151
|
except OSError:
|
|
152
152
|
logger.exception("Failed to find environment variable DAPLA_GROUP_CONTEXT")
|
|
153
153
|
|
|
@@ -245,10 +245,7 @@ def _is_missing_multilanguage_value(
|
|
|
245
245
|
len(field_value[0]) > 0
|
|
246
246
|
and not field_value[0]["languageText"]
|
|
247
247
|
and (len(field_value) <= 1 or not field_value[1]["languageText"])
|
|
248
|
-
and (
|
|
249
|
-
len(field_value) <= 2 # noqa: PLR2004 approve magic value
|
|
250
|
-
or not field_value[2]["languageText"]
|
|
251
|
-
)
|
|
248
|
+
and (len(field_value) <= 2 or not field_value[2]["languageText"])
|
|
252
249
|
),
|
|
253
250
|
)
|
|
254
251
|
|
|
@@ -277,8 +274,7 @@ def _is_missing_metadata(
|
|
|
277
274
|
True if the field doesn't have a value, False otherwise.
|
|
278
275
|
"""
|
|
279
276
|
return bool(
|
|
280
|
-
field_name in obligatory_list
|
|
281
|
-
and field_value is None
|
|
277
|
+
(field_name in obligatory_list and field_value is None)
|
|
282
278
|
or _is_missing_multilanguage_value(
|
|
283
279
|
field_name,
|
|
284
280
|
field_value,
|