dapla-toolbelt-metadata 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dapla-toolbelt-metadata might be problematic. Click here for more details.
- dapla_metadata/__init__.py +2 -0
- dapla_metadata/dapla/__init__.py +3 -0
- dapla_metadata/dapla/user_info.py +15 -16
- dapla_metadata/datasets/dapla_dataset_path_info.py +128 -14
- dapla_metadata/datasets/dataset_parser.py +34 -21
- dapla_metadata/datasets/utility/utils.py +2 -2
- dapla_metadata/standards/__init__.py +4 -0
- dapla_metadata/standards/name_validator.py +250 -0
- dapla_metadata/standards/standard_validators.py +98 -0
- dapla_metadata/standards/utils/__init__.py +1 -0
- dapla_metadata/standards/utils/constants.py +49 -0
- dapla_metadata/variable_definitions/_utils/descriptions.py +7 -4
- dapla_metadata/variable_definitions/variable_definition.py +21 -11
- {dapla_toolbelt_metadata-0.5.0.dist-info → dapla_toolbelt_metadata-0.6.1.dist-info}/METADATA +5 -5
- {dapla_toolbelt_metadata-0.5.0.dist-info → dapla_toolbelt_metadata-0.6.1.dist-info}/RECORD +17 -12
- {dapla_toolbelt_metadata-0.5.0.dist-info → dapla_toolbelt_metadata-0.6.1.dist-info}/WHEEL +1 -1
- {dapla_toolbelt_metadata-0.5.0.dist-info → dapla_toolbelt_metadata-0.6.1.dist-info}/LICENSE +0 -0
dapla_metadata/__init__.py
CHANGED
dapla_metadata/dapla/__init__.py
CHANGED
|
@@ -13,11 +13,6 @@ from dapla_metadata._shared.enums import DaplaService
|
|
|
13
13
|
logger = logging.getLogger(__name__)
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
PLACEHOLDER_EMAIL_ADDRESS = "default_user@ssb.no"
|
|
17
|
-
PLACEHOLDER_GROUP = "default-team-developers"
|
|
18
|
-
PLACEHOLDER_TEAM = "default-team"
|
|
19
|
-
|
|
20
|
-
|
|
21
16
|
class UserInfo(Protocol):
|
|
22
17
|
"""Information about the current user.
|
|
23
18
|
|
|
@@ -62,20 +57,24 @@ class UnknownUserInfo:
|
|
|
62
57
|
class TestUserInfo:
|
|
63
58
|
"""Information about the current user for local development and testing."""
|
|
64
59
|
|
|
60
|
+
PLACEHOLDER_EMAIL_ADDRESS = "default_user@ssb.no"
|
|
61
|
+
PLACEHOLDER_GROUP = "default-team-developers"
|
|
62
|
+
PLACEHOLDER_TEAM = "default-team"
|
|
63
|
+
|
|
65
64
|
@property
|
|
66
65
|
def short_email(self) -> str | None:
|
|
67
66
|
"""Get the short email address."""
|
|
68
|
-
return PLACEHOLDER_EMAIL_ADDRESS
|
|
67
|
+
return TestUserInfo.PLACEHOLDER_EMAIL_ADDRESS
|
|
69
68
|
|
|
70
69
|
@property
|
|
71
70
|
def current_group(self) -> str | None:
|
|
72
71
|
"""Get the group which the user is currently representing."""
|
|
73
|
-
return PLACEHOLDER_GROUP
|
|
72
|
+
return TestUserInfo.PLACEHOLDER_GROUP
|
|
74
73
|
|
|
75
74
|
@property
|
|
76
75
|
def current_team(self) -> str | None:
|
|
77
76
|
"""Get the team which the user is currently representing."""
|
|
78
|
-
return PLACEHOLDER_TEAM
|
|
77
|
+
return TestUserInfo.PLACEHOLDER_TEAM
|
|
79
78
|
|
|
80
79
|
|
|
81
80
|
class DaplaLabUserInfo:
|
|
@@ -146,17 +145,17 @@ def get_user_info_for_current_platform() -> UserInfo:
|
|
|
146
145
|
def parse_team_name(group: str) -> str:
|
|
147
146
|
"""Parses the group to get the current team.
|
|
148
147
|
|
|
149
|
-
>>> parse_team_name(dapla-metadata-developers)
|
|
150
|
-
|
|
148
|
+
>>> parse_team_name("dapla-metadata-developers")
|
|
149
|
+
'dapla-metadata'
|
|
151
150
|
|
|
152
|
-
>>> parse_team_name(dapla-metadata-data-admins)
|
|
153
|
-
|
|
151
|
+
>>> parse_team_name("dapla-metadata-data-admins")
|
|
152
|
+
'dapla-metadata'
|
|
154
153
|
|
|
155
|
-
>>> parse_team_name(dapla-metadata)
|
|
156
|
-
|
|
154
|
+
>>> parse_team_name("dapla-metadata")
|
|
155
|
+
'dapla'
|
|
157
156
|
|
|
158
|
-
>>> parse_team_name(dapla-metadata-not-real-name)
|
|
159
|
-
|
|
157
|
+
>>> parse_team_name("dapla-metadata-not-real-name")
|
|
158
|
+
'dapla-metadata-not-real'
|
|
160
159
|
"""
|
|
161
160
|
parts = group.split("-")
|
|
162
161
|
return "-".join(parts[:-2] if group.endswith("data-admins") else parts[:-1])
|
|
@@ -478,7 +478,7 @@ class DaplaDatasetPathInfo:
|
|
|
478
478
|
"""Extract the bucket name from the dataset path.
|
|
479
479
|
|
|
480
480
|
Returns:
|
|
481
|
-
The bucket name or None if the dataset path is not a GCS path.
|
|
481
|
+
The bucket name or None if the dataset path is not a GCS path nor ssb bucketeer path.
|
|
482
482
|
|
|
483
483
|
Examples:
|
|
484
484
|
>>> DaplaDatasetPathInfo('gs://ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet').bucket_name
|
|
@@ -492,17 +492,35 @@ class DaplaDatasetPathInfo:
|
|
|
492
492
|
|
|
493
493
|
>>> DaplaDatasetPathInfo('ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet').bucket_name
|
|
494
494
|
None
|
|
495
|
+
|
|
496
|
+
>>> DaplaDatasetPathInfo('ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet').bucket_name
|
|
497
|
+
None
|
|
498
|
+
|
|
499
|
+
>>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/stat/utdata/person_data_p2021_v2.parquet').bucket_name
|
|
500
|
+
ssb-staging-dapla-felles-data-delt
|
|
501
|
+
|
|
502
|
+
>>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/person_data_p2021_v2.parquet').bucket_name
|
|
503
|
+
ssb-staging-dapla-felles-data-delt
|
|
504
|
+
|
|
505
|
+
>>> DaplaDatasetPathInfo('home/work/buckets/ssb-staging-dapla-felles-produkt/stat/utdata/person_data_p2021_v2.parquet').bucket_name
|
|
506
|
+
ssb-staging-dapla-felles-produkt
|
|
495
507
|
"""
|
|
496
508
|
prefix: str | None = None
|
|
497
|
-
|
|
509
|
+
dataset_string = str(self.dataset_string)
|
|
510
|
+
if GSPath.cloud_prefix in self.dataset_string:
|
|
498
511
|
prefix = GSPath.cloud_prefix
|
|
499
|
-
|
|
512
|
+
_, bucket_and_rest = dataset_string.split(prefix, 1)
|
|
513
|
+
elif GS_PREFIX_FROM_PATHLIB in self.dataset_string:
|
|
500
514
|
prefix = GS_PREFIX_FROM_PATHLIB
|
|
515
|
+
_, bucket_and_rest = self.dataset_string.split(prefix, 1)
|
|
516
|
+
elif "buckets/" in self.dataset_string:
|
|
517
|
+
prefix = "buckets/"
|
|
518
|
+
_, bucket_and_rest = self.dataset_string.split(prefix, 1)
|
|
501
519
|
else:
|
|
502
520
|
return None
|
|
503
521
|
|
|
504
522
|
return pathlib.Path(
|
|
505
|
-
|
|
523
|
+
bucket_and_rest,
|
|
506
524
|
).parts[0]
|
|
507
525
|
|
|
508
526
|
@property
|
|
@@ -528,6 +546,15 @@ class DaplaDatasetPathInfo:
|
|
|
528
546
|
|
|
529
547
|
>>> DaplaDatasetPathInfo('my_data/simple_dataset_name.parquet').dataset_short_name
|
|
530
548
|
simple_dataset_name
|
|
549
|
+
|
|
550
|
+
>>> DaplaDatasetPathInfo('gs:/ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_p2021_v2.parquet').dataset_short_name
|
|
551
|
+
person_data
|
|
552
|
+
|
|
553
|
+
>>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/stat/utdata/folk_data_p2021_v2.parquet').dataset_short_name
|
|
554
|
+
folk_data
|
|
555
|
+
|
|
556
|
+
>>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/stat/utdata/dapla/bus_p2021_v2.parquet').dataset_short_name
|
|
557
|
+
bus
|
|
531
558
|
"""
|
|
532
559
|
if self.contains_data_from or self.contains_data_until:
|
|
533
560
|
short_name_sections = self.dataset_name_sections[
|
|
@@ -601,6 +628,9 @@ class DaplaDatasetPathInfo:
|
|
|
601
628
|
>>> DaplaDatasetPathInfo('utdata/min_statistikk/person_data_v1.parquet').dataset_state
|
|
602
629
|
<DataSetState.OUTPUT_DATA: 'OUTPUT_DATA'>
|
|
603
630
|
|
|
631
|
+
>>> DaplaDatasetPathInfo('buckets/bucket_name/stat_name/inndata/min_statistikk/person_data_v1.parquet').dataset_state
|
|
632
|
+
<DataSetState.INPUT_DATA: 'INPUT_DATA'>
|
|
633
|
+
|
|
604
634
|
>>> DaplaDatasetPathInfo('my_special_data/person_data_v1.parquet').dataset_state
|
|
605
635
|
None
|
|
606
636
|
"""
|
|
@@ -632,6 +662,12 @@ class DaplaDatasetPathInfo:
|
|
|
632
662
|
|
|
633
663
|
>>> DaplaDatasetPathInfo('person_data.parquet').dataset_version
|
|
634
664
|
None
|
|
665
|
+
|
|
666
|
+
>>> DaplaDatasetPathInfo('buckets/bucket_name/stat_name/inndata/min_statistikk/person_data_v1.parquet').dataset_version
|
|
667
|
+
'1'
|
|
668
|
+
|
|
669
|
+
>>> DaplaDatasetPathInfo('buckets/bucket_name/stat_name/inndata/min_statistikk/person_data.parquet').dataset_version
|
|
670
|
+
None
|
|
635
671
|
"""
|
|
636
672
|
minimum_elements_in_file_name: Final[int] = 2
|
|
637
673
|
minimum_characters_in_version_string: Final[int] = 2
|
|
@@ -645,13 +681,37 @@ class DaplaDatasetPathInfo:
|
|
|
645
681
|
return last_filename_element[1:]
|
|
646
682
|
return None
|
|
647
683
|
|
|
684
|
+
def _get_left_parts(
|
|
685
|
+
self,
|
|
686
|
+
dataset_path_parts: list[str],
|
|
687
|
+
state_index: int,
|
|
688
|
+
) -> list[str]:
|
|
689
|
+
"""Retrieve the path parts before the dataset state, considering bucket prefixes."""
|
|
690
|
+
bucket_prefix = {"gs:", "buckets"}
|
|
691
|
+
left_parts = dataset_path_parts[:state_index]
|
|
692
|
+
|
|
693
|
+
# Stop checking beyond the bucket prefix
|
|
694
|
+
prefix_intersection = bucket_prefix & set(left_parts)
|
|
695
|
+
if prefix_intersection:
|
|
696
|
+
first_prefix = min(
|
|
697
|
+
left_parts.index(prefix) for prefix in prefix_intersection
|
|
698
|
+
)
|
|
699
|
+
left_parts = left_parts[first_prefix:]
|
|
700
|
+
|
|
701
|
+
return (
|
|
702
|
+
[]
|
|
703
|
+
if left_parts == ["/"]
|
|
704
|
+
or (left_parts[0] in bucket_prefix and len(left_parts) <= 2)
|
|
705
|
+
else left_parts
|
|
706
|
+
)
|
|
707
|
+
|
|
648
708
|
@property
|
|
649
709
|
def statistic_short_name(
|
|
650
710
|
self,
|
|
651
711
|
) -> str | None:
|
|
652
712
|
"""Extract the statistical short name from the filepath.
|
|
653
713
|
|
|
654
|
-
Extract the statistical short name from the filepath right before the
|
|
714
|
+
Extract the statistical short name from the filepath either after bucket name or right before the
|
|
655
715
|
dataset state based on the Dapla filepath naming convention.
|
|
656
716
|
|
|
657
717
|
Returns:
|
|
@@ -662,21 +722,75 @@ class DaplaDatasetPathInfo:
|
|
|
662
722
|
>>> DaplaDatasetPathInfo('prosjekt/befolkning/klargjorte_data/person_data_v1.parquet').statistic_short_name
|
|
663
723
|
befolkning
|
|
664
724
|
|
|
725
|
+
>>> DaplaDatasetPathInfo('buckets/prosjekt/befolkning/person_data_v1.parquet').statistic_short_name
|
|
726
|
+
befolkning
|
|
727
|
+
|
|
665
728
|
>>> DaplaDatasetPathInfo('befolkning/inndata/person_data_v1.parquet').statistic_short_name
|
|
666
729
|
befolkning
|
|
667
730
|
|
|
731
|
+
>>> DaplaDatasetPathInfo('buckets/bucket_name/stat_name/inndata/min_statistikk/person_data.parquet').statistic_short_name
|
|
732
|
+
stat_name
|
|
733
|
+
|
|
734
|
+
>>> DaplaDatasetPathInfo('buckets/stat_name/utdata/person_data.parquet').statistic_short_name
|
|
735
|
+
None
|
|
736
|
+
|
|
668
737
|
>>> DaplaDatasetPathInfo('befolkning/person_data.parquet').statistic_short_name
|
|
669
738
|
None
|
|
739
|
+
|
|
740
|
+
>>> DaplaDatasetPathInfo('buckets/produkt/befolkning/utdata/person_data.parquet').statistic_short_name
|
|
741
|
+
befolkning
|
|
742
|
+
|
|
743
|
+
>>> DaplaDatasetPathInfo('resources/buckets/produkt/befolkning/utdata/person_data.parquet').statistic_short_name
|
|
744
|
+
befolkning
|
|
745
|
+
|
|
746
|
+
>>> DaplaDatasetPathInfo('gs://statistikk/produkt/klargjorte-data/persondata_p1990-Q1_p2023-Q4_v1/aar=2019/data.parquet').statistic_short_name
|
|
747
|
+
produkt
|
|
748
|
+
|
|
749
|
+
>>> DaplaDatasetPathInfo('gs://statistikk/produkt/persondata_p1990-Q1_p2023-Q4_v1/aar=2019/data.parquet').statistic_short_name
|
|
750
|
+
None
|
|
751
|
+
|
|
752
|
+
>>> DaplaDatasetPathInfo('buckets/ssb-staging-dapla-felles-data-delt/person_data_p2021_v2.parquet').statistic_short_name
|
|
753
|
+
None
|
|
670
754
|
"""
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
755
|
+
if not self.dataset_state:
|
|
756
|
+
if self.bucket_name:
|
|
757
|
+
parts = self.dataset_path.parent.parts
|
|
758
|
+
|
|
759
|
+
if self.bucket_name not in parts:
|
|
760
|
+
return None
|
|
761
|
+
|
|
762
|
+
# Find the index of bucket_name in the path
|
|
763
|
+
bucket_name_index = self.dataset_path.parent.parts.index(
|
|
764
|
+
self.bucket_name,
|
|
765
|
+
)
|
|
766
|
+
|
|
767
|
+
# If there are parts after bucket_name, return the part immediately after it
|
|
768
|
+
if len(self.dataset_path.parent.parts) > bucket_name_index + 1:
|
|
769
|
+
return self.dataset_path.parent.parts[bucket_name_index + 1]
|
|
770
|
+
|
|
771
|
+
return None
|
|
772
|
+
|
|
773
|
+
dataset_state_names = self._extract_norwegian_dataset_state_path_part(
|
|
774
|
+
self.dataset_state,
|
|
775
|
+
)
|
|
776
|
+
dataset_path_parts = list(self.dataset_path.parts)
|
|
777
|
+
|
|
778
|
+
for state in dataset_state_names:
|
|
779
|
+
if state not in dataset_path_parts:
|
|
780
|
+
continue
|
|
781
|
+
|
|
782
|
+
index = dataset_path_parts.index(state)
|
|
783
|
+
|
|
784
|
+
if index == 0:
|
|
785
|
+
continue
|
|
786
|
+
|
|
787
|
+
left_parts = self._get_left_parts(dataset_path_parts, index)
|
|
788
|
+
|
|
789
|
+
if not left_parts:
|
|
790
|
+
return None
|
|
791
|
+
|
|
792
|
+
return dataset_path_parts[index - 1]
|
|
793
|
+
|
|
680
794
|
return None
|
|
681
795
|
|
|
682
796
|
def path_complies_with_naming_standard(self) -> bool:
|
|
@@ -7,7 +7,6 @@ from __future__ import annotations
|
|
|
7
7
|
|
|
8
8
|
import pathlib # noqa: TC003 import is needed for docs build
|
|
9
9
|
import re
|
|
10
|
-
import typing as t
|
|
11
10
|
from abc import ABC
|
|
12
11
|
from abc import abstractmethod
|
|
13
12
|
from typing import TYPE_CHECKING
|
|
@@ -56,6 +55,8 @@ KNOWN_FLOAT_TYPES = (
|
|
|
56
55
|
|
|
57
56
|
KNOWN_STRING_TYPES = (
|
|
58
57
|
"string",
|
|
58
|
+
"string[pyarrow]",
|
|
59
|
+
"large_string",
|
|
59
60
|
"str",
|
|
60
61
|
"char",
|
|
61
62
|
"varchar",
|
|
@@ -67,13 +68,18 @@ KNOWN_STRING_TYPES = (
|
|
|
67
68
|
|
|
68
69
|
KNOWN_DATETIME_TYPES = (
|
|
69
70
|
"timestamp",
|
|
71
|
+
"timestamp[s]",
|
|
72
|
+
"timestamp[ms]",
|
|
70
73
|
"timestamp[us]",
|
|
71
74
|
"timestamp[ns]",
|
|
75
|
+
"datetime",
|
|
72
76
|
"datetime64",
|
|
73
|
-
"
|
|
74
|
-
"
|
|
77
|
+
"datetime64[s]",
|
|
78
|
+
"datetime64[ms]",
|
|
79
|
+
"datetime64[us]",
|
|
80
|
+
"datetime64[ns]",
|
|
75
81
|
"date",
|
|
76
|
-
"
|
|
82
|
+
"date32[day]",
|
|
77
83
|
"time",
|
|
78
84
|
)
|
|
79
85
|
|
|
@@ -89,9 +95,7 @@ TYPE_CORRESPONDENCE: list[tuple[tuple[str, ...], DataType]] = [
|
|
|
89
95
|
]
|
|
90
96
|
TYPE_MAP: dict[str, DataType] = {}
|
|
91
97
|
for concrete_type, abstract_type in TYPE_CORRESPONDENCE:
|
|
92
|
-
TYPE_MAP.update(
|
|
93
|
-
|
|
94
|
-
TDatasetParser = t.TypeVar("TDatasetParser", bound="DatasetParser")
|
|
98
|
+
TYPE_MAP.update(dict.fromkeys(concrete_type, abstract_type))
|
|
95
99
|
|
|
96
100
|
|
|
97
101
|
class DatasetParser(ABC):
|
|
@@ -112,31 +116,23 @@ class DatasetParser(ABC):
|
|
|
112
116
|
@staticmethod
|
|
113
117
|
def for_file(dataset: pathlib.Path | CloudPath) -> DatasetParser:
|
|
114
118
|
"""Return the correct subclass based on the given dataset file."""
|
|
115
|
-
supported_file_types: dict[
|
|
116
|
-
str,
|
|
117
|
-
type[DatasetParser],
|
|
118
|
-
] = {
|
|
119
|
-
".parquet": DatasetParserParquet,
|
|
120
|
-
".sas7bdat": DatasetParserSas7Bdat,
|
|
121
|
-
".parquet.gzip": DatasetParserParquet,
|
|
122
|
-
}
|
|
123
119
|
file_type = "Unknown"
|
|
124
120
|
try:
|
|
125
121
|
file_type = dataset.suffix
|
|
126
122
|
# Gzipped parquet files can be read with DatasetParserParquet
|
|
127
|
-
match = re.search(
|
|
128
|
-
file_type =
|
|
129
|
-
# Extract the appropriate reader class from the SUPPORTED_FILE_TYPES dict
|
|
130
|
-
reader =
|
|
123
|
+
match = re.search(PARQUET_GZIP_FILE_SUFFIX, str(dataset).lower())
|
|
124
|
+
file_type = PARQUET_GZIP_FILE_SUFFIX if match else file_type
|
|
125
|
+
# Extract the appropriate reader class from the SUPPORTED_FILE_TYPES dict
|
|
126
|
+
reader = SUPPORTED_DATASET_FILE_SUFFIXES[file_type](dataset)
|
|
131
127
|
except IndexError as e:
|
|
132
128
|
# Thrown when just one element is returned from split, meaning there is no file extension supplied
|
|
133
|
-
msg = f"Could not recognise file type for provided {dataset = }. Supported file types are: {', '.join(
|
|
129
|
+
msg = f"Could not recognise file type for provided {dataset = }. Supported file types are: {', '.join(SUPPORTED_DATASET_FILE_SUFFIXES.keys())}"
|
|
134
130
|
raise FileNotFoundError(
|
|
135
131
|
msg,
|
|
136
132
|
) from e
|
|
137
133
|
except KeyError as e:
|
|
138
134
|
# In this case the file type is not supported, so we throw a helpful exception
|
|
139
|
-
msg = f"{file_type = } is not supported. Please open one of the following supported files types: {', '.join(
|
|
135
|
+
msg = f"{file_type = } is not supported. Please open one of the following supported files types: {', '.join(SUPPORTED_DATASET_FILE_SUFFIXES.keys())} or contact the maintainers to request support."
|
|
140
136
|
raise NotImplementedError(
|
|
141
137
|
msg,
|
|
142
138
|
) from e
|
|
@@ -157,6 +153,9 @@ class DatasetParser(ABC):
|
|
|
157
153
|
|
|
158
154
|
Arguments:
|
|
159
155
|
data_type: The concrete data type to map.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
The abstract data type or None
|
|
160
159
|
"""
|
|
161
160
|
return TYPE_MAP.get(data_type.lower(), None)
|
|
162
161
|
|
|
@@ -239,3 +238,17 @@ class DatasetParserSas7Bdat(DatasetParser):
|
|
|
239
238
|
)
|
|
240
239
|
|
|
241
240
|
return fields
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
PARQUET_FILE_SUFFIX = ".parquet"
|
|
244
|
+
PARQUET_GZIP_FILE_SUFFIX = ".parquet.gzip"
|
|
245
|
+
SAS7BDAT_FILE_SUFFIX = ".sas7bdat"
|
|
246
|
+
|
|
247
|
+
SUPPORTED_DATASET_FILE_SUFFIXES: dict[
|
|
248
|
+
str,
|
|
249
|
+
type[DatasetParser],
|
|
250
|
+
] = {
|
|
251
|
+
PARQUET_FILE_SUFFIX: DatasetParserParquet,
|
|
252
|
+
PARQUET_GZIP_FILE_SUFFIX: DatasetParserParquet,
|
|
253
|
+
SAS7BDAT_FILE_SUFFIX: DatasetParserSas7Bdat,
|
|
254
|
+
}
|
|
@@ -5,6 +5,7 @@ import logging
|
|
|
5
5
|
import pathlib
|
|
6
6
|
import uuid
|
|
7
7
|
|
|
8
|
+
import google.auth
|
|
8
9
|
from cloudpathlib import CloudPath
|
|
9
10
|
from cloudpathlib import GSClient
|
|
10
11
|
from cloudpathlib import GSPath
|
|
@@ -13,7 +14,6 @@ from datadoc_model.model import Assessment
|
|
|
13
14
|
from datadoc_model.model import DataSetState
|
|
14
15
|
from datadoc_model.model import VariableRole
|
|
15
16
|
|
|
16
|
-
from dapla import AuthClient
|
|
17
17
|
from dapla_metadata.dapla import user_info
|
|
18
18
|
from dapla_metadata.datasets.utility.constants import (
|
|
19
19
|
DATASET_FIELDS_FROM_EXISTING_METADATA,
|
|
@@ -52,7 +52,7 @@ def normalize_path(path: str) -> pathlib.Path | CloudPath:
|
|
|
52
52
|
Pathlib compatible object.
|
|
53
53
|
"""
|
|
54
54
|
if path.startswith(GSPath.cloud_prefix):
|
|
55
|
-
client = GSClient(credentials=
|
|
55
|
+
client = GSClient(credentials=google.auth.default()[0])
|
|
56
56
|
return GSPath(path, client=client)
|
|
57
57
|
return pathlib.Path(path)
|
|
58
58
|
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
from collections.abc import AsyncGenerator
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from dapla_metadata.datasets.dapla_dataset_path_info import DaplaDatasetPathInfo
|
|
9
|
+
from dapla_metadata.datasets.dataset_parser import SUPPORTED_DATASET_FILE_SUFFIXES
|
|
10
|
+
from dapla_metadata.standards.utils.constants import FILE_DOES_NOT_EXIST
|
|
11
|
+
from dapla_metadata.standards.utils.constants import FILE_IGNORED
|
|
12
|
+
from dapla_metadata.standards.utils.constants import IGNORED_FOLDERS
|
|
13
|
+
from dapla_metadata.standards.utils.constants import INVALID_SYMBOLS
|
|
14
|
+
from dapla_metadata.standards.utils.constants import MISSING_DATA_STATE
|
|
15
|
+
from dapla_metadata.standards.utils.constants import MISSING_DATASET_SHORT_NAME
|
|
16
|
+
from dapla_metadata.standards.utils.constants import MISSING_PERIOD
|
|
17
|
+
from dapla_metadata.standards.utils.constants import MISSING_SHORT_NAME
|
|
18
|
+
from dapla_metadata.standards.utils.constants import MISSING_VERSION
|
|
19
|
+
from dapla_metadata.standards.utils.constants import NAME_STANDARD_SUCCESS
|
|
20
|
+
from dapla_metadata.standards.utils.constants import NAME_STANDARD_VIOLATION
|
|
21
|
+
from dapla_metadata.standards.utils.constants import PATH_IGNORED
|
|
22
|
+
from dapla_metadata.standards.utils.constants import SSB_NAMING_STANDARD_REPORT
|
|
23
|
+
from dapla_metadata.standards.utils.constants import SSB_NAMING_STANDARD_REPORT_FILES
|
|
24
|
+
from dapla_metadata.standards.utils.constants import (
|
|
25
|
+
SSB_NAMING_STANDARD_REPORT_RESULT_AVERAGE,
|
|
26
|
+
)
|
|
27
|
+
from dapla_metadata.standards.utils.constants import (
|
|
28
|
+
SSB_NAMING_STANDARD_REPORT_RESULT_BEST,
|
|
29
|
+
)
|
|
30
|
+
from dapla_metadata.standards.utils.constants import (
|
|
31
|
+
SSB_NAMING_STANDARD_REPORT_RESULT_GOOD,
|
|
32
|
+
)
|
|
33
|
+
from dapla_metadata.standards.utils.constants import (
|
|
34
|
+
SSB_NAMING_STANDARD_REPORT_RESULT_LOW,
|
|
35
|
+
)
|
|
36
|
+
from dapla_metadata.standards.utils.constants import (
|
|
37
|
+
SSB_NAMING_STANDARD_REPORT_RESULT_NO_SCORE,
|
|
38
|
+
)
|
|
39
|
+
from dapla_metadata.standards.utils.constants import SSB_NAMING_STANDARD_REPORT_SUCCESS
|
|
40
|
+
from dapla_metadata.standards.utils.constants import (
|
|
41
|
+
SSB_NAMING_STANDARD_REPORT_SUCCESS_RATE,
|
|
42
|
+
)
|
|
43
|
+
from dapla_metadata.standards.utils.constants import (
|
|
44
|
+
SSB_NAMING_STANDARD_REPORT_VIOLATIONS,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
logger = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ValidationResult:
|
|
51
|
+
"""Result object for name standard validation."""
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
success: bool,
|
|
56
|
+
file_path: str,
|
|
57
|
+
) -> None:
|
|
58
|
+
"""Initialize the validatation result."""
|
|
59
|
+
self.success = success
|
|
60
|
+
self.file_path = file_path
|
|
61
|
+
self.messages: list[str] = []
|
|
62
|
+
self.violations: list[str] = []
|
|
63
|
+
|
|
64
|
+
def add_message(self, message: str) -> None:
|
|
65
|
+
"""Add message to list."""
|
|
66
|
+
if message not in self.messages:
|
|
67
|
+
self.messages.append(message)
|
|
68
|
+
|
|
69
|
+
def add_violation(self, violation: str) -> None:
|
|
70
|
+
"""Add violation to list."""
|
|
71
|
+
if violation not in self.violations:
|
|
72
|
+
self.violations.append(violation)
|
|
73
|
+
if self.success:
|
|
74
|
+
self.success = False
|
|
75
|
+
|
|
76
|
+
def __repr__(self) -> str:
|
|
77
|
+
"""Representation for debugging."""
|
|
78
|
+
return f"ValidationResult(success={self.success}, file_path={self.file_path}, messages={self.messages}, violations={self.violations})"
|
|
79
|
+
|
|
80
|
+
def to_dict(self) -> dict:
|
|
81
|
+
"""Return result as a dictionary."""
|
|
82
|
+
return {
|
|
83
|
+
"success": self.success,
|
|
84
|
+
"file_path": self.file_path,
|
|
85
|
+
"messages": self.messages,
|
|
86
|
+
"violations": self.violations,
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class NamingStandardReport:
|
|
91
|
+
"""Report object for name standard validation."""
|
|
92
|
+
|
|
93
|
+
def __init__(self, validation_results: list[ValidationResult]) -> None:
|
|
94
|
+
"""Initialize the naming standard report."""
|
|
95
|
+
self.validation_results = validation_results
|
|
96
|
+
self.num_files_validated = len(validation_results)
|
|
97
|
+
self.num_success = len(
|
|
98
|
+
[result for result in validation_results if result.success is True],
|
|
99
|
+
)
|
|
100
|
+
self.num_failures = len(
|
|
101
|
+
[result for result in validation_results if result.success is False],
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def generate_report(self) -> str:
|
|
105
|
+
"""Format the report as a string."""
|
|
106
|
+
return (
|
|
107
|
+
f"{SSB_NAMING_STANDARD_REPORT}\n"
|
|
108
|
+
f"=============================\n"
|
|
109
|
+
f"{self.evaluate_result()}"
|
|
110
|
+
f"{SSB_NAMING_STANDARD_REPORT_SUCCESS_RATE}: {self.success_rate():.2f}%\n"
|
|
111
|
+
f"{SSB_NAMING_STANDARD_REPORT_FILES}: {self.num_files_validated}\n"
|
|
112
|
+
f"{SSB_NAMING_STANDARD_REPORT_SUCCESS}: {self.num_success}\n"
|
|
113
|
+
f"{SSB_NAMING_STANDARD_REPORT_VIOLATIONS}s: {self.num_failures}\n"
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
def success_rate(self) -> int | float | None:
|
|
117
|
+
"""Calculate the success rate as a percentage.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
int | float | None: The success rate as a percentage, or None if
|
|
121
|
+
no files were validated.
|
|
122
|
+
"""
|
|
123
|
+
if self.num_files_validated == 0:
|
|
124
|
+
return None
|
|
125
|
+
return self.num_success / self.num_files_validated * 100
|
|
126
|
+
|
|
127
|
+
def evaluate_result(self) -> str:
|
|
128
|
+
"""Returns an appropriate message based on the success rate."""
|
|
129
|
+
rate = self.success_rate()
|
|
130
|
+
if rate is not None:
|
|
131
|
+
if rate == 100:
|
|
132
|
+
return SSB_NAMING_STANDARD_REPORT_RESULT_BEST
|
|
133
|
+
if 70 < rate < 100:
|
|
134
|
+
return SSB_NAMING_STANDARD_REPORT_RESULT_GOOD
|
|
135
|
+
if 40 <= rate <= 70:
|
|
136
|
+
return SSB_NAMING_STANDARD_REPORT_RESULT_AVERAGE
|
|
137
|
+
if rate < 40:
|
|
138
|
+
return SSB_NAMING_STANDARD_REPORT_RESULT_LOW
|
|
139
|
+
return SSB_NAMING_STANDARD_REPORT_RESULT_NO_SCORE
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _has_invalid_symbols(path: os.PathLike[str]) -> bool:
|
|
143
|
+
"""Return True if string contains illegal symbols.
|
|
144
|
+
|
|
145
|
+
Examples:
|
|
146
|
+
>>> _has_invalid_symbols("åregang-øre")
|
|
147
|
+
True
|
|
148
|
+
|
|
149
|
+
>>> _has_invalid_symbols("Azor89")
|
|
150
|
+
False
|
|
151
|
+
|
|
152
|
+
>>> _has_invalid_symbols("ssbÆ-dapla-example-data-produkt-prod/ledstill/oppdrag/skjema_p2018_p2020_v1")
|
|
153
|
+
True
|
|
154
|
+
|
|
155
|
+
>>> _has_invalid_symbols("ssb-dapla-example-data-produkt-prod/ledstill/oppdrag/skjema_p2018_p2020_v1")
|
|
156
|
+
False
|
|
157
|
+
|
|
158
|
+
>>> _has_invalid_symbols("ssb-dapla-example-data-produkt-prod/ledstill/inndata/skjema_p2018_p202_v1/aar=2018/data.parquet")
|
|
159
|
+
False
|
|
160
|
+
"""
|
|
161
|
+
# TODO @mmwinther: The = symbol is allowed to avoid failures on subdirectories of partioned parquet datasets.
|
|
162
|
+
# DPMETA-824
|
|
163
|
+
return bool(re.search(r"[^a-zA-Z0-9\./:_\-=]", str(path).strip()))
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _check_violations(
|
|
167
|
+
file: Path,
|
|
168
|
+
) -> list[str]:
|
|
169
|
+
"""Check for missing attributes and invalid symbols."""
|
|
170
|
+
path_info = DaplaDatasetPathInfo(file)
|
|
171
|
+
checks = {
|
|
172
|
+
MISSING_SHORT_NAME: path_info.statistic_short_name,
|
|
173
|
+
MISSING_DATA_STATE: path_info.dataset_state,
|
|
174
|
+
MISSING_PERIOD: path_info.contains_data_from,
|
|
175
|
+
MISSING_DATASET_SHORT_NAME: path_info.dataset_short_name,
|
|
176
|
+
MISSING_VERSION: path_info.dataset_version,
|
|
177
|
+
INVALID_SYMBOLS: not _has_invalid_symbols(file),
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
return [message for message, value in checks.items() if not value]
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
async def _validate_file(
|
|
184
|
+
file: Path,
|
|
185
|
+
check_file_exists: bool = False,
|
|
186
|
+
) -> ValidationResult:
|
|
187
|
+
"""Check for naming standard violations.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
A ValidationResult object containing messages and violations
|
|
191
|
+
"""
|
|
192
|
+
logger.info("Validating file: %s", file)
|
|
193
|
+
if file.suffix not in SUPPORTED_DATASET_FILE_SUFFIXES:
|
|
194
|
+
logger.info("Skipping validation on non-dataset file: %s", file)
|
|
195
|
+
return await _ignored_file_type_result(file)
|
|
196
|
+
|
|
197
|
+
result = ValidationResult(success=True, file_path=str(file))
|
|
198
|
+
|
|
199
|
+
if check_file_exists and not file.exists():
|
|
200
|
+
result.add_message(
|
|
201
|
+
FILE_DOES_NOT_EXIST,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
result.violations = await asyncio.get_running_loop().run_in_executor(
|
|
205
|
+
None,
|
|
206
|
+
lambda: _check_violations(file),
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
if result.violations:
|
|
210
|
+
result.success = False
|
|
211
|
+
result.add_message(NAME_STANDARD_VIOLATION)
|
|
212
|
+
else:
|
|
213
|
+
result.success = True
|
|
214
|
+
result.add_message(
|
|
215
|
+
NAME_STANDARD_SUCCESS,
|
|
216
|
+
)
|
|
217
|
+
return result
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
async def _ignored_folder_result(file: Path) -> ValidationResult:
|
|
221
|
+
r = ValidationResult(success=True, file_path=str(file))
|
|
222
|
+
r.add_message(PATH_IGNORED)
|
|
223
|
+
return r
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
async def _ignored_file_type_result(file: Path) -> ValidationResult:
|
|
227
|
+
r = ValidationResult(success=True, file_path=str(file))
|
|
228
|
+
r.add_message(FILE_IGNORED)
|
|
229
|
+
return r
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
async def validate_directory(
|
|
233
|
+
path: Path,
|
|
234
|
+
) -> AsyncGenerator[AsyncGenerator | asyncio.Task]:
|
|
235
|
+
"""Validate a file or recursively validate all files in a directory."""
|
|
236
|
+
if set(path.parts).intersection(IGNORED_FOLDERS):
|
|
237
|
+
logger.info("File path ignored: %s", path)
|
|
238
|
+
yield asyncio.create_task(_ignored_folder_result(path))
|
|
239
|
+
elif path.suffix:
|
|
240
|
+
yield asyncio.create_task(_validate_file(path, check_file_exists=True))
|
|
241
|
+
else:
|
|
242
|
+
for obj in await asyncio.get_running_loop().run_in_executor(
|
|
243
|
+
None,
|
|
244
|
+
lambda: path.glob("*"),
|
|
245
|
+
):
|
|
246
|
+
if obj.suffix:
|
|
247
|
+
yield asyncio.create_task(_validate_file(obj), name=obj.name)
|
|
248
|
+
else:
|
|
249
|
+
logger.debug("Recursing into: %s", obj)
|
|
250
|
+
yield validate_directory(obj)
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import AsyncGenerator
|
|
6
|
+
|
|
7
|
+
from dapla_metadata.datasets.utility.utils import normalize_path
|
|
8
|
+
from dapla_metadata.standards.name_validator import NamingStandardReport
|
|
9
|
+
from dapla_metadata.standards.name_validator import ValidationResult
|
|
10
|
+
from dapla_metadata.standards.name_validator import validate_directory
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
async def check_naming_standard(
|
|
16
|
+
file_path: str | os.PathLike[str],
|
|
17
|
+
) -> list[ValidationResult]:
|
|
18
|
+
"""Check whether a given path follows the SSB naming standard.
|
|
19
|
+
|
|
20
|
+
This function checks whether the provided `file_path` and subdirectories thereof comply
|
|
21
|
+
with the naming standard. Currently we only examine '.parquet' files. Other files are ignored.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
file_path: The path to a bucket, directory, or specific file to validate.
|
|
25
|
+
This can be in the following forms:
|
|
26
|
+
- A bucket URL in the form 'gs://ssb-dapla-felles-data-produkt-test'
|
|
27
|
+
- An absolute path to a mounted bucket in the form '/buckets/produkt'
|
|
28
|
+
- Any subdirectory or file thereof
|
|
29
|
+
We also accept paths which don't yet exist so that you can test if a path will comply.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
list[ValidationResult]: A list of validation results,
|
|
33
|
+
including success status, checked file path, messages, and any detected violations.
|
|
34
|
+
|
|
35
|
+
Examples:
|
|
36
|
+
>>> check_naming_standard("/data/example_file.parquet").success
|
|
37
|
+
False
|
|
38
|
+
|
|
39
|
+
>>> check_naming_standard("/buckets/produkt/datadoc/utdata/person_data_p2021_v2.parquet").success
|
|
40
|
+
True
|
|
41
|
+
"""
|
|
42
|
+
results = []
|
|
43
|
+
|
|
44
|
+
# Begin validation.
|
|
45
|
+
# For each file this returns a task which we can wait on to complete.
|
|
46
|
+
# For each directory this returns another AsyncGenerator which must be unpacked below
|
|
47
|
+
tasks = [t async for t in validate_directory(normalize_path(str(file_path)))] # type:ignore [arg-type]
|
|
48
|
+
|
|
49
|
+
# 5 minute timeout for safety
|
|
50
|
+
start_time = time.time()
|
|
51
|
+
while time.time() < start_time + (5 * 60):
|
|
52
|
+
for item in tasks:
|
|
53
|
+
if isinstance(item, AsyncGenerator):
|
|
54
|
+
# Drill down into lower directories to get the validation tasks from them
|
|
55
|
+
tasks.remove(item)
|
|
56
|
+
new_tasks = [t async for t in item]
|
|
57
|
+
logger.debug("New Tasks: %s %s", len(new_tasks), new_tasks)
|
|
58
|
+
tasks.extend(
|
|
59
|
+
new_tasks,
|
|
60
|
+
)
|
|
61
|
+
elif isinstance(item, asyncio.Task):
|
|
62
|
+
if item.done():
|
|
63
|
+
logger.info("Validated %s", item.get_name())
|
|
64
|
+
tasks.remove(item)
|
|
65
|
+
results.append(item.result())
|
|
66
|
+
|
|
67
|
+
logger.debug("Tasks: %s %s", len(tasks), tasks)
|
|
68
|
+
logger.debug("Results: %s", len(results))
|
|
69
|
+
|
|
70
|
+
if len(tasks) == 0:
|
|
71
|
+
logger.info("Completed validation")
|
|
72
|
+
break
|
|
73
|
+
|
|
74
|
+
# Allow time for other processing to be performed
|
|
75
|
+
await asyncio.sleep(0.001)
|
|
76
|
+
|
|
77
|
+
return results
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def generate_validation_report(
|
|
81
|
+
validation_results: list[ValidationResult],
|
|
82
|
+
) -> NamingStandardReport:
|
|
83
|
+
"""Generate and print a formatted naming standard validation report.
|
|
84
|
+
|
|
85
|
+
This function takes a list of `ValidationResult` objects, creates a
|
|
86
|
+
`NamingStandardReport` instance, and prints the generated report.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
validation_results: A list of ValidationResult objects that
|
|
90
|
+
contain the outcomes of the name standard checks.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
NamingStandardReport: An instance of `NamingStandardReport` containing
|
|
94
|
+
the validation results.
|
|
95
|
+
"""
|
|
96
|
+
report = NamingStandardReport(validation_results=validation_results)
|
|
97
|
+
print(report.generate_report()) # noqa: T201
|
|
98
|
+
return report
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Utils for validating ssb standards."""
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Constants used in validate ssb name standard."""
|
|
2
|
+
|
|
3
|
+
from dapla_metadata.datasets.dataset_parser import SUPPORTED_DATASET_FILE_SUFFIXES
|
|
4
|
+
|
|
5
|
+
SUCCESS = "Suksess"
|
|
6
|
+
|
|
7
|
+
NAME_STANDARD_SUCCESS = "Filene dine er i samsvar med SSB-navnestandarden"
|
|
8
|
+
|
|
9
|
+
NAME_STANDARD_VIOLATION = "Det er oppdaget brudd på SSB-navnestandard:"
|
|
10
|
+
|
|
11
|
+
MISSING_BUCKET_NAME = "Filnavn mangler bøttenavn ref: https://manual.dapla.ssb.no/statistikkere/navnestandard.html#obligatoriske-mapper"
|
|
12
|
+
MISSING_VERSION = "Filnavn mangler versjon ref: https://manual.dapla.ssb.no/statistikkere/navnestandard.html#filnavn"
|
|
13
|
+
MISSING_PERIOD = "Filnavn mangler gyldighetsperiode ref: https://manual.dapla.ssb.no/statistikkere/navnestandard.html#filnavn"
|
|
14
|
+
MISSING_SHORT_NAME = "Kortnavn for statistikk mangler ref: https://manual.dapla.ssb.no/statistikkere/navnestandard.html#obligatoriske-mapper"
|
|
15
|
+
MISSING_DATA_STATE = "Mappe for datatilstand mangler ref: https://manual.dapla.ssb.no/statistikkere/navnestandard.html#obligatoriske-mapper"
|
|
16
|
+
MISSING_DATASET_SHORT_NAME = "Filnavn mangler datasett kortnavn ref: https://manual.dapla.ssb.no/statistikkere/navnestandard.html#filnavn"
|
|
17
|
+
|
|
18
|
+
INVALID_SYMBOLS = "Filnavn inneholder ulovlige tegn ref: https://manual.dapla.ssb.no/statistikkere/navnestandard.html#filnavn"
|
|
19
|
+
|
|
20
|
+
PATH_IGNORED = "Ignorert, mappen er ikke underlagt krav til navnestandard."
|
|
21
|
+
FILE_IGNORED = f"Ignorert, kun datasett med {', '.join(SUPPORTED_DATASET_FILE_SUFFIXES.keys())} filendelser valideres foreløpig."
|
|
22
|
+
|
|
23
|
+
FILE_DOES_NOT_EXIST = "Filen eksisterer ikke. Validerer uansett."
|
|
24
|
+
|
|
25
|
+
BUCKET_NAME_UNKNOWN = "Kan ikke validere bøttenavn"
|
|
26
|
+
|
|
27
|
+
SSB_NAMING_STANDARD_REPORT = "SSB navnestandard rapport"
|
|
28
|
+
SSB_NAMING_STANDARD_REPORT_SUCCESS_RATE = "Suksess rate"
|
|
29
|
+
SSB_NAMING_STANDARD_REPORT_RESULT_BEST = "🚀 Fantastisk! Alt bestått! 🎉\n"
|
|
30
|
+
SSB_NAMING_STANDARD_REPORT_RESULT_GOOD = (
|
|
31
|
+
"✅ Bra jobba! Fortsatt litt rom for forbedring. 😊\n"
|
|
32
|
+
)
|
|
33
|
+
SSB_NAMING_STANDARD_REPORT_RESULT_AVERAGE = (
|
|
34
|
+
"⚠️ Ikke verst! Men det er noen feil å fikse. 🔧\n"
|
|
35
|
+
)
|
|
36
|
+
SSB_NAMING_STANDARD_REPORT_RESULT_LOW = "❌ Mye å forbedre! Ta en grundig sjekk. 🛠️\n"
|
|
37
|
+
SSB_NAMING_STANDARD_REPORT_RESULT_NO_SCORE = "👀 Ingen filer ble validert\n"
|
|
38
|
+
SSB_NAMING_STANDARD_REPORT_FILES = "Antall filer validert"
|
|
39
|
+
SSB_NAMING_STANDARD_REPORT_SUCCESS = "Antall filer som følger SSB navnestandard"
|
|
40
|
+
SSB_NAMING_STANDARD_REPORT_VIOLATIONS = "Antall filer som bryter SSB navnestandard"
|
|
41
|
+
|
|
42
|
+
IGNORED_FOLDERS = [
|
|
43
|
+
"temp",
|
|
44
|
+
"oppdrag",
|
|
45
|
+
"konfigurasjon",
|
|
46
|
+
"logg",
|
|
47
|
+
"tidsserier",
|
|
48
|
+
"migrert",
|
|
49
|
+
]
|
|
@@ -2,15 +2,18 @@
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
5
6
|
from typing import cast
|
|
6
7
|
|
|
7
|
-
import yaml
|
|
8
|
+
import ruamel.yaml
|
|
8
9
|
from pydantic import BaseModel
|
|
9
10
|
from pydantic import Field
|
|
10
|
-
from pydantic.config import JsonDict
|
|
11
11
|
|
|
12
12
|
from dapla_metadata.variable_definitions._utils.config import get_descriptions_path
|
|
13
13
|
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from pydantic.config import JsonDict
|
|
16
|
+
|
|
14
17
|
logger = logging.getLogger(__name__)
|
|
15
18
|
|
|
16
19
|
|
|
@@ -34,7 +37,7 @@ def load_descriptions(file_path: Path) -> dict:
|
|
|
34
37
|
dict: Parsed contents of the YAML file.
|
|
35
38
|
"""
|
|
36
39
|
with Path.open(file_path, encoding="utf-8") as f:
|
|
37
|
-
return yaml.
|
|
40
|
+
return ruamel.yaml.YAML().load(f)
|
|
38
41
|
|
|
39
42
|
|
|
40
43
|
def apply_norwegian_descriptions_to_model(
|
|
@@ -74,7 +77,7 @@ def apply_norwegian_descriptions_to_model(
|
|
|
74
77
|
title=field_info.title,
|
|
75
78
|
description=field_info.description,
|
|
76
79
|
json_schema_extra=cast(
|
|
77
|
-
JsonDict,
|
|
80
|
+
"JsonDict",
|
|
78
81
|
{
|
|
79
82
|
"norwegian_description": new_description,
|
|
80
83
|
"annotation": field_info.annotation,
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from datetime import date
|
|
3
|
+
from io import StringIO
|
|
3
4
|
from os import PathLike
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
|
-
import yaml
|
|
7
|
+
import ruamel.yaml
|
|
7
8
|
from pydantic import ConfigDict
|
|
8
9
|
from pydantic import PrivateAttr
|
|
9
10
|
|
|
@@ -122,6 +123,8 @@ class VariableDefinition(CompleteResponse):
|
|
|
122
123
|
update_draft=update_draft,
|
|
123
124
|
),
|
|
124
125
|
)
|
|
126
|
+
self.__dict__.update(updated)
|
|
127
|
+
|
|
125
128
|
logger.info(
|
|
126
129
|
"Successfully updated variable definition '%s' with ID '%s'",
|
|
127
130
|
updated.short_name,
|
|
@@ -228,6 +231,8 @@ class VariableDefinition(CompleteResponse):
|
|
|
228
231
|
valid_from=valid_from,
|
|
229
232
|
),
|
|
230
233
|
)
|
|
234
|
+
self.__dict__.update(new_patch)
|
|
235
|
+
|
|
231
236
|
logger.info(
|
|
232
237
|
"Successfully created patch with patch ID '%s' for variable definition '%s' with ID '%s'",
|
|
233
238
|
new_patch.patch_id,
|
|
@@ -301,6 +306,7 @@ class VariableDefinition(CompleteResponse):
|
|
|
301
306
|
validity_period=validity_period,
|
|
302
307
|
),
|
|
303
308
|
)
|
|
309
|
+
self.__dict__.update(new_validity_period)
|
|
304
310
|
|
|
305
311
|
logger.info(
|
|
306
312
|
"Successfully created validity period that is valid from '%s' for variable definition '%s' with ID '%s'",
|
|
@@ -398,13 +404,17 @@ class VariableDefinition(CompleteResponse):
|
|
|
398
404
|
return self._convert_to_yaml_output()
|
|
399
405
|
|
|
400
406
|
def _convert_to_yaml_output(self) -> str:
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
407
|
+
stream = StringIO()
|
|
408
|
+
with ruamel.yaml.YAML(
|
|
409
|
+
output=stream,
|
|
410
|
+
) as yaml:
|
|
411
|
+
yaml.default_flow_style = False
|
|
412
|
+
yaml.allow_unicode = True
|
|
413
|
+
yaml.dump(
|
|
414
|
+
self.model_dump(
|
|
415
|
+
mode="json",
|
|
416
|
+
serialize_as_any=True,
|
|
417
|
+
warnings="error",
|
|
418
|
+
),
|
|
419
|
+
)
|
|
420
|
+
return stream.getvalue()
|
{dapla_toolbelt_metadata-0.5.0.dist-info → dapla_toolbelt_metadata-0.6.1.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: dapla-toolbelt-metadata
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: Dapla Toolbelt Metadata
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Team Metadata
|
|
@@ -15,16 +15,16 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
15
15
|
Requires-Dist: arrow (>=1.3.0)
|
|
16
16
|
Requires-Dist: beautifulsoup4 (>=4.12.3)
|
|
17
17
|
Requires-Dist: cloudpathlib[gs] (>=0.17.0)
|
|
18
|
-
Requires-Dist:
|
|
19
|
-
Requires-Dist:
|
|
18
|
+
Requires-Dist: google-auth (>=2.38.0)
|
|
19
|
+
Requires-Dist: lxml (>=5.3.1)
|
|
20
20
|
Requires-Dist: pyarrow (>=8.0.0)
|
|
21
21
|
Requires-Dist: pydantic (>=2.5.2)
|
|
22
22
|
Requires-Dist: pyjwt (>=2.8.0)
|
|
23
23
|
Requires-Dist: python-dotenv (>=1.0.1)
|
|
24
24
|
Requires-Dist: requests (>=2.31.0)
|
|
25
|
-
Requires-Dist: ruamel-yaml (>=0.18.10
|
|
25
|
+
Requires-Dist: ruamel-yaml (>=0.18.10)
|
|
26
26
|
Requires-Dist: ssb-datadoc-model (>=6.0.0,<7.0.0)
|
|
27
|
-
Requires-Dist: ssb-klass-python (>=
|
|
27
|
+
Requires-Dist: ssb-klass-python (>=1.0.1)
|
|
28
28
|
Requires-Dist: typing-extensions (>=4.12.2)
|
|
29
29
|
Project-URL: Changelog, https://github.com/statisticsnorway/dapla-toolbelt-metadata/releases
|
|
30
30
|
Project-URL: Documentation, https://statisticsnorway.github.io/dapla-toolbelt-metadata
|
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
dapla_metadata/__init__.py,sha256=
|
|
1
|
+
dapla_metadata/__init__.py,sha256=LI-qV1Vq1nKw8KkO0uppNwjOoXwd8niQFCx9jECn6Aw,415
|
|
2
2
|
dapla_metadata/_shared/__init__.py,sha256=qUFgnVhBVlPRQP0ePmY76c8FvWRrJ-9c5GvzibwERnQ,103
|
|
3
3
|
dapla_metadata/_shared/config.py,sha256=QqXcmP66AfXF8wi6FMsa7et7kH2k4EJPOF4IELKuQig,3213
|
|
4
4
|
dapla_metadata/_shared/enums.py,sha256=WHkH1d8xw41gOly6au_izZB1_-6XTcKu5rhBWUImjp8,509
|
|
5
5
|
dapla_metadata/_shared/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
dapla_metadata/dapla/__init__.py,sha256=
|
|
7
|
-
dapla_metadata/dapla/user_info.py,sha256=
|
|
6
|
+
dapla_metadata/dapla/__init__.py,sha256=tkapF-YwmruPPrKvN3pEoCZqb7xvJx_ogBM8XyGMuJI,130
|
|
7
|
+
dapla_metadata/dapla/user_info.py,sha256=bENez-ICt9ySR8orYebO68Q3_2LkIW9QTL58DTctmEQ,4833
|
|
8
8
|
dapla_metadata/datasets/__init__.py,sha256=TvzskpdFC6hGcC9_55URT5jr5wNAPzXuISd2UjJWM_8,280
|
|
9
9
|
dapla_metadata/datasets/code_list.py,sha256=gNSJsDoQsOgwr-nivNoV51cf19ar_I_Qd9S8LXhdJVE,9065
|
|
10
10
|
dapla_metadata/datasets/core.py,sha256=gp-zTJyn5Z_9bbmHF749OUx0kpYviiDAP9MDwSZiRW8,21768
|
|
11
|
-
dapla_metadata/datasets/dapla_dataset_path_info.py,sha256=
|
|
12
|
-
dapla_metadata/datasets/dataset_parser.py,sha256=
|
|
11
|
+
dapla_metadata/datasets/dapla_dataset_path_info.py,sha256=zdkVjxlqXMBe7eTAneUrTDP0_fx7JsEQ_0JrKjREhfU,26854
|
|
12
|
+
dapla_metadata/datasets/dataset_parser.py,sha256=bc3KOIDQGgdZMPh3XVHhiKMsY6FxIY9glvGlwTM4g7I,8233
|
|
13
13
|
dapla_metadata/datasets/external_sources/__init__.py,sha256=qvIdXwqyEmXNUCB94ZtZXRzifdW4hiXASFFPtC70f6E,83
|
|
14
14
|
dapla_metadata/datasets/external_sources/external_sources.py,sha256=9eIcOIUbaodNX1w9Tj2wl4U4wUmr5kF1R0i01fKUzGs,2974
|
|
15
15
|
dapla_metadata/datasets/model_backwards_compatibility.py,sha256=0NZM0j3P3ngmssH69t24BKagAxEL3veb_n5TkWK2jaA,19167
|
|
@@ -19,7 +19,12 @@ dapla_metadata/datasets/statistic_subject_mapping.py,sha256=QdC22DUBOdRgsfmTTEUr
|
|
|
19
19
|
dapla_metadata/datasets/utility/__init__.py,sha256=pp6tUcgUbo8iq9OPtFKQrTbLuI3uY7NHptwWSTpasOU,33
|
|
20
20
|
dapla_metadata/datasets/utility/constants.py,sha256=SqZMc1v8rO2b_nRFJR7frVd0TAGvvxzIPEIzkqOuBSw,2444
|
|
21
21
|
dapla_metadata/datasets/utility/enums.py,sha256=SpV4xlmP1YMaJPbmX03hqRLHUOhXIk5gquTeJ8G_5OE,432
|
|
22
|
-
dapla_metadata/datasets/utility/utils.py,sha256=
|
|
22
|
+
dapla_metadata/datasets/utility/utils.py,sha256=belHeJ3a8PWtfWRQoGSsoMPu72VAhryLnlYR7gg9kZM,18361
|
|
23
|
+
dapla_metadata/standards/__init__.py,sha256=n8jnMrudLuScSdfQ4UMJorc-Ptg3Y1-ilT8zAaQnM70,179
|
|
24
|
+
dapla_metadata/standards/name_validator.py,sha256=VercKsO7bNDEfAXy_oxMadL8KLVhxmmTe0T8sNlwzm0,9179
|
|
25
|
+
dapla_metadata/standards/standard_validators.py,sha256=tcCiCI76wUVtMzXA2oCgdauZc0uGgUi11FKu-t7KGwQ,3767
|
|
26
|
+
dapla_metadata/standards/utils/__init__.py,sha256=AiM7JcpFsAgyuCyLDYZo9kI94wvIImMDGoV2lKhS4pE,42
|
|
27
|
+
dapla_metadata/standards/utils/constants.py,sha256=SiWViQo90PXruWeACuW9Y1iNqmFoDVe0dP4WfO29PXU,2498
|
|
23
28
|
dapla_metadata/variable_definitions/__init__.py,sha256=j_Nn5mnlZ2uio9moDFLE2xpALqrYpupIZMlvwbLuEuA,391
|
|
24
29
|
dapla_metadata/variable_definitions/_generated/.openapi-generator/FILES,sha256=hfNllHEkFODP0XbgqZB5Tz2mmEBFeAeMplXXslczo1E,634
|
|
25
30
|
dapla_metadata/variable_definitions/_generated/.openapi-generator/VERSION,sha256=Y6lrqS2bXoujk5K-DCAwRFdRmkCKuTgvlngEx6FY5So,7
|
|
@@ -70,15 +75,15 @@ dapla_metadata/variable_definitions/_utils/__init__.py,sha256=qAhRLJoTBqtR3f9xRX
|
|
|
70
75
|
dapla_metadata/variable_definitions/_utils/_client.py,sha256=v1-9VjrdPI6-sroam5vXMPEV1dQMPsYk7KyGd48HjYw,971
|
|
71
76
|
dapla_metadata/variable_definitions/_utils/config.py,sha256=PZIe0XsrYKsF5SiATCx5EoSn-s9EbYAteiA2k5FxRTw,2475
|
|
72
77
|
dapla_metadata/variable_definitions/_utils/constants.py,sha256=b0HOSPeTbY0Zjs10YIOMDx8MsSQF12ES9tvXlj8VTao,1483
|
|
73
|
-
dapla_metadata/variable_definitions/_utils/descriptions.py,sha256=
|
|
78
|
+
dapla_metadata/variable_definitions/_utils/descriptions.py,sha256=bB5QHNc4eOhmpLQHCty-CP5_aA82chkICifXw430suI,2746
|
|
74
79
|
dapla_metadata/variable_definitions/_utils/files.py,sha256=sK6rYKJLGp3nSySLPM9WX5SFOFSit_envi9knUmZWi0,9253
|
|
75
80
|
dapla_metadata/variable_definitions/_utils/template_files.py,sha256=hCINDyGFYJtFaWP-0Qjbrv0S6GFkPccIDyKD0KcPtuA,3670
|
|
76
81
|
dapla_metadata/variable_definitions/_utils/variable_definition_files.py,sha256=ePlbsrVl1JNMDUomS-ldYOeOilmcjQy0I5RhorShE2o,2785
|
|
77
82
|
dapla_metadata/variable_definitions/exceptions.py,sha256=z6Gtd84FboDu7vWjC3wathIF7I0gF0imtRhwMkr16lY,7851
|
|
78
83
|
dapla_metadata/variable_definitions/resources/vardef_model_descriptions_nb.yaml,sha256=cA1jh6JBTPWluSc2fhC__lMj8wmDrlk3CJODzSY8nKY,4771
|
|
79
84
|
dapla_metadata/variable_definitions/vardef.py,sha256=ZBV8ezGRJ8rsmb2H-ZyVX3-DNnMDPIZsHa7MzlcEkNM,11340
|
|
80
|
-
dapla_metadata/variable_definitions/variable_definition.py,sha256=
|
|
81
|
-
dapla_toolbelt_metadata-0.
|
|
82
|
-
dapla_toolbelt_metadata-0.
|
|
83
|
-
dapla_toolbelt_metadata-0.
|
|
84
|
-
dapla_toolbelt_metadata-0.
|
|
85
|
+
dapla_metadata/variable_definitions/variable_definition.py,sha256=Q_XjMKaJhPwoM0vyfgYVWKYNAYFX0caNWWthXmQxeNs,14784
|
|
86
|
+
dapla_toolbelt_metadata-0.6.1.dist-info/LICENSE,sha256=np3IfD5m0ZUofn_kVzDZqliozuiO6wrktw3LRPjyEiI,1073
|
|
87
|
+
dapla_toolbelt_metadata-0.6.1.dist-info/METADATA,sha256=f4K7r9ABjeJAuZ-inim82NKKsBMj5u6Qvl_DynYi_EM,4866
|
|
88
|
+
dapla_toolbelt_metadata-0.6.1.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
|
|
89
|
+
dapla_toolbelt_metadata-0.6.1.dist-info/RECORD,,
|
|
File without changes
|