dapla-toolbelt-metadata 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dapla-toolbelt-metadata might be problematic. Click here for more details.

@@ -0,0 +1,188 @@
1
+ """Handle validation for metadata with pydantic validators and custom warnings."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import warnings
7
+ from typing import TYPE_CHECKING
8
+ from typing import TextIO
9
+
10
+ from datadoc_model import model
11
+ from pydantic import model_validator
12
+ from typing_extensions import Self
13
+
14
+ from dataset.utility.constants import DATE_VALIDATION_MESSAGE
15
+ from dataset.utility.constants import NUM_OBLIGATORY_DATASET_FIELDS
16
+ from dataset.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
17
+ from dataset.utility.constants import OBLIGATORY_METADATA_WARNING
18
+ from dataset.utility.utils import get_missing_obligatory_dataset_fields
19
+ from dataset.utility.utils import get_missing_obligatory_variables_fields
20
+ from dataset.utility.utils import get_timestamp_now
21
+ from dataset.utility.utils import incorrect_date_order
22
+ from dataset.utility.utils import num_obligatory_dataset_fields_completed
23
+ from dataset.utility.utils import num_obligatory_variables_fields_completed
24
+ from dataset.utility.utils import set_variables_inherit_from_dataset
25
+
26
+ if TYPE_CHECKING:
27
+ from datetime import datetime
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class ValidateDatadocMetadata(model.DatadocMetadata):
33
+ """Class that inherits from DatadocMetadata, providing additional validation."""
34
+
35
+ @model_validator(mode="after")
36
+ def check_date_order(self) -> Self:
37
+ """Validate the order of date fields.
38
+
39
+ Check that dataset and variable date fields `contains_data_from` and
40
+ `contains_data_until` are in chronological order.
41
+
42
+ Mode: This validator runs after other validation.
43
+
44
+ Returns:
45
+ The instance of the model after validation.
46
+
47
+ Raises:
48
+ ValueError: If `contains_data_until` date is earlier than
49
+ `contains_data_from date`.
50
+ """
51
+ if self.dataset is not None and incorrect_date_order(
52
+ self.dataset.contains_data_from,
53
+ self.dataset.contains_data_until,
54
+ ):
55
+ raise ValueError(DATE_VALIDATION_MESSAGE)
56
+ if self.variables is not None:
57
+ for v in self.variables:
58
+ if incorrect_date_order(v.contains_data_from, v.contains_data_until):
59
+ raise ValueError(DATE_VALIDATION_MESSAGE)
60
+ return self
61
+
62
+ @model_validator(mode="after")
63
+ def check_metadata_created_date(self) -> Self:
64
+ """Ensure `metadata_created_date` is set for the dataset.
65
+
66
+ Sets the current timestamp if `metadata_created_date` is None.
67
+
68
+ Mode: This validator runs after other validation.
69
+
70
+ Returns:
71
+ The instance of the model after validation.
72
+ """
73
+ timestamp: datetime = get_timestamp_now() # --check-untyped-defs
74
+ if self.dataset is not None and self.dataset.metadata_created_date is None:
75
+ self.dataset.metadata_created_date = timestamp
76
+ return self
77
+
78
+ @model_validator(mode="after")
79
+ def check_inherit_values(self) -> Self:
80
+ """Inherit values from dataset to variables if not set.
81
+
82
+ Sets values for 'data source', 'temporality type', 'contains data from',
83
+ and 'contains data until' if they are None.
84
+
85
+ Mode: This validator runs after other validation.
86
+
87
+ Returns:
88
+ The instance of the model after validation.
89
+ """
90
+ if self.variables and self.dataset is not None:
91
+ set_variables_inherit_from_dataset(self.dataset, self.variables)
92
+ return self
93
+
94
+ @model_validator(mode="after")
95
+ def check_obligatory_dataset_metadata(self) -> Self:
96
+ """Check obligatory dataset fields and issue a warning if any are missing.
97
+
98
+ Mode:
99
+ This validator runs after other validation.
100
+
101
+ Returns:
102
+ The instance of the model after validation.
103
+
104
+ Raises:
105
+ ObligatoryDatasetWarning: If not all obligatory dataset metadata fields
106
+ are filled in.
107
+ """
108
+ if (
109
+ self.dataset is not None
110
+ and num_obligatory_dataset_fields_completed(
111
+ self.dataset,
112
+ )
113
+ != NUM_OBLIGATORY_DATASET_FIELDS
114
+ ):
115
+ warnings.warn(
116
+ f"{OBLIGATORY_METADATA_WARNING} {get_missing_obligatory_dataset_fields(self.dataset)}",
117
+ ObligatoryDatasetWarning,
118
+ stacklevel=2,
119
+ )
120
+ logger.warning(
121
+ "Type warning: %s.%s %s",
122
+ ObligatoryDatasetWarning,
123
+ OBLIGATORY_METADATA_WARNING,
124
+ get_missing_obligatory_dataset_fields(self.dataset),
125
+ )
126
+
127
+ return self
128
+
129
+ @model_validator(mode="after")
130
+ def check_obligatory_variables_metadata(self) -> Self:
131
+ """Check obligatory variable fields and issue a warning if any are missing.
132
+
133
+ Mode:
134
+ This validator runs after other validation.
135
+
136
+ Returns:
137
+ The instance of the model after validation.
138
+
139
+ Raises:
140
+ ObligatoryVariableWarning: If not all obligatory variable metadata fields
141
+ are filled in.
142
+ """
143
+ if self.variables is not None and num_obligatory_variables_fields_completed(
144
+ self.variables,
145
+ ) != (NUM_OBLIGATORY_VARIABLES_FIELDS * len(self.variables)):
146
+ warnings.warn(
147
+ f"{OBLIGATORY_METADATA_WARNING} {get_missing_obligatory_variables_fields(self.variables)}",
148
+ ObligatoryVariableWarning,
149
+ stacklevel=2,
150
+ )
151
+ logger.warning(
152
+ "Type warning: %s.%s %s",
153
+ ObligatoryVariableWarning,
154
+ OBLIGATORY_METADATA_WARNING,
155
+ get_missing_obligatory_variables_fields(self.variables),
156
+ )
157
+
158
+ return self
159
+
160
+
161
+ class ValidationWarning(UserWarning):
162
+ """Custom warning for validation purposes."""
163
+
164
+
165
+ class ObligatoryDatasetWarning(UserWarning):
166
+ """Custom warning for checking obligatory metadata for dataset."""
167
+
168
+
169
+ class ObligatoryVariableWarning(UserWarning):
170
+ """Custom warning for checking obligatory metadata for variables."""
171
+
172
+
173
+ def custom_warning_handler( # noqa: PLR0913 remove fields causes incompatible types
174
+ message: Warning | str,
175
+ category: type[Warning],
176
+ filename: str,
177
+ lineno: int,
178
+ file: TextIO | None = None, # noqa: ARG001 remove causes incompatible types
179
+ line: str | None = None, # noqa: ARG001 remove causes incompatible types
180
+ ) -> None:
181
+ """Handle warnings."""
182
+ print( # noqa: T201
183
+ f"Warning: {message}, Category: {category.__name__}, Filename: {filename}, Line: {lineno}",
184
+ )
185
+
186
+
187
+ warnings.showwarning = custom_warning_handler
188
+ warnings.simplefilter("always")
dataset/py.typed ADDED
File without changes
@@ -0,0 +1,182 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from dataclasses import dataclass
5
+ from typing import TYPE_CHECKING
6
+
7
+ import bs4
8
+ import requests
9
+ from bs4 import BeautifulSoup
10
+ from bs4 import ResultSet
11
+
12
+ from dataset.external_sources.external_sources import GetExternalSource
13
+ from dataset.utility.enums import SupportedLanguages
14
+
15
+ if TYPE_CHECKING:
16
+ from concurrent.futures import ThreadPoolExecutor
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ @dataclass
22
+ class Subject:
23
+ """Base class for Primary and Secondary subjects.
24
+
25
+ A statistical subject is a related grouping of statistics.
26
+ """
27
+
28
+ titles: dict[str, str]
29
+ subject_code: str
30
+
31
+ def get_title(self, language: SupportedLanguages) -> str:
32
+ """Get the title in the given language."""
33
+ try:
34
+ return self.titles[
35
+ (
36
+ # Adjust to language codes in the StatisticSubjectMapping structure.
37
+ "no"
38
+ if language
39
+ in [
40
+ SupportedLanguages.NORSK_BOKMÅL,
41
+ SupportedLanguages.NORSK_NYNORSK,
42
+ ]
43
+ else "en"
44
+ )
45
+ ]
46
+ except KeyError:
47
+ logger.exception(
48
+ "Could not find title for subject %s and language: %s",
49
+ self,
50
+ language.name,
51
+ )
52
+ return ""
53
+
54
+
55
+ @dataclass
56
+ class SecondarySubject(Subject):
57
+ """Data structure for secondary subjects or 'delemne'."""
58
+
59
+ statistic_short_names: list[str]
60
+
61
+
62
+ @dataclass
63
+ class PrimarySubject(Subject):
64
+ """Data structure for primary subjects or 'hovedemne'."""
65
+
66
+ secondary_subjects: list[SecondarySubject]
67
+
68
+
69
+ class StatisticSubjectMapping(GetExternalSource):
70
+ """Provide mapping between statistic short name and primary and secondary subject."""
71
+
72
+ def __init__(
73
+ self,
74
+ executor: ThreadPoolExecutor,
75
+ source_url: str | None,
76
+ ) -> None:
77
+ """Retrieve the statistical structure document from the given URL.
78
+
79
+ Initializes the mapping based on values in the statistical structure document sourced at `source_url`.
80
+
81
+ Args:
82
+ executor: The ThreadPoolExecutor which will run the job of fetching the statistical structure document.
83
+ source_url: The URL from which to fetch the statistical structure document.
84
+ """
85
+ self.source_url = source_url
86
+
87
+ self._statistic_subject_structure_xml: ResultSet | None = None
88
+
89
+ self._primary_subjects: list[PrimarySubject] = []
90
+
91
+ super().__init__(executor)
92
+
93
+ def get_secondary_subject(self, statistic_short_name: str | None) -> str | None:
94
+ """Looks up the secondary subject for the given statistic short name in the mapping dict.
95
+
96
+ Returns the secondary subject string if found, else None.
97
+ """
98
+ for p in self.primary_subjects:
99
+ for s in p.secondary_subjects:
100
+ if statistic_short_name in s.statistic_short_names:
101
+ logger.debug("Got %s from %s", s, statistic_short_name)
102
+ return s.subject_code
103
+
104
+ logger.debug("No secondary subject found for %s", statistic_short_name)
105
+ return None
106
+
107
+ @staticmethod
108
+ def _extract_titles(titles_xml: bs4.element.Tag) -> dict[str, str]:
109
+ titles = {}
110
+ for title in titles_xml.find_all("tittel"):
111
+ titles[title["sprak"]] = title.text
112
+ return titles
113
+
114
+ def _fetch_data_from_external_source(self) -> ResultSet | None:
115
+ """Fetch statistical structure document from source_url.
116
+
117
+ Returns a BeautifulSoup ResultSet.
118
+ """
119
+ try:
120
+ url = str(self.source_url)
121
+ response = requests.get(url, timeout=30)
122
+ response.encoding = "utf-8"
123
+ logger.debug("Got response %s from %s", response, url)
124
+ soup = BeautifulSoup(response.text, features="xml")
125
+ return soup.find_all("hovedemne")
126
+ except requests.exceptions.RequestException:
127
+ logger.exception(
128
+ "Exception while fetching statistical structure ",
129
+ )
130
+ return None
131
+
132
+ def _parse_statistic_subject_structure_xml(
133
+ self,
134
+ statistical_structure_xml: ResultSet,
135
+ ) -> list[PrimarySubject]:
136
+ primary_subjects: list[PrimarySubject] = []
137
+ for p in statistical_structure_xml:
138
+ secondary_subjects: list[SecondarySubject] = [
139
+ SecondarySubject(
140
+ self._extract_titles(s.titler),
141
+ s["emnekode"],
142
+ [statistikk["kortnavn"] for statistikk in s.find_all("Statistikk")],
143
+ )
144
+ for s in p.find_all("delemne")
145
+ ]
146
+
147
+ primary_subjects.append(
148
+ PrimarySubject(
149
+ self._extract_titles(p.titler),
150
+ p["emnekode"],
151
+ secondary_subjects,
152
+ ),
153
+ )
154
+ return primary_subjects
155
+
156
+ @property
157
+ def primary_subjects(self) -> list[PrimarySubject]:
158
+ """Getter for primary subjects."""
159
+ if not self._primary_subjects:
160
+ self._parse_xml_if_loaded()
161
+ logger.debug("Got %s primary subjects", len(self._primary_subjects))
162
+ return self._primary_subjects
163
+
164
+ def _parse_xml_if_loaded(self) -> bool:
165
+ """Checks if the xml is loaded, then parses the xml if it is loaded.
166
+
167
+ Returns `True` if it is loaded and parsed.
168
+ """
169
+ if self.check_if_external_data_is_loaded():
170
+ self._statistic_subject_structure_xml = self.retrieve_external_data()
171
+
172
+ if self._statistic_subject_structure_xml is not None:
173
+ self._primary_subjects = self._parse_statistic_subject_structure_xml(
174
+ self._statistic_subject_structure_xml,
175
+ )
176
+ logger.debug(
177
+ "Thread finished. Parsed %s primary subjects",
178
+ len(self._primary_subjects),
179
+ )
180
+ return True
181
+ logger.warning("Thread is not done. Cannot parse xml.")
182
+ return False
dataset/user_info.py ADDED
@@ -0,0 +1,88 @@
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ import logging
5
+ from typing import Protocol
6
+
7
+ import jwt
8
+
9
+ from dataset import config
10
+ from dataset.utility.enums import DaplaRegion
11
+ from dataset.utility.enums import DaplaService
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ PLACEHOLDER_EMAIL_ADDRESS = "default_user@ssb.no"
17
+
18
+
19
+ class UserInfo(Protocol):
20
+ """Information about the current user.
21
+
22
+ Implementations may be provided for different platforms or testing.
23
+ """
24
+
25
+ @property
26
+ def short_email(self) -> str | None:
27
+ """Get the short email address."""
28
+ ...
29
+
30
+
31
+ class UnknownUserInfo:
32
+ """Fallback when no implementation is found."""
33
+
34
+ @property
35
+ def short_email(self) -> str | None:
36
+ """Unknown email address."""
37
+ return None
38
+
39
+
40
+ class TestUserInfo:
41
+ """Information about the current user for local development and testing."""
42
+
43
+ @property
44
+ def short_email(self) -> str | None:
45
+ """Get the short email address."""
46
+ return PLACEHOLDER_EMAIL_ADDRESS
47
+
48
+
49
+ class DaplaLabUserInfo:
50
+ """Information about the current user when running on Dapla Lab."""
51
+
52
+ @property
53
+ def short_email(self) -> str | None:
54
+ """Get the short email address."""
55
+ encoded_jwt = config.get_oidc_token()
56
+ if encoded_jwt:
57
+ # The JWT has been verified by the platform prior to injection, no need to verify.
58
+ decoded_jwt = jwt.decode(encoded_jwt, options={"verify_signature": False})
59
+ with contextlib.suppress(KeyError):
60
+ # If email can't be found in the JWT, fall through and return None
61
+ return decoded_jwt["email"]
62
+
63
+ logger.warning(
64
+ "Could not access JWT from environment. Could not get short email address.",
65
+ )
66
+ return None
67
+
68
+
69
+ class JupyterHubUserInfo:
70
+ """Information about the current user when running on JupyterHub."""
71
+
72
+ @property
73
+ def short_email(self) -> str | None:
74
+ """Get the short email address."""
75
+ return config.get_jupyterhub_user()
76
+
77
+
78
+ def get_user_info_for_current_platform() -> UserInfo:
79
+ """Return the correct implementation of UserInfo for the current platform."""
80
+ if config.get_dapla_region() == DaplaRegion.DAPLA_LAB:
81
+ return DaplaLabUserInfo()
82
+ elif config.get_dapla_service() == DaplaService.JUPYTERLAB: # noqa: RET505
83
+ return JupyterHubUserInfo()
84
+ else:
85
+ logger.warning(
86
+ "Was not possible to retrieve user information! Some fields may not be set.",
87
+ )
88
+ return UnknownUserInfo()
@@ -0,0 +1 @@
1
+ """Utility files for Datadoc."""
@@ -0,0 +1,92 @@
1
+ """Repository for constant values in Datadoc backend."""
2
+
3
+ from datadoc_model.model import LanguageStringType
4
+ from datadoc_model.model import LanguageStringTypeItem
5
+
6
+ VALIDATION_ERROR = "Validation error: "
7
+
8
+ DATE_VALIDATION_MESSAGE = f"{VALIDATION_ERROR}contains_data_from must be the same or earlier date than contains_data_until"
9
+
10
+ OBLIGATORY_METADATA_WARNING = "Obligatory metadata is missing: "
11
+
12
+ INCONSISTENCIES_MESSAGE = "Inconsistencies found between extracted and existing metadata. Inconsistencies are:"
13
+
14
+ OBLIGATORY_DATASET_METADATA_IDENTIFIERS: list = [
15
+ "assessment",
16
+ "dataset_state",
17
+ "dataset_status",
18
+ "name",
19
+ "description",
20
+ "data_source",
21
+ "population_description",
22
+ "version",
23
+ "version_description",
24
+ "unit_type",
25
+ "temporality_type",
26
+ "subject_field",
27
+ "spatial_coverage_description",
28
+ "owner",
29
+ "contains_data_from",
30
+ "contains_data_until",
31
+ "contains_personal_data",
32
+ ]
33
+
34
+ OBLIGATORY_DATASET_METADATA_IDENTIFIERS_MULTILANGUAGE = [
35
+ "name",
36
+ "description",
37
+ "population_description",
38
+ "version_description",
39
+ "spatial_coverage_description",
40
+ ]
41
+
42
+ OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS = [
43
+ "name",
44
+ "data_type",
45
+ "variable_role",
46
+ "is_personal_data",
47
+ ]
48
+
49
+ OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS_MULTILANGUAGE = [
50
+ "name",
51
+ ]
52
+
53
+ DEFAULT_SPATIAL_COVERAGE_DESCRIPTION = LanguageStringType(
54
+ [
55
+ LanguageStringTypeItem(
56
+ languageCode="nb",
57
+ languageText="Norge",
58
+ ),
59
+ LanguageStringTypeItem(
60
+ languageCode="nn",
61
+ languageText="Noreg",
62
+ ),
63
+ LanguageStringTypeItem(
64
+ languageCode="en",
65
+ languageText="Norway",
66
+ ),
67
+ ],
68
+ )
69
+
70
+ NUM_OBLIGATORY_DATASET_FIELDS = len(OBLIGATORY_DATASET_METADATA_IDENTIFIERS)
71
+
72
+ NUM_OBLIGATORY_VARIABLES_FIELDS = len(OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS)
73
+
74
+ DATASET_FIELDS_FROM_EXISTING_METADATA = [
75
+ "dataset_status",
76
+ "name",
77
+ "description",
78
+ "data_source",
79
+ "population_description",
80
+ "unit_type",
81
+ "temporality_type",
82
+ "subject_field",
83
+ "keyword",
84
+ "spatial_coverage_description",
85
+ "contains_personal_data",
86
+ "use_restriction",
87
+ "use_restriction_date",
88
+ "custom_type",
89
+ "owner",
90
+ ]
91
+
92
+ METADATA_DOCUMENT_FILE_SUFFIX = "__DOC.json"
@@ -0,0 +1,35 @@
1
+ """Enumerations used in Datadoc."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from enum import Enum
6
+
7
+
8
+ class DaplaRegion(str, Enum):
9
+ """Dapla platforms/regions."""
10
+
11
+ DAPLA_LAB = "DAPLA_LAB"
12
+ BIP = "BIP"
13
+ ON_PREM = "ON_PREM"
14
+ CLOUD_RUN = "CLOUD_RUN"
15
+
16
+
17
+ class DaplaService(str, Enum):
18
+ """Dapla services."""
19
+
20
+ DATADOC = "DATADOC"
21
+ JUPYTERLAB = "JUPYTERLAB"
22
+ VS_CODE = "VS_CODE"
23
+ R_STUDIO = "R_STUDIO"
24
+ KILDOMATEN = "KILDOMATEN"
25
+
26
+
27
+ class SupportedLanguages(str, Enum):
28
+ """The list of languages metadata may be recorded in.
29
+
30
+ Reference: https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
31
+ """
32
+
33
+ NORSK_BOKMÅL = "nb"
34
+ NORSK_NYNORSK = "nn"
35
+ ENGLISH = "en"