PyPI - dapla-toolbelt-metadata - Versions diffs - 0.1.1__py3-none-any.whl - Mend

dapla-toolbelt-metadata 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dapla-toolbelt-metadata might be problematic. Click here for more details.

Files changed (21) hide show

dapla_toolbelt_metadata-0.1.1.dist-info/LICENSE +21 -0
dapla_toolbelt_metadata-0.1.1.dist-info/METADATA +125 -0
dapla_toolbelt_metadata-0.1.1.dist-info/RECORD +21 -0
dapla_toolbelt_metadata-0.1.1.dist-info/WHEEL +4 -0
dataset/__init__.py +11 -0
dataset/code_list.py +244 -0
dataset/config.py +151 -0
dataset/core.py +543 -0
dataset/dapla_dataset_path_info.py +685 -0
dataset/dataset_parser.py +241 -0
dataset/external_sources/__init__.py +1 -0
dataset/external_sources/external_sources.py +87 -0
dataset/model_backwards_compatibility.py +520 -0
dataset/model_validation.py +188 -0
dataset/py.typed +0 -0
dataset/statistic_subject_mapping.py +182 -0
dataset/user_info.py +88 -0
dataset/utility/__init__.py +1 -0
dataset/utility/constants.py +92 -0
dataset/utility/enums.py +35 -0
dataset/utility/utils.py +405 -0

dataset/model_validation.py ADDED Viewed

@@ -0,0 +1,188 @@
+"""Handle validation for metadata with pydantic validators and custom warnings."""
+from __future__ import annotations
+import logging
+import warnings
+from typing import TYPE_CHECKING
+from typing import TextIO
+from datadoc_model import model
+from pydantic import model_validator
+from typing_extensions import Self
+from dataset.utility.constants import DATE_VALIDATION_MESSAGE
+from dataset.utility.constants import NUM_OBLIGATORY_DATASET_FIELDS
+from dataset.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
+from dataset.utility.constants import OBLIGATORY_METADATA_WARNING
+from dataset.utility.utils import get_missing_obligatory_dataset_fields
+from dataset.utility.utils import get_missing_obligatory_variables_fields
+from dataset.utility.utils import get_timestamp_now
+from dataset.utility.utils import incorrect_date_order
+from dataset.utility.utils import num_obligatory_dataset_fields_completed
+from dataset.utility.utils import num_obligatory_variables_fields_completed
+from dataset.utility.utils import set_variables_inherit_from_dataset
+if TYPE_CHECKING:
+    from datetime import datetime
+logger = logging.getLogger(__name__)
+class ValidateDatadocMetadata(model.DatadocMetadata):
+    """Class that inherits from DatadocMetadata, providing additional validation."""
+    @model_validator(mode="after")
+    def check_date_order(self) -> Self:
+        """Validate the order of date fields.
+        Check that dataset and variable date fields `contains_data_from` and
+        `contains_data_until` are in chronological order.
+        Mode: This validator runs after other validation.
+        Returns:
+            The instance of the model after validation.
+        Raises:
+            ValueError: If `contains_data_until` date is earlier than
+                `contains_data_from date`.
+        """
+        if self.dataset is not None and incorrect_date_order(
+            self.dataset.contains_data_from,
+            self.dataset.contains_data_until,
+        ):
+            raise ValueError(DATE_VALIDATION_MESSAGE)
+        if self.variables is not None:
+            for v in self.variables:
+                if incorrect_date_order(v.contains_data_from, v.contains_data_until):
+                    raise ValueError(DATE_VALIDATION_MESSAGE)
+        return self
+    @model_validator(mode="after")
+    def check_metadata_created_date(self) -> Self:
+        """Ensure `metadata_created_date` is set for the dataset.
+        Sets the current timestamp if `metadata_created_date` is None.
+        Mode: This validator runs after other validation.
+        Returns:
+            The instance of the model after validation.
+        """
+        timestamp: datetime = get_timestamp_now()  # --check-untyped-defs
+        if self.dataset is not None and self.dataset.metadata_created_date is None:
+            self.dataset.metadata_created_date = timestamp
+        return self
+    @model_validator(mode="after")
+    def check_inherit_values(self) -> Self:
+        """Inherit values from dataset to variables if not set.
+        Sets values for 'data source', 'temporality type', 'contains data from',
+        and 'contains data until' if they are None.
+        Mode: This validator runs after other validation.
+        Returns:
+            The instance of the model after validation.
+        """
+        if self.variables and self.dataset is not None:
+            set_variables_inherit_from_dataset(self.dataset, self.variables)
+        return self
+    @model_validator(mode="after")
+    def check_obligatory_dataset_metadata(self) -> Self:
+        """Check obligatory dataset fields and issue a warning if any are missing.
+        Mode:
+            This validator runs after other validation.
+        Returns:
+            The instance of the model after validation.
+        Raises:
+            ObligatoryDatasetWarning: If not all obligatory dataset metadata fields
+                are filled in.
+        """
+        if (
+            self.dataset is not None
+            and num_obligatory_dataset_fields_completed(
+                self.dataset,
+            )
+            != NUM_OBLIGATORY_DATASET_FIELDS
+        ):
+            warnings.warn(
+                f"{OBLIGATORY_METADATA_WARNING} {get_missing_obligatory_dataset_fields(self.dataset)}",
+                ObligatoryDatasetWarning,
+                stacklevel=2,
+            )
+            logger.warning(
+                "Type warning: %s.%s %s",
+                ObligatoryDatasetWarning,
+                OBLIGATORY_METADATA_WARNING,
+                get_missing_obligatory_dataset_fields(self.dataset),
+            )
+        return self
+    @model_validator(mode="after")
+    def check_obligatory_variables_metadata(self) -> Self:
+        """Check obligatory variable fields and issue a warning if any are missing.
+        Mode:
+            This validator runs after other validation.
+        Returns:
+            The instance of the model after validation.
+        Raises:
+            ObligatoryVariableWarning: If not all obligatory variable metadata fields
+                are filled in.
+        """
+        if self.variables is not None and num_obligatory_variables_fields_completed(
+            self.variables,
+        ) != (NUM_OBLIGATORY_VARIABLES_FIELDS * len(self.variables)):
+            warnings.warn(
+                f"{OBLIGATORY_METADATA_WARNING} {get_missing_obligatory_variables_fields(self.variables)}",
+                ObligatoryVariableWarning,
+                stacklevel=2,
+            )
+            logger.warning(
+                "Type warning: %s.%s %s",
+                ObligatoryVariableWarning,
+                OBLIGATORY_METADATA_WARNING,
+                get_missing_obligatory_variables_fields(self.variables),
+            )
+        return self
+class ValidationWarning(UserWarning):
+    """Custom warning for validation purposes."""
+class ObligatoryDatasetWarning(UserWarning):
+    """Custom warning for checking obligatory metadata for dataset."""
+class ObligatoryVariableWarning(UserWarning):
+    """Custom warning for checking obligatory metadata for variables."""
+def custom_warning_handler(  # noqa: PLR0913 remove fields causes incompatible types
+    message: Warning | str,
+    category: type[Warning],
+    filename: str,
+    lineno: int,
+    file: TextIO | None = None,  # noqa: ARG001 remove causes incompatible types
+    line: str | None = None,  # noqa: ARG001 remove causes incompatible types
+) -> None:
+    """Handle warnings."""
+    print(  # noqa: T201
+        f"Warning: {message}, Category: {category.__name__}, Filename: {filename}, Line: {lineno}",
+    )
+warnings.showwarning = custom_warning_handler
+warnings.simplefilter("always")

dataset/py.typed ADDED Viewed

File without changes

dataset/statistic_subject_mapping.py ADDED Viewed

@@ -0,0 +1,182 @@
+from __future__ import annotations
+import logging
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+import bs4
+import requests
+from bs4 import BeautifulSoup
+from bs4 import ResultSet
+from dataset.external_sources.external_sources import GetExternalSource
+from dataset.utility.enums import SupportedLanguages
+if TYPE_CHECKING:
+    from concurrent.futures import ThreadPoolExecutor
+logger = logging.getLogger(__name__)
+@dataclass
+class Subject:
+    """Base class for Primary and Secondary subjects.
+    A statistical subject is a related grouping of statistics.
+    """
+    titles: dict[str, str]
+    subject_code: str
+    def get_title(self, language: SupportedLanguages) -> str:
+        """Get the title in the given language."""
+        try:
+            return self.titles[
+                (
+                    # Adjust to language codes in the StatisticSubjectMapping structure.
+                    "no"
+                    if language
+                    in [
+                        SupportedLanguages.NORSK_BOKMÅL,
+                        SupportedLanguages.NORSK_NYNORSK,
+                    ]
+                    else "en"
+                )
+            ]
+        except KeyError:
+            logger.exception(
+                "Could not find title for subject %s  and language: %s",
+                self,
+                language.name,
+            )
+            return ""
+@dataclass
+class SecondarySubject(Subject):
+    """Data structure for secondary subjects or 'delemne'."""
+    statistic_short_names: list[str]
+@dataclass
+class PrimarySubject(Subject):
+    """Data structure for primary subjects or 'hovedemne'."""
+    secondary_subjects: list[SecondarySubject]
+class StatisticSubjectMapping(GetExternalSource):
+    """Provide mapping between statistic short name and primary and secondary subject."""
+    def __init__(
+        self,
+        executor: ThreadPoolExecutor,
+        source_url: str | None,
+    ) -> None:
+        """Retrieve the statistical structure document from the given URL.
+        Initializes the mapping based on values in the statistical structure document sourced at `source_url`.
+        Args:
+            executor: The ThreadPoolExecutor which will run the job of fetching the statistical structure document.
+            source_url: The URL from which to fetch the statistical structure document.
+        """
+        self.source_url = source_url
+        self._statistic_subject_structure_xml: ResultSet | None = None
+        self._primary_subjects: list[PrimarySubject] = []
+        super().__init__(executor)
+    def get_secondary_subject(self, statistic_short_name: str | None) -> str | None:
+        """Looks up the secondary subject for the given statistic short name in the mapping dict.
+        Returns the secondary subject string if found, else None.
+        """
+        for p in self.primary_subjects:
+            for s in p.secondary_subjects:
+                if statistic_short_name in s.statistic_short_names:
+                    logger.debug("Got %s from %s", s, statistic_short_name)
+                    return s.subject_code
+        logger.debug("No secondary subject found for %s", statistic_short_name)
+        return None
+    @staticmethod
+    def _extract_titles(titles_xml: bs4.element.Tag) -> dict[str, str]:
+        titles = {}
+        for title in titles_xml.find_all("tittel"):
+            titles[title["sprak"]] = title.text
+        return titles
+    def _fetch_data_from_external_source(self) -> ResultSet | None:
+        """Fetch statistical structure document from source_url.
+        Returns a BeautifulSoup ResultSet.
+        """
+        try:
+            url = str(self.source_url)
+            response = requests.get(url, timeout=30)
+            response.encoding = "utf-8"
+            logger.debug("Got response %s from %s", response, url)
+            soup = BeautifulSoup(response.text, features="xml")
+            return soup.find_all("hovedemne")
+        except requests.exceptions.RequestException:
+            logger.exception(
+                "Exception while fetching statistical structure ",
+            )
+            return None
+    def _parse_statistic_subject_structure_xml(
+        self,
+        statistical_structure_xml: ResultSet,
+    ) -> list[PrimarySubject]:
+        primary_subjects: list[PrimarySubject] = []
+        for p in statistical_structure_xml:
+            secondary_subjects: list[SecondarySubject] = [
+                SecondarySubject(
+                    self._extract_titles(s.titler),
+                    s["emnekode"],
+                    [statistikk["kortnavn"] for statistikk in s.find_all("Statistikk")],
+                )
+                for s in p.find_all("delemne")
+            ]
+            primary_subjects.append(
+                PrimarySubject(
+                    self._extract_titles(p.titler),
+                    p["emnekode"],
+                    secondary_subjects,
+                ),
+            )
+        return primary_subjects
+    @property
+    def primary_subjects(self) -> list[PrimarySubject]:
+        """Getter for primary subjects."""
+        if not self._primary_subjects:
+            self._parse_xml_if_loaded()
+            logger.debug("Got %s primary subjects", len(self._primary_subjects))
+        return self._primary_subjects
+    def _parse_xml_if_loaded(self) -> bool:
+        """Checks if the xml is loaded, then parses the xml if it is loaded.
+        Returns `True` if it is loaded and parsed.
+        """
+        if self.check_if_external_data_is_loaded():
+            self._statistic_subject_structure_xml = self.retrieve_external_data()
+            if self._statistic_subject_structure_xml is not None:
+                self._primary_subjects = self._parse_statistic_subject_structure_xml(
+                    self._statistic_subject_structure_xml,
+                )
+                logger.debug(
+                    "Thread finished. Parsed %s primary subjects",
+                    len(self._primary_subjects),
+                )
+                return True
+            logger.warning("Thread is not done. Cannot parse xml.")
+        return False

dataset/user_info.py ADDED Viewed

@@ -0,0 +1,88 @@
+from __future__ import annotations
+import contextlib
+import logging
+from typing import Protocol
+import jwt
+from dataset import config
+from dataset.utility.enums import DaplaRegion
+from dataset.utility.enums import DaplaService
+logger = logging.getLogger(__name__)
+PLACEHOLDER_EMAIL_ADDRESS = "default_user@ssb.no"
+class UserInfo(Protocol):
+    """Information about the current user.
+    Implementations may be provided for different platforms or testing.
+    """
+    @property
+    def short_email(self) -> str | None:
+        """Get the short email address."""
+        ...
+class UnknownUserInfo:
+    """Fallback when no implementation is found."""
+    @property
+    def short_email(self) -> str | None:
+        """Unknown email address."""
+        return None
+class TestUserInfo:
+    """Information about the current user for local development and testing."""
+    @property
+    def short_email(self) -> str | None:
+        """Get the short email address."""
+        return PLACEHOLDER_EMAIL_ADDRESS
+class DaplaLabUserInfo:
+    """Information about the current user when running on Dapla Lab."""
+    @property
+    def short_email(self) -> str | None:
+        """Get the short email address."""
+        encoded_jwt = config.get_oidc_token()
+        if encoded_jwt:
+            # The JWT has been verified by the platform prior to injection, no need to verify.
+            decoded_jwt = jwt.decode(encoded_jwt, options={"verify_signature": False})
+            with contextlib.suppress(KeyError):
+                # If email can't be found in the JWT, fall through and return None
+                return decoded_jwt["email"]
+        logger.warning(
+            "Could not access JWT from environment. Could not get short email address.",
+        )
+        return None
+class JupyterHubUserInfo:
+    """Information about the current user when running on JupyterHub."""
+    @property
+    def short_email(self) -> str | None:
+        """Get the short email address."""
+        return config.get_jupyterhub_user()
+def get_user_info_for_current_platform() -> UserInfo:
+    """Return the correct implementation of UserInfo for the current platform."""
+    if config.get_dapla_region() == DaplaRegion.DAPLA_LAB:
+        return DaplaLabUserInfo()
+    elif config.get_dapla_service() == DaplaService.JUPYTERLAB:  # noqa: RET505
+        return JupyterHubUserInfo()
+    else:
+        logger.warning(
+            "Was not possible to retrieve user information! Some fields may not be set.",
+        )
+        return UnknownUserInfo()

dataset/utility/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Utility files for Datadoc."""

dataset/utility/constants.py ADDED Viewed

@@ -0,0 +1,92 @@
+"""Repository for constant values in Datadoc backend."""
+from datadoc_model.model import LanguageStringType
+from datadoc_model.model import LanguageStringTypeItem
+VALIDATION_ERROR = "Validation error: "
+DATE_VALIDATION_MESSAGE = f"{VALIDATION_ERROR}contains_data_from must be the same or earlier date than contains_data_until"
+OBLIGATORY_METADATA_WARNING = "Obligatory metadata is missing: "
+INCONSISTENCIES_MESSAGE = "Inconsistencies found between extracted and existing metadata. Inconsistencies are:"
+OBLIGATORY_DATASET_METADATA_IDENTIFIERS: list = [
+    "assessment",
+    "dataset_state",
+    "dataset_status",
+    "name",
+    "description",
+    "data_source",
+    "population_description",
+    "version",
+    "version_description",
+    "unit_type",
+    "temporality_type",
+    "subject_field",
+    "spatial_coverage_description",
+    "owner",
+    "contains_data_from",
+    "contains_data_until",
+    "contains_personal_data",
+]
+OBLIGATORY_DATASET_METADATA_IDENTIFIERS_MULTILANGUAGE = [
+    "name",
+    "description",
+    "population_description",
+    "version_description",
+    "spatial_coverage_description",
+]
+OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS = [
+    "name",
+    "data_type",
+    "variable_role",
+    "is_personal_data",
+]
+OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS_MULTILANGUAGE = [
+    "name",
+]
+DEFAULT_SPATIAL_COVERAGE_DESCRIPTION = LanguageStringType(
+    [
+        LanguageStringTypeItem(
+            languageCode="nb",
+            languageText="Norge",
+        ),
+        LanguageStringTypeItem(
+            languageCode="nn",
+            languageText="Noreg",
+        ),
+        LanguageStringTypeItem(
+            languageCode="en",
+            languageText="Norway",
+        ),
+    ],
+)
+NUM_OBLIGATORY_DATASET_FIELDS = len(OBLIGATORY_DATASET_METADATA_IDENTIFIERS)
+NUM_OBLIGATORY_VARIABLES_FIELDS = len(OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS)
+DATASET_FIELDS_FROM_EXISTING_METADATA = [
+    "dataset_status",
+    "name",
+    "description",
+    "data_source",
+    "population_description",
+    "unit_type",
+    "temporality_type",
+    "subject_field",
+    "keyword",
+    "spatial_coverage_description",
+    "contains_personal_data",
+    "use_restriction",
+    "use_restriction_date",
+    "custom_type",
+    "owner",
+]
+METADATA_DOCUMENT_FILE_SUFFIX = "__DOC.json"

dataset/utility/enums.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""Enumerations used in Datadoc."""
+from __future__ import annotations
+from enum import Enum
+class DaplaRegion(str, Enum):
+    """Dapla platforms/regions."""
+    DAPLA_LAB = "DAPLA_LAB"
+    BIP = "BIP"
+    ON_PREM = "ON_PREM"
+    CLOUD_RUN = "CLOUD_RUN"
+class DaplaService(str, Enum):
+    """Dapla services."""
+    DATADOC = "DATADOC"
+    JUPYTERLAB = "JUPYTERLAB"
+    VS_CODE = "VS_CODE"
+    R_STUDIO = "R_STUDIO"
+    KILDOMATEN = "KILDOMATEN"
+class SupportedLanguages(str, Enum):
+    """The list of languages metadata may be recorded in.
+    Reference: https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
+    """
+    NORSK_BOKMÅL = "nb"
+    NORSK_NYNORSK = "nn"
+    ENGLISH = "en"