dapla-toolbelt-metadata 0.2.1__py3-none-any.whl → 0.9.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dapla-toolbelt-metadata might be problematic. Click here for more details.

Files changed (97) hide show
  1. dapla_metadata/__init__.py +11 -1
  2. dapla_metadata/_shared/__init__.py +1 -0
  3. dapla_metadata/_shared/config.py +109 -0
  4. dapla_metadata/_shared/enums.py +27 -0
  5. dapla_metadata/_shared/py.typed +0 -0
  6. dapla_metadata/dapla/__init__.py +4 -0
  7. dapla_metadata/dapla/user_info.py +138 -0
  8. dapla_metadata/datasets/__init__.py +1 -1
  9. dapla_metadata/datasets/_merge.py +333 -0
  10. dapla_metadata/datasets/code_list.py +5 -6
  11. dapla_metadata/datasets/compatibility/__init__.py +10 -0
  12. dapla_metadata/datasets/compatibility/_handlers.py +363 -0
  13. dapla_metadata/datasets/compatibility/_utils.py +259 -0
  14. dapla_metadata/datasets/compatibility/model_backwards_compatibility.py +135 -0
  15. dapla_metadata/datasets/core.py +136 -182
  16. dapla_metadata/datasets/dapla_dataset_path_info.py +145 -19
  17. dapla_metadata/datasets/dataset_parser.py +41 -28
  18. dapla_metadata/datasets/model_validation.py +29 -20
  19. dapla_metadata/datasets/statistic_subject_mapping.py +5 -1
  20. dapla_metadata/datasets/utility/constants.py +22 -15
  21. dapla_metadata/datasets/utility/enums.py +8 -20
  22. dapla_metadata/datasets/utility/urn.py +234 -0
  23. dapla_metadata/datasets/utility/utils.py +183 -111
  24. dapla_metadata/standards/__init__.py +4 -0
  25. dapla_metadata/standards/name_validator.py +250 -0
  26. dapla_metadata/standards/standard_validators.py +98 -0
  27. dapla_metadata/standards/utils/__init__.py +1 -0
  28. dapla_metadata/standards/utils/constants.py +49 -0
  29. dapla_metadata/variable_definitions/__init__.py +11 -0
  30. dapla_metadata/variable_definitions/_generated/.openapi-generator/FILES +20 -0
  31. dapla_metadata/variable_definitions/_generated/.openapi-generator/VERSION +1 -0
  32. dapla_metadata/variable_definitions/_generated/.openapi-generator-ignore +6 -0
  33. dapla_metadata/variable_definitions/_generated/README.md +148 -0
  34. dapla_metadata/variable_definitions/_generated/__init__.py +0 -0
  35. dapla_metadata/variable_definitions/_generated/vardef_client/__init__.py +47 -0
  36. dapla_metadata/variable_definitions/_generated/vardef_client/api/__init__.py +8 -0
  37. dapla_metadata/variable_definitions/_generated/vardef_client/api/data_migration_api.py +766 -0
  38. dapla_metadata/variable_definitions/_generated/vardef_client/api/draft_variable_definitions_api.py +888 -0
  39. dapla_metadata/variable_definitions/_generated/vardef_client/api/patches_api.py +888 -0
  40. dapla_metadata/variable_definitions/_generated/vardef_client/api/validity_periods_api.py +583 -0
  41. dapla_metadata/variable_definitions/_generated/vardef_client/api/variable_definitions_api.py +613 -0
  42. dapla_metadata/variable_definitions/_generated/vardef_client/api_client.py +779 -0
  43. dapla_metadata/variable_definitions/_generated/vardef_client/api_response.py +27 -0
  44. dapla_metadata/variable_definitions/_generated/vardef_client/configuration.py +474 -0
  45. dapla_metadata/variable_definitions/_generated/vardef_client/docs/CompleteResponse.md +51 -0
  46. dapla_metadata/variable_definitions/_generated/vardef_client/docs/Contact.md +30 -0
  47. dapla_metadata/variable_definitions/_generated/vardef_client/docs/DataMigrationApi.md +90 -0
  48. dapla_metadata/variable_definitions/_generated/vardef_client/docs/Draft.md +42 -0
  49. dapla_metadata/variable_definitions/_generated/vardef_client/docs/DraftVariableDefinitionsApi.md +259 -0
  50. dapla_metadata/variable_definitions/_generated/vardef_client/docs/LanguageStringType.md +31 -0
  51. dapla_metadata/variable_definitions/_generated/vardef_client/docs/Owner.md +31 -0
  52. dapla_metadata/variable_definitions/_generated/vardef_client/docs/Patch.md +43 -0
  53. dapla_metadata/variable_definitions/_generated/vardef_client/docs/PatchesApi.md +249 -0
  54. dapla_metadata/variable_definitions/_generated/vardef_client/docs/PublicApi.md +218 -0
  55. dapla_metadata/variable_definitions/_generated/vardef_client/docs/SupportedLanguages.md +15 -0
  56. dapla_metadata/variable_definitions/_generated/vardef_client/docs/UpdateDraft.md +44 -0
  57. dapla_metadata/variable_definitions/_generated/vardef_client/docs/ValidityPeriod.md +42 -0
  58. dapla_metadata/variable_definitions/_generated/vardef_client/docs/ValidityPeriodsApi.md +236 -0
  59. dapla_metadata/variable_definitions/_generated/vardef_client/docs/VariableDefinitionsApi.md +304 -0
  60. dapla_metadata/variable_definitions/_generated/vardef_client/docs/VariableStatus.md +17 -0
  61. dapla_metadata/variable_definitions/_generated/vardef_client/exceptions.py +193 -0
  62. dapla_metadata/variable_definitions/_generated/vardef_client/models/__init__.py +31 -0
  63. dapla_metadata/variable_definitions/_generated/vardef_client/models/complete_response.py +260 -0
  64. dapla_metadata/variable_definitions/_generated/vardef_client/models/contact.py +94 -0
  65. dapla_metadata/variable_definitions/_generated/vardef_client/models/draft.py +228 -0
  66. dapla_metadata/variable_definitions/_generated/vardef_client/models/get_vardok_vardef_mapping_by_id200_response.py +158 -0
  67. dapla_metadata/variable_definitions/_generated/vardef_client/models/language_string_type.py +101 -0
  68. dapla_metadata/variable_definitions/_generated/vardef_client/models/owner.py +87 -0
  69. dapla_metadata/variable_definitions/_generated/vardef_client/models/patch.py +244 -0
  70. dapla_metadata/variable_definitions/_generated/vardef_client/models/problem.py +118 -0
  71. dapla_metadata/variable_definitions/_generated/vardef_client/models/update_draft.py +274 -0
  72. dapla_metadata/variable_definitions/_generated/vardef_client/models/validity_period.py +225 -0
  73. dapla_metadata/variable_definitions/_generated/vardef_client/models/vardok_id_response.py +81 -0
  74. dapla_metadata/variable_definitions/_generated/vardef_client/models/vardok_vardef_id_pair_response.py +84 -0
  75. dapla_metadata/variable_definitions/_generated/vardef_client/models/variable_status.py +33 -0
  76. dapla_metadata/variable_definitions/_generated/vardef_client/py.typed +0 -0
  77. dapla_metadata/variable_definitions/_generated/vardef_client/rest.py +249 -0
  78. dapla_metadata/variable_definitions/_utils/__init__.py +1 -0
  79. dapla_metadata/variable_definitions/_utils/_client.py +32 -0
  80. dapla_metadata/variable_definitions/_utils/config.py +54 -0
  81. dapla_metadata/variable_definitions/_utils/constants.py +80 -0
  82. dapla_metadata/variable_definitions/_utils/files.py +309 -0
  83. dapla_metadata/variable_definitions/_utils/template_files.py +99 -0
  84. dapla_metadata/variable_definitions/_utils/variable_definition_files.py +143 -0
  85. dapla_metadata/variable_definitions/exceptions.py +255 -0
  86. dapla_metadata/variable_definitions/vardef.py +372 -0
  87. dapla_metadata/variable_definitions/vardok_id.py +48 -0
  88. dapla_metadata/variable_definitions/vardok_vardef_id_pair.py +47 -0
  89. dapla_metadata/variable_definitions/variable_definition.py +422 -0
  90. {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info}/METADATA +34 -36
  91. dapla_toolbelt_metadata-0.9.11.dist-info/RECORD +97 -0
  92. {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info}/WHEEL +1 -1
  93. dapla_metadata/datasets/config.py +0 -80
  94. dapla_metadata/datasets/model_backwards_compatibility.py +0 -520
  95. dapla_metadata/datasets/user_info.py +0 -88
  96. dapla_toolbelt_metadata-0.2.1.dist-info/RECORD +0 -22
  97. {dapla_toolbelt_metadata-0.2.1.dist-info → dapla_toolbelt_metadata-0.9.11.dist-info/licenses}/LICENSE +0 -0
@@ -1,5 +1,15 @@
1
1
  """Tools and clients for working with the Dapla Metadata system."""
2
2
 
3
- import datadoc_model.model as datadoc_model
3
+ import warnings
4
4
 
5
+ warnings.filterwarnings(
6
+ "ignore",
7
+ message="As the c extension couldn't be imported, `google-crc32c` is using a pure python implementation that is significantly slower.",
8
+ )
9
+
10
+ import datadoc_model.all_optional.model as datadoc_model
11
+
12
+ from . import dapla
5
13
  from . import datasets
14
+ from . import standards
15
+ from . import variable_definitions
@@ -0,0 +1 @@
1
+ """Utility code intended to be used by multiple packages within this project, but not by end users."""
@@ -0,0 +1,109 @@
1
+ """Configuration management for dataset package."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import os
7
+ from pathlib import Path
8
+ from pprint import pformat
9
+
10
+ from dotenv import dotenv_values
11
+ from dotenv import load_dotenv
12
+
13
+ from dapla_metadata._shared.enums import DaplaEnvironment
14
+ from dapla_metadata._shared.enums import DaplaRegion
15
+ from dapla_metadata._shared.enums import DaplaService
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ DOT_ENV_FILE_PATH = Path(__file__).parent.joinpath(".env")
20
+
21
+ JUPYTERHUB_USER = "JUPYTERHUB_USER"
22
+ DAPLA_REGION = "DAPLA_REGION"
23
+ DAPLA_ENVIRONMENT = "DAPLA_ENVIRONMENT"
24
+ DAPLA_SERVICE = "DAPLA_SERVICE"
25
+ DAPLA_GROUP_CONTEXT = "DAPLA_GROUP_CONTEXT"
26
+ OIDC_TOKEN = "OIDC_TOKEN" # noqa: S105
27
+
28
+
29
+ DATADOC_STATISTICAL_SUBJECT_SOURCE_URL_DEFAULT = (
30
+ "https://www.ssb.no/xp/_/service/mimir/subjectStructurStatistics"
31
+ )
32
+
33
+
34
+ env_loaded = False
35
+
36
+
37
+ def _load_dotenv_file() -> None:
38
+ global env_loaded # noqa: PLW0603
39
+ if not env_loaded and DOT_ENV_FILE_PATH.exists():
40
+ load_dotenv(DOT_ENV_FILE_PATH)
41
+ env_loaded = True
42
+ logger.info(
43
+ "Loaded .env file with config keys: \n%s",
44
+ pformat(list(dotenv_values(DOT_ENV_FILE_PATH).keys())),
45
+ )
46
+
47
+
48
+ def get_config_item(item: str, *, raising: bool = False) -> str | None:
49
+ """Get a config item. Makes sure all access is logged.
50
+
51
+ Args:
52
+ item: The name of the environment variable to obtain.
53
+ raising: `True` if an exception should be raised when the item isn't present.
54
+
55
+ Returns:
56
+ The set value or `None`
57
+
58
+ Raises:
59
+ OSError: Only if `raising` is True and the item is not found.
60
+ """
61
+ _load_dotenv_file()
62
+ value = os.environ.get(item)
63
+ if raising and not value:
64
+ msg = f"Environment variable {item} not defined."
65
+ raise OSError(msg)
66
+ logger.debug("Config accessed. %s", item)
67
+ return value
68
+
69
+
70
+ def get_statistical_subject_source_url() -> str | None:
71
+ """Get the URL to the statistical subject source."""
72
+ return (
73
+ get_config_item("DATADOC_STATISTICAL_SUBJECT_SOURCE_URL")
74
+ or DATADOC_STATISTICAL_SUBJECT_SOURCE_URL_DEFAULT
75
+ )
76
+
77
+
78
+ def get_dapla_region() -> DaplaRegion | None:
79
+ """Get the Dapla region we're running on."""
80
+ if region := get_config_item(DAPLA_REGION):
81
+ return DaplaRegion(region)
82
+
83
+ return None
84
+
85
+
86
+ def get_dapla_environment() -> DaplaEnvironment | None:
87
+ """Get the Dapla environment we're running on."""
88
+ if env := get_config_item(DAPLA_ENVIRONMENT):
89
+ return DaplaEnvironment(env)
90
+
91
+ return None
92
+
93
+
94
+ def get_dapla_service() -> DaplaService | None:
95
+ """Get the Dapla service we're running on."""
96
+ if service := get_config_item(DAPLA_SERVICE):
97
+ return DaplaService(service)
98
+
99
+ return None
100
+
101
+
102
+ def get_oidc_token(*, raising: bool = False) -> str | None:
103
+ """Get the JWT token from the environment."""
104
+ return get_config_item(OIDC_TOKEN, raising=raising)
105
+
106
+
107
+ def get_dapla_group_context(*, raising: bool = False) -> str | None:
108
+ """Get the group which the user has chosen to represent."""
109
+ return get_config_item(DAPLA_GROUP_CONTEXT, raising=raising)
@@ -0,0 +1,27 @@
1
+ from enum import Enum
2
+
3
+
4
+ class DaplaRegion(str, Enum):
5
+ """Dapla platforms/regions."""
6
+
7
+ DAPLA_LAB = "DAPLA_LAB"
8
+ ON_PREM = "ON_PREM"
9
+ CLOUD_RUN = "CLOUD_RUN"
10
+
11
+
12
+ class DaplaEnvironment(str, Enum):
13
+ """Dapla lifecycle environment."""
14
+
15
+ PROD = "PROD"
16
+ TEST = "TEST"
17
+ DEV = "DEV"
18
+
19
+
20
+ class DaplaService(str, Enum):
21
+ """Dapla services."""
22
+
23
+ DATADOC = "DATADOC"
24
+ JUPYTERLAB = "JUPYTERLAB"
25
+ VS_CODE = "VS_CODE"
26
+ R_STUDIO = "R_STUDIO"
27
+ KILDOMATEN = "KILDOMATEN"
File without changes
@@ -0,0 +1,4 @@
1
+ """Expose information specific to the Dapla platform."""
2
+
3
+ from .user_info import DaplaLabUserInfo
4
+ from .user_info import UserInfo
@@ -0,0 +1,138 @@
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ import logging
5
+ from typing import Protocol
6
+
7
+ import jwt
8
+
9
+ from dapla_metadata._shared import config
10
+ from dapla_metadata._shared.enums import DaplaRegion
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class UserInfo(Protocol):
16
+ """Information about the current user.
17
+
18
+ Implementations may be provided for different platforms or testing.
19
+ """
20
+
21
+ @property
22
+ def short_email(self) -> str | None:
23
+ """Get the short email address."""
24
+ ...
25
+
26
+ @property
27
+ def current_group(self) -> str:
28
+ """Get the group which the user is currently representing."""
29
+ ...
30
+
31
+ @property
32
+ def current_team(self) -> str:
33
+ """Get the team which the user is currently representing."""
34
+ ...
35
+
36
+
37
+ class UnknownUserInfo:
38
+ """Fallback when no implementation is found."""
39
+
40
+ @property
41
+ def short_email(self) -> str | None:
42
+ """Unknown email address."""
43
+ return None
44
+
45
+ @property
46
+ def current_group(self) -> str:
47
+ """Get the group which the user is currently representing."""
48
+ return ""
49
+
50
+ @property
51
+ def current_team(self) -> str:
52
+ """Get the team which the user is currently representing."""
53
+ return ""
54
+
55
+
56
+ class TestUserInfo:
57
+ """Information about the current user for local development and testing."""
58
+
59
+ PLACEHOLDER_EMAIL_ADDRESS = "default_user@ssb.no"
60
+ PLACEHOLDER_GROUP = "default-team-developers"
61
+ PLACEHOLDER_TEAM = "default-team"
62
+
63
+ @property
64
+ def short_email(self) -> str | None:
65
+ """Get the short email address."""
66
+ return TestUserInfo.PLACEHOLDER_EMAIL_ADDRESS
67
+
68
+ @property
69
+ def current_group(self) -> str | None:
70
+ """Get the group which the user is currently representing."""
71
+ return TestUserInfo.PLACEHOLDER_GROUP
72
+
73
+ @property
74
+ def current_team(self) -> str | None:
75
+ """Get the team which the user is currently representing."""
76
+ return TestUserInfo.PLACEHOLDER_TEAM
77
+
78
+
79
+ class DaplaLabUserInfo:
80
+ """Information about the current user when running on Dapla Lab."""
81
+
82
+ @property
83
+ def short_email(self) -> str | None:
84
+ """Get the short email address."""
85
+ encoded_jwt = config.get_oidc_token()
86
+ if encoded_jwt:
87
+ # The JWT has been verified by the platform prior to injection, no need to verify.
88
+ decoded_jwt = jwt.decode(encoded_jwt, options={"verify_signature": False})
89
+ with contextlib.suppress(KeyError):
90
+ # If email can't be found in the JWT, fall through and return None
91
+ return decoded_jwt["email"]
92
+
93
+ logger.warning(
94
+ "Could not access JWT from environment. Could not get short email address.",
95
+ )
96
+ return None
97
+
98
+ @property
99
+ def current_group(self) -> str:
100
+ """Get the group which the user is currently representing."""
101
+ if group := config.get_dapla_group_context():
102
+ return group
103
+ msg = "DAPLA_GROUP_CONTEXT environment variable not found"
104
+ raise OSError(msg)
105
+
106
+ @property
107
+ def current_team(self) -> str:
108
+ """Get the team which the user is currently representing."""
109
+ return parse_team_name(self.current_group)
110
+
111
+
112
+ def get_user_info_for_current_platform() -> UserInfo:
113
+ """Return the correct implementation of UserInfo for the current platform."""
114
+ if config.get_dapla_region() == DaplaRegion.DAPLA_LAB:
115
+ return DaplaLabUserInfo()
116
+ logger.warning(
117
+ "Was not possible to retrieve user information! Some fields may not be set.",
118
+ )
119
+ return UnknownUserInfo()
120
+
121
+
122
+ def parse_team_name(group: str) -> str:
123
+ """Parses the group to get the current team.
124
+
125
+ >>> parse_team_name("dapla-metadata-developers")
126
+ 'dapla-metadata'
127
+
128
+ >>> parse_team_name("dapla-metadata-data-admins")
129
+ 'dapla-metadata'
130
+
131
+ >>> parse_team_name("dapla-metadata")
132
+ 'dapla'
133
+
134
+ >>> parse_team_name("dapla-metadata-not-real-name")
135
+ 'dapla-metadata-not-real'
136
+ """
137
+ parts = group.split("-")
138
+ return "-".join(parts[:-2] if group.endswith("data-admins") else parts[:-1])
@@ -1,6 +1,6 @@
1
1
  """Document dataset."""
2
2
 
3
- from datadoc_model import model
3
+ from datadoc_model.all_optional import model
4
4
 
5
5
  from .core import Datadoc
6
6
  from .dapla_dataset_path_info import DaplaDatasetPathInfo
@@ -0,0 +1,333 @@
1
+ """Code relating to merging metadata from an existing metadata document and metadata extracted from a new dataset.
2
+
3
+ This is primarily convenience functionality for users whereby they can programmatically generate metadata without
4
+ having to manually enter it. This is primarily useful when data is sharded by time (i.e. each dataset applies for
5
+ a particular period like a month or a year). Assuming there aren't structural changes, the metadata may be reused
6
+ for all periods.
7
+
8
+ It is important to be able to detect changes in the structure of the data and warn users about this so that they can
9
+ make changes as appropriate.
10
+ """
11
+
12
+ import copy
13
+ import logging
14
+ import warnings
15
+ from collections.abc import Iterable
16
+ from dataclasses import dataclass
17
+ from dataclasses import field
18
+ from pathlib import Path
19
+ from typing import cast
20
+
21
+ import datadoc_model
22
+ import datadoc_model.all_optional.model as all_optional_model
23
+ import datadoc_model.required.model as required_model
24
+ from cloudpathlib import CloudPath
25
+
26
+ from dapla_metadata.datasets.dapla_dataset_path_info import DaplaDatasetPathInfo
27
+ from dapla_metadata.datasets.utility.constants import (
28
+ DATASET_FIELDS_FROM_EXISTING_METADATA,
29
+ )
30
+ from dapla_metadata.datasets.utility.constants import INCONSISTENCIES_MESSAGE
31
+ from dapla_metadata.datasets.utility.utils import OptionalDatadocMetadataType
32
+ from dapla_metadata.datasets.utility.utils import VariableListType
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+ BUCKET_NAME_MESSAGE = "Bucket name"
37
+ DATA_PRODUCT_NAME_MESSAGE = "Data product name"
38
+ DATASET_STATE_MESSAGE = "Dataset state"
39
+ DATASET_SHORT_NAME_MESSAGE = "Dataset short name"
40
+ VARIABLES_ADDITIONAL_MESSAGE = (
41
+ "Dataset has additional variables than defined in metadata"
42
+ )
43
+ VARIABLE_RENAME_MESSAGE = "Variables have been renamed in the dataset"
44
+ VARIABLE_ORDER_MESSAGE = "The order of variables in the dataset has changed"
45
+ VARIABLE_DATATYPES_MESSAGE = "Variable datatypes differ"
46
+ VARIABLES_FEWER_MESSAGE = "Dataset has fewer variables than defined in metadata"
47
+
48
+
49
+ class InconsistentDatasetsWarning(UserWarning):
50
+ """Existing and new datasets differ significantly from one another."""
51
+
52
+
53
+ class InconsistentDatasetsError(ValueError):
54
+ """Existing and new datasets differ significantly from one another."""
55
+
56
+
57
+ @dataclass
58
+ class DatasetConsistencyStatus:
59
+ """Store the status for different aspects of dataset consistency.
60
+
61
+ Attributes:
62
+ message: Communicates to the user what aspect is inconsistent.
63
+ success: False if inconsistency is detected.
64
+ variables: Optionally communicate which variables are affected.
65
+ """
66
+
67
+ message: str
68
+ success: bool
69
+ variables: Iterable[str] = field(default_factory=list)
70
+
71
+ def __str__(self) -> str:
72
+ """Format the user message."""
73
+ message = self.message
74
+ if self.variables:
75
+ message += f"\n\tVariables: {self.variables}"
76
+ return message
77
+
78
+
79
+ def check_dataset_consistency(
80
+ new_dataset_path: Path | CloudPath,
81
+ existing_dataset_path: Path | CloudPath,
82
+ ) -> list[DatasetConsistencyStatus]:
83
+ """Run consistency tests.
84
+
85
+ Args:
86
+ new_dataset_path: Path to the dataset to be documented.
87
+ existing_dataset_path: Path stored in the existing metadata.
88
+
89
+ Returns:
90
+ List of consistency check results.
91
+ """
92
+ new_dataset_path_info = DaplaDatasetPathInfo(new_dataset_path)
93
+ existing_dataset_path_info = DaplaDatasetPathInfo(existing_dataset_path)
94
+ return [
95
+ DatasetConsistencyStatus(
96
+ message=BUCKET_NAME_MESSAGE,
97
+ success=(
98
+ new_dataset_path_info.bucket_name
99
+ == existing_dataset_path_info.bucket_name
100
+ ),
101
+ ),
102
+ DatasetConsistencyStatus(
103
+ message=DATA_PRODUCT_NAME_MESSAGE,
104
+ success=(
105
+ new_dataset_path_info.statistic_short_name
106
+ == existing_dataset_path_info.statistic_short_name
107
+ ),
108
+ ),
109
+ DatasetConsistencyStatus(
110
+ message=DATASET_STATE_MESSAGE,
111
+ success=(
112
+ new_dataset_path_info.dataset_state
113
+ == existing_dataset_path_info.dataset_state
114
+ ),
115
+ ),
116
+ DatasetConsistencyStatus(
117
+ message=DATASET_SHORT_NAME_MESSAGE,
118
+ success=(
119
+ new_dataset_path_info.dataset_short_name
120
+ == existing_dataset_path_info.dataset_short_name
121
+ ),
122
+ ),
123
+ ]
124
+
125
+
126
+ def check_variables_consistency(
127
+ extracted_variables: VariableListType,
128
+ existing_variables: VariableListType,
129
+ ) -> list[DatasetConsistencyStatus]:
130
+ """Check for consistency in variables structure.
131
+
132
+ Compares the existing metadata and that extracted from the new dataset and provides
133
+ highly detailed feedback on what is different between them.
134
+
135
+ We don't return all the results because that could create conflicting messages and false positives.
136
+
137
+ Args:
138
+ extracted_variables (VariableListType): Variables extracted from the new dataset.
139
+ existing_variables (VariableListType): Variables already documented in existing metadata
140
+
141
+ Returns:
142
+ list[DatasetConsistencyStatus]: The list of checks and whether they were successful.
143
+ """
144
+ extracted_names_set = {v.short_name or "" for v in extracted_variables}
145
+ existing_names_set = {v.short_name or "" for v in existing_variables}
146
+ same_length = len(extracted_variables) == len(existing_variables)
147
+ more_extracted_variables = extracted_names_set.difference(existing_names_set)
148
+ fewer_extracted_variables = existing_names_set.difference(extracted_names_set)
149
+ results = []
150
+ if same_length:
151
+ if more_extracted_variables:
152
+ results.append(
153
+ DatasetConsistencyStatus(
154
+ message=VARIABLE_RENAME_MESSAGE,
155
+ variables=more_extracted_variables,
156
+ success=not bool(more_extracted_variables),
157
+ )
158
+ )
159
+ else:
160
+ results.append(
161
+ DatasetConsistencyStatus(
162
+ message=VARIABLE_ORDER_MESSAGE,
163
+ success=[v.short_name or "" for v in extracted_variables]
164
+ == [v.short_name or "" for v in existing_variables],
165
+ )
166
+ )
167
+ results.append(
168
+ DatasetConsistencyStatus(
169
+ message=VARIABLE_DATATYPES_MESSAGE,
170
+ success=[v.data_type for v in extracted_variables]
171
+ == [v.data_type for v in existing_variables],
172
+ )
173
+ )
174
+ else:
175
+ results.extend(
176
+ [
177
+ DatasetConsistencyStatus(
178
+ message=VARIABLES_ADDITIONAL_MESSAGE,
179
+ variables=more_extracted_variables,
180
+ success=not bool(more_extracted_variables),
181
+ ),
182
+ DatasetConsistencyStatus(
183
+ message=VARIABLES_FEWER_MESSAGE,
184
+ variables=fewer_extracted_variables,
185
+ success=not bool(fewer_extracted_variables),
186
+ ),
187
+ ]
188
+ )
189
+ return results
190
+
191
+
192
+ def check_ready_to_merge(
193
+ results: list[DatasetConsistencyStatus], *, errors_as_warnings: bool
194
+ ) -> None:
195
+ """Check if the datasets are consistent enough to make a successful merge of metadata.
196
+
197
+ Args:
198
+ results: List if dict with property name and boolean success flag
199
+ errors_as_warnings: True if failing checks should be raised as warnings, not errors.
200
+
201
+ Raises:
202
+ InconsistentDatasetsError: If inconsistencies are found and `errors_as_warnings == False`
203
+ """
204
+ if failures := [result for result in results if not result.success]:
205
+ messages_list = "\n - ".join(str(f) for f in failures)
206
+ msg = f"{INCONSISTENCIES_MESSAGE}\n - {messages_list}"
207
+ if errors_as_warnings:
208
+ warnings.warn(
209
+ message=msg,
210
+ category=InconsistentDatasetsWarning,
211
+ stacklevel=2,
212
+ )
213
+ else:
214
+ raise InconsistentDatasetsError(
215
+ msg,
216
+ )
217
+
218
+
219
+ def override_dataset_fields(
220
+ merged_metadata: all_optional_model.DatadocMetadata,
221
+ existing_metadata: all_optional_model.DatadocMetadata
222
+ | required_model.DatadocMetadata,
223
+ ) -> None:
224
+ """Overrides specific fields in the dataset of `merged_metadata` with values from the dataset of `existing_metadata`.
225
+
226
+ This function iterates over a predefined list of fields, `DATASET_FIELDS_FROM_EXISTING_METADATA`,
227
+ and sets the corresponding fields in the `merged_metadata.dataset` object to the values
228
+ from the `existing_metadata.dataset` object.
229
+
230
+ Args:
231
+ merged_metadata: An instance of `DatadocMetadata` containing the dataset to be updated.
232
+ existing_metadata: An instance of `DatadocMetadata` containing the dataset whose values are used to update `merged_metadata.dataset`.
233
+
234
+ Returns:
235
+ `None`.
236
+ """
237
+ if merged_metadata.dataset and existing_metadata.dataset:
238
+ # Override the fields as defined
239
+ for field in DATASET_FIELDS_FROM_EXISTING_METADATA:
240
+ setattr(
241
+ merged_metadata.dataset,
242
+ field,
243
+ getattr(existing_metadata.dataset, field),
244
+ )
245
+
246
+
247
+ def merge_variables(
248
+ existing_metadata: OptionalDatadocMetadataType,
249
+ extracted_metadata: all_optional_model.DatadocMetadata,
250
+ merged_metadata: all_optional_model.DatadocMetadata,
251
+ ) -> all_optional_model.DatadocMetadata:
252
+ """Merges variables from the extracted metadata into the existing metadata and updates the merged metadata.
253
+
254
+ This function compares the variables from `extracted_metadata` with those in `existing_metadata`.
255
+ For each variable in `extracted_metadata`, it checks if a variable with the same `short_name` exists
256
+ in `existing_metadata`. If a match is found, it updates the existing variable with information from
257
+ `extracted_metadata`. If no match is found, the variable from `extracted_metadata` is directly added to `merged_metadata`.
258
+
259
+ Args:
260
+ existing_metadata: The metadata object containing the current state of variables.
261
+ extracted_metadata: The metadata object containing new or updated variables to merge.
262
+ merged_metadata: The metadata object that will contain the result of the merge.
263
+
264
+ Returns:
265
+ all_optional_model.DatadocMetadata: The `merged_metadata` object containing variables from both `existing_metadata`
266
+ and `extracted_metadata`.
267
+ """
268
+ if (
269
+ existing_metadata is not None
270
+ and existing_metadata.variables is not None
271
+ and extracted_metadata is not None
272
+ and extracted_metadata.variables is not None
273
+ and merged_metadata.variables is not None
274
+ ):
275
+ for extracted in extracted_metadata.variables:
276
+ existing = next(
277
+ (
278
+ existing
279
+ for existing in existing_metadata.variables
280
+ if existing.short_name == extracted.short_name
281
+ ),
282
+ None,
283
+ )
284
+ if existing:
285
+ existing.id = (
286
+ None # Set to None so that it will be set assigned a fresh ID later
287
+ )
288
+ existing.contains_data_from = (
289
+ extracted.contains_data_from or existing.contains_data_from
290
+ )
291
+ existing.contains_data_until = (
292
+ extracted.contains_data_until or existing.contains_data_until
293
+ )
294
+ merged_metadata.variables.append(
295
+ cast("datadoc_model.all_optional.model.Variable", existing)
296
+ )
297
+ else:
298
+ # If there is no existing metadata for this variable, we just use what we have extracted
299
+ merged_metadata.variables.append(extracted)
300
+ return merged_metadata
301
+
302
+
303
+ def merge_metadata(
304
+ extracted_metadata: all_optional_model.DatadocMetadata | None,
305
+ existing_metadata: OptionalDatadocMetadataType,
306
+ ) -> all_optional_model.DatadocMetadata:
307
+ if not existing_metadata:
308
+ logger.warning(
309
+ "No existing metadata found, no merge to perform. Continuing with extracted metadata.",
310
+ )
311
+ return extracted_metadata or all_optional_model.DatadocMetadata()
312
+
313
+ if not extracted_metadata:
314
+ return cast("all_optional_model.DatadocMetadata", existing_metadata)
315
+
316
+ # Use the extracted metadata as a base
317
+ merged_metadata = all_optional_model.DatadocMetadata(
318
+ dataset=copy.deepcopy(extracted_metadata.dataset),
319
+ variables=[],
320
+ )
321
+
322
+ override_dataset_fields(
323
+ merged_metadata=merged_metadata,
324
+ existing_metadata=existing_metadata,
325
+ )
326
+
327
+ # Merge variables.
328
+ # For each extracted variable, copy existing metadata into the merged metadata
329
+ return merge_variables(
330
+ existing_metadata=existing_metadata,
331
+ extracted_metadata=extracted_metadata,
332
+ merged_metadata=merged_metadata,
333
+ )
@@ -90,10 +90,6 @@ class CodeList(GetExternalSource):
90
90
  execution of data fetching.
91
91
  classification_id: The ID of the classification to retrieve.
92
92
  """
93
- self.supported_languages = [
94
- SupportedLanguages.NORSK_BOKMÅL,
95
- SupportedLanguages.ENGLISH,
96
- ]
97
93
  self._classifications: list[CodeListItem] = []
98
94
  self.classification_id = classification_id
99
95
  self.classifications_dataframes: (
@@ -117,12 +113,15 @@ class CodeList(GetExternalSource):
117
113
  and returns None.
118
114
  """
119
115
  classifications_dataframes: dict[SupportedLanguages, pd.DataFrame] = {}
120
- for i in self.supported_languages:
116
+ for i in [
117
+ SupportedLanguages.NORSK_BOKMÅL,
118
+ SupportedLanguages.ENGLISH,
119
+ ]:
121
120
  try:
122
121
  classifications_dataframes[i] = (
123
122
  KlassClassification(
124
123
  str(self.classification_id),
125
- i,
124
+ i.lower(), # type: ignore [arg-type]
126
125
  )
127
126
  .get_codes()
128
127
  .data
@@ -0,0 +1,10 @@
1
+ """Model Backwards Compatibility.
2
+
3
+ This package contains code for upgrading existing metadata documents to the newest version of the model.
4
+ This is analogous to a Database Migration where the structure of the data has changed and we wish to
5
+ retain already persisted information.
6
+
7
+ """
8
+
9
+ from ._utils import is_metadata_in_container_structure
10
+ from .model_backwards_compatibility import upgrade_metadata