dapla-toolbelt-metadata 0.1.2__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dapla-toolbelt-metadata might be problematic. Click here for more details.
- {dapla_toolbelt_metadata-0.1.2 → dapla_toolbelt_metadata-0.2.1}/PKG-INFO +9 -20
- {dapla_toolbelt_metadata-0.1.2 → dapla_toolbelt_metadata-0.2.1}/README.md +7 -5
- {dapla_toolbelt_metadata-0.1.2 → dapla_toolbelt_metadata-0.2.1}/pyproject.toml +36 -47
- dapla_toolbelt_metadata-0.2.1/src/dapla_metadata/__init__.py +5 -0
- {dapla_toolbelt_metadata-0.1.2/src/dataset → dapla_toolbelt_metadata-0.2.1/src/dapla_metadata/datasets}/code_list.py +2 -2
- {dapla_toolbelt_metadata-0.1.2/src/dataset → dapla_toolbelt_metadata-0.2.1/src/dapla_metadata/datasets}/config.py +9 -5
- {dapla_toolbelt_metadata-0.1.2/src/dataset → dapla_toolbelt_metadata-0.2.1/src/dapla_metadata/datasets}/core.py +75 -91
- {dapla_toolbelt_metadata-0.1.2/src/dataset → dapla_toolbelt_metadata-0.2.1/src/dapla_metadata/datasets}/dataset_parser.py +1 -1
- {dapla_toolbelt_metadata-0.1.2/src/dataset → dapla_toolbelt_metadata-0.2.1/src/dapla_metadata/datasets}/model_validation.py +17 -11
- {dapla_toolbelt_metadata-0.1.2/src/dataset → dapla_toolbelt_metadata-0.2.1/src/dapla_metadata/datasets}/statistic_subject_mapping.py +9 -8
- {dapla_toolbelt_metadata-0.1.2/src/dataset → dapla_toolbelt_metadata-0.2.1/src/dapla_metadata/datasets}/user_info.py +3 -3
- {dapla_toolbelt_metadata-0.1.2/src/dataset → dapla_toolbelt_metadata-0.2.1/src/dapla_metadata/datasets}/utility/constants.py +4 -0
- {dapla_toolbelt_metadata-0.1.2/src/dataset → dapla_toolbelt_metadata-0.2.1/src/dapla_metadata/datasets}/utility/utils.py +92 -5
- {dapla_toolbelt_metadata-0.1.2 → dapla_toolbelt_metadata-0.2.1}/LICENSE +0 -0
- {dapla_toolbelt_metadata-0.1.2/src/dataset → dapla_toolbelt_metadata-0.2.1/src/dapla_metadata/datasets}/__init__.py +0 -0
- {dapla_toolbelt_metadata-0.1.2/src/dataset → dapla_toolbelt_metadata-0.2.1/src/dapla_metadata/datasets}/dapla_dataset_path_info.py +0 -0
- {dapla_toolbelt_metadata-0.1.2/src/dataset → dapla_toolbelt_metadata-0.2.1/src/dapla_metadata/datasets}/external_sources/__init__.py +0 -0
- {dapla_toolbelt_metadata-0.1.2/src/dataset → dapla_toolbelt_metadata-0.2.1/src/dapla_metadata/datasets}/external_sources/external_sources.py +0 -0
- {dapla_toolbelt_metadata-0.1.2/src/dataset → dapla_toolbelt_metadata-0.2.1/src/dapla_metadata/datasets}/model_backwards_compatibility.py +0 -0
- {dapla_toolbelt_metadata-0.1.2/src/dataset → dapla_toolbelt_metadata-0.2.1/src/dapla_metadata/datasets}/py.typed +0 -0
- {dapla_toolbelt_metadata-0.1.2/src/dataset → dapla_toolbelt_metadata-0.2.1/src/dapla_metadata/datasets}/utility/__init__.py +0 -0
- {dapla_toolbelt_metadata-0.1.2/src/dataset → dapla_toolbelt_metadata-0.2.1/src/dapla_metadata/datasets}/utility/enums.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: dapla-toolbelt-metadata
|
|
3
|
-
Version: 0.1
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Dapla Toolbelt Metadata
|
|
5
5
|
Home-page: https://github.com/statisticsnorway/dapla-toolbelt-metadata
|
|
6
6
|
License: MIT
|
|
@@ -15,30 +15,17 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.12
|
|
16
16
|
Requires-Dist: arrow (>=1.3.0)
|
|
17
17
|
Requires-Dist: beautifulsoup4 (>=4.12.3)
|
|
18
|
-
Requires-Dist: black (>=24.8.0,<25.0.0)
|
|
19
|
-
Requires-Dist: bs4 (>=0.0.2,<0.0.3)
|
|
20
|
-
Requires-Dist: click (>=8.0.1)
|
|
21
18
|
Requires-Dist: cloudpathlib[gs] (>=0.17.0)
|
|
22
|
-
Requires-Dist: coverage (>=7.6.1,<8.0.0)
|
|
23
19
|
Requires-Dist: dapla-toolbelt (>=1.3.3)
|
|
24
|
-
Requires-Dist: faker (>=26.1.0,<27.0.0)
|
|
25
|
-
Requires-Dist: furo (>=2024.7.18,<2025.0.0)
|
|
26
|
-
Requires-Dist: gunicorn (>=21.2.0)
|
|
27
20
|
Requires-Dist: pandas (>=1.4.2)
|
|
28
|
-
Requires-Dist: pre-commit (>=3.8.0,<4.0.0)
|
|
29
21
|
Requires-Dist: pyarrow (>=8.0.0)
|
|
30
22
|
Requires-Dist: pydantic (>=2.5.2)
|
|
31
|
-
Requires-Dist: pygments (>=2.18.0,<3.0.0)
|
|
32
23
|
Requires-Dist: pyjwt (>=2.8.0)
|
|
33
|
-
Requires-Dist: pytest (>=8.3.2,<9.0.0)
|
|
34
|
-
Requires-Dist: pytest-mock (>=3.14.0,<4.0.0)
|
|
35
24
|
Requires-Dist: python-dotenv (>=1.0.1)
|
|
36
25
|
Requires-Dist: requests (>=2.31.0)
|
|
37
|
-
Requires-Dist: requests-mock (>=1.12.1,<2.0.0)
|
|
38
|
-
Requires-Dist: ruff (>=0.5.6,<0.6.0)
|
|
39
26
|
Requires-Dist: ssb-datadoc-model (>=6.0.0,<7.0.0)
|
|
40
27
|
Requires-Dist: ssb-klass-python (>=0.0.9)
|
|
41
|
-
Requires-Dist:
|
|
28
|
+
Requires-Dist: typing-extensions (>=4.12.2)
|
|
42
29
|
Project-URL: Changelog, https://github.com/statisticsnorway/dapla-toolbelt-metadata/releases
|
|
43
30
|
Project-URL: Documentation, https://statisticsnorway.github.io/dapla-toolbelt-metadata
|
|
44
31
|
Project-URL: Repository, https://github.com/statisticsnorway/dapla-toolbelt-metadata
|
|
@@ -71,14 +58,16 @@ Description-Content-Type: text/markdown
|
|
|
71
58
|
[black]: https://github.com/psf/black
|
|
72
59
|
[poetry]: https://python-poetry.org/
|
|
73
60
|
|
|
61
|
+
Tools and clients for working with the Dapla Metadata system.
|
|
62
|
+
|
|
74
63
|
## Features
|
|
75
64
|
|
|
76
|
-
-
|
|
65
|
+
- Create and update metadata for datasets (Datadoc).
|
|
77
66
|
|
|
78
|
-
|
|
67
|
+
### Coming
|
|
79
68
|
|
|
80
|
-
-
|
|
81
|
-
-
|
|
69
|
+
- Read, create and update variable definitions.
|
|
70
|
+
- Publish dataset metadata to Statistics Norway's data catalogue.
|
|
82
71
|
|
|
83
72
|
## Installation
|
|
84
73
|
|
|
@@ -90,7 +79,7 @@ pip install dapla-toolbelt-metadata
|
|
|
90
79
|
|
|
91
80
|
## Usage
|
|
92
81
|
|
|
93
|
-
Please see the [Reference Guide] for
|
|
82
|
+
Instructions and examples may be found in the [Dapla Manual](https://manual.dapla.ssb.no/statistikkere/). Please see the [Reference Guide] for API documentation.
|
|
94
83
|
|
|
95
84
|
## Contributing
|
|
96
85
|
|
|
@@ -25,14 +25,16 @@
|
|
|
25
25
|
[black]: https://github.com/psf/black
|
|
26
26
|
[poetry]: https://python-poetry.org/
|
|
27
27
|
|
|
28
|
+
Tools and clients for working with the Dapla Metadata system.
|
|
29
|
+
|
|
28
30
|
## Features
|
|
29
31
|
|
|
30
|
-
-
|
|
32
|
+
- Create and update metadata for datasets (Datadoc).
|
|
31
33
|
|
|
32
|
-
|
|
34
|
+
### Coming
|
|
33
35
|
|
|
34
|
-
-
|
|
35
|
-
-
|
|
36
|
+
- Read, create and update variable definitions.
|
|
37
|
+
- Publish dataset metadata to Statistics Norway's data catalogue.
|
|
36
38
|
|
|
37
39
|
## Installation
|
|
38
40
|
|
|
@@ -44,7 +46,7 @@ pip install dapla-toolbelt-metadata
|
|
|
44
46
|
|
|
45
47
|
## Usage
|
|
46
48
|
|
|
47
|
-
Please see the [Reference Guide] for
|
|
49
|
+
Instructions and examples may be found in the [Dapla Manual](https://manual.dapla.ssb.no/statistikkere/). Please see the [Reference Guide] for API documentation.
|
|
48
50
|
|
|
49
51
|
## Contributing
|
|
50
52
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "dapla-toolbelt-metadata"
|
|
3
|
-
version = "0.1
|
|
3
|
+
version = "0.2.1"
|
|
4
4
|
description = "Dapla Toolbelt Metadata"
|
|
5
5
|
authors = ["Team Metadata <metadata@ssb.no>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -10,19 +10,17 @@ repository = "https://github.com/statisticsnorway/dapla-toolbelt-metadata"
|
|
|
10
10
|
documentation = "https://statisticsnorway.github.io/dapla-toolbelt-metadata"
|
|
11
11
|
classifiers = ["Development Status :: 4 - Beta"]
|
|
12
12
|
|
|
13
|
-
packages = [{ include = "
|
|
13
|
+
packages = [{ include = "dapla_metadata", from = "src" }]
|
|
14
14
|
|
|
15
15
|
[tool.poetry.urls]
|
|
16
16
|
Changelog = "https://github.com/statisticsnorway/dapla-toolbelt-metadata/releases"
|
|
17
17
|
|
|
18
18
|
[tool.poetry.dependencies]
|
|
19
|
-
click = ">=8.0.1"
|
|
20
19
|
python = ">=3.10,<4.0"
|
|
21
20
|
pyarrow = ">=8.0.0"
|
|
22
21
|
pydantic = ">=2.5.2"
|
|
23
22
|
pandas = ">=1.4.2"
|
|
24
23
|
dapla-toolbelt = ">=1.3.3"
|
|
25
|
-
gunicorn = ">=21.2.0"
|
|
26
24
|
arrow = ">=1.3.0"
|
|
27
25
|
python-dotenv = ">=1.0.1"
|
|
28
26
|
requests = ">=2.31.0"
|
|
@@ -31,27 +29,17 @@ cloudpathlib = { extras = ["gs"], version = ">=0.17.0" }
|
|
|
31
29
|
pyjwt = ">=2.8.0"
|
|
32
30
|
ssb-klass-python = ">=0.0.9"
|
|
33
31
|
ssb-datadoc-model = "^6.0.0"
|
|
34
|
-
|
|
35
|
-
pygments = "^2.18.0"
|
|
36
|
-
black = "^24.8.0"
|
|
37
|
-
coverage = "^7.6.1"
|
|
38
|
-
furo = "^2024.7.18"
|
|
39
|
-
pre-commit = "^3.8.0"
|
|
40
|
-
ruff = "^0.5.6"
|
|
41
|
-
pytest = "^8.3.2"
|
|
42
|
-
pytest-mock = "^3.14.0"
|
|
43
|
-
requests-mock = "^1.12.1"
|
|
44
|
-
bs4 = "^0.0.2"
|
|
45
|
-
types-beautifulsoup4 = "^4.12.0.20240511"
|
|
32
|
+
typing-extensions = ">=4.12.2"
|
|
46
33
|
|
|
47
34
|
[tool.poetry.group.dev.dependencies]
|
|
48
|
-
|
|
49
|
-
|
|
35
|
+
black = ">=24.8.0"
|
|
36
|
+
pygments = ">=2.18.0"
|
|
50
37
|
coverage = { extras = ["toml"], version = ">=6.2" }
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
38
|
+
faker = ">=26.1.0"
|
|
39
|
+
furo = ">=2024.7.18"
|
|
40
|
+
pre-commit = ">=3.8.0"
|
|
41
|
+
ruff = ">=0.5.6"
|
|
42
|
+
pytest = ">=8.3.2"
|
|
55
43
|
sphinx = ">=6.2.1"
|
|
56
44
|
sphinx-autobuild = ">=2021.3.14"
|
|
57
45
|
sphinx-autodoc-typehints = ">=1.24.0"
|
|
@@ -63,31 +51,29 @@ mypy = ">=0.950"
|
|
|
63
51
|
pytest-cov = ">=3.0.0"
|
|
64
52
|
nbstripout = ">=0.5.0"
|
|
65
53
|
python-kacl = "*"
|
|
66
|
-
pytest-mock = "
|
|
54
|
+
pytest-mock = ">=3.14.0"
|
|
67
55
|
deptry = ">=0.12.0"
|
|
68
56
|
pandas-stubs = "*"
|
|
69
57
|
pyarrow-stubs = ">=10.0.1.9"
|
|
58
|
+
requests-mock = ">=1.12.1"
|
|
70
59
|
types-Pygments = "*"
|
|
71
60
|
types-colorama = "*"
|
|
72
61
|
types-setuptools = "*"
|
|
73
|
-
types-beautifulsoup4 = "
|
|
74
|
-
|
|
75
|
-
faker = "*"
|
|
62
|
+
types-beautifulsoup4 = ">=4.12.0.20240511"
|
|
63
|
+
ipykernel = "^6.29.5"
|
|
76
64
|
|
|
77
65
|
[tool.pytest.ini_options]
|
|
78
66
|
pythonpath = ["src"]
|
|
79
67
|
|
|
68
|
+
|
|
80
69
|
[tool.coverage.paths]
|
|
81
70
|
source = ["src", "*/site-packages"]
|
|
82
71
|
tests = ["tests", "*/tests"]
|
|
83
72
|
|
|
84
73
|
[tool.coverage.run]
|
|
85
74
|
branch = true
|
|
86
|
-
source = ["
|
|
87
|
-
omit = [
|
|
88
|
-
"tests/*",
|
|
89
|
-
"__init__.py",
|
|
90
|
-
]
|
|
75
|
+
source = ["dapla_metadata"]
|
|
76
|
+
omit = ["tests/*", "__init__.py"]
|
|
91
77
|
relative_files = true
|
|
92
78
|
|
|
93
79
|
[tool.coverage.report]
|
|
@@ -95,9 +81,7 @@ show_missing = true
|
|
|
95
81
|
fail_under = 80
|
|
96
82
|
|
|
97
83
|
[tool.mypy]
|
|
98
|
-
plugins = [
|
|
99
|
-
"pydantic.mypy"
|
|
100
|
-
]
|
|
84
|
+
plugins = ["pydantic.mypy"]
|
|
101
85
|
strict = false
|
|
102
86
|
warn_unreachable = true
|
|
103
87
|
pretty = true
|
|
@@ -129,10 +113,10 @@ disable_error_code = [
|
|
|
129
113
|
]
|
|
130
114
|
|
|
131
115
|
[tool.ruff]
|
|
132
|
-
force-exclude = true
|
|
116
|
+
force-exclude = true # Apply excludes to pre-commit
|
|
133
117
|
show-fixes = true
|
|
134
118
|
src = ["src", "tests"]
|
|
135
|
-
target-version = "py310"
|
|
119
|
+
target-version = "py310" # Minimum Python version supported
|
|
136
120
|
include = ["*.py", "*.pyi", "**/pyproject.toml", "*.ipynb"]
|
|
137
121
|
extend-exclude = [
|
|
138
122
|
"__pycache__",
|
|
@@ -162,26 +146,31 @@ force-single-line = true
|
|
|
162
146
|
max-complexity = 15
|
|
163
147
|
|
|
164
148
|
[tool.ruff.lint.pydocstyle]
|
|
165
|
-
convention = "google"
|
|
149
|
+
convention = "google" # You can also use "numpy".
|
|
166
150
|
|
|
167
151
|
[tool.ruff.lint.pep8-naming]
|
|
168
|
-
classmethod-decorators = [
|
|
152
|
+
classmethod-decorators = [
|
|
153
|
+
"classmethod",
|
|
154
|
+
"validator",
|
|
155
|
+
"root_validator",
|
|
156
|
+
"pydantic.validator",
|
|
157
|
+
]
|
|
169
158
|
|
|
170
159
|
[tool.ruff.lint.per-file-ignores]
|
|
171
160
|
"*/__init__.py" = ["F401"]
|
|
172
161
|
"**/tests/*" = [
|
|
173
|
-
"ANN001",
|
|
174
|
-
"ANN002",
|
|
175
|
-
"ANN003",
|
|
176
|
-
"ANN201",
|
|
177
|
-
"ANN204",
|
|
178
|
-
"ANN205",
|
|
179
|
-
"ANN206",
|
|
180
|
-
"D100",
|
|
162
|
+
"ANN001", # type annotations don't add value for test functions
|
|
163
|
+
"ANN002", # type annotations don't add value for test functions
|
|
164
|
+
"ANN003", # type annotations don't add value for test functions
|
|
165
|
+
"ANN201", # type annotations don't add value for test functions
|
|
166
|
+
"ANN204", # type annotations don't add value for test functions
|
|
167
|
+
"ANN205", # type annotations don't add value for test functions
|
|
168
|
+
"ANN206", # type annotations don't add value for test functions
|
|
169
|
+
"D100", # docstrings are overkill for test functions
|
|
181
170
|
"D101",
|
|
182
171
|
"D102",
|
|
183
172
|
"D103",
|
|
184
|
-
"S101",
|
|
173
|
+
"S101", # asserts are encouraged in pytest
|
|
185
174
|
]
|
|
186
175
|
|
|
187
176
|
[build-system]
|
|
@@ -4,8 +4,8 @@ import logging
|
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
from typing import TYPE_CHECKING
|
|
6
6
|
|
|
7
|
-
from
|
|
8
|
-
from
|
|
7
|
+
from dapla_metadata.datasets.external_sources.external_sources import GetExternalSource
|
|
8
|
+
from dapla_metadata.datasets.utility.enums import SupportedLanguages
|
|
9
9
|
|
|
10
10
|
if TYPE_CHECKING:
|
|
11
11
|
from concurrent.futures import ThreadPoolExecutor
|
|
@@ -10,10 +10,11 @@ from pprint import pformat
|
|
|
10
10
|
from dotenv import dotenv_values
|
|
11
11
|
from dotenv import load_dotenv
|
|
12
12
|
|
|
13
|
-
from
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
13
|
+
from dapla_metadata.datasets.utility.constants import (
|
|
14
|
+
DATADOC_STATISTICAL_SUBJECT_SOURCE_URL,
|
|
15
|
+
)
|
|
16
|
+
from dapla_metadata.datasets.utility.enums import DaplaRegion
|
|
17
|
+
from dapla_metadata.datasets.utility.enums import DaplaService
|
|
17
18
|
|
|
18
19
|
logger = logging.getLogger(__name__)
|
|
19
20
|
|
|
@@ -52,7 +53,10 @@ def get_jupyterhub_user() -> str | None:
|
|
|
52
53
|
|
|
53
54
|
def get_statistical_subject_source_url() -> str | None:
|
|
54
55
|
"""Get the URL to the statistical subject source."""
|
|
55
|
-
return
|
|
56
|
+
return (
|
|
57
|
+
_get_config_item("DATADOC_STATISTICAL_SUBJECT_SOURCE_URL")
|
|
58
|
+
or DATADOC_STATISTICAL_SUBJECT_SOURCE_URL
|
|
59
|
+
)
|
|
56
60
|
|
|
57
61
|
|
|
58
62
|
def get_dapla_region() -> DaplaRegion | None:
|
|
@@ -13,28 +13,37 @@ from typing import TYPE_CHECKING
|
|
|
13
13
|
from datadoc_model import model
|
|
14
14
|
from datadoc_model.model import DataSetStatus
|
|
15
15
|
|
|
16
|
-
from
|
|
17
|
-
from
|
|
18
|
-
from
|
|
19
|
-
from
|
|
20
|
-
from
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
from
|
|
24
|
-
from
|
|
25
|
-
from
|
|
26
|
-
from
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
from
|
|
30
|
-
from
|
|
31
|
-
from
|
|
32
|
-
from
|
|
33
|
-
from
|
|
34
|
-
from
|
|
35
|
-
from
|
|
36
|
-
from
|
|
37
|
-
from
|
|
16
|
+
from dapla_metadata.datasets import config
|
|
17
|
+
from dapla_metadata.datasets import user_info
|
|
18
|
+
from dapla_metadata.datasets.dapla_dataset_path_info import DaplaDatasetPathInfo
|
|
19
|
+
from dapla_metadata.datasets.dataset_parser import DatasetParser
|
|
20
|
+
from dapla_metadata.datasets.model_backwards_compatibility import (
|
|
21
|
+
is_metadata_in_container_structure,
|
|
22
|
+
)
|
|
23
|
+
from dapla_metadata.datasets.model_backwards_compatibility import upgrade_metadata
|
|
24
|
+
from dapla_metadata.datasets.model_validation import ValidateDatadocMetadata
|
|
25
|
+
from dapla_metadata.datasets.statistic_subject_mapping import StatisticSubjectMapping
|
|
26
|
+
from dapla_metadata.datasets.utility.constants import (
|
|
27
|
+
DEFAULT_SPATIAL_COVERAGE_DESCRIPTION,
|
|
28
|
+
)
|
|
29
|
+
from dapla_metadata.datasets.utility.constants import INCONSISTENCIES_MESSAGE
|
|
30
|
+
from dapla_metadata.datasets.utility.constants import METADATA_DOCUMENT_FILE_SUFFIX
|
|
31
|
+
from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_DATASET_FIELDS
|
|
32
|
+
from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
|
|
33
|
+
from dapla_metadata.datasets.utility.utils import calculate_percentage
|
|
34
|
+
from dapla_metadata.datasets.utility.utils import derive_assessment_from_state
|
|
35
|
+
from dapla_metadata.datasets.utility.utils import get_timestamp_now
|
|
36
|
+
from dapla_metadata.datasets.utility.utils import merge_variables
|
|
37
|
+
from dapla_metadata.datasets.utility.utils import normalize_path
|
|
38
|
+
from dapla_metadata.datasets.utility.utils import (
|
|
39
|
+
num_obligatory_dataset_fields_completed,
|
|
40
|
+
)
|
|
41
|
+
from dapla_metadata.datasets.utility.utils import (
|
|
42
|
+
num_obligatory_variables_fields_completed,
|
|
43
|
+
)
|
|
44
|
+
from dapla_metadata.datasets.utility.utils import override_dataset_fields
|
|
45
|
+
from dapla_metadata.datasets.utility.utils import set_default_values_dataset
|
|
46
|
+
from dapla_metadata.datasets.utility.utils import set_default_values_variables
|
|
38
47
|
|
|
39
48
|
if TYPE_CHECKING:
|
|
40
49
|
import pathlib
|
|
@@ -138,10 +147,11 @@ class Datadoc:
|
|
|
138
147
|
"""
|
|
139
148
|
extracted_metadata: model.DatadocMetadata | None = None
|
|
140
149
|
existing_metadata: model.DatadocMetadata | None = None
|
|
141
|
-
if self.metadata_document
|
|
150
|
+
if self.metadata_document and self.metadata_document.exists():
|
|
142
151
|
existing_metadata = self._extract_metadata_from_existing_document(
|
|
143
152
|
self.metadata_document,
|
|
144
153
|
)
|
|
154
|
+
|
|
145
155
|
if (
|
|
146
156
|
self.dataset_path is not None
|
|
147
157
|
and self.dataset == model.Dataset()
|
|
@@ -157,14 +167,7 @@ class Datadoc:
|
|
|
157
167
|
and extracted_metadata is not None
|
|
158
168
|
and existing_metadata is not None
|
|
159
169
|
):
|
|
160
|
-
|
|
161
|
-
extracted_metadata.dataset is not None
|
|
162
|
-
and extracted_metadata.dataset.file_path is not None
|
|
163
|
-
):
|
|
164
|
-
existing_file_path = extracted_metadata.dataset.file_path
|
|
165
|
-
else:
|
|
166
|
-
msg = "Could not access existing dataset file path"
|
|
167
|
-
raise ValueError(msg)
|
|
170
|
+
existing_file_path = self._get_existing_file_path(extracted_metadata)
|
|
168
171
|
self._check_ready_to_merge(
|
|
169
172
|
self.dataset_path,
|
|
170
173
|
Path(existing_file_path),
|
|
@@ -181,31 +184,39 @@ class Datadoc:
|
|
|
181
184
|
self.metadata_document = self.build_metadata_document_path(
|
|
182
185
|
self.dataset_path,
|
|
183
186
|
)
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
187
|
+
self._set_metadata(merged_metadata)
|
|
188
|
+
else:
|
|
189
|
+
self._set_metadata(existing_metadata or extracted_metadata)
|
|
190
|
+
set_default_values_variables(self.variables)
|
|
191
|
+
set_default_values_dataset(self.dataset)
|
|
192
|
+
self._create_variables_lookup()
|
|
193
|
+
|
|
194
|
+
def _get_existing_file_path(
|
|
195
|
+
self,
|
|
196
|
+
extracted_metadata: model.DatadocMetadata | None,
|
|
197
|
+
) -> str:
|
|
198
|
+
if (
|
|
199
|
+
extracted_metadata is not None
|
|
200
|
+
and extracted_metadata.dataset is not None
|
|
201
|
+
and extracted_metadata.dataset.file_path is not None
|
|
194
202
|
):
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
203
|
+
return extracted_metadata.dataset.file_path
|
|
204
|
+
msg = "Could not access existing dataset file path"
|
|
205
|
+
raise ValueError(msg)
|
|
206
|
+
|
|
207
|
+
def _set_metadata(
|
|
208
|
+
self,
|
|
209
|
+
merged_metadata: model.DatadocMetadata | None,
|
|
210
|
+
) -> None:
|
|
211
|
+
if not merged_metadata or not (
|
|
212
|
+
merged_metadata.dataset and merged_metadata.variables
|
|
201
213
|
):
|
|
202
|
-
self.dataset = extracted_metadata.dataset
|
|
203
|
-
self.variables = extracted_metadata.variables
|
|
204
|
-
else:
|
|
205
214
|
msg = "Could not read metadata"
|
|
206
215
|
raise ValueError(msg)
|
|
207
|
-
|
|
208
|
-
|
|
216
|
+
self.dataset = merged_metadata.dataset
|
|
217
|
+
self.variables = merged_metadata.variables
|
|
218
|
+
|
|
219
|
+
def _create_variables_lookup(self) -> None:
|
|
209
220
|
self.variables_lookup = {
|
|
210
221
|
v.short_name: v for v in self.variables if v.short_name
|
|
211
222
|
}
|
|
@@ -300,55 +311,28 @@ class Datadoc:
|
|
|
300
311
|
"No existing metadata found, no merge to perform. Continuing with extracted metadata.",
|
|
301
312
|
)
|
|
302
313
|
return extracted_metadata or model.DatadocMetadata()
|
|
314
|
+
|
|
303
315
|
if not extracted_metadata:
|
|
304
316
|
return existing_metadata
|
|
317
|
+
|
|
305
318
|
# Use the extracted metadata as a base
|
|
306
319
|
merged_metadata = model.DatadocMetadata(
|
|
307
320
|
dataset=copy.deepcopy(extracted_metadata.dataset),
|
|
308
321
|
variables=[],
|
|
309
322
|
)
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
for field in DATASET_FIELDS_FROM_EXISTING_METADATA:
|
|
316
|
-
setattr(
|
|
317
|
-
merged_metadata.dataset,
|
|
318
|
-
field,
|
|
319
|
-
getattr(existing_metadata.dataset, field),
|
|
320
|
-
)
|
|
323
|
+
|
|
324
|
+
override_dataset_fields(
|
|
325
|
+
merged_metadata=merged_metadata,
|
|
326
|
+
existing_metadata=existing_metadata,
|
|
327
|
+
)
|
|
321
328
|
|
|
322
329
|
# Merge variables.
|
|
323
330
|
# For each extracted variable, copy existing metadata into the merged metadata
|
|
324
|
-
|
|
325
|
-
existing_metadata
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
):
|
|
330
|
-
for extracted in extracted_metadata.variables:
|
|
331
|
-
existing = next(
|
|
332
|
-
(
|
|
333
|
-
existing
|
|
334
|
-
for existing in existing_metadata.variables
|
|
335
|
-
if existing.short_name == extracted.short_name
|
|
336
|
-
),
|
|
337
|
-
None,
|
|
338
|
-
)
|
|
339
|
-
if existing:
|
|
340
|
-
existing.id = None # Set to None so that it will be set assigned a fresh ID later
|
|
341
|
-
existing.contains_data_from = (
|
|
342
|
-
extracted.contains_data_from or existing.contains_data_from
|
|
343
|
-
)
|
|
344
|
-
existing.contains_data_until = (
|
|
345
|
-
extracted.contains_data_until or existing.contains_data_until
|
|
346
|
-
)
|
|
347
|
-
merged_metadata.variables.append(existing)
|
|
348
|
-
else:
|
|
349
|
-
# If there is no existing metadata for this variable, we just use what we have extracted
|
|
350
|
-
merged_metadata.variables.append(extracted)
|
|
351
|
-
return merged_metadata
|
|
331
|
+
return merge_variables(
|
|
332
|
+
existing_metadata=existing_metadata,
|
|
333
|
+
extracted_metadata=extracted_metadata,
|
|
334
|
+
merged_metadata=merged_metadata,
|
|
335
|
+
)
|
|
352
336
|
|
|
353
337
|
def _extract_metadata_from_existing_document(
|
|
354
338
|
self,
|
|
@@ -19,7 +19,7 @@ from datadoc_model.model import LanguageStringTypeItem
|
|
|
19
19
|
from datadoc_model.model import Variable
|
|
20
20
|
from pyarrow import parquet as pq
|
|
21
21
|
|
|
22
|
-
from
|
|
22
|
+
from dapla_metadata.datasets.utility.enums import SupportedLanguages
|
|
23
23
|
|
|
24
24
|
if TYPE_CHECKING:
|
|
25
25
|
import pyarrow as pa
|
|
@@ -11,17 +11,23 @@ from datadoc_model import model
|
|
|
11
11
|
from pydantic import model_validator
|
|
12
12
|
from typing_extensions import Self
|
|
13
13
|
|
|
14
|
-
from
|
|
15
|
-
from
|
|
16
|
-
from
|
|
17
|
-
from
|
|
18
|
-
from
|
|
19
|
-
from
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
from
|
|
23
|
-
from
|
|
24
|
-
from
|
|
14
|
+
from dapla_metadata.datasets.utility.constants import DATE_VALIDATION_MESSAGE
|
|
15
|
+
from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_DATASET_FIELDS
|
|
16
|
+
from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
|
|
17
|
+
from dapla_metadata.datasets.utility.constants import OBLIGATORY_METADATA_WARNING
|
|
18
|
+
from dapla_metadata.datasets.utility.utils import get_missing_obligatory_dataset_fields
|
|
19
|
+
from dapla_metadata.datasets.utility.utils import (
|
|
20
|
+
get_missing_obligatory_variables_fields,
|
|
21
|
+
)
|
|
22
|
+
from dapla_metadata.datasets.utility.utils import get_timestamp_now
|
|
23
|
+
from dapla_metadata.datasets.utility.utils import incorrect_date_order
|
|
24
|
+
from dapla_metadata.datasets.utility.utils import (
|
|
25
|
+
num_obligatory_dataset_fields_completed,
|
|
26
|
+
)
|
|
27
|
+
from dapla_metadata.datasets.utility.utils import (
|
|
28
|
+
num_obligatory_variables_fields_completed,
|
|
29
|
+
)
|
|
30
|
+
from dapla_metadata.datasets.utility.utils import set_variables_inherit_from_dataset
|
|
25
31
|
|
|
26
32
|
if TYPE_CHECKING:
|
|
27
33
|
from datetime import datetime
|
|
@@ -9,8 +9,8 @@ import requests
|
|
|
9
9
|
from bs4 import BeautifulSoup
|
|
10
10
|
from bs4 import ResultSet
|
|
11
11
|
|
|
12
|
-
from
|
|
13
|
-
from
|
|
12
|
+
from dapla_metadata.datasets.external_sources.external_sources import GetExternalSource
|
|
13
|
+
from dapla_metadata.datasets.utility.enums import SupportedLanguages
|
|
14
14
|
|
|
15
15
|
if TYPE_CHECKING:
|
|
16
16
|
from concurrent.futures import ThreadPoolExecutor
|
|
@@ -116,17 +116,18 @@ class StatisticSubjectMapping(GetExternalSource):
|
|
|
116
116
|
|
|
117
117
|
Returns a BeautifulSoup ResultSet.
|
|
118
118
|
"""
|
|
119
|
+
if not self.source_url:
|
|
120
|
+
logger.debug("No statistic subject url supplied")
|
|
121
|
+
return None
|
|
122
|
+
|
|
119
123
|
try:
|
|
120
|
-
|
|
121
|
-
response = requests.get(url, timeout=30)
|
|
124
|
+
response = requests.get(str(self.source_url), timeout=30)
|
|
122
125
|
response.encoding = "utf-8"
|
|
123
|
-
logger.debug("Got response %s from %s", response,
|
|
126
|
+
logger.debug("Got response %s from %s", response, self.source_url)
|
|
124
127
|
soup = BeautifulSoup(response.text, features="xml")
|
|
125
128
|
return soup.find_all("hovedemne")
|
|
126
129
|
except requests.exceptions.RequestException:
|
|
127
|
-
logger.exception(
|
|
128
|
-
"Exception while fetching statistical structure ",
|
|
129
|
-
)
|
|
130
|
+
logger.exception("Exception while fetching statistical structure")
|
|
130
131
|
return None
|
|
131
132
|
|
|
132
133
|
def _parse_statistic_subject_structure_xml(
|
|
@@ -6,9 +6,9 @@ from typing import Protocol
|
|
|
6
6
|
|
|
7
7
|
import jwt
|
|
8
8
|
|
|
9
|
-
from
|
|
10
|
-
from
|
|
11
|
-
from
|
|
9
|
+
from dapla_metadata.datasets import config
|
|
10
|
+
from dapla_metadata.datasets.utility.enums import DaplaRegion
|
|
11
|
+
from dapla_metadata.datasets.utility.enums import DaplaService
|
|
12
12
|
|
|
13
13
|
logger = logging.getLogger(__name__)
|
|
14
14
|
|
|
@@ -14,13 +14,20 @@ from datadoc_model.model import Assessment
|
|
|
14
14
|
from datadoc_model.model import DataSetState
|
|
15
15
|
from datadoc_model.model import VariableRole
|
|
16
16
|
|
|
17
|
-
from
|
|
18
|
-
|
|
19
|
-
|
|
17
|
+
from dapla_metadata.datasets.utility.constants import (
|
|
18
|
+
DATASET_FIELDS_FROM_EXISTING_METADATA,
|
|
19
|
+
)
|
|
20
|
+
from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
|
|
21
|
+
from dapla_metadata.datasets.utility.constants import (
|
|
22
|
+
OBLIGATORY_DATASET_METADATA_IDENTIFIERS,
|
|
23
|
+
)
|
|
24
|
+
from dapla_metadata.datasets.utility.constants import (
|
|
20
25
|
OBLIGATORY_DATASET_METADATA_IDENTIFIERS_MULTILANGUAGE,
|
|
21
26
|
)
|
|
22
|
-
from
|
|
23
|
-
|
|
27
|
+
from dapla_metadata.datasets.utility.constants import (
|
|
28
|
+
OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS,
|
|
29
|
+
)
|
|
30
|
+
from dapla_metadata.datasets.utility.constants import (
|
|
24
31
|
OBLIGATORY_VARIABLES_METADATA_IDENTIFIERS_MULTILANGUAGE,
|
|
25
32
|
)
|
|
26
33
|
|
|
@@ -403,3 +410,83 @@ def running_in_notebook() -> bool:
|
|
|
403
410
|
# interpreters and will throw a NameError. Therefore we're not running
|
|
404
411
|
# in Jupyter.
|
|
405
412
|
return False
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def override_dataset_fields(
|
|
416
|
+
merged_metadata: model.DatadocMetadata,
|
|
417
|
+
existing_metadata: model.DatadocMetadata,
|
|
418
|
+
) -> None:
|
|
419
|
+
"""Overrides specific fields in the dataset of `merged_metadata` with values from the dataset of `existing_metadata`.
|
|
420
|
+
|
|
421
|
+
This function iterates over a predefined list of fields, `DATASET_FIELDS_FROM_EXISTING_METADATA`,
|
|
422
|
+
and sets the corresponding fields in the `merged_metadata.dataset` object to the values
|
|
423
|
+
from the `existing_metadata.dataset` object.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
merged_metadata: An instance of `DatadocMetadata` containing the dataset to be updated.
|
|
427
|
+
existing_metadata: An instance of `DatadocMetadata` containing the dataset whose values are used to update `merged_metadata.dataset`.
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
`None`.
|
|
431
|
+
"""
|
|
432
|
+
if merged_metadata.dataset and existing_metadata.dataset:
|
|
433
|
+
# Override the fields as defined
|
|
434
|
+
for field in DATASET_FIELDS_FROM_EXISTING_METADATA:
|
|
435
|
+
setattr(
|
|
436
|
+
merged_metadata.dataset,
|
|
437
|
+
field,
|
|
438
|
+
getattr(existing_metadata.dataset, field),
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def merge_variables(
|
|
443
|
+
existing_metadata: model.DatadocMetadata,
|
|
444
|
+
extracted_metadata: model.DatadocMetadata,
|
|
445
|
+
merged_metadata: model.DatadocMetadata,
|
|
446
|
+
) -> model.DatadocMetadata:
|
|
447
|
+
"""Merges variables from the extracted metadata into the existing metadata and updates the merged metadata.
|
|
448
|
+
|
|
449
|
+
This function compares the variables from `extracted_metadata` with those in `existing_metadata`.
|
|
450
|
+
For each variable in `extracted_metadata`, it checks if a variable with the same `short_name` exists
|
|
451
|
+
in `existing_metadata`. If a match is found, it updates the existing variable with information from
|
|
452
|
+
`extracted_metadata`. If no match is found, the variable from `extracted_metadata` is directly added to `merged_metadata`.
|
|
453
|
+
|
|
454
|
+
Args:
|
|
455
|
+
existing_metadata: The metadata object containing the current state of variables.
|
|
456
|
+
extracted_metadata: The metadata object containing new or updated variables to merge.
|
|
457
|
+
merged_metadata: The metadata object that will contain the result of the merge.
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
model.DatadocMetadata: The `merged_metadata` object containing variables from both `existing_metadata`
|
|
461
|
+
and `extracted_metadata`.
|
|
462
|
+
"""
|
|
463
|
+
if (
|
|
464
|
+
existing_metadata.variables is not None
|
|
465
|
+
and extracted_metadata is not None
|
|
466
|
+
and extracted_metadata.variables is not None
|
|
467
|
+
and merged_metadata.variables is not None
|
|
468
|
+
):
|
|
469
|
+
for extracted in extracted_metadata.variables:
|
|
470
|
+
existing = next(
|
|
471
|
+
(
|
|
472
|
+
existing
|
|
473
|
+
for existing in existing_metadata.variables
|
|
474
|
+
if existing.short_name == extracted.short_name
|
|
475
|
+
),
|
|
476
|
+
None,
|
|
477
|
+
)
|
|
478
|
+
if existing:
|
|
479
|
+
existing.id = (
|
|
480
|
+
None # Set to None so that it will be set assigned a fresh ID later
|
|
481
|
+
)
|
|
482
|
+
existing.contains_data_from = (
|
|
483
|
+
extracted.contains_data_from or existing.contains_data_from
|
|
484
|
+
)
|
|
485
|
+
existing.contains_data_until = (
|
|
486
|
+
extracted.contains_data_until or existing.contains_data_until
|
|
487
|
+
)
|
|
488
|
+
merged_metadata.variables.append(existing)
|
|
489
|
+
else:
|
|
490
|
+
# If there is no existing metadata for this variable, we just use what we have extracted
|
|
491
|
+
merged_metadata.variables.append(extracted)
|
|
492
|
+
return merged_metadata
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|