dapla-toolbelt-metadata 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dapla-toolbelt-metadata might be problematic. Click here for more details.
- dapla_toolbelt_metadata-0.1.1.dist-info/LICENSE +21 -0
- dapla_toolbelt_metadata-0.1.1.dist-info/METADATA +125 -0
- dapla_toolbelt_metadata-0.1.1.dist-info/RECORD +21 -0
- dapla_toolbelt_metadata-0.1.1.dist-info/WHEEL +4 -0
- dataset/__init__.py +11 -0
- dataset/code_list.py +244 -0
- dataset/config.py +151 -0
- dataset/core.py +543 -0
- dataset/dapla_dataset_path_info.py +685 -0
- dataset/dataset_parser.py +241 -0
- dataset/external_sources/__init__.py +1 -0
- dataset/external_sources/external_sources.py +87 -0
- dataset/model_backwards_compatibility.py +520 -0
- dataset/model_validation.py +188 -0
- dataset/py.typed +0 -0
- dataset/statistic_subject_mapping.py +182 -0
- dataset/user_info.py +88 -0
- dataset/utility/__init__.py +1 -0
- dataset/utility/constants.py +92 -0
- dataset/utility/enums.py +35 -0
- dataset/utility/utils.py +405 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright © 2024 Statistics Norway
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: dapla-toolbelt-metadata
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Dapla Toolbelt Metadata
|
|
5
|
+
Home-page: https://github.com/statisticsnorway/dapla-toolbelt-metadata
|
|
6
|
+
License: MIT
|
|
7
|
+
Author: Team Metadata
|
|
8
|
+
Author-email: metadata@ssb.no
|
|
9
|
+
Requires-Python: >=3.10,<4.0
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Requires-Dist: arrow (>=1.3.0)
|
|
17
|
+
Requires-Dist: beautifulsoup4 (>=4.12.3)
|
|
18
|
+
Requires-Dist: black (>=24.8.0,<25.0.0)
|
|
19
|
+
Requires-Dist: bs4 (>=0.0.2,<0.0.3)
|
|
20
|
+
Requires-Dist: click (>=8.0.1)
|
|
21
|
+
Requires-Dist: cloudpathlib[gs] (>=0.17.0)
|
|
22
|
+
Requires-Dist: coverage (>=7.6.1,<8.0.0)
|
|
23
|
+
Requires-Dist: dapla-toolbelt (>=1.3.3)
|
|
24
|
+
Requires-Dist: faker (>=26.1.0,<27.0.0)
|
|
25
|
+
Requires-Dist: furo (>=2024.7.18,<2025.0.0)
|
|
26
|
+
Requires-Dist: gunicorn (>=21.2.0)
|
|
27
|
+
Requires-Dist: pandas (>=1.4.2)
|
|
28
|
+
Requires-Dist: pre-commit (>=3.8.0,<4.0.0)
|
|
29
|
+
Requires-Dist: pyarrow (>=8.0.0)
|
|
30
|
+
Requires-Dist: pydantic (>=2.5.2)
|
|
31
|
+
Requires-Dist: pygments (>=2.18.0,<3.0.0)
|
|
32
|
+
Requires-Dist: pyjwt (>=2.8.0)
|
|
33
|
+
Requires-Dist: pytest (>=8.3.2,<9.0.0)
|
|
34
|
+
Requires-Dist: pytest-mock (>=3.14.0,<4.0.0)
|
|
35
|
+
Requires-Dist: python-dotenv (>=1.0.1)
|
|
36
|
+
Requires-Dist: requests (>=2.31.0)
|
|
37
|
+
Requires-Dist: requests-mock (>=1.12.1,<2.0.0)
|
|
38
|
+
Requires-Dist: ruff (>=0.5.6,<0.6.0)
|
|
39
|
+
Requires-Dist: ssb-datadoc-model (>=6.0.0,<7.0.0)
|
|
40
|
+
Requires-Dist: ssb-klass-python (>=0.0.9)
|
|
41
|
+
Requires-Dist: types-beautifulsoup4 (>=4.12.0.20240511,<5.0.0.0)
|
|
42
|
+
Project-URL: Changelog, https://github.com/statisticsnorway/dapla-toolbelt-metadata/releases
|
|
43
|
+
Project-URL: Documentation, https://statisticsnorway.github.io/dapla-toolbelt-metadata
|
|
44
|
+
Project-URL: Repository, https://github.com/statisticsnorway/dapla-toolbelt-metadata
|
|
45
|
+
Description-Content-Type: text/markdown
|
|
46
|
+
|
|
47
|
+
# Dapla Toolbelt Metadata
|
|
48
|
+
|
|
49
|
+
[][pypi status]
|
|
50
|
+
[][pypi status]
|
|
51
|
+
[][pypi status]
|
|
52
|
+
[][license]
|
|
53
|
+
|
|
54
|
+
[][documentation]
|
|
55
|
+
[][tests]
|
|
56
|
+
[][sonarcov]
|
|
57
|
+
[][sonarquality]
|
|
58
|
+
|
|
59
|
+
[][pre-commit]
|
|
60
|
+
[][black]
|
|
61
|
+
[](https://github.com/astral-sh/ruff)
|
|
62
|
+
[][poetry]
|
|
63
|
+
|
|
64
|
+
[pypi status]: https://pypi.org/project/dapla-toolbelt-metadata/
|
|
65
|
+
[documentation]: https://statisticsnorway.github.io/dapla-toolbelt-metadata
|
|
66
|
+
[tests]: https://github.com/statisticsnorway/dapla-toolbelt-metadata/actions?workflow=Tests
|
|
67
|
+
|
|
68
|
+
[sonarcov]: https://sonarcloud.io/summary/overall?id=statisticsnorway_dapla-toolbelt-metadata
|
|
69
|
+
[sonarquality]: https://sonarcloud.io/summary/overall?id=statisticsnorway_dapla-toolbelt-metadata
|
|
70
|
+
[pre-commit]: https://github.com/pre-commit/pre-commit
|
|
71
|
+
[black]: https://github.com/psf/black
|
|
72
|
+
[poetry]: https://python-poetry.org/
|
|
73
|
+
|
|
74
|
+
## Features
|
|
75
|
+
|
|
76
|
+
- TODO
|
|
77
|
+
|
|
78
|
+
## Requirements
|
|
79
|
+
|
|
80
|
+
- TODO
|
|
81
|
+
- Python
|
|
82
|
+
|
|
83
|
+
## Installation
|
|
84
|
+
|
|
85
|
+
You can install _Dapla Toolbelt Metadata_ via [pip] from [PyPI]:
|
|
86
|
+
|
|
87
|
+
```console
|
|
88
|
+
pip install dapla-toolbelt-metadata
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Usage
|
|
92
|
+
|
|
93
|
+
Please see the [Reference Guide] for details.
|
|
94
|
+
|
|
95
|
+
## Contributing
|
|
96
|
+
|
|
97
|
+
Contributions are very welcome.
|
|
98
|
+
To learn more, see the [Contributor Guide].
|
|
99
|
+
|
|
100
|
+
## License
|
|
101
|
+
|
|
102
|
+
Distributed under the terms of the [MIT license][license],
|
|
103
|
+
_Dapla Toolbelt Metadata_ is free and open source software.
|
|
104
|
+
|
|
105
|
+
## Issues
|
|
106
|
+
|
|
107
|
+
If you encounter any problems,
|
|
108
|
+
please [file an issue] along with a detailed description.
|
|
109
|
+
|
|
110
|
+
## Credits
|
|
111
|
+
|
|
112
|
+
This project was generated from [Statistics Norway]'s [SSB PyPI Template].
|
|
113
|
+
|
|
114
|
+
[statistics norway]: https://www.ssb.no/en
|
|
115
|
+
[pypi]: https://pypi.org/
|
|
116
|
+
[ssb pypi template]: https://github.com/statisticsnorway/ssb-pypitemplate
|
|
117
|
+
[file an issue]: https://github.com/statisticsnorway/dapla-toolbelt-metadata/issues
|
|
118
|
+
[pip]: https://pip.pypa.io/
|
|
119
|
+
|
|
120
|
+
<!-- github-only -->
|
|
121
|
+
|
|
122
|
+
[license]: https://github.com/statisticsnorway/dapla-toolbelt-metadata/blob/main/LICENSE
|
|
123
|
+
[contributor guide]: https://github.com/statisticsnorway/dapla-toolbelt-metadata/blob/main/CONTRIBUTING.md
|
|
124
|
+
[reference guide]: https://statisticsnorway.github.io/dapla-toolbelt-metadata/reference.html
|
|
125
|
+
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
dataset/__init__.py,sha256=aa6dF2ddfeljw1SElaVqAF6YewuZ0Fxk3q7iebtWR2E,378
|
|
2
|
+
dataset/code_list.py,sha256=SqzassTXL-Gr4TqomXStJFiI5gSN0lXjbmZLDJadMrQ,9050
|
|
3
|
+
dataset/config.py,sha256=EWTmrkLWYKSHVrzEWQO16_CaWrxiUllAnRJFbvEoono,4239
|
|
4
|
+
dataset/core.py,sha256=E7OmFM1iTKlKuvQXuwhucO3z5pKg8fTkE2TmOYFi7_M,22654
|
|
5
|
+
dataset/dapla_dataset_path_info.py,sha256=7wwVwykJUaRbqCZrAMsZsOd1p_xO8bHe5LhNOLE8j6k,21600
|
|
6
|
+
dataset/dataset_parser.py,sha256=AvN4cKaDvP4VwplNR5uvXJdiZh4ippNcFTBll-HhH-4,7949
|
|
7
|
+
dataset/external_sources/__init__.py,sha256=qvIdXwqyEmXNUCB94ZtZXRzifdW4hiXASFFPtC70f6E,83
|
|
8
|
+
dataset/external_sources/external_sources.py,sha256=9eIcOIUbaodNX1w9Tj2wl4U4wUmr5kF1R0i01fKUzGs,2974
|
|
9
|
+
dataset/model_backwards_compatibility.py,sha256=69RKZwOrSyaBQvMCjOZiM-S-clVQu8cIKOUGGpI_87Y,19171
|
|
10
|
+
dataset/model_validation.py,sha256=uj98wiz9SWbJc_He3kGGejy4JIIXM6RKaSccJfmo6wc,6672
|
|
11
|
+
dataset/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
dataset/statistic_subject_mapping.py,sha256=aOKQLvVNF4XWqVrDXVFUz-Hj_me6JeWW_uEPAKJvVJk,6145
|
|
13
|
+
dataset/user_info.py,sha256=42PikdAQzC1FLOISC49yZO0IgVMWIq_QgxVD0xixaes,2541
|
|
14
|
+
dataset/utility/__init__.py,sha256=pp6tUcgUbo8iq9OPtFKQrTbLuI3uY7NHptwWSTpasOU,33
|
|
15
|
+
dataset/utility/constants.py,sha256=Wv1LIqq2P7ow6sToNdrTOAIMqvyPxNS2j6ArIB-GMds,2301
|
|
16
|
+
dataset/utility/enums.py,sha256=C-qlB9ZI4Oy3q1ehbuF0GD7lqJJbuaspY_e8BDFu5DU,727
|
|
17
|
+
dataset/utility/utils.py,sha256=j2A6DOgb4MmKaEGd5qW8DHxUsTZrZFLLAsvPW1BQIc0,14269
|
|
18
|
+
dapla_toolbelt_metadata-0.1.1.dist-info/LICENSE,sha256=np3IfD5m0ZUofn_kVzDZqliozuiO6wrktw3LRPjyEiI,1073
|
|
19
|
+
dapla_toolbelt_metadata-0.1.1.dist-info/METADATA,sha256=nmA0eQkasfLsUfoRtwKJBd9vD9m8mR9fC9z-AjZj-lM,5158
|
|
20
|
+
dapla_toolbelt_metadata-0.1.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
21
|
+
dapla_toolbelt_metadata-0.1.1.dist-info/RECORD,,
|
dataset/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Document dataset."""
|
|
2
|
+
|
|
3
|
+
from datadoc_model import model
|
|
4
|
+
|
|
5
|
+
from .core import Datadoc
|
|
6
|
+
from .dapla_dataset_path_info import DaplaDatasetPathInfo
|
|
7
|
+
from .model_validation import ObligatoryDatasetWarning
|
|
8
|
+
from .model_validation import ObligatoryVariableWarning
|
|
9
|
+
from .utility.enums import DaplaRegion
|
|
10
|
+
from .utility.enums import DaplaService
|
|
11
|
+
from .utility.enums import SupportedLanguages
|
dataset/code_list.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from dataset.external_sources.external_sources import GetExternalSource
|
|
8
|
+
from dataset.utility.enums import SupportedLanguages
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
from klass.classes.classification import KlassClassification
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class CodeListItem:
|
|
21
|
+
"""Data structure for a code list item.
|
|
22
|
+
|
|
23
|
+
Attributes:
|
|
24
|
+
titles: A dictionary mapping language codes to titles.
|
|
25
|
+
code: The code associated with the item.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
titles: dict[SupportedLanguages, str]
|
|
29
|
+
code: str
|
|
30
|
+
|
|
31
|
+
def get_title(self, language: SupportedLanguages) -> str:
|
|
32
|
+
"""Return the title in the specified language.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
language: The language code for which to get the title.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
The title in the specified language. It returns the title in Norwegian
|
|
39
|
+
Bokmål ("nb") if the language is either Norwegian Bokmål or Norwegian
|
|
40
|
+
Nynorsk, otherwise it returns the title in English ("en"). If none of
|
|
41
|
+
these are available, it returns an empty string and logs an exception.
|
|
42
|
+
"""
|
|
43
|
+
try:
|
|
44
|
+
return self.titles[language]
|
|
45
|
+
except KeyError:
|
|
46
|
+
try:
|
|
47
|
+
return self.titles[
|
|
48
|
+
(
|
|
49
|
+
SupportedLanguages.NORSK_BOKMÅL
|
|
50
|
+
if language
|
|
51
|
+
in [
|
|
52
|
+
SupportedLanguages.NORSK_BOKMÅL,
|
|
53
|
+
SupportedLanguages.NORSK_NYNORSK,
|
|
54
|
+
]
|
|
55
|
+
else SupportedLanguages.ENGLISH
|
|
56
|
+
)
|
|
57
|
+
]
|
|
58
|
+
except KeyError:
|
|
59
|
+
logger.exception(
|
|
60
|
+
"Could not find title for subject %s and language: %s",
|
|
61
|
+
self,
|
|
62
|
+
language.name,
|
|
63
|
+
)
|
|
64
|
+
return ""
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class CodeList(GetExternalSource):
|
|
68
|
+
"""Class for retrieving classifications from Klass.
|
|
69
|
+
|
|
70
|
+
This class fetches a classification given a classification ID
|
|
71
|
+
and supports multiple languages.
|
|
72
|
+
|
|
73
|
+
Attributes:
|
|
74
|
+
supported_languages: A list of supported language codes.
|
|
75
|
+
_classifications: A list to store classification items.
|
|
76
|
+
classification_id: The ID of the classification to retrieve.
|
|
77
|
+
classifications_dataframes: A dictionary to store dataframes of
|
|
78
|
+
classifications.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
executor: ThreadPoolExecutor,
|
|
84
|
+
classification_id: int | None,
|
|
85
|
+
) -> None:
|
|
86
|
+
"""Initialize the CodeList with the given classification ID and executor.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
executor: An instance of ThreadPoolExecutor to manage the asynchronous
|
|
90
|
+
execution of data fetching.
|
|
91
|
+
classification_id: The ID of the classification to retrieve.
|
|
92
|
+
"""
|
|
93
|
+
self.supported_languages = [
|
|
94
|
+
SupportedLanguages.NORSK_BOKMÅL,
|
|
95
|
+
SupportedLanguages.ENGLISH,
|
|
96
|
+
]
|
|
97
|
+
self._classifications: list[CodeListItem] = []
|
|
98
|
+
self.classification_id = classification_id
|
|
99
|
+
self.classifications_dataframes: (
|
|
100
|
+
dict[SupportedLanguages, pd.DataFrame] | None
|
|
101
|
+
) = None
|
|
102
|
+
super().__init__(executor)
|
|
103
|
+
|
|
104
|
+
def _fetch_data_from_external_source(
|
|
105
|
+
self,
|
|
106
|
+
) -> dict[SupportedLanguages, pd.DataFrame] | None:
|
|
107
|
+
"""Fetch the classifications from Klass by classification ID.
|
|
108
|
+
|
|
109
|
+
This method retrieves classification data for each supported language and
|
|
110
|
+
stores it in a dictionary where the keys are language codes and the values
|
|
111
|
+
are pandas DataFrames containing the classification data.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
A dictionary mapping language codes to pandas DataFrames containing the
|
|
115
|
+
classification data for the given classification ID.
|
|
116
|
+
If an exception occurs during the fetching process, logs the exception
|
|
117
|
+
and returns None.
|
|
118
|
+
"""
|
|
119
|
+
classifications_dataframes: dict[SupportedLanguages, pd.DataFrame] = {}
|
|
120
|
+
for i in self.supported_languages:
|
|
121
|
+
try:
|
|
122
|
+
classifications_dataframes[i] = (
|
|
123
|
+
KlassClassification(
|
|
124
|
+
str(self.classification_id),
|
|
125
|
+
i,
|
|
126
|
+
)
|
|
127
|
+
.get_codes()
|
|
128
|
+
.data
|
|
129
|
+
)
|
|
130
|
+
except Exception: # noqa: PERF203
|
|
131
|
+
logger.exception(
|
|
132
|
+
"Exception while getting classifications from Klass",
|
|
133
|
+
)
|
|
134
|
+
return None
|
|
135
|
+
else:
|
|
136
|
+
return classifications_dataframes
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
def _extract_titles(
|
|
140
|
+
self,
|
|
141
|
+
dataframes: dict[SupportedLanguages, pd.DataFrame],
|
|
142
|
+
) -> list[dict[SupportedLanguages, str]]:
|
|
143
|
+
"""Extract titles from the dataframes for each supported language.
|
|
144
|
+
|
|
145
|
+
This method processes the provided dataframes and extracts the title from
|
|
146
|
+
each row for all supported languages, creating a list of dictionaries where
|
|
147
|
+
each dictionary maps language codes to titles.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
dataframes: A dictionary mapping language codes to pandas DataFrames
|
|
151
|
+
containing classification data.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
A list of dictionaries, each mapping language codes to titles.
|
|
155
|
+
If a title is not available in a dataframe, the corresponding dictionary
|
|
156
|
+
value will be None.
|
|
157
|
+
"""
|
|
158
|
+
list_of_titles = []
|
|
159
|
+
languages = list(dataframes)
|
|
160
|
+
for i in range(len(dataframes[SupportedLanguages.NORSK_BOKMÅL])):
|
|
161
|
+
titles = {}
|
|
162
|
+
for j in languages:
|
|
163
|
+
if "name" in dataframes[j]:
|
|
164
|
+
titles[j] = dataframes[j].loc[:, "name"][i]
|
|
165
|
+
else:
|
|
166
|
+
titles[j] = None
|
|
167
|
+
list_of_titles.append(titles)
|
|
168
|
+
return list_of_titles
|
|
169
|
+
|
|
170
|
+
def _create_code_list_from_dataframe(
|
|
171
|
+
self,
|
|
172
|
+
classifications_dataframes: dict[SupportedLanguages, pd.DataFrame],
|
|
173
|
+
) -> list[CodeListItem]:
|
|
174
|
+
"""Create a list of CodeListItem objects from the classification dataframes.
|
|
175
|
+
|
|
176
|
+
This method extracts titles from the provided dataframes and pairs them
|
|
177
|
+
with their corresponding classification codes to create a list of
|
|
178
|
+
CodeListItem objects.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
classifications_dataframes: A dictionary mapping language codes to
|
|
182
|
+
pandas DataFrames containing classification data.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
A list of CodeListItem objects containing classification titles
|
|
186
|
+
and codes.
|
|
187
|
+
"""
|
|
188
|
+
classification_names = self._extract_titles(classifications_dataframes)
|
|
189
|
+
classification_codes: list
|
|
190
|
+
if "code" in classifications_dataframes[SupportedLanguages.NORSK_BOKMÅL]:
|
|
191
|
+
classification_codes = (
|
|
192
|
+
classifications_dataframes[SupportedLanguages.NORSK_BOKMÅL]
|
|
193
|
+
.loc[:, "code"]
|
|
194
|
+
.to_list()
|
|
195
|
+
)
|
|
196
|
+
else:
|
|
197
|
+
classification_codes = [None] * len(classification_names)
|
|
198
|
+
classification_items = []
|
|
199
|
+
for a, b in zip(classification_names, classification_codes, strict=False):
|
|
200
|
+
classification_items.append(
|
|
201
|
+
CodeListItem(a, b),
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
return classification_items
|
|
205
|
+
|
|
206
|
+
def _get_classification_dataframe_if_loaded(self) -> bool:
|
|
207
|
+
"""Check if the classification data from Klass is loaded.
|
|
208
|
+
|
|
209
|
+
This method verifies whether the classification data has been loaded.
|
|
210
|
+
If not, it retrieves the data from an external source and populates the
|
|
211
|
+
classifications. It logs the process and returns a boolean indicating the
|
|
212
|
+
success of the operation.
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
True if the data is loaded and classifications are successfully extracted,
|
|
216
|
+
False otherwise.
|
|
217
|
+
"""
|
|
218
|
+
if not self._classifications:
|
|
219
|
+
self.classifications_dataframes = self.retrieve_external_data()
|
|
220
|
+
if self.classifications_dataframes is not None:
|
|
221
|
+
self._classifications = self._create_code_list_from_dataframe(
|
|
222
|
+
self.classifications_dataframes,
|
|
223
|
+
)
|
|
224
|
+
logger.debug(
|
|
225
|
+
"Thread finished. found %s classifications",
|
|
226
|
+
len(self._classifications),
|
|
227
|
+
)
|
|
228
|
+
return True
|
|
229
|
+
logger.warning(
|
|
230
|
+
"Thread is not done. Cannot get classifications from the dataframe.",
|
|
231
|
+
)
|
|
232
|
+
return False
|
|
233
|
+
|
|
234
|
+
@property
|
|
235
|
+
def classifications(self) -> list[CodeListItem]:
|
|
236
|
+
"""Get the list of classifications.
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
A list of CodeListItem objects.
|
|
240
|
+
"""
|
|
241
|
+
self._get_classification_dataframe_if_loaded()
|
|
242
|
+
|
|
243
|
+
logger.debug("Got %s classifications subjects", len(self._classifications))
|
|
244
|
+
return self._classifications
|
dataset/config.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""Configuration management for dataset package."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from pprint import pformat
|
|
9
|
+
from typing import Literal
|
|
10
|
+
|
|
11
|
+
from dotenv import dotenv_values
|
|
12
|
+
from dotenv import load_dotenv
|
|
13
|
+
|
|
14
|
+
from dataset.utility.enums import DaplaRegion
|
|
15
|
+
from dataset.utility.enums import DaplaService
|
|
16
|
+
|
|
17
|
+
logging.basicConfig(level=logging.DEBUG, force=True)
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
DOT_ENV_FILE_PATH = Path(__file__).parent.joinpath(".env")
|
|
22
|
+
|
|
23
|
+
JUPYTERHUB_USER = "JUPYTERHUB_USER"
|
|
24
|
+
DAPLA_REGION = "DAPLA_REGION"
|
|
25
|
+
DAPLA_SERVICE = "DAPLA_SERVICE"
|
|
26
|
+
|
|
27
|
+
env_loaded = False
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _load_dotenv_file() -> None:
|
|
31
|
+
global env_loaded # noqa: PLW0603
|
|
32
|
+
if not env_loaded and DOT_ENV_FILE_PATH.exists():
|
|
33
|
+
load_dotenv(DOT_ENV_FILE_PATH)
|
|
34
|
+
env_loaded = True
|
|
35
|
+
logger.info(
|
|
36
|
+
"Loaded .env file with config keys: \n%s",
|
|
37
|
+
pformat(list(dotenv_values(DOT_ENV_FILE_PATH).keys())),
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _get_config_item(item: str) -> str | None:
|
|
42
|
+
"""Get a config item. Makes sure all access is logged."""
|
|
43
|
+
_load_dotenv_file()
|
|
44
|
+
value = os.getenv(item)
|
|
45
|
+
logger.debug("Config accessed. %s", item)
|
|
46
|
+
return value
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_jupyterhub_user() -> str | None:
|
|
50
|
+
"""Get the JupyterHub user name."""
|
|
51
|
+
return _get_config_item(JUPYTERHUB_USER)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def get_datadoc_dataset_path() -> str | None:
|
|
55
|
+
"""Get the path to the dataset."""
|
|
56
|
+
return _get_config_item("DATADOC_DATASET_PATH")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_log_level() -> int:
|
|
60
|
+
"""Get the log level."""
|
|
61
|
+
# Magic numbers as defined in Python's stdlib logging
|
|
62
|
+
log_levels: dict[str, int] = {
|
|
63
|
+
"CRITICAL": 50,
|
|
64
|
+
"ERROR": 40,
|
|
65
|
+
"WARNING": 30,
|
|
66
|
+
"INFO": 20,
|
|
67
|
+
"DEBUG": 10,
|
|
68
|
+
}
|
|
69
|
+
if level_string := _get_config_item("DATADOC_LOG_LEVEL"):
|
|
70
|
+
try:
|
|
71
|
+
return log_levels[level_string.upper()]
|
|
72
|
+
except KeyError:
|
|
73
|
+
return log_levels["INFO"]
|
|
74
|
+
else:
|
|
75
|
+
return log_levels["INFO"]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def get_log_formatter() -> Literal["simple", "json"]:
|
|
79
|
+
"""Get log formatter configuration."""
|
|
80
|
+
if (
|
|
81
|
+
_get_config_item("DATADOC_ENABLE_JSON_FORMATTING") == "True"
|
|
82
|
+
or get_dapla_region() is not None
|
|
83
|
+
):
|
|
84
|
+
return "json"
|
|
85
|
+
return "simple"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def get_jupyterhub_service_prefix() -> str | None:
|
|
89
|
+
"""Get the JupyterHub service prefix."""
|
|
90
|
+
return _get_config_item("JUPYTERHUB_SERVICE_PREFIX")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_app_name() -> str:
|
|
94
|
+
"""Get the name of the app. Defaults to 'Datadoc'."""
|
|
95
|
+
return _get_config_item("DATADOC_APP_NAME") or "Datadoc"
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def get_jupyterhub_http_referrer() -> str | None:
|
|
99
|
+
"""Get the JupyterHub http referrer."""
|
|
100
|
+
return _get_config_item("JUPYTERHUB_HTTP_REFERER")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def get_port() -> int:
|
|
104
|
+
"""Get the port to run the app on."""
|
|
105
|
+
return int(_get_config_item("DATADOC_PORT") or 7002)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def get_statistical_subject_source_url() -> str | None:
|
|
109
|
+
"""Get the URL to the statistical subject source."""
|
|
110
|
+
return _get_config_item("DATADOC_STATISTICAL_SUBJECT_SOURCE_URL")
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def get_dapla_region() -> DaplaRegion | None:
|
|
114
|
+
"""Get the Dapla region we're running on."""
|
|
115
|
+
if region := _get_config_item(DAPLA_REGION):
|
|
116
|
+
return DaplaRegion(region)
|
|
117
|
+
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def get_dapla_service() -> DaplaService | None:
|
|
122
|
+
"""Get the Dapla service we're running on."""
|
|
123
|
+
if service := _get_config_item(DAPLA_SERVICE):
|
|
124
|
+
return DaplaService(service)
|
|
125
|
+
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def get_oidc_token() -> str | None:
|
|
130
|
+
"""Get the JWT token from the environment."""
|
|
131
|
+
return _get_config_item("OIDC_TOKEN")
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def get_unit_code() -> int | None:
|
|
135
|
+
"""The code for the Unit Type code list in Klass."""
|
|
136
|
+
return int(_get_config_item("DATADOC_UNIT_CODE") or 702)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def get_measurement_unit_code() -> int | None:
|
|
140
|
+
"""The code for the Measurement Unit code list in Klass."""
|
|
141
|
+
return int(_get_config_item("DATADOC_MEASUREMENT_UNIT") or 303)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def get_organisational_unit_code() -> int | None:
|
|
145
|
+
"""The code for the organisational units code list in Klass."""
|
|
146
|
+
return int(_get_config_item("DATADOC_ORGANISATIONAL_UNIT_CODE") or 83)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def get_data_source_code() -> int | None:
|
|
150
|
+
"""The code for the organisational units code list in Klass."""
|
|
151
|
+
return int(_get_config_item("DATADOC_DATA_SOURCE_CODE") or 712)
|