dapla-toolbelt-metadata 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dapla-toolbelt-metadata might be problematic. Click here for more details.
- dapla_toolbelt_metadata-0.1.1.dist-info/LICENSE +21 -0
- dapla_toolbelt_metadata-0.1.1.dist-info/METADATA +125 -0
- dapla_toolbelt_metadata-0.1.1.dist-info/RECORD +21 -0
- dapla_toolbelt_metadata-0.1.1.dist-info/WHEEL +4 -0
- dataset/__init__.py +11 -0
- dataset/code_list.py +244 -0
- dataset/config.py +151 -0
- dataset/core.py +543 -0
- dataset/dapla_dataset_path_info.py +685 -0
- dataset/dataset_parser.py +241 -0
- dataset/external_sources/__init__.py +1 -0
- dataset/external_sources/external_sources.py +87 -0
- dataset/model_backwards_compatibility.py +520 -0
- dataset/model_validation.py +188 -0
- dataset/py.typed +0 -0
- dataset/statistic_subject_mapping.py +182 -0
- dataset/user_info.py +88 -0
- dataset/utility/__init__.py +1 -0
- dataset/utility/constants.py +92 -0
- dataset/utility/enums.py +35 -0
- dataset/utility/utils.py +405 -0
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
"""Abstractions for dataset file formats.
|
|
2
|
+
|
|
3
|
+
Handles reading in the data and transforming data types to generic metadata types.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import pathlib # noqa: TCH003 import is needed for docs build
|
|
9
|
+
import re
|
|
10
|
+
import typing as t
|
|
11
|
+
from abc import ABC
|
|
12
|
+
from abc import abstractmethod
|
|
13
|
+
from typing import TYPE_CHECKING
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
from datadoc_model.model import DataType
|
|
17
|
+
from datadoc_model.model import LanguageStringType
|
|
18
|
+
from datadoc_model.model import LanguageStringTypeItem
|
|
19
|
+
from datadoc_model.model import Variable
|
|
20
|
+
from pyarrow import parquet as pq
|
|
21
|
+
|
|
22
|
+
from dataset.utility.enums import SupportedLanguages
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
import pyarrow as pa
|
|
26
|
+
from cloudpathlib import CloudPath
|
|
27
|
+
|
|
28
|
+
KNOWN_INTEGER_TYPES = (
|
|
29
|
+
"int",
|
|
30
|
+
"int_",
|
|
31
|
+
"int8",
|
|
32
|
+
"int16",
|
|
33
|
+
"int32",
|
|
34
|
+
"int64",
|
|
35
|
+
"integer",
|
|
36
|
+
"long",
|
|
37
|
+
"uint",
|
|
38
|
+
"uint8",
|
|
39
|
+
"uint16",
|
|
40
|
+
"uint32",
|
|
41
|
+
"uint64",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
KNOWN_FLOAT_TYPES = (
|
|
45
|
+
"double",
|
|
46
|
+
"float",
|
|
47
|
+
"float_",
|
|
48
|
+
"float16",
|
|
49
|
+
"float32",
|
|
50
|
+
"float64",
|
|
51
|
+
"decimal",
|
|
52
|
+
"number",
|
|
53
|
+
"numeric",
|
|
54
|
+
"num",
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
KNOWN_STRING_TYPES = (
|
|
58
|
+
"string",
|
|
59
|
+
"str",
|
|
60
|
+
"char",
|
|
61
|
+
"varchar",
|
|
62
|
+
"varchar2",
|
|
63
|
+
"text",
|
|
64
|
+
"txt",
|
|
65
|
+
"bytes",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
KNOWN_DATETIME_TYPES = (
|
|
69
|
+
"timestamp",
|
|
70
|
+
"timestamp[us]",
|
|
71
|
+
"timestamp[ns]",
|
|
72
|
+
"datetime64",
|
|
73
|
+
" datetime64[ns]",
|
|
74
|
+
" datetime64[us]",
|
|
75
|
+
"date",
|
|
76
|
+
"datetime",
|
|
77
|
+
"time",
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
KNOWN_BOOLEAN_TYPES = ("bool", "bool_", "boolean")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
TYPE_CORRESPONDENCE: list[tuple[tuple[str, ...], DataType]] = [
|
|
84
|
+
(KNOWN_INTEGER_TYPES, DataType.INTEGER),
|
|
85
|
+
(KNOWN_FLOAT_TYPES, DataType.FLOAT),
|
|
86
|
+
(KNOWN_STRING_TYPES, DataType.STRING),
|
|
87
|
+
(KNOWN_DATETIME_TYPES, DataType.DATETIME),
|
|
88
|
+
(KNOWN_BOOLEAN_TYPES, DataType.BOOLEAN),
|
|
89
|
+
]
|
|
90
|
+
TYPE_MAP: dict[str, DataType] = {}
|
|
91
|
+
for concrete_type, abstract_type in TYPE_CORRESPONDENCE:
|
|
92
|
+
TYPE_MAP.update({c: abstract_type for c in concrete_type})
|
|
93
|
+
|
|
94
|
+
TDatasetParser = t.TypeVar("TDatasetParser", bound="DatasetParser")
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class DatasetParser(ABC):
|
|
98
|
+
"""Abstract Base Class for all Dataset parsers.
|
|
99
|
+
|
|
100
|
+
Implements:
|
|
101
|
+
- A static factory method to get the correct implementation for each file extension.
|
|
102
|
+
- A static method for data type conversion.
|
|
103
|
+
|
|
104
|
+
Requires implementation by subclasses:
|
|
105
|
+
- A method to extract variables (columns) from the dataset, so they may be documented.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
def __init__(self, dataset: pathlib.Path | CloudPath) -> None:
|
|
109
|
+
"""Initialize for a given dataset."""
|
|
110
|
+
self.dataset = dataset
|
|
111
|
+
|
|
112
|
+
@staticmethod
|
|
113
|
+
def for_file(dataset: pathlib.Path | CloudPath) -> DatasetParser:
|
|
114
|
+
"""Return the correct subclass based on the given dataset file."""
|
|
115
|
+
supported_file_types: dict[
|
|
116
|
+
str,
|
|
117
|
+
type[DatasetParser],
|
|
118
|
+
] = {
|
|
119
|
+
".parquet": DatasetParserParquet,
|
|
120
|
+
".sas7bdat": DatasetParserSas7Bdat,
|
|
121
|
+
".parquet.gzip": DatasetParserParquet,
|
|
122
|
+
}
|
|
123
|
+
file_type = "Unknown"
|
|
124
|
+
try:
|
|
125
|
+
file_type = dataset.suffix
|
|
126
|
+
# Gzipped parquet files can be read with DatasetParserParquet
|
|
127
|
+
match = re.search(r"(.parquet.gzip)", str(dataset).lower())
|
|
128
|
+
file_type = ".parquet.gzip" if match else file_type
|
|
129
|
+
# Extract the appropriate reader class from the SUPPORTED_FILE_TYPES dict and return an instance of it
|
|
130
|
+
reader = supported_file_types[file_type](dataset)
|
|
131
|
+
except IndexError as e:
|
|
132
|
+
# Thrown when just one element is returned from split, meaning there is no file extension supplied
|
|
133
|
+
msg = f"Could not recognise file type for provided {dataset = }. Supported file types are: {', '.join(supported_file_types.keys())}"
|
|
134
|
+
raise FileNotFoundError(
|
|
135
|
+
msg,
|
|
136
|
+
) from e
|
|
137
|
+
except KeyError as e:
|
|
138
|
+
# In this case the file type is not supported, so we throw a helpful exception
|
|
139
|
+
msg = f"{file_type = } is not supported. Please open one of the following supported files types: {', '.join(supported_file_types.keys())} or contact the maintainers to request support."
|
|
140
|
+
raise NotImplementedError(
|
|
141
|
+
msg,
|
|
142
|
+
) from e
|
|
143
|
+
else:
|
|
144
|
+
return reader
|
|
145
|
+
|
|
146
|
+
@staticmethod
|
|
147
|
+
def transform_data_type(data_type: str) -> DataType | None:
|
|
148
|
+
"""Transform a concrete data type to an abstract data type.
|
|
149
|
+
|
|
150
|
+
In statistical metadata, one is not interested in how the data is
|
|
151
|
+
technically stored, but in the meaning of the data type. Because of
|
|
152
|
+
this, we transform known data types to their abstract metadata
|
|
153
|
+
representations.
|
|
154
|
+
|
|
155
|
+
If we encounter a data type we don't know, we just ignore it and let
|
|
156
|
+
the user handle it in the GUI.
|
|
157
|
+
|
|
158
|
+
Arguments:
|
|
159
|
+
data_type: The concrete data type to map.
|
|
160
|
+
"""
|
|
161
|
+
return TYPE_MAP.get(data_type.lower(), None)
|
|
162
|
+
|
|
163
|
+
@abstractmethod
|
|
164
|
+
def get_fields(self) -> list[Variable]:
|
|
165
|
+
"""Abstract method, must be implemented by subclasses."""
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class DatasetParserParquet(DatasetParser):
|
|
169
|
+
"""Concrete implementation for parsing parquet files."""
|
|
170
|
+
|
|
171
|
+
def __init__(self, dataset: pathlib.Path | CloudPath) -> None:
|
|
172
|
+
"""Call the super init method for initialization.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
dataset: Path to the dataset to parse.
|
|
176
|
+
"""
|
|
177
|
+
super().__init__(dataset)
|
|
178
|
+
|
|
179
|
+
def get_fields(self) -> list[Variable]:
|
|
180
|
+
"""Extract the fields from this dataset."""
|
|
181
|
+
with self.dataset.open(mode="rb") as f:
|
|
182
|
+
schema: pa.Schema = pq.read_schema(f) # type: ignore [arg-type]
|
|
183
|
+
return [
|
|
184
|
+
Variable(
|
|
185
|
+
short_name=data_field.name.strip(),
|
|
186
|
+
data_type=self.transform_data_type(str(data_field.type)),
|
|
187
|
+
)
|
|
188
|
+
for data_field in schema
|
|
189
|
+
if data_field.name
|
|
190
|
+
!= "__index_level_0__" # Index columns should not be documented
|
|
191
|
+
]
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
class DatasetParserSas7Bdat(DatasetParser):
|
|
195
|
+
"""Concrete implementation for parsing SAS7BDAT files."""
|
|
196
|
+
|
|
197
|
+
def __init__(self, dataset: pathlib.Path | CloudPath) -> None:
|
|
198
|
+
"""Call the super init method for initialization.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
dataset: Path to the dataset to parse.
|
|
202
|
+
"""
|
|
203
|
+
super().__init__(dataset)
|
|
204
|
+
|
|
205
|
+
def get_fields(self) -> list[Variable]:
|
|
206
|
+
"""Extract the fields from this dataset."""
|
|
207
|
+
fields = []
|
|
208
|
+
with self.dataset.open(mode="rb") as f:
|
|
209
|
+
# Use an iterator to avoid reading in the entire dataset
|
|
210
|
+
sas_reader = pd.read_sas(f, format="sas7bdat", iterator=True)
|
|
211
|
+
|
|
212
|
+
# Get the first row from the iterator
|
|
213
|
+
try:
|
|
214
|
+
row = next(sas_reader)
|
|
215
|
+
except StopIteration as e:
|
|
216
|
+
msg = f"Could not read data from {self.dataset}"
|
|
217
|
+
raise RuntimeError(msg) from e
|
|
218
|
+
|
|
219
|
+
# Get all the values from the row and loop through them
|
|
220
|
+
for i, v in enumerate(row.to_numpy().tolist()[0]):
|
|
221
|
+
fields.append(
|
|
222
|
+
Variable(
|
|
223
|
+
short_name=sas_reader.columns[i].name, # type: ignore [attr-defined]
|
|
224
|
+
# Assume labels are defined in the default language (NORSK_BOKMÅL)
|
|
225
|
+
# If this is not correct, the user may fix it via the UI
|
|
226
|
+
name=LanguageStringType(
|
|
227
|
+
[
|
|
228
|
+
LanguageStringTypeItem(
|
|
229
|
+
languageCode=SupportedLanguages.NORSK_BOKMÅL.value,
|
|
230
|
+
languageText=sas_reader.columns[ # type: ignore [attr-defined]
|
|
231
|
+
i
|
|
232
|
+
].label,
|
|
233
|
+
),
|
|
234
|
+
],
|
|
235
|
+
),
|
|
236
|
+
# Access the python type for the value and transform it to a DataDoc Data type
|
|
237
|
+
data_type=self.transform_data_type(type(v).__name__.lower()), # type: ignore # noqa: PGH003
|
|
238
|
+
),
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
return fields
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Abstract parent class for interacting with external resources asynchorously."""
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from abc import ABC
|
|
5
|
+
from abc import abstractmethod
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
from typing import Generic
|
|
8
|
+
from typing import TypeVar
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
T = TypeVar("T")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GetExternalSource(ABC, Generic[T]):
|
|
19
|
+
"""Abstract base class for retrieving data from external sources asynchronously.
|
|
20
|
+
|
|
21
|
+
This class provides methods to initiate an asynchronous data retrieval
|
|
22
|
+
operation, check its status, and retrieve the result once the operation
|
|
23
|
+
completes. Subclasses must implement the `_fetch_data_from_external_source`
|
|
24
|
+
method to define how data is fetched from the specific external source.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, executor: ThreadPoolExecutor) -> None:
|
|
28
|
+
"""Initialize the GetExternalSource with an executor to manage asynchronous tasks.
|
|
29
|
+
|
|
30
|
+
This constructor initializes a future object that will hold the result of the
|
|
31
|
+
asynchronous data fetching operation from an external source.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
executor: An instance of ThreadPoolExecutor to manage the asynchronous
|
|
35
|
+
execution of data fetching.
|
|
36
|
+
"""
|
|
37
|
+
self.future = executor.submit(
|
|
38
|
+
self._fetch_data_from_external_source,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def wait_for_external_result(self) -> None:
|
|
42
|
+
"""Wait for the thread responsible for loading the external request to finish.
|
|
43
|
+
|
|
44
|
+
If there is no future to wait for, it logs a warning and returns immediately.
|
|
45
|
+
"""
|
|
46
|
+
if not self.future:
|
|
47
|
+
logger.warning("No future to wait for.")
|
|
48
|
+
return
|
|
49
|
+
self.future.result()
|
|
50
|
+
|
|
51
|
+
def check_if_external_data_is_loaded(self) -> bool:
|
|
52
|
+
"""Check if the thread getting the external data has finished running.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
True if the data fetching operation is complete, False otherwise.
|
|
56
|
+
"""
|
|
57
|
+
if self.future:
|
|
58
|
+
return self.future.done()
|
|
59
|
+
return False
|
|
60
|
+
|
|
61
|
+
def retrieve_external_data(self) -> T | None:
|
|
62
|
+
"""Retrieve the result of the data fetching operation.
|
|
63
|
+
|
|
64
|
+
This method checks if the asynchronous data fetching operation has
|
|
65
|
+
completed. If the operation is finished, it returns the result.
|
|
66
|
+
Otherwise, it returns None.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
The result of the data fetching operation if it is complete or None
|
|
70
|
+
if the operation has not yet finished.
|
|
71
|
+
"""
|
|
72
|
+
if self.future:
|
|
73
|
+
return self.future.result()
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
@abstractmethod
|
|
77
|
+
def _fetch_data_from_external_source(self) -> T | None:
|
|
78
|
+
"""Handle external data retrieval.
|
|
79
|
+
|
|
80
|
+
Abstract method to be implemented in the subclass.
|
|
81
|
+
This method should define the logic for retrieving data from the specific
|
|
82
|
+
external source.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
The data retrieved from the external source.
|
|
86
|
+
"""
|
|
87
|
+
raise NotImplementedError
|