dapla-toolbelt-metadata 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dapla-toolbelt-metadata might be problematic. Click here for more details.

@@ -0,0 +1,241 @@
1
+ """Abstractions for dataset file formats.
2
+
3
+ Handles reading in the data and transforming data types to generic metadata types.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import pathlib # noqa: TCH003 import is needed for docs build
9
+ import re
10
+ import typing as t
11
+ from abc import ABC
12
+ from abc import abstractmethod
13
+ from typing import TYPE_CHECKING
14
+
15
+ import pandas as pd
16
+ from datadoc_model.model import DataType
17
+ from datadoc_model.model import LanguageStringType
18
+ from datadoc_model.model import LanguageStringTypeItem
19
+ from datadoc_model.model import Variable
20
+ from pyarrow import parquet as pq
21
+
22
+ from dataset.utility.enums import SupportedLanguages
23
+
24
+ if TYPE_CHECKING:
25
+ import pyarrow as pa
26
+ from cloudpathlib import CloudPath
27
+
28
+ KNOWN_INTEGER_TYPES = (
29
+ "int",
30
+ "int_",
31
+ "int8",
32
+ "int16",
33
+ "int32",
34
+ "int64",
35
+ "integer",
36
+ "long",
37
+ "uint",
38
+ "uint8",
39
+ "uint16",
40
+ "uint32",
41
+ "uint64",
42
+ )
43
+
44
+ KNOWN_FLOAT_TYPES = (
45
+ "double",
46
+ "float",
47
+ "float_",
48
+ "float16",
49
+ "float32",
50
+ "float64",
51
+ "decimal",
52
+ "number",
53
+ "numeric",
54
+ "num",
55
+ )
56
+
57
+ KNOWN_STRING_TYPES = (
58
+ "string",
59
+ "str",
60
+ "char",
61
+ "varchar",
62
+ "varchar2",
63
+ "text",
64
+ "txt",
65
+ "bytes",
66
+ )
67
+
68
+ KNOWN_DATETIME_TYPES = (
69
+ "timestamp",
70
+ "timestamp[us]",
71
+ "timestamp[ns]",
72
+ "datetime64",
73
+ " datetime64[ns]",
74
+ " datetime64[us]",
75
+ "date",
76
+ "datetime",
77
+ "time",
78
+ )
79
+
80
+ KNOWN_BOOLEAN_TYPES = ("bool", "bool_", "boolean")
81
+
82
+
83
+ TYPE_CORRESPONDENCE: list[tuple[tuple[str, ...], DataType]] = [
84
+ (KNOWN_INTEGER_TYPES, DataType.INTEGER),
85
+ (KNOWN_FLOAT_TYPES, DataType.FLOAT),
86
+ (KNOWN_STRING_TYPES, DataType.STRING),
87
+ (KNOWN_DATETIME_TYPES, DataType.DATETIME),
88
+ (KNOWN_BOOLEAN_TYPES, DataType.BOOLEAN),
89
+ ]
90
+ TYPE_MAP: dict[str, DataType] = {}
91
+ for concrete_type, abstract_type in TYPE_CORRESPONDENCE:
92
+ TYPE_MAP.update({c: abstract_type for c in concrete_type})
93
+
94
+ TDatasetParser = t.TypeVar("TDatasetParser", bound="DatasetParser")
95
+
96
+
97
+ class DatasetParser(ABC):
98
+ """Abstract Base Class for all Dataset parsers.
99
+
100
+ Implements:
101
+ - A static factory method to get the correct implementation for each file extension.
102
+ - A static method for data type conversion.
103
+
104
+ Requires implementation by subclasses:
105
+ - A method to extract variables (columns) from the dataset, so they may be documented.
106
+ """
107
+
108
+ def __init__(self, dataset: pathlib.Path | CloudPath) -> None:
109
+ """Initialize for a given dataset."""
110
+ self.dataset = dataset
111
+
112
+ @staticmethod
113
+ def for_file(dataset: pathlib.Path | CloudPath) -> DatasetParser:
114
+ """Return the correct subclass based on the given dataset file."""
115
+ supported_file_types: dict[
116
+ str,
117
+ type[DatasetParser],
118
+ ] = {
119
+ ".parquet": DatasetParserParquet,
120
+ ".sas7bdat": DatasetParserSas7Bdat,
121
+ ".parquet.gzip": DatasetParserParquet,
122
+ }
123
+ file_type = "Unknown"
124
+ try:
125
+ file_type = dataset.suffix
126
+ # Gzipped parquet files can be read with DatasetParserParquet
127
+ match = re.search(r"(.parquet.gzip)", str(dataset).lower())
128
+ file_type = ".parquet.gzip" if match else file_type
129
+ # Extract the appropriate reader class from the SUPPORTED_FILE_TYPES dict and return an instance of it
130
+ reader = supported_file_types[file_type](dataset)
131
+ except IndexError as e:
132
+ # Thrown when just one element is returned from split, meaning there is no file extension supplied
133
+ msg = f"Could not recognise file type for provided {dataset = }. Supported file types are: {', '.join(supported_file_types.keys())}"
134
+ raise FileNotFoundError(
135
+ msg,
136
+ ) from e
137
+ except KeyError as e:
138
+ # In this case the file type is not supported, so we throw a helpful exception
139
+ msg = f"{file_type = } is not supported. Please open one of the following supported files types: {', '.join(supported_file_types.keys())} or contact the maintainers to request support."
140
+ raise NotImplementedError(
141
+ msg,
142
+ ) from e
143
+ else:
144
+ return reader
145
+
146
+ @staticmethod
147
+ def transform_data_type(data_type: str) -> DataType | None:
148
+ """Transform a concrete data type to an abstract data type.
149
+
150
+ In statistical metadata, one is not interested in how the data is
151
+ technically stored, but in the meaning of the data type. Because of
152
+ this, we transform known data types to their abstract metadata
153
+ representations.
154
+
155
+ If we encounter a data type we don't know, we just ignore it and let
156
+ the user handle it in the GUI.
157
+
158
+ Arguments:
159
+ data_type: The concrete data type to map.
160
+ """
161
+ return TYPE_MAP.get(data_type.lower(), None)
162
+
163
+ @abstractmethod
164
+ def get_fields(self) -> list[Variable]:
165
+ """Abstract method, must be implemented by subclasses."""
166
+
167
+
168
+ class DatasetParserParquet(DatasetParser):
169
+ """Concrete implementation for parsing parquet files."""
170
+
171
+ def __init__(self, dataset: pathlib.Path | CloudPath) -> None:
172
+ """Call the super init method for initialization.
173
+
174
+ Args:
175
+ dataset: Path to the dataset to parse.
176
+ """
177
+ super().__init__(dataset)
178
+
179
+ def get_fields(self) -> list[Variable]:
180
+ """Extract the fields from this dataset."""
181
+ with self.dataset.open(mode="rb") as f:
182
+ schema: pa.Schema = pq.read_schema(f) # type: ignore [arg-type]
183
+ return [
184
+ Variable(
185
+ short_name=data_field.name.strip(),
186
+ data_type=self.transform_data_type(str(data_field.type)),
187
+ )
188
+ for data_field in schema
189
+ if data_field.name
190
+ != "__index_level_0__" # Index columns should not be documented
191
+ ]
192
+
193
+
194
+ class DatasetParserSas7Bdat(DatasetParser):
195
+ """Concrete implementation for parsing SAS7BDAT files."""
196
+
197
+ def __init__(self, dataset: pathlib.Path | CloudPath) -> None:
198
+ """Call the super init method for initialization.
199
+
200
+ Args:
201
+ dataset: Path to the dataset to parse.
202
+ """
203
+ super().__init__(dataset)
204
+
205
+ def get_fields(self) -> list[Variable]:
206
+ """Extract the fields from this dataset."""
207
+ fields = []
208
+ with self.dataset.open(mode="rb") as f:
209
+ # Use an iterator to avoid reading in the entire dataset
210
+ sas_reader = pd.read_sas(f, format="sas7bdat", iterator=True)
211
+
212
+ # Get the first row from the iterator
213
+ try:
214
+ row = next(sas_reader)
215
+ except StopIteration as e:
216
+ msg = f"Could not read data from {self.dataset}"
217
+ raise RuntimeError(msg) from e
218
+
219
+ # Get all the values from the row and loop through them
220
+ for i, v in enumerate(row.to_numpy().tolist()[0]):
221
+ fields.append(
222
+ Variable(
223
+ short_name=sas_reader.columns[i].name, # type: ignore [attr-defined]
224
+ # Assume labels are defined in the default language (NORSK_BOKMÅL)
225
+ # If this is not correct, the user may fix it via the UI
226
+ name=LanguageStringType(
227
+ [
228
+ LanguageStringTypeItem(
229
+ languageCode=SupportedLanguages.NORSK_BOKMÅL.value,
230
+ languageText=sas_reader.columns[ # type: ignore [attr-defined]
231
+ i
232
+ ].label,
233
+ ),
234
+ ],
235
+ ),
236
+ # Access the python type for the value and transform it to a DataDoc Data type
237
+ data_type=self.transform_data_type(type(v).__name__.lower()), # type: ignore # noqa: PGH003
238
+ ),
239
+ )
240
+
241
+ return fields
@@ -0,0 +1 @@
1
+ """Abstract parent class for interacting with external resources asynchorously."""
@@ -0,0 +1,87 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from abc import ABC
5
+ from abc import abstractmethod
6
+ from typing import TYPE_CHECKING
7
+ from typing import Generic
8
+ from typing import TypeVar
9
+
10
+ if TYPE_CHECKING:
11
+ from concurrent.futures import ThreadPoolExecutor
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ T = TypeVar("T")
16
+
17
+
18
+ class GetExternalSource(ABC, Generic[T]):
19
+ """Abstract base class for retrieving data from external sources asynchronously.
20
+
21
+ This class provides methods to initiate an asynchronous data retrieval
22
+ operation, check its status, and retrieve the result once the operation
23
+ completes. Subclasses must implement the `_fetch_data_from_external_source`
24
+ method to define how data is fetched from the specific external source.
25
+ """
26
+
27
+ def __init__(self, executor: ThreadPoolExecutor) -> None:
28
+ """Initialize the GetExternalSource with an executor to manage asynchronous tasks.
29
+
30
+ This constructor initializes a future object that will hold the result of the
31
+ asynchronous data fetching operation from an external source.
32
+
33
+ Args:
34
+ executor: An instance of ThreadPoolExecutor to manage the asynchronous
35
+ execution of data fetching.
36
+ """
37
+ self.future = executor.submit(
38
+ self._fetch_data_from_external_source,
39
+ )
40
+
41
+ def wait_for_external_result(self) -> None:
42
+ """Wait for the thread responsible for loading the external request to finish.
43
+
44
+ If there is no future to wait for, it logs a warning and returns immediately.
45
+ """
46
+ if not self.future:
47
+ logger.warning("No future to wait for.")
48
+ return
49
+ self.future.result()
50
+
51
+ def check_if_external_data_is_loaded(self) -> bool:
52
+ """Check if the thread getting the external data has finished running.
53
+
54
+ Returns:
55
+ True if the data fetching operation is complete, False otherwise.
56
+ """
57
+ if self.future:
58
+ return self.future.done()
59
+ return False
60
+
61
+ def retrieve_external_data(self) -> T | None:
62
+ """Retrieve the result of the data fetching operation.
63
+
64
+ This method checks if the asynchronous data fetching operation has
65
+ completed. If the operation is finished, it returns the result.
66
+ Otherwise, it returns None.
67
+
68
+ Returns:
69
+ The result of the data fetching operation if it is complete or None
70
+ if the operation has not yet finished.
71
+ """
72
+ if self.future:
73
+ return self.future.result()
74
+ return None
75
+
76
+ @abstractmethod
77
+ def _fetch_data_from_external_source(self) -> T | None:
78
+ """Handle external data retrieval.
79
+
80
+ Abstract method to be implemented in the subclass.
81
+ This method should define the logic for retrieving data from the specific
82
+ external source.
83
+
84
+ Returns:
85
+ The data retrieved from the external source.
86
+ """
87
+ raise NotImplementedError