data-validation-engine 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_validation_engine-0.6.2.dist-info/METADATA +104 -0
- data_validation_engine-0.6.2.dist-info/RECORD +105 -0
- data_validation_engine-0.6.2.dist-info/WHEEL +4 -0
- data_validation_engine-0.6.2.dist-info/licenses/LICENSE +21 -0
- dve/__init__.py +0 -0
- dve/common/__init__.py +0 -0
- dve/common/error_utils.py +189 -0
- dve/core_engine/__init__.py +0 -0
- dve/core_engine/backends/__init__.py +1 -0
- dve/core_engine/backends/base/__init__.py +1 -0
- dve/core_engine/backends/base/auditing.py +618 -0
- dve/core_engine/backends/base/backend.py +240 -0
- dve/core_engine/backends/base/contract.py +454 -0
- dve/core_engine/backends/base/core.py +124 -0
- dve/core_engine/backends/base/reader.py +176 -0
- dve/core_engine/backends/base/reference_data.py +217 -0
- dve/core_engine/backends/base/rules.py +685 -0
- dve/core_engine/backends/base/utilities.py +146 -0
- dve/core_engine/backends/exceptions.py +311 -0
- dve/core_engine/backends/implementations/__init__.py +1 -0
- dve/core_engine/backends/implementations/duckdb/__init__.py +26 -0
- dve/core_engine/backends/implementations/duckdb/auditing.py +234 -0
- dve/core_engine/backends/implementations/duckdb/contract.py +213 -0
- dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +288 -0
- dve/core_engine/backends/implementations/duckdb/readers/__init__.py +13 -0
- dve/core_engine/backends/implementations/duckdb/readers/csv.py +222 -0
- dve/core_engine/backends/implementations/duckdb/readers/json.py +50 -0
- dve/core_engine/backends/implementations/duckdb/readers/xml.py +45 -0
- dve/core_engine/backends/implementations/duckdb/reference_data.py +49 -0
- dve/core_engine/backends/implementations/duckdb/rules.py +534 -0
- dve/core_engine/backends/implementations/duckdb/types.py +47 -0
- dve/core_engine/backends/implementations/duckdb/utilities.py +41 -0
- dve/core_engine/backends/implementations/spark/__init__.py +22 -0
- dve/core_engine/backends/implementations/spark/auditing.py +230 -0
- dve/core_engine/backends/implementations/spark/backend.py +78 -0
- dve/core_engine/backends/implementations/spark/contract.py +241 -0
- dve/core_engine/backends/implementations/spark/readers/__init__.py +15 -0
- dve/core_engine/backends/implementations/spark/readers/csv.py +77 -0
- dve/core_engine/backends/implementations/spark/readers/json.py +66 -0
- dve/core_engine/backends/implementations/spark/readers/xml.py +202 -0
- dve/core_engine/backends/implementations/spark/reference_data.py +42 -0
- dve/core_engine/backends/implementations/spark/rules.py +430 -0
- dve/core_engine/backends/implementations/spark/spark_helpers.py +412 -0
- dve/core_engine/backends/implementations/spark/types.py +21 -0
- dve/core_engine/backends/implementations/spark/utilities.py +144 -0
- dve/core_engine/backends/metadata/__init__.py +47 -0
- dve/core_engine/backends/metadata/contract.py +80 -0
- dve/core_engine/backends/metadata/reporting.py +374 -0
- dve/core_engine/backends/metadata/rules.py +737 -0
- dve/core_engine/backends/readers/__init__.py +41 -0
- dve/core_engine/backends/readers/csv.py +232 -0
- dve/core_engine/backends/readers/utilities.py +21 -0
- dve/core_engine/backends/readers/xml.py +432 -0
- dve/core_engine/backends/readers/xml_linting.py +142 -0
- dve/core_engine/backends/types.py +26 -0
- dve/core_engine/backends/utilities.py +177 -0
- dve/core_engine/configuration/__init__.py +1 -0
- dve/core_engine/configuration/base.py +56 -0
- dve/core_engine/configuration/v1/__init__.py +351 -0
- dve/core_engine/configuration/v1/filters.py +60 -0
- dve/core_engine/configuration/v1/rule_stores/__init__.py +1 -0
- dve/core_engine/configuration/v1/rule_stores/models.py +57 -0
- dve/core_engine/configuration/v1/steps.py +365 -0
- dve/core_engine/constants.py +8 -0
- dve/core_engine/engine.py +265 -0
- dve/core_engine/exceptions.py +29 -0
- dve/core_engine/functions/__init__.py +6 -0
- dve/core_engine/functions/implementations.py +200 -0
- dve/core_engine/loggers.py +57 -0
- dve/core_engine/message.py +512 -0
- dve/core_engine/models.py +196 -0
- dve/core_engine/templating.py +114 -0
- dve/core_engine/type_hints.py +255 -0
- dve/core_engine/validation.py +160 -0
- dve/metadata_parser/__init__.py +2 -0
- dve/metadata_parser/domain_types.py +682 -0
- dve/metadata_parser/exc.py +44 -0
- dve/metadata_parser/function_library.py +64 -0
- dve/metadata_parser/function_wrapper.py +201 -0
- dve/metadata_parser/model_generator.py +119 -0
- dve/metadata_parser/models.py +410 -0
- dve/metadata_parser/utilities.py +54 -0
- dve/parser/__init__.py +1 -0
- dve/parser/exceptions.py +50 -0
- dve/parser/file_handling/__init__.py +31 -0
- dve/parser/file_handling/helpers.py +29 -0
- dve/parser/file_handling/implementations/__init__.py +7 -0
- dve/parser/file_handling/implementations/base.py +97 -0
- dve/parser/file_handling/implementations/dbfs.py +81 -0
- dve/parser/file_handling/implementations/file.py +203 -0
- dve/parser/file_handling/implementations/s3.py +371 -0
- dve/parser/file_handling/log_handler.py +215 -0
- dve/parser/file_handling/service.py +441 -0
- dve/parser/file_handling/utilities.py +53 -0
- dve/parser/type_hints.py +46 -0
- dve/parser/utilities.py +113 -0
- dve/pipeline/__init__.py +0 -0
- dve/pipeline/duckdb_pipeline.py +56 -0
- dve/pipeline/foundry_ddb_pipeline.py +171 -0
- dve/pipeline/pipeline.py +935 -0
- dve/pipeline/spark_pipeline.py +69 -0
- dve/pipeline/utils.py +96 -0
- dve/reporting/__init__.py +1 -0
- dve/reporting/error_report.py +153 -0
- dve/reporting/excel_report.py +319 -0
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
# pylint: disable=protected-access
|
|
2
|
+
# ignore: type[attr-defined]
|
|
3
|
+
|
|
4
|
+
"""Helper objects for duckdb data contract implementation"""
|
|
5
|
+
from collections.abc import Generator, Iterator
|
|
6
|
+
from dataclasses import is_dataclass
|
|
7
|
+
from datetime import date, datetime, time
|
|
8
|
+
from decimal import Decimal
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, ClassVar, Union
|
|
11
|
+
from urllib.parse import urlparse
|
|
12
|
+
|
|
13
|
+
import duckdb.typing as ddbtyp
|
|
14
|
+
import numpy as np
|
|
15
|
+
from duckdb import DuckDBPyConnection, DuckDBPyRelation
|
|
16
|
+
from duckdb.typing import DuckDBPyType
|
|
17
|
+
from pandas import DataFrame
|
|
18
|
+
from pydantic import BaseModel
|
|
19
|
+
from typing_extensions import Annotated, get_args, get_origin, get_type_hints
|
|
20
|
+
|
|
21
|
+
from dve.core_engine.backends.base.utilities import _get_non_heterogenous_type
|
|
22
|
+
from dve.core_engine.type_hints import URI
|
|
23
|
+
from dve.parser.file_handling.service import LocalFilesystemImplementation, _get_implementation
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DDBDecimal:
|
|
27
|
+
"""DuckDB Decimal type"""
|
|
28
|
+
|
|
29
|
+
TYPE_TEXT = "DECIMAL"
|
|
30
|
+
|
|
31
|
+
def __init__(self, width: int = 18, scale: int = 3):
|
|
32
|
+
self._width = width
|
|
33
|
+
self._scale = scale
|
|
34
|
+
|
|
35
|
+
def __str__(self):
|
|
36
|
+
return DDBDecimal.TYPE_TEXT + f"({self._width},{self._scale})"
|
|
37
|
+
|
|
38
|
+
def __call__(self):
|
|
39
|
+
return self.__str__()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class DDBList:
|
|
43
|
+
"""DuckDB List type"""
|
|
44
|
+
|
|
45
|
+
TYPE_TEXT = "[]"
|
|
46
|
+
|
|
47
|
+
def __init__(self, element_type: DuckDBPyType):
|
|
48
|
+
self._element_type = element_type
|
|
49
|
+
|
|
50
|
+
def __str__(self):
|
|
51
|
+
return str(self._element_type) + DDBList.TYPE_TEXT
|
|
52
|
+
|
|
53
|
+
def __call__(self):
|
|
54
|
+
return self.__str__()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class DDBStruct:
|
|
58
|
+
"""DuckDB StructType"""
|
|
59
|
+
|
|
60
|
+
TYPE_TEXT = "STRUCT"
|
|
61
|
+
|
|
62
|
+
def __init__(self, sub_elements: dict[str, DuckDBPyType]):
|
|
63
|
+
self._sub_elements = {**sub_elements}
|
|
64
|
+
|
|
65
|
+
def add_element(self, field_name: str, data_type: DuckDBPyType):
|
|
66
|
+
"""Add another element to the struct"""
|
|
67
|
+
self._sub_elements.update({field_name: data_type})
|
|
68
|
+
|
|
69
|
+
def __str__(self):
|
|
70
|
+
return (
|
|
71
|
+
DDBStruct.TYPE_TEXT
|
|
72
|
+
+ "("
|
|
73
|
+
+ ", ".join(f"{fld} {dtype}" for fld, dtype in self._sub_elements.items())
|
|
74
|
+
+ ")"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
def __call__(self):
|
|
78
|
+
return self.__str__()
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
PYTHON_TYPE_TO_DUCKDB_TYPE: dict[type, DuckDBPyType] = {
|
|
82
|
+
str: ddbtyp.VARCHAR,
|
|
83
|
+
int: ddbtyp.BIGINT,
|
|
84
|
+
bool: ddbtyp.BOOLEAN,
|
|
85
|
+
float: ddbtyp.FLOAT,
|
|
86
|
+
bytes: ddbtyp.BLOB,
|
|
87
|
+
date: ddbtyp.DATE,
|
|
88
|
+
datetime: ddbtyp.TIMESTAMP,
|
|
89
|
+
Decimal: DDBDecimal()(),
|
|
90
|
+
time: ddbtyp.TIME,
|
|
91
|
+
}
|
|
92
|
+
"""A mapping of Python types to the equivalent DuckDB types."""
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def table_exists(connection: DuckDBPyConnection, table_name: str) -> bool:
|
|
96
|
+
"""check if a table exists in a given DuckDBPyConnection"""
|
|
97
|
+
return table_name in map(lambda x: x[0], connection.sql("SHOW TABLES").fetchall())
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def relation_is_empty(relation: DuckDBPyRelation) -> bool:
|
|
101
|
+
"""Check if a duckdb relation is empty"""
|
|
102
|
+
if relation.limit(1).count("*"):
|
|
103
|
+
return False
|
|
104
|
+
return True
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_duckdb_type_from_annotation(type_annotation: Any) -> DuckDBPyType:
|
|
108
|
+
"""Get a duckdb type from a Python type annotation.
|
|
109
|
+
|
|
110
|
+
Supported types are any of the following (this definition is recursive):
|
|
111
|
+
- Supported basic Python types. These are:
|
|
112
|
+
* `str`: VARCHAR
|
|
113
|
+
* `int`: BIGINT
|
|
114
|
+
* `bool`: BOOLEAN
|
|
115
|
+
* `float`: FLOAT
|
|
116
|
+
* `bytes`: BLOB
|
|
117
|
+
* `datetime.date`: DATE
|
|
118
|
+
* `datetime.datetime`: TIMESTAMP
|
|
119
|
+
* `decimal.Decimal`: DECIMAL with precision of 18 and scale of 3
|
|
120
|
+
- A list of supported types (e.g. `List[str]` or `typing.List[str]`).
|
|
121
|
+
This will return a duckdb LIST type (variable length)
|
|
122
|
+
- A `typing.Optional` type or a `typing.Union` of the type and `None` (e.g.
|
|
123
|
+
`typing.Optional[str]`, `typing.Union[List[str], None]`). This will remove the
|
|
124
|
+
'optional' wrapper and return the inner type
|
|
125
|
+
- A subclass of `typing.TypedDict` with values typed using supported types. This
|
|
126
|
+
will parse the value types as Polars types and return a duckdb STRUCT.
|
|
127
|
+
- A dataclass or `pydantic.main.ModelMetaClass` with values typed using supported types.
|
|
128
|
+
This will parse the field types as Polars types and return a duckdb STRUCT.
|
|
129
|
+
- Any supported type, with a `typing_extensions.Annotated` wrapper.
|
|
130
|
+
|
|
131
|
+
Any `ClassVar` types within `TypedDict`s, dataclasses, or `pydantic` models will be
|
|
132
|
+
ignored.
|
|
133
|
+
|
|
134
|
+
"""
|
|
135
|
+
type_origin = get_origin(type_annotation)
|
|
136
|
+
|
|
137
|
+
# An `Optional` or `Union` type, check to ensure non-heterogenity.
|
|
138
|
+
if type_origin is Union:
|
|
139
|
+
python_type = _get_non_heterogenous_type(get_args(type_annotation))
|
|
140
|
+
return get_duckdb_type_from_annotation(python_type)
|
|
141
|
+
|
|
142
|
+
# Type hint is e.g. `List[str]`, check to ensure non-heterogenity.
|
|
143
|
+
if type_origin is list or (isinstance(type_origin, type) and issubclass(type_origin, list)):
|
|
144
|
+
element_type = _get_non_heterogenous_type(get_args(type_annotation))
|
|
145
|
+
return DDBList(get_duckdb_type_from_annotation(element_type))()
|
|
146
|
+
|
|
147
|
+
if type_origin is Annotated:
|
|
148
|
+
python_type, *other_args = get_args(type_annotation) # pylint: disable=unused-variable
|
|
149
|
+
return get_duckdb_type_from_annotation(python_type)
|
|
150
|
+
# Ensure that we have a concrete type at this point.
|
|
151
|
+
if not isinstance(type_annotation, type):
|
|
152
|
+
raise ValueError(f"Unsupported type annotation {type_annotation!r}")
|
|
153
|
+
|
|
154
|
+
if (
|
|
155
|
+
# Type hint is a dict subclass, but not dict. Possibly a `TypedDict`.
|
|
156
|
+
(issubclass(type_annotation, dict) and type_annotation is not dict)
|
|
157
|
+
# Type hint is a dataclass.
|
|
158
|
+
or is_dataclass(type_annotation)
|
|
159
|
+
# Type hint is a `pydantic` model.
|
|
160
|
+
or (type_origin is None and issubclass(type_annotation, BaseModel))
|
|
161
|
+
):
|
|
162
|
+
fields: dict[str, DuckDBPyType] = {}
|
|
163
|
+
for field_name, field_annotation in get_type_hints(type_annotation).items():
|
|
164
|
+
# Technically non-string keys are disallowed, but people are bad.
|
|
165
|
+
if not isinstance(field_name, str):
|
|
166
|
+
raise ValueError(
|
|
167
|
+
f"Dictionary/Dataclass keys must be strings, got {type_annotation!r}"
|
|
168
|
+
) # pragma: no cover
|
|
169
|
+
if get_origin(field_annotation) is ClassVar:
|
|
170
|
+
continue
|
|
171
|
+
|
|
172
|
+
fields[field_name] = get_duckdb_type_from_annotation(field_annotation)
|
|
173
|
+
|
|
174
|
+
if not fields:
|
|
175
|
+
raise ValueError(
|
|
176
|
+
f"No type annotations in dict/dataclass type (got {type_annotation!r})"
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
return DDBStruct(fields)()
|
|
180
|
+
|
|
181
|
+
if type_annotation is list:
|
|
182
|
+
raise ValueError(
|
|
183
|
+
f"List must have type annotation (e.g. `List[str]`), got {type_annotation!r}"
|
|
184
|
+
)
|
|
185
|
+
if type_annotation is dict or type_origin is dict:
|
|
186
|
+
raise ValueError(f"dict must be `typing.TypedDict` subclass, got {type_annotation!r}")
|
|
187
|
+
|
|
188
|
+
for type_ in type_annotation.mro():
|
|
189
|
+
duck_type = PYTHON_TYPE_TO_DUCKDB_TYPE.get(type_)
|
|
190
|
+
if duck_type:
|
|
191
|
+
return duck_type
|
|
192
|
+
raise ValueError(f"No equivalent DuckDB type for {type_annotation!r}")
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def coerce_inferred_numpy_array_to_list(pandas_df: DataFrame) -> DataFrame:
|
|
196
|
+
"""Function to modify numpy inferred array when cnverting from duckdb relation to
|
|
197
|
+
pandas dataframe - these cause issues with pydantic models
|
|
198
|
+
(ie. numpy array can't be viewed as a list)
|
|
199
|
+
|
|
200
|
+
pandas_df (DataFrame): The dataframe to type check and covert where needed
|
|
201
|
+
"""
|
|
202
|
+
pandas_df = pandas_df.replace({np.nan: None})
|
|
203
|
+
for col in pandas_df.columns:
|
|
204
|
+
if isinstance(pandas_df[col].iloc[0], (np.ndarray,)):
|
|
205
|
+
pandas_df[col] = pandas_df[col].apply(lambda x: x.tolist() if x is not None else x)
|
|
206
|
+
return pandas_df
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _ddb_read_parquet(
|
|
210
|
+
self, path: URI, **kwargs # pylint: disable=unused-argument
|
|
211
|
+
) -> DuckDBPyRelation:
|
|
212
|
+
"""Read entity from a parquet file. Due to different behaviours in writing parquet
|
|
213
|
+
files to single files/ mutiple files in a directory and duckdb being inflexible
|
|
214
|
+
checks whether a directory supplied (in which case points at parquet files within)
|
|
215
|
+
or a single file, in which case retains supplied path.
|
|
216
|
+
"""
|
|
217
|
+
if isinstance(_get_implementation(path), LocalFilesystemImplementation):
|
|
218
|
+
path = urlparse(path).path
|
|
219
|
+
|
|
220
|
+
if Path(path).is_dir():
|
|
221
|
+
if not path.endswith("/"):
|
|
222
|
+
path += "/"
|
|
223
|
+
# is a directory - provide edit glob to include all parquet files
|
|
224
|
+
return self._connection.read_parquet(file_glob=f"{path}*.parquet")
|
|
225
|
+
return self._connection.read_parquet(file_glob=path)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _ddb_write_parquet( # pylint: disable=unused-argument
|
|
229
|
+
self, entity: Union[Iterator[dict[str, Any]], DuckDBPyRelation], target_location: URI, **kwargs
|
|
230
|
+
) -> URI:
|
|
231
|
+
"""Method to write parquet files from type cast entities
|
|
232
|
+
following data contract application
|
|
233
|
+
"""
|
|
234
|
+
if isinstance(_get_implementation(target_location), LocalFilesystemImplementation):
|
|
235
|
+
Path(target_location).parent.mkdir(parents=True, exist_ok=True)
|
|
236
|
+
|
|
237
|
+
if isinstance(entity, Generator):
|
|
238
|
+
entity = self._connection.query(
|
|
239
|
+
"select dta.* from (select unnest($data) as dta)", params={"data": list(entity)}
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
entity.to_parquet(file_name=target_location, compression="snappy", **kwargs) # type: ignore
|
|
243
|
+
return target_location
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def duckdb_read_parquet(cls):
|
|
247
|
+
"""Class decorator to add read_parquet method for duckdb implementations"""
|
|
248
|
+
cls.read_parquet = _ddb_read_parquet
|
|
249
|
+
return cls
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def duckdb_write_parquet(cls):
|
|
253
|
+
"""Class decorator to add write_parquet method for duckdb implementations"""
|
|
254
|
+
cls.write_parquet = _ddb_write_parquet
|
|
255
|
+
return cls
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
@staticmethod # type: ignore
|
|
259
|
+
def _duckdb_get_entity_count(entity: DuckDBPyRelation) -> int:
|
|
260
|
+
"""Method to obtain entity count from a persisted parquet entity"""
|
|
261
|
+
return entity.shape[0]
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def duckdb_get_entity_count(cls):
|
|
265
|
+
"""Class decorator to count records in an entity supplied"""
|
|
266
|
+
cls.get_entity_count = _duckdb_get_entity_count
|
|
267
|
+
return cls
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def get_all_registered_udfs(connection: DuckDBPyConnection) -> set[str]:
|
|
271
|
+
"""Function to supply the names of a registered functions stored in the supplied
|
|
272
|
+
duckdb connection. Creates the temp table used to store registered functions (if not exists).
|
|
273
|
+
"""
|
|
274
|
+
connection.sql("CREATE TEMP TABLE IF NOT EXISTS dve_udfs (function_name VARCHAR)")
|
|
275
|
+
return {rw[0] for rw in connection.sql("SELECT * FROM dve_udfs").fetchall()}
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def duckdb_rel_to_dictionaries(
|
|
279
|
+
entity: DuckDBPyRelation, batch_size=1000
|
|
280
|
+
) -> Iterator[dict[str, Any]]:
|
|
281
|
+
"""Iterator converting DuckDBPyRelation to lists of dictionaries.
|
|
282
|
+
Avoids issues where dates are getting converted to datetimes using polars as intermediate."""
|
|
283
|
+
# TODO - look into float conversion - floats that can't be stored exactly in binary
|
|
284
|
+
# TODO - are given to nearest approximation. Tried Decimal, causes issues in arrays
|
|
285
|
+
# TODO - with templating (as in complex fields, repr used when str called in jinja templating).
|
|
286
|
+
cols: tuple[str] = tuple(entity.columns) # type: ignore
|
|
287
|
+
while rows := entity.fetchmany(batch_size):
|
|
288
|
+
yield from (dict(zip(cols, rw)) for rw in rows)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Readers for use with duckdb backend"""
|
|
2
|
+
|
|
3
|
+
from .csv import DuckDBCSVReader, DuckDBCSVRepeatingHeaderReader, PolarsToDuckDBCSVReader
|
|
4
|
+
from .json import DuckDBJSONReader
|
|
5
|
+
from .xml import DuckDBXMLStreamReader
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"DuckDBCSVReader",
|
|
9
|
+
"DuckDBCSVRepeatingHeaderReader",
|
|
10
|
+
"DuckDBJSONReader",
|
|
11
|
+
"DuckDBXMLStreamReader",
|
|
12
|
+
"PolarsToDuckDBCSVReader",
|
|
13
|
+
]
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""A csv reader to create duckdb relations"""
|
|
2
|
+
|
|
3
|
+
# pylint: disable=arguments-differ
|
|
4
|
+
from collections.abc import Iterator
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
|
|
7
|
+
import duckdb as ddb
|
|
8
|
+
import polars as pl
|
|
9
|
+
from duckdb import DuckDBPyConnection, DuckDBPyRelation, default_connection, read_csv
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
from dve.core_engine.backends.base.reader import BaseFileReader, read_function
|
|
13
|
+
from dve.core_engine.backends.exceptions import EmptyFileError, MessageBearingError
|
|
14
|
+
from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import (
|
|
15
|
+
duckdb_write_parquet,
|
|
16
|
+
get_duckdb_type_from_annotation,
|
|
17
|
+
)
|
|
18
|
+
from dve.core_engine.backends.implementations.duckdb.types import SQLType
|
|
19
|
+
from dve.core_engine.backends.readers.utilities import check_csv_header_expected
|
|
20
|
+
from dve.core_engine.backends.utilities import get_polars_type_from_annotation
|
|
21
|
+
from dve.core_engine.message import FeedbackMessage
|
|
22
|
+
from dve.core_engine.type_hints import URI, EntityName
|
|
23
|
+
from dve.parser.file_handling import get_content_length
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@duckdb_write_parquet
|
|
27
|
+
class DuckDBCSVReader(BaseFileReader):
|
|
28
|
+
"""A reader for CSV files including the ability to compare the passed model
|
|
29
|
+
to the file header, if it exists.
|
|
30
|
+
|
|
31
|
+
field_check: flag to compare submitted file header to the accompanying pydantic model
|
|
32
|
+
field_check_error_code: The error code to provide if the file header doesn't contain
|
|
33
|
+
the expected fields
|
|
34
|
+
field_check_error_message: The error message to provide if the file header doesn't contain
|
|
35
|
+
the expected fields"""
|
|
36
|
+
|
|
37
|
+
# TODO - the read_to_relation should include the schema and determine whether to
|
|
38
|
+
# TODO - stringify or not
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
*,
|
|
42
|
+
header: bool = True,
|
|
43
|
+
delim: str = ",",
|
|
44
|
+
quotechar: str = '"',
|
|
45
|
+
connection: Optional[DuckDBPyConnection] = None,
|
|
46
|
+
field_check: bool = False,
|
|
47
|
+
field_check_error_code: Optional[str] = "ExpectedVsActualFieldMismatch",
|
|
48
|
+
field_check_error_message: Optional[str] = "The submitted header is missing fields",
|
|
49
|
+
**_,
|
|
50
|
+
):
|
|
51
|
+
self.header = header
|
|
52
|
+
self.delim = delim
|
|
53
|
+
self.quotechar = quotechar
|
|
54
|
+
self._connection = connection if connection else default_connection
|
|
55
|
+
self.field_check = field_check
|
|
56
|
+
self.field_check_error_code = field_check_error_code
|
|
57
|
+
self.field_check_error_message = field_check_error_message
|
|
58
|
+
|
|
59
|
+
super().__init__()
|
|
60
|
+
|
|
61
|
+
def perform_field_check(
|
|
62
|
+
self, resource: URI, entity_name: str, expected_schema: type[BaseModel]
|
|
63
|
+
):
|
|
64
|
+
"""Check that the header of the CSV aligns with the provided model"""
|
|
65
|
+
if not self.header:
|
|
66
|
+
raise ValueError("Cannot perform field check without a CSV header")
|
|
67
|
+
|
|
68
|
+
if missing := check_csv_header_expected(resource, expected_schema, self.delim):
|
|
69
|
+
raise MessageBearingError(
|
|
70
|
+
"The CSV header doesn't match what is expected",
|
|
71
|
+
messages=[
|
|
72
|
+
FeedbackMessage(
|
|
73
|
+
entity=entity_name,
|
|
74
|
+
record=None,
|
|
75
|
+
failure_type="submission",
|
|
76
|
+
error_location="Whole File",
|
|
77
|
+
error_code=self.field_check_error_code,
|
|
78
|
+
error_message=f"{self.field_check_error_message} - missing fields: {missing}", # pylint: disable=line-too-long
|
|
79
|
+
)
|
|
80
|
+
],
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
def read_to_py_iterator(
|
|
84
|
+
self, resource: URI, entity_name: EntityName, schema: type[BaseModel]
|
|
85
|
+
) -> Iterator[dict[str, Any]]:
|
|
86
|
+
"""Creates an iterable object of rows as dictionaries"""
|
|
87
|
+
yield from self.read_to_relation(resource, entity_name, schema).pl().iter_rows(named=True)
|
|
88
|
+
|
|
89
|
+
@read_function(DuckDBPyRelation)
|
|
90
|
+
def read_to_relation( # pylint: disable=unused-argument
|
|
91
|
+
self, resource: URI, entity_name: EntityName, schema: type[BaseModel]
|
|
92
|
+
) -> DuckDBPyRelation:
|
|
93
|
+
"""Returns a relation object from the source csv"""
|
|
94
|
+
if get_content_length(resource) == 0:
|
|
95
|
+
raise EmptyFileError(f"File at {resource} is empty.")
|
|
96
|
+
|
|
97
|
+
if self.field_check:
|
|
98
|
+
self.perform_field_check(resource, entity_name, schema)
|
|
99
|
+
|
|
100
|
+
reader_options: dict[str, Any] = {
|
|
101
|
+
"header": self.header,
|
|
102
|
+
"delimiter": self.delim,
|
|
103
|
+
"quotechar": self.quotechar,
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
ddb_schema: dict[str, SQLType] = {
|
|
107
|
+
fld.name: str(get_duckdb_type_from_annotation(fld.annotation)) # type: ignore
|
|
108
|
+
for fld in schema.__fields__.values()
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
reader_options["columns"] = ddb_schema
|
|
112
|
+
return read_csv(resource, **reader_options)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class PolarsToDuckDBCSVReader(DuckDBCSVReader):
|
|
116
|
+
"""
|
|
117
|
+
Utilises the polars lazy csv reader which is then converted into a DuckDBPyRelation object.
|
|
118
|
+
|
|
119
|
+
The primary reason this reader exists is due to the limitation within duckdb csv reader and
|
|
120
|
+
it not being able to read partial content from a csv (i.e. select a, b NOT y).
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
@read_function(DuckDBPyRelation)
|
|
124
|
+
def read_to_relation( # pylint: disable=unused-argument
|
|
125
|
+
self, resource: URI, entity_name: EntityName, schema: type[BaseModel]
|
|
126
|
+
) -> DuckDBPyRelation:
|
|
127
|
+
"""Returns a relation object from the source csv"""
|
|
128
|
+
if get_content_length(resource) == 0:
|
|
129
|
+
raise EmptyFileError(f"File at {resource} is empty.")
|
|
130
|
+
|
|
131
|
+
if self.field_check:
|
|
132
|
+
self.perform_field_check(resource, entity_name, schema)
|
|
133
|
+
|
|
134
|
+
reader_options: dict[str, Any] = {
|
|
135
|
+
"has_header": self.header,
|
|
136
|
+
"separator": self.delim,
|
|
137
|
+
"quote_char": self.quotechar,
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
polars_types = {
|
|
141
|
+
fld.name: get_polars_type_from_annotation(fld.annotation) # type: ignore
|
|
142
|
+
for fld in schema.__fields__.values()
|
|
143
|
+
}
|
|
144
|
+
reader_options["dtypes"] = polars_types
|
|
145
|
+
|
|
146
|
+
# there is a raise_if_empty arg for 0.18+. Future reference when upgrading. Makes L85
|
|
147
|
+
# redundant
|
|
148
|
+
df = pl.scan_csv(resource, **reader_options).select(list(polars_types.keys())) # type: ignore # pylint: disable=W0612
|
|
149
|
+
|
|
150
|
+
return ddb.sql("SELECT * FROM df")
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class DuckDBCSVRepeatingHeaderReader(PolarsToDuckDBCSVReader):
|
|
154
|
+
"""A Reader for files with a `.csv` extension and where there are repeating "header" values
|
|
155
|
+
within the file. Header in this case is not the column names at the top of a csv, rather a
|
|
156
|
+
collection of unique records that would usually be structured in another entity. However, due
|
|
157
|
+
to the fact that `csv` is a semi-structured data format, you cannot define complex entities,
|
|
158
|
+
hence the values are then repeated on all rows.
|
|
159
|
+
|
|
160
|
+
Example of a repeating header data may look like this...
|
|
161
|
+
|
|
162
|
+
| headerCol1 | headerCol2 | headerCol3 | nonHeaderCol1 | nonHeaderCol2 |
|
|
163
|
+
| ---------- | ---------- | ---------- | ------------- | ------------- |
|
|
164
|
+
| shop 1 | clothes | 2025-01-01 | jeans | 20.39 |
|
|
165
|
+
| shop 1 | clothes | 2025-01-01 | shirt | 14.99 |
|
|
166
|
+
|
|
167
|
+
This reader will just pull out the distinct values from the header column. Where there are
|
|
168
|
+
more/less than one distinct value per column, the reader will produce a
|
|
169
|
+
`NonDistinctHeaderError`.
|
|
170
|
+
|
|
171
|
+
So using the example above, the expected entity would look like this...
|
|
172
|
+
| headerCol1 | headerCol2 | headerCol3 |
|
|
173
|
+
| ---------- | ---------- | ---------- |
|
|
174
|
+
| shop1 | clothes | 2025-01-01 |
|
|
175
|
+
"""
|
|
176
|
+
|
|
177
|
+
def __init__(
|
|
178
|
+
self,
|
|
179
|
+
*args,
|
|
180
|
+
non_unique_header_error_code: Optional[str] = "NonUniqueHeader",
|
|
181
|
+
non_unique_header_error_message: Optional[str] = None,
|
|
182
|
+
**kwargs,
|
|
183
|
+
):
|
|
184
|
+
self._non_unique_header_code = non_unique_header_error_code
|
|
185
|
+
self._non_unique_header_message = non_unique_header_error_message
|
|
186
|
+
super().__init__(*args, **kwargs)
|
|
187
|
+
|
|
188
|
+
@read_function(DuckDBPyRelation)
|
|
189
|
+
def read_to_relation( # pylint: disable=unused-argument
|
|
190
|
+
self, resource: URI, entity_name: EntityName, schema: type[BaseModel]
|
|
191
|
+
) -> DuckDBPyRelation:
|
|
192
|
+
entity = super().read_to_relation(resource=resource, entity_name=entity_name, schema=schema)
|
|
193
|
+
entity = entity.distinct()
|
|
194
|
+
no_records = entity.shape[0]
|
|
195
|
+
|
|
196
|
+
if no_records != 1:
|
|
197
|
+
rows = entity.pl().to_dicts()
|
|
198
|
+
differing_values = [
|
|
199
|
+
f"{key}: {', '.join(sorted(str(val) for val in values))}"
|
|
200
|
+
for key, *values in zip(rows[0], *map(dict.values, rows)) # type: ignore
|
|
201
|
+
if len(set(values)) > 1
|
|
202
|
+
]
|
|
203
|
+
raise MessageBearingError(
|
|
204
|
+
"More than one set of Headers found in CSV file",
|
|
205
|
+
messages=[
|
|
206
|
+
FeedbackMessage(
|
|
207
|
+
record={entity_name: differing_values},
|
|
208
|
+
entity="Pre-validation",
|
|
209
|
+
failure_type="submission",
|
|
210
|
+
error_message=(
|
|
211
|
+
f"Found {no_records} distinct combination of header values."
|
|
212
|
+
if not self._non_unique_header_message
|
|
213
|
+
else self._non_unique_header_message
|
|
214
|
+
),
|
|
215
|
+
error_location=entity_name,
|
|
216
|
+
category="Bad file",
|
|
217
|
+
error_code=self._non_unique_header_code,
|
|
218
|
+
)
|
|
219
|
+
],
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
return entity
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""A csv reader to create duckdb relations"""
|
|
2
|
+
|
|
3
|
+
# pylint: disable=arguments-differ
|
|
4
|
+
from collections.abc import Iterator
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
|
|
7
|
+
from duckdb import DuckDBPyRelation, read_json
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
|
|
10
|
+
from dve.core_engine.backends.base.reader import BaseFileReader, read_function
|
|
11
|
+
from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import (
|
|
12
|
+
duckdb_write_parquet,
|
|
13
|
+
get_duckdb_type_from_annotation,
|
|
14
|
+
)
|
|
15
|
+
from dve.core_engine.backends.implementations.duckdb.types import SQLType
|
|
16
|
+
from dve.core_engine.type_hints import URI, EntityName
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@duckdb_write_parquet
|
|
20
|
+
class DuckDBJSONReader(BaseFileReader):
|
|
21
|
+
"""A reader for JSON files"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
*,
|
|
26
|
+
json_format: Optional[str] = "array",
|
|
27
|
+
**_,
|
|
28
|
+
):
|
|
29
|
+
self._json_format = json_format
|
|
30
|
+
|
|
31
|
+
super().__init__()
|
|
32
|
+
|
|
33
|
+
def read_to_py_iterator(
|
|
34
|
+
self, resource: URI, entity_name: EntityName, schema: type[BaseModel]
|
|
35
|
+
) -> Iterator[dict[str, Any]]:
|
|
36
|
+
"""Creates an iterable object of rows as dictionaries"""
|
|
37
|
+
return self.read_to_relation(resource, entity_name, schema).pl().iter_rows(named=True)
|
|
38
|
+
|
|
39
|
+
@read_function(DuckDBPyRelation)
|
|
40
|
+
def read_to_relation( # pylint: disable=unused-argument
|
|
41
|
+
self, resource: URI, entity_name: EntityName, schema: type[BaseModel]
|
|
42
|
+
) -> DuckDBPyRelation:
|
|
43
|
+
"""Returns a relation object from the source json"""
|
|
44
|
+
|
|
45
|
+
ddb_schema: dict[str, SQLType] = {
|
|
46
|
+
fld.name: str(get_duckdb_type_from_annotation(fld.annotation)) # type: ignore
|
|
47
|
+
for fld in schema.__fields__.values()
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return read_json(resource, columns=ddb_schema, format=self._json_format) # type: ignore
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# mypy: disable-error-code="attr-defined"
|
|
2
|
+
"""An xml reader to create duckdb relations"""
|
|
3
|
+
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import polars as pl
|
|
7
|
+
from duckdb import DuckDBPyConnection, DuckDBPyRelation, default_connection
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
|
|
10
|
+
from dve.core_engine.backends.base.reader import read_function
|
|
11
|
+
from dve.core_engine.backends.exceptions import MessageBearingError
|
|
12
|
+
from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import duckdb_write_parquet
|
|
13
|
+
from dve.core_engine.backends.readers.xml import XMLStreamReader
|
|
14
|
+
from dve.core_engine.backends.utilities import get_polars_type_from_annotation, stringify_model
|
|
15
|
+
from dve.core_engine.type_hints import URI
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@duckdb_write_parquet
|
|
19
|
+
class DuckDBXMLStreamReader(XMLStreamReader):
|
|
20
|
+
"""A reader for XML files"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, *, ddb_connection: Optional[DuckDBPyConnection] = None, **kwargs):
|
|
23
|
+
self.ddb_connection = ddb_connection if ddb_connection else default_connection
|
|
24
|
+
super().__init__(**kwargs)
|
|
25
|
+
|
|
26
|
+
@read_function(DuckDBPyRelation)
|
|
27
|
+
def read_to_relation(self, resource: URI, entity_name: str, schema: type[BaseModel]):
|
|
28
|
+
"""Returns a relation object from the source xml"""
|
|
29
|
+
if self.xsd_location:
|
|
30
|
+
msg = self._run_xmllint(file_uri=resource)
|
|
31
|
+
if msg:
|
|
32
|
+
raise MessageBearingError(
|
|
33
|
+
"Submitted file failed XSD validation.",
|
|
34
|
+
messages=[msg],
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
polars_schema: dict[str, pl.DataType] = { # type: ignore
|
|
38
|
+
fld.name: get_polars_type_from_annotation(fld.annotation)
|
|
39
|
+
for fld in stringify_model(schema).__fields__.values()
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
_lazy_frame = pl.LazyFrame(
|
|
43
|
+
data=self.read_to_py_iterator(resource, entity_name, schema), schema=polars_schema
|
|
44
|
+
)
|
|
45
|
+
return self.ddb_connection.sql("select * from _lazy_frame")
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""A reference data loader for duckdb."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from duckdb import DuckDBPyConnection, DuckDBPyRelation
|
|
6
|
+
from pyarrow import ipc # type: ignore
|
|
7
|
+
|
|
8
|
+
from dve.core_engine.backends.base.reference_data import (
|
|
9
|
+
BaseRefDataLoader,
|
|
10
|
+
ReferenceConfigUnion,
|
|
11
|
+
ReferenceTable,
|
|
12
|
+
mark_refdata_file_extension,
|
|
13
|
+
)
|
|
14
|
+
from dve.core_engine.type_hints import EntityName
|
|
15
|
+
from dve.parser.type_hints import URI
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# pylint: disable=too-few-public-methods
|
|
19
|
+
class DuckDBRefDataLoader(BaseRefDataLoader[DuckDBPyRelation]):
|
|
20
|
+
"""A reference data loader using already existing DuckDB tables."""
|
|
21
|
+
|
|
22
|
+
connection: DuckDBPyConnection
|
|
23
|
+
"""The DuckDB connection for the backend."""
|
|
24
|
+
dataset_config_uri: Optional[URI] = None
|
|
25
|
+
"""The location of the dischema file"""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
reference_entity_config: dict[EntityName, ReferenceConfigUnion],
|
|
30
|
+
**kwargs,
|
|
31
|
+
) -> None:
|
|
32
|
+
super().__init__(reference_entity_config, self.dataset_config_uri, **kwargs)
|
|
33
|
+
|
|
34
|
+
if not self.connection:
|
|
35
|
+
raise AttributeError("DuckDBConnection must be specified")
|
|
36
|
+
|
|
37
|
+
def load_table(self, config: ReferenceTable) -> DuckDBPyRelation:
|
|
38
|
+
"""Load reference entity from a database table"""
|
|
39
|
+
return self.connection.sql(f"select * from {config.fq_table_name}")
|
|
40
|
+
|
|
41
|
+
@mark_refdata_file_extension("parquet")
|
|
42
|
+
def load_parquet_file(self, uri: str) -> DuckDBPyRelation:
|
|
43
|
+
"""Load a parquet file into a duckdb relation"""
|
|
44
|
+
return self.connection.read_parquet(uri)
|
|
45
|
+
|
|
46
|
+
@mark_refdata_file_extension("arrow")
|
|
47
|
+
def load_arrow_file(self, uri: str) -> DuckDBPyRelation:
|
|
48
|
+
"""Load an arrow ipc file into a duckdb relation"""
|
|
49
|
+
return self.connection.from_arrow(ipc.open_stream(uri).read_all()) # type:ignore
|