data-validation-engine 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_validation_engine-0.6.2.dist-info/METADATA +104 -0
- data_validation_engine-0.6.2.dist-info/RECORD +105 -0
- data_validation_engine-0.6.2.dist-info/WHEEL +4 -0
- data_validation_engine-0.6.2.dist-info/licenses/LICENSE +21 -0
- dve/__init__.py +0 -0
- dve/common/__init__.py +0 -0
- dve/common/error_utils.py +189 -0
- dve/core_engine/__init__.py +0 -0
- dve/core_engine/backends/__init__.py +1 -0
- dve/core_engine/backends/base/__init__.py +1 -0
- dve/core_engine/backends/base/auditing.py +618 -0
- dve/core_engine/backends/base/backend.py +240 -0
- dve/core_engine/backends/base/contract.py +454 -0
- dve/core_engine/backends/base/core.py +124 -0
- dve/core_engine/backends/base/reader.py +176 -0
- dve/core_engine/backends/base/reference_data.py +217 -0
- dve/core_engine/backends/base/rules.py +685 -0
- dve/core_engine/backends/base/utilities.py +146 -0
- dve/core_engine/backends/exceptions.py +311 -0
- dve/core_engine/backends/implementations/__init__.py +1 -0
- dve/core_engine/backends/implementations/duckdb/__init__.py +26 -0
- dve/core_engine/backends/implementations/duckdb/auditing.py +234 -0
- dve/core_engine/backends/implementations/duckdb/contract.py +213 -0
- dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +288 -0
- dve/core_engine/backends/implementations/duckdb/readers/__init__.py +13 -0
- dve/core_engine/backends/implementations/duckdb/readers/csv.py +222 -0
- dve/core_engine/backends/implementations/duckdb/readers/json.py +50 -0
- dve/core_engine/backends/implementations/duckdb/readers/xml.py +45 -0
- dve/core_engine/backends/implementations/duckdb/reference_data.py +49 -0
- dve/core_engine/backends/implementations/duckdb/rules.py +534 -0
- dve/core_engine/backends/implementations/duckdb/types.py +47 -0
- dve/core_engine/backends/implementations/duckdb/utilities.py +41 -0
- dve/core_engine/backends/implementations/spark/__init__.py +22 -0
- dve/core_engine/backends/implementations/spark/auditing.py +230 -0
- dve/core_engine/backends/implementations/spark/backend.py +78 -0
- dve/core_engine/backends/implementations/spark/contract.py +241 -0
- dve/core_engine/backends/implementations/spark/readers/__init__.py +15 -0
- dve/core_engine/backends/implementations/spark/readers/csv.py +77 -0
- dve/core_engine/backends/implementations/spark/readers/json.py +66 -0
- dve/core_engine/backends/implementations/spark/readers/xml.py +202 -0
- dve/core_engine/backends/implementations/spark/reference_data.py +42 -0
- dve/core_engine/backends/implementations/spark/rules.py +430 -0
- dve/core_engine/backends/implementations/spark/spark_helpers.py +412 -0
- dve/core_engine/backends/implementations/spark/types.py +21 -0
- dve/core_engine/backends/implementations/spark/utilities.py +144 -0
- dve/core_engine/backends/metadata/__init__.py +47 -0
- dve/core_engine/backends/metadata/contract.py +80 -0
- dve/core_engine/backends/metadata/reporting.py +374 -0
- dve/core_engine/backends/metadata/rules.py +737 -0
- dve/core_engine/backends/readers/__init__.py +41 -0
- dve/core_engine/backends/readers/csv.py +232 -0
- dve/core_engine/backends/readers/utilities.py +21 -0
- dve/core_engine/backends/readers/xml.py +432 -0
- dve/core_engine/backends/readers/xml_linting.py +142 -0
- dve/core_engine/backends/types.py +26 -0
- dve/core_engine/backends/utilities.py +177 -0
- dve/core_engine/configuration/__init__.py +1 -0
- dve/core_engine/configuration/base.py +56 -0
- dve/core_engine/configuration/v1/__init__.py +351 -0
- dve/core_engine/configuration/v1/filters.py +60 -0
- dve/core_engine/configuration/v1/rule_stores/__init__.py +1 -0
- dve/core_engine/configuration/v1/rule_stores/models.py +57 -0
- dve/core_engine/configuration/v1/steps.py +365 -0
- dve/core_engine/constants.py +8 -0
- dve/core_engine/engine.py +265 -0
- dve/core_engine/exceptions.py +29 -0
- dve/core_engine/functions/__init__.py +6 -0
- dve/core_engine/functions/implementations.py +200 -0
- dve/core_engine/loggers.py +57 -0
- dve/core_engine/message.py +512 -0
- dve/core_engine/models.py +196 -0
- dve/core_engine/templating.py +114 -0
- dve/core_engine/type_hints.py +255 -0
- dve/core_engine/validation.py +160 -0
- dve/metadata_parser/__init__.py +2 -0
- dve/metadata_parser/domain_types.py +682 -0
- dve/metadata_parser/exc.py +44 -0
- dve/metadata_parser/function_library.py +64 -0
- dve/metadata_parser/function_wrapper.py +201 -0
- dve/metadata_parser/model_generator.py +119 -0
- dve/metadata_parser/models.py +410 -0
- dve/metadata_parser/utilities.py +54 -0
- dve/parser/__init__.py +1 -0
- dve/parser/exceptions.py +50 -0
- dve/parser/file_handling/__init__.py +31 -0
- dve/parser/file_handling/helpers.py +29 -0
- dve/parser/file_handling/implementations/__init__.py +7 -0
- dve/parser/file_handling/implementations/base.py +97 -0
- dve/parser/file_handling/implementations/dbfs.py +81 -0
- dve/parser/file_handling/implementations/file.py +203 -0
- dve/parser/file_handling/implementations/s3.py +371 -0
- dve/parser/file_handling/log_handler.py +215 -0
- dve/parser/file_handling/service.py +441 -0
- dve/parser/file_handling/utilities.py +53 -0
- dve/parser/type_hints.py +46 -0
- dve/parser/utilities.py +113 -0
- dve/pipeline/__init__.py +0 -0
- dve/pipeline/duckdb_pipeline.py +56 -0
- dve/pipeline/foundry_ddb_pipeline.py +171 -0
- dve/pipeline/pipeline.py +935 -0
- dve/pipeline/spark_pipeline.py +69 -0
- dve/pipeline/utils.py +96 -0
- dve/reporting/__init__.py +1 -0
- dve/reporting/error_report.py +153 -0
- dve/reporting/excel_report.py +319 -0
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""Auditing definitions for duckdb backend"""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from typing import Any, Optional, Union
|
|
5
|
+
|
|
6
|
+
import polars as pl
|
|
7
|
+
from duckdb import ColumnExpression, DuckDBPyConnection, DuckDBPyRelation, StarExpression, connect
|
|
8
|
+
from polars.datatypes.classes import DataTypeClass as PolarsType
|
|
9
|
+
|
|
10
|
+
from dve.core_engine.backends.base.auditing import (
|
|
11
|
+
BaseAuditingManager,
|
|
12
|
+
BaseAuditor,
|
|
13
|
+
FilterCriteria,
|
|
14
|
+
OrderCriteria,
|
|
15
|
+
)
|
|
16
|
+
from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import (
|
|
17
|
+
PYTHON_TYPE_TO_DUCKDB_TYPE,
|
|
18
|
+
table_exists,
|
|
19
|
+
)
|
|
20
|
+
from dve.core_engine.backends.utilities import PYTHON_TYPE_TO_POLARS_TYPE
|
|
21
|
+
from dve.core_engine.models import (
|
|
22
|
+
AuditRecord,
|
|
23
|
+
ProcessingStatusRecord,
|
|
24
|
+
SubmissionInfo,
|
|
25
|
+
SubmissionStatisticsRecord,
|
|
26
|
+
TransferRecord,
|
|
27
|
+
)
|
|
28
|
+
from dve.core_engine.type_hints import URI, ExecutorType
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class DDBAuditor(BaseAuditor[DuckDBPyRelation]):
|
|
32
|
+
"""An auditor implemented using the python duckdb package"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
record_type: type[AuditRecord],
|
|
37
|
+
database_uri: URI,
|
|
38
|
+
name: str,
|
|
39
|
+
connection: Optional[DuckDBPyConnection] = None,
|
|
40
|
+
):
|
|
41
|
+
|
|
42
|
+
self._db = database_uri
|
|
43
|
+
self._connection: DuckDBPyConnection = (
|
|
44
|
+
connection
|
|
45
|
+
if connection
|
|
46
|
+
else connect(
|
|
47
|
+
database=database_uri,
|
|
48
|
+
config={
|
|
49
|
+
"access_mode": "READ_WRITE",
|
|
50
|
+
"default_null_order": "NULLS_LAST",
|
|
51
|
+
"threads": 1,
|
|
52
|
+
},
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
super().__init__(name=name, record_type=record_type)
|
|
56
|
+
if not table_exists(self._connection, self._name):
|
|
57
|
+
|
|
58
|
+
self._connection.sql(self.ddb_create_table_sql)
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def ddb_create_table_sql(self) -> str:
|
|
62
|
+
"""Generate create table sql script for auditor"""
|
|
63
|
+
_sql_expression = f"CREATE TABLE {self._name} ("
|
|
64
|
+
_sql_expression += ", ".join(
|
|
65
|
+
[f"{fld} {PYTHON_TYPE_TO_DUCKDB_TYPE.get(dtype)}" for fld, dtype in self.schema.items()]
|
|
66
|
+
)
|
|
67
|
+
_sql_expression += ")"
|
|
68
|
+
return _sql_expression
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def polars_schema(self) -> dict[str, PolarsType]:
|
|
72
|
+
"""Get polars dataframe schema for auditor"""
|
|
73
|
+
return {
|
|
74
|
+
fld: PYTHON_TYPE_TO_POLARS_TYPE.get(dtype, pl.Utf8) # type: ignore
|
|
75
|
+
for fld, dtype in self.schema.items()
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
def get_relation(self) -> DuckDBPyRelation:
|
|
79
|
+
"""Get a relation to interact with the auditor duckdb table"""
|
|
80
|
+
return self._connection.table(self._name)
|
|
81
|
+
|
|
82
|
+
def combine_filters(self, filter_criteria: list[FilterCriteria]) -> str:
|
|
83
|
+
"""Combine multiple filters to apply"""
|
|
84
|
+
return " AND ".join([self.normalise_filter(filt) for filt in filter_criteria])
|
|
85
|
+
|
|
86
|
+
@staticmethod
|
|
87
|
+
def normalise_field(field: str) -> ColumnExpression: # type: ignore
|
|
88
|
+
"""Convert field to duckdb expression"""
|
|
89
|
+
return ColumnExpression(field)
|
|
90
|
+
|
|
91
|
+
@staticmethod
|
|
92
|
+
def normalise_order(order_condition: OrderCriteria) -> str:
|
|
93
|
+
"""Convert order criteria to duckdb expression"""
|
|
94
|
+
return order_condition.to_sql()
|
|
95
|
+
|
|
96
|
+
@staticmethod
|
|
97
|
+
def normalise_filter(filter_condition: FilterCriteria) -> str:
|
|
98
|
+
"""Convert filter criteria to duckdb expression"""
|
|
99
|
+
return filter_condition.to_sql()
|
|
100
|
+
|
|
101
|
+
def conv_to_records(self, recs: DuckDBPyRelation) -> Iterable[AuditRecord]:
|
|
102
|
+
"""Convert the relation to an iterable of the related audit record"""
|
|
103
|
+
return (self._record_type(**rec) for rec in recs.pl().iter_rows(named=True))
|
|
104
|
+
|
|
105
|
+
def conv_to_entity(self, recs: list[AuditRecord]) -> DuckDBPyRelation:
|
|
106
|
+
"""Convert a list of audit records to a relation"""
|
|
107
|
+
# pylint: disable=W0612
|
|
108
|
+
rec_df = pl.DataFrame( # type: ignore
|
|
109
|
+
[rec.dict() for rec in recs],
|
|
110
|
+
schema=self.polars_schema,
|
|
111
|
+
)
|
|
112
|
+
return self._connection.sql("select * from rec_df")
|
|
113
|
+
|
|
114
|
+
def add_records(self, records: Iterable[dict[str, Any]]) -> None:
|
|
115
|
+
"""Add records to the underlying duckdb table"""
|
|
116
|
+
# pylint: disable=W0612
|
|
117
|
+
data_pl_df = pl.DataFrame( # type: ignore
|
|
118
|
+
records,
|
|
119
|
+
schema=self.polars_schema,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
self._connection.sql(
|
|
123
|
+
f"""INSERT INTO {self._name} ({', '.join(self.polars_schema)})
|
|
124
|
+
SELECT {', '.join(self.polars_schema)} from data_pl_df"""
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
def retrieve_records(
|
|
128
|
+
self,
|
|
129
|
+
filter_criteria: Optional[list[FilterCriteria]] = None,
|
|
130
|
+
data: Optional[DuckDBPyRelation] = None,
|
|
131
|
+
) -> DuckDBPyRelation:
|
|
132
|
+
"""Get records from the underlying duckdb table"""
|
|
133
|
+
rel = self.get_relation() if not data else data
|
|
134
|
+
if filter_criteria:
|
|
135
|
+
rel = rel.filter(self.combine_filters(filter_criteria))
|
|
136
|
+
return rel
|
|
137
|
+
|
|
138
|
+
def get_most_recent_records(
|
|
139
|
+
self,
|
|
140
|
+
order_criteria: list[OrderCriteria],
|
|
141
|
+
partition_fields: Optional[list[str]] = None,
|
|
142
|
+
pre_filter_criteria: Optional[list[FilterCriteria]] = None,
|
|
143
|
+
) -> DuckDBPyRelation:
|
|
144
|
+
"""Get most recent records, based on the order and partitioning,
|
|
145
|
+
from the underlying duckdb table"""
|
|
146
|
+
ordering = " AND ".join([self.normalise_order(fld) for fld in order_criteria])
|
|
147
|
+
rel = self.get_relation()
|
|
148
|
+
if pre_filter_criteria:
|
|
149
|
+
rel = rel.filter(self.combine_filters(pre_filter_criteria))
|
|
150
|
+
if partition_fields:
|
|
151
|
+
rel = (
|
|
152
|
+
rel.select(
|
|
153
|
+
"*, row_number() OVER (PARTITION BY {} ORDER BY {}) as RN".format( # pylint: disable=C0209
|
|
154
|
+
",".join(partition_fields),
|
|
155
|
+
",".join([self.normalise_order(ordr) for ordr in order_criteria]),
|
|
156
|
+
)
|
|
157
|
+
)
|
|
158
|
+
.filter("RN = 1")
|
|
159
|
+
.select(StarExpression(exclude=["RN"]))
|
|
160
|
+
)
|
|
161
|
+
else:
|
|
162
|
+
rel = rel.order(ordering).limit(1)
|
|
163
|
+
return rel
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class DDBAuditingManager(BaseAuditingManager[DDBAuditor, DuckDBPyRelation]):
|
|
167
|
+
"""Auditing manager for duckdb implementaion"""
|
|
168
|
+
|
|
169
|
+
def __init__(
|
|
170
|
+
self,
|
|
171
|
+
database_uri: URI,
|
|
172
|
+
pool: Optional[ExecutorType] = None,
|
|
173
|
+
connection: Optional[DuckDBPyRelation] = None,
|
|
174
|
+
):
|
|
175
|
+
self._database_uri = database_uri
|
|
176
|
+
self._connection = (
|
|
177
|
+
connection
|
|
178
|
+
if connection
|
|
179
|
+
else connect(
|
|
180
|
+
database=database_uri,
|
|
181
|
+
config={
|
|
182
|
+
"access_mode": "READ_WRITE",
|
|
183
|
+
"default_null_order": "NULLS_LAST",
|
|
184
|
+
"threads": 1,
|
|
185
|
+
},
|
|
186
|
+
)
|
|
187
|
+
)
|
|
188
|
+
self._pool = pool
|
|
189
|
+
super().__init__(
|
|
190
|
+
processing_status=DDBAuditor(
|
|
191
|
+
record_type=ProcessingStatusRecord,
|
|
192
|
+
database_uri=self._database_uri,
|
|
193
|
+
name="processing_status",
|
|
194
|
+
connection=self._connection, # type: ignore
|
|
195
|
+
),
|
|
196
|
+
submission_info=DDBAuditor(
|
|
197
|
+
record_type=SubmissionInfo,
|
|
198
|
+
database_uri=self._database_uri,
|
|
199
|
+
name="submission_info",
|
|
200
|
+
connection=self._connection, # type: ignore
|
|
201
|
+
),
|
|
202
|
+
submission_statistics=DDBAuditor(
|
|
203
|
+
record_type=SubmissionStatisticsRecord,
|
|
204
|
+
database_uri=self._database_uri,
|
|
205
|
+
name="submission_statistics",
|
|
206
|
+
connection=self._connection, # type: ignore
|
|
207
|
+
),
|
|
208
|
+
transfers=DDBAuditor(
|
|
209
|
+
record_type=TransferRecord,
|
|
210
|
+
database_uri=self._database_uri,
|
|
211
|
+
name="transfers",
|
|
212
|
+
connection=self._connection, # type: ignore
|
|
213
|
+
),
|
|
214
|
+
pool=self._pool,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
def combine_auditor_information(
|
|
218
|
+
self, left: Union[DDBAuditor, DuckDBPyRelation], right: Union[DDBAuditor, DuckDBPyRelation]
|
|
219
|
+
) -> DuckDBPyRelation:
|
|
220
|
+
if isinstance(left, DDBAuditor):
|
|
221
|
+
left = left.get_relation()
|
|
222
|
+
left = left.set_alias("lhs")
|
|
223
|
+
if isinstance(right, DDBAuditor):
|
|
224
|
+
right = right.get_relation()
|
|
225
|
+
right = right.set_alias("rhs")
|
|
226
|
+
return left.join(right, condition="submission_id", how="inner").select(
|
|
227
|
+
*[f"lhs.{fld}" for fld in left.columns],
|
|
228
|
+
*[f"rhs.{fld}" for fld in right.columns if not fld in left.columns],
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
@staticmethod
|
|
232
|
+
def conv_to_iterable(recs: Union[DDBAuditor, DuckDBPyRelation]) -> Iterable[dict[str, Any]]:
|
|
233
|
+
recs_rel: DuckDBPyRelation = recs.get_relation() if isinstance(recs, DDBAuditor) else recs
|
|
234
|
+
return recs_rel.pl().iter_rows(named=True)
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
"""An implementation of the data contract in Duck DB."""
|
|
2
|
+
|
|
3
|
+
# pylint: disable=R0903
|
|
4
|
+
import logging
|
|
5
|
+
from collections.abc import Iterator
|
|
6
|
+
from functools import partial
|
|
7
|
+
from typing import Any, Optional
|
|
8
|
+
from uuid import uuid4
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import polars as pl
|
|
12
|
+
import pyarrow.parquet as pq # type: ignore
|
|
13
|
+
from duckdb import DuckDBPyConnection, DuckDBPyRelation
|
|
14
|
+
from duckdb.typing import DuckDBPyType
|
|
15
|
+
from polars.datatypes.classes import DataTypeClass as PolarsType
|
|
16
|
+
from pydantic import BaseModel
|
|
17
|
+
from pydantic.fields import ModelField
|
|
18
|
+
|
|
19
|
+
import dve.parser.file_handling as fh
|
|
20
|
+
from dve.common.error_utils import (
|
|
21
|
+
BackgroundMessageWriter,
|
|
22
|
+
dump_processing_errors,
|
|
23
|
+
get_feedback_errors_uri,
|
|
24
|
+
)
|
|
25
|
+
from dve.core_engine.backends.base.contract import BaseDataContract
|
|
26
|
+
from dve.core_engine.backends.base.utilities import (
|
|
27
|
+
check_if_parquet_file,
|
|
28
|
+
generate_error_casting_entity_message,
|
|
29
|
+
)
|
|
30
|
+
from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import (
|
|
31
|
+
duckdb_read_parquet,
|
|
32
|
+
duckdb_write_parquet,
|
|
33
|
+
get_duckdb_type_from_annotation,
|
|
34
|
+
relation_is_empty,
|
|
35
|
+
)
|
|
36
|
+
from dve.core_engine.backends.implementations.duckdb.types import DuckDBEntities
|
|
37
|
+
from dve.core_engine.backends.metadata.contract import DataContractMetadata
|
|
38
|
+
from dve.core_engine.backends.types import StageSuccessful
|
|
39
|
+
from dve.core_engine.backends.utilities import get_polars_type_from_annotation, stringify_model
|
|
40
|
+
from dve.core_engine.message import FeedbackMessage
|
|
41
|
+
from dve.core_engine.type_hints import URI, EntityLocations
|
|
42
|
+
from dve.core_engine.validation import RowValidator, apply_row_validator_helper
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class PandasApplyHelper:
|
|
46
|
+
"""A helper for using RowValidator object with pandas dataframe"""
|
|
47
|
+
|
|
48
|
+
def __init__(self, row_validator: RowValidator):
|
|
49
|
+
self.row_validator = row_validator
|
|
50
|
+
self.errors: list[FeedbackMessage] = []
|
|
51
|
+
|
|
52
|
+
def __call__(self, row: pd.Series):
|
|
53
|
+
self.errors.extend(self.row_validator(row.to_dict())[1]) # type: ignore
|
|
54
|
+
return row # no op
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@duckdb_write_parquet
|
|
58
|
+
@duckdb_read_parquet
|
|
59
|
+
class DuckDBDataContract(BaseDataContract[DuckDBPyRelation]):
|
|
60
|
+
"""An implementation of a data contract in DuckDB.
|
|
61
|
+
|
|
62
|
+
This utilises pyarrow to distibute parquet data across python processes and
|
|
63
|
+
a background process to write error messages.
|
|
64
|
+
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
connection: DuckDBPyConnection,
|
|
70
|
+
logger: Optional[logging.Logger] = None,
|
|
71
|
+
debug: bool = False,
|
|
72
|
+
**kwargs: Any,
|
|
73
|
+
):
|
|
74
|
+
self.debug = debug
|
|
75
|
+
self._connection = connection
|
|
76
|
+
"""A bool indicating whether to enable debug logging."""
|
|
77
|
+
|
|
78
|
+
super().__init__(logger, **kwargs)
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def connection(self) -> DuckDBPyConnection:
|
|
82
|
+
"""The duckdb connection"""
|
|
83
|
+
return self._connection
|
|
84
|
+
|
|
85
|
+
def _cache_records(self, relation: DuckDBPyRelation, working_dir: URI) -> URI:
|
|
86
|
+
chunk_uri = "/".join((working_dir.rstrip("/"), str(uuid4()))) + ".parquet"
|
|
87
|
+
self.write_parquet(entity=relation, target_location=chunk_uri)
|
|
88
|
+
return chunk_uri
|
|
89
|
+
|
|
90
|
+
def create_entity_from_py_iterator( # pylint: disable=unused-argument
|
|
91
|
+
self, entity_name: URI, records: Iterator[dict[URI, Any]], schema: type[BaseModel]
|
|
92
|
+
) -> DuckDBPyRelation:
|
|
93
|
+
"""Create DuckDB Relation from iterator of records"""
|
|
94
|
+
polars_schema: dict[str, PolarsType] = {
|
|
95
|
+
fld.name: get_polars_type_from_annotation(fld.type_)
|
|
96
|
+
for fld in stringify_model(schema).__fields__.values()
|
|
97
|
+
}
|
|
98
|
+
_lazy_df = pl.LazyFrame(records, polars_schema) # type: ignore # pylint: disable=unused-variable
|
|
99
|
+
return self._connection.sql("select * from _lazy_df")
|
|
100
|
+
|
|
101
|
+
@staticmethod
|
|
102
|
+
def generate_ddb_cast_statement(
|
|
103
|
+
column_name: str, dtype: DuckDBPyType, null_flag: bool = False
|
|
104
|
+
) -> str:
|
|
105
|
+
"""Helper method to generate sql statements for casting datatypes (permissively).
|
|
106
|
+
Current duckdb python API doesn't play well with this currently.
|
|
107
|
+
"""
|
|
108
|
+
if not null_flag:
|
|
109
|
+
return f'try_cast("{column_name}" AS {dtype}) AS "{column_name}"'
|
|
110
|
+
return f'cast(NULL AS {dtype}) AS "{column_name}"'
|
|
111
|
+
|
|
112
|
+
# pylint: disable=R0914
|
|
113
|
+
def apply_data_contract(
|
|
114
|
+
self,
|
|
115
|
+
working_dir: URI,
|
|
116
|
+
entities: DuckDBEntities,
|
|
117
|
+
entity_locations: EntityLocations,
|
|
118
|
+
contract_metadata: DataContractMetadata,
|
|
119
|
+
key_fields: Optional[dict[str, list[str]]] = None,
|
|
120
|
+
) -> tuple[DuckDBEntities, URI, StageSuccessful]:
|
|
121
|
+
"""Apply the data contract to the duckdb relations"""
|
|
122
|
+
self.logger.info("Applying data contracts")
|
|
123
|
+
feedback_errors_uri: URI = get_feedback_errors_uri(working_dir, "data_contract")
|
|
124
|
+
|
|
125
|
+
# check if entities are valid parquet - if not, convert
|
|
126
|
+
for entity, entity_loc in entity_locations.items():
|
|
127
|
+
if not check_if_parquet_file(entity_loc):
|
|
128
|
+
parquet_uri = self.write_parquet(
|
|
129
|
+
entities[entity], fh.joinuri(fh.get_parent(entity_loc), f"{entity}.parquet")
|
|
130
|
+
)
|
|
131
|
+
entity_locations[entity] = parquet_uri
|
|
132
|
+
|
|
133
|
+
successful = True
|
|
134
|
+
|
|
135
|
+
with BackgroundMessageWriter(
|
|
136
|
+
working_dir, "data_contract", key_fields=key_fields
|
|
137
|
+
) as msg_writer:
|
|
138
|
+
for entity_name, relation in entities.items():
|
|
139
|
+
# get dtypes for all fields -> python data types or use with relation
|
|
140
|
+
entity_fields: dict[str, ModelField] = contract_metadata.schemas[
|
|
141
|
+
entity_name
|
|
142
|
+
].__fields__
|
|
143
|
+
ddb_schema: dict[str, DuckDBPyType] = {
|
|
144
|
+
fld.name: get_duckdb_type_from_annotation(fld.annotation)
|
|
145
|
+
for fld in entity_fields.values()
|
|
146
|
+
}
|
|
147
|
+
polars_schema: dict[str, PolarsType] = {
|
|
148
|
+
fld.name: get_polars_type_from_annotation(fld.annotation)
|
|
149
|
+
for fld in entity_fields.values()
|
|
150
|
+
}
|
|
151
|
+
if relation_is_empty(relation):
|
|
152
|
+
self.logger.warning(f"+ Empty relation for {entity_name}")
|
|
153
|
+
empty_df = pl.DataFrame([], schema=polars_schema) # type: ignore # pylint: disable=W0612
|
|
154
|
+
relation = self._connection.sql("select * from empty_df")
|
|
155
|
+
continue
|
|
156
|
+
|
|
157
|
+
self.logger.info(f"+ Applying contract to: {entity_name}")
|
|
158
|
+
|
|
159
|
+
row_validator_helper = partial(
|
|
160
|
+
apply_row_validator_helper,
|
|
161
|
+
row_validator=contract_metadata.validators[entity_name],
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
batches = pq.ParquetFile(entity_locations[entity_name]).iter_batches(10000)
|
|
165
|
+
msg_count = 0
|
|
166
|
+
for batch in batches:
|
|
167
|
+
if msgs := row_validator_helper(arrow_batch=batch):
|
|
168
|
+
msg_writer.write_queue.put(msgs)
|
|
169
|
+
msg_count += len(msgs)
|
|
170
|
+
|
|
171
|
+
self.logger.info(f"Data contract found {msg_count} issues in {entity_name}")
|
|
172
|
+
|
|
173
|
+
casting_statements = [
|
|
174
|
+
(
|
|
175
|
+
self.generate_ddb_cast_statement(column, dtype)
|
|
176
|
+
if column in relation.columns
|
|
177
|
+
else self.generate_ddb_cast_statement(column, dtype, null_flag=True)
|
|
178
|
+
)
|
|
179
|
+
for column, dtype in ddb_schema.items()
|
|
180
|
+
]
|
|
181
|
+
try:
|
|
182
|
+
relation = relation.project(", ".join(casting_statements))
|
|
183
|
+
except Exception as err: # pylint: disable=broad-except
|
|
184
|
+
successful = False
|
|
185
|
+
self.logger.error(f"Error in casting relation: {err}")
|
|
186
|
+
dump_processing_errors(
|
|
187
|
+
working_dir,
|
|
188
|
+
"data_contract",
|
|
189
|
+
[generate_error_casting_entity_message(entity_name)],
|
|
190
|
+
)
|
|
191
|
+
continue
|
|
192
|
+
|
|
193
|
+
if self.debug:
|
|
194
|
+
# count will force evaluation - only done in debug
|
|
195
|
+
pre_convert_row_count = relation.count("*").fetchone()[0] # type: ignore
|
|
196
|
+
self.logger.info(f"+ Converting to parquet: ({pre_convert_row_count} rows)")
|
|
197
|
+
else:
|
|
198
|
+
pre_convert_row_count = 0
|
|
199
|
+
self.logger.info("+ Converting to parquet")
|
|
200
|
+
|
|
201
|
+
entities[entity_name] = relation
|
|
202
|
+
if self.debug:
|
|
203
|
+
post_convert_row_count = entities[entity_name].count("*").fetchone()[0] # type: ignore # pylint:disable=line-too-long
|
|
204
|
+
self.logger.info(f"+ Converted to parquet: ({post_convert_row_count} rows)")
|
|
205
|
+
if post_convert_row_count != pre_convert_row_count:
|
|
206
|
+
raise ValueError(
|
|
207
|
+
f"Row count mismatch for {entity_name}"
|
|
208
|
+
f" ({pre_convert_row_count} vs {post_convert_row_count})"
|
|
209
|
+
)
|
|
210
|
+
else:
|
|
211
|
+
self.logger.info("+ Converted to parquet")
|
|
212
|
+
|
|
213
|
+
return entities, feedback_errors_uri, successful
|