data-validation-engine 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_validation_engine-0.6.2.dist-info/METADATA +104 -0
- data_validation_engine-0.6.2.dist-info/RECORD +105 -0
- data_validation_engine-0.6.2.dist-info/WHEEL +4 -0
- data_validation_engine-0.6.2.dist-info/licenses/LICENSE +21 -0
- dve/__init__.py +0 -0
- dve/common/__init__.py +0 -0
- dve/common/error_utils.py +189 -0
- dve/core_engine/__init__.py +0 -0
- dve/core_engine/backends/__init__.py +1 -0
- dve/core_engine/backends/base/__init__.py +1 -0
- dve/core_engine/backends/base/auditing.py +618 -0
- dve/core_engine/backends/base/backend.py +240 -0
- dve/core_engine/backends/base/contract.py +454 -0
- dve/core_engine/backends/base/core.py +124 -0
- dve/core_engine/backends/base/reader.py +176 -0
- dve/core_engine/backends/base/reference_data.py +217 -0
- dve/core_engine/backends/base/rules.py +685 -0
- dve/core_engine/backends/base/utilities.py +146 -0
- dve/core_engine/backends/exceptions.py +311 -0
- dve/core_engine/backends/implementations/__init__.py +1 -0
- dve/core_engine/backends/implementations/duckdb/__init__.py +26 -0
- dve/core_engine/backends/implementations/duckdb/auditing.py +234 -0
- dve/core_engine/backends/implementations/duckdb/contract.py +213 -0
- dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +288 -0
- dve/core_engine/backends/implementations/duckdb/readers/__init__.py +13 -0
- dve/core_engine/backends/implementations/duckdb/readers/csv.py +222 -0
- dve/core_engine/backends/implementations/duckdb/readers/json.py +50 -0
- dve/core_engine/backends/implementations/duckdb/readers/xml.py +45 -0
- dve/core_engine/backends/implementations/duckdb/reference_data.py +49 -0
- dve/core_engine/backends/implementations/duckdb/rules.py +534 -0
- dve/core_engine/backends/implementations/duckdb/types.py +47 -0
- dve/core_engine/backends/implementations/duckdb/utilities.py +41 -0
- dve/core_engine/backends/implementations/spark/__init__.py +22 -0
- dve/core_engine/backends/implementations/spark/auditing.py +230 -0
- dve/core_engine/backends/implementations/spark/backend.py +78 -0
- dve/core_engine/backends/implementations/spark/contract.py +241 -0
- dve/core_engine/backends/implementations/spark/readers/__init__.py +15 -0
- dve/core_engine/backends/implementations/spark/readers/csv.py +77 -0
- dve/core_engine/backends/implementations/spark/readers/json.py +66 -0
- dve/core_engine/backends/implementations/spark/readers/xml.py +202 -0
- dve/core_engine/backends/implementations/spark/reference_data.py +42 -0
- dve/core_engine/backends/implementations/spark/rules.py +430 -0
- dve/core_engine/backends/implementations/spark/spark_helpers.py +412 -0
- dve/core_engine/backends/implementations/spark/types.py +21 -0
- dve/core_engine/backends/implementations/spark/utilities.py +144 -0
- dve/core_engine/backends/metadata/__init__.py +47 -0
- dve/core_engine/backends/metadata/contract.py +80 -0
- dve/core_engine/backends/metadata/reporting.py +374 -0
- dve/core_engine/backends/metadata/rules.py +737 -0
- dve/core_engine/backends/readers/__init__.py +41 -0
- dve/core_engine/backends/readers/csv.py +232 -0
- dve/core_engine/backends/readers/utilities.py +21 -0
- dve/core_engine/backends/readers/xml.py +432 -0
- dve/core_engine/backends/readers/xml_linting.py +142 -0
- dve/core_engine/backends/types.py +26 -0
- dve/core_engine/backends/utilities.py +177 -0
- dve/core_engine/configuration/__init__.py +1 -0
- dve/core_engine/configuration/base.py +56 -0
- dve/core_engine/configuration/v1/__init__.py +351 -0
- dve/core_engine/configuration/v1/filters.py +60 -0
- dve/core_engine/configuration/v1/rule_stores/__init__.py +1 -0
- dve/core_engine/configuration/v1/rule_stores/models.py +57 -0
- dve/core_engine/configuration/v1/steps.py +365 -0
- dve/core_engine/constants.py +8 -0
- dve/core_engine/engine.py +265 -0
- dve/core_engine/exceptions.py +29 -0
- dve/core_engine/functions/__init__.py +6 -0
- dve/core_engine/functions/implementations.py +200 -0
- dve/core_engine/loggers.py +57 -0
- dve/core_engine/message.py +512 -0
- dve/core_engine/models.py +196 -0
- dve/core_engine/templating.py +114 -0
- dve/core_engine/type_hints.py +255 -0
- dve/core_engine/validation.py +160 -0
- dve/metadata_parser/__init__.py +2 -0
- dve/metadata_parser/domain_types.py +682 -0
- dve/metadata_parser/exc.py +44 -0
- dve/metadata_parser/function_library.py +64 -0
- dve/metadata_parser/function_wrapper.py +201 -0
- dve/metadata_parser/model_generator.py +119 -0
- dve/metadata_parser/models.py +410 -0
- dve/metadata_parser/utilities.py +54 -0
- dve/parser/__init__.py +1 -0
- dve/parser/exceptions.py +50 -0
- dve/parser/file_handling/__init__.py +31 -0
- dve/parser/file_handling/helpers.py +29 -0
- dve/parser/file_handling/implementations/__init__.py +7 -0
- dve/parser/file_handling/implementations/base.py +97 -0
- dve/parser/file_handling/implementations/dbfs.py +81 -0
- dve/parser/file_handling/implementations/file.py +203 -0
- dve/parser/file_handling/implementations/s3.py +371 -0
- dve/parser/file_handling/log_handler.py +215 -0
- dve/parser/file_handling/service.py +441 -0
- dve/parser/file_handling/utilities.py +53 -0
- dve/parser/type_hints.py +46 -0
- dve/parser/utilities.py +113 -0
- dve/pipeline/__init__.py +0 -0
- dve/pipeline/duckdb_pipeline.py +56 -0
- dve/pipeline/foundry_ddb_pipeline.py +171 -0
- dve/pipeline/pipeline.py +935 -0
- dve/pipeline/spark_pipeline.py +69 -0
- dve/pipeline/utils.py +96 -0
- dve/reporting/__init__.py +1 -0
- dve/reporting/error_report.py +153 -0
- dve/reporting/excel_report.py +319 -0
|
@@ -0,0 +1,534 @@
|
|
|
1
|
+
"""Business rule definitions for duckdb backend"""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
from typing import get_type_hints
|
|
5
|
+
from uuid import uuid4
|
|
6
|
+
|
|
7
|
+
from duckdb import (
|
|
8
|
+
ColumnExpression,
|
|
9
|
+
ConstantExpression,
|
|
10
|
+
DuckDBPyConnection,
|
|
11
|
+
DuckDBPyRelation,
|
|
12
|
+
StarExpression,
|
|
13
|
+
)
|
|
14
|
+
from duckdb.typing import DuckDBPyType
|
|
15
|
+
|
|
16
|
+
from dve.core_engine.backends.base.rules import (
|
|
17
|
+
BaseStepImplementations,
|
|
18
|
+
ColumnAddition,
|
|
19
|
+
ColumnRemoval,
|
|
20
|
+
SelectColumns,
|
|
21
|
+
)
|
|
22
|
+
from dve.core_engine.backends.exceptions import ConstraintError
|
|
23
|
+
from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import (
|
|
24
|
+
DDBStruct,
|
|
25
|
+
duckdb_read_parquet,
|
|
26
|
+
duckdb_rel_to_dictionaries,
|
|
27
|
+
duckdb_write_parquet,
|
|
28
|
+
get_all_registered_udfs,
|
|
29
|
+
get_duckdb_type_from_annotation,
|
|
30
|
+
)
|
|
31
|
+
from dve.core_engine.backends.implementations.duckdb.types import (
|
|
32
|
+
DuckDBEntities,
|
|
33
|
+
Joined,
|
|
34
|
+
Source,
|
|
35
|
+
Target,
|
|
36
|
+
)
|
|
37
|
+
from dve.core_engine.backends.implementations.duckdb.utilities import parse_multiple_expressions
|
|
38
|
+
from dve.core_engine.backends.metadata.rules import (
|
|
39
|
+
AbstractConditionalJoin,
|
|
40
|
+
AbstractNewColumnConditionalJoin,
|
|
41
|
+
Aggregation,
|
|
42
|
+
AntiJoin,
|
|
43
|
+
ConfirmJoinHasMatch,
|
|
44
|
+
HeaderJoin,
|
|
45
|
+
ImmediateFilter,
|
|
46
|
+
InnerJoin,
|
|
47
|
+
LeftJoin,
|
|
48
|
+
Notification,
|
|
49
|
+
OneToOneJoin,
|
|
50
|
+
OrphanIdentification,
|
|
51
|
+
SemiJoin,
|
|
52
|
+
TableUnion,
|
|
53
|
+
)
|
|
54
|
+
from dve.core_engine.constants import ROWID_COLUMN_NAME
|
|
55
|
+
from dve.core_engine.functions import implementations as functions
|
|
56
|
+
from dve.core_engine.message import FeedbackMessage
|
|
57
|
+
from dve.core_engine.templating import template_object
|
|
58
|
+
from dve.core_engine.type_hints import Messages
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@duckdb_write_parquet
|
|
62
|
+
@duckdb_read_parquet
|
|
63
|
+
class DuckDBStepImplementations(BaseStepImplementations[DuckDBPyRelation]):
|
|
64
|
+
"""An implementation of transformation steps in duckdb."""
|
|
65
|
+
|
|
66
|
+
def __init__(self, connection: DuckDBPyConnection, **kwargs):
|
|
67
|
+
self._connection = connection
|
|
68
|
+
self.registered_functions = get_all_registered_udfs(self._connection)
|
|
69
|
+
super().__init__(**kwargs)
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def connection(self) -> DuckDBPyConnection:
|
|
73
|
+
"""The duckdb connection"""
|
|
74
|
+
return self._connection
|
|
75
|
+
|
|
76
|
+
@classmethod
|
|
77
|
+
def register_udfs( # type: ignore
|
|
78
|
+
cls, connection: DuckDBPyConnection, **kwargs
|
|
79
|
+
): # pylint: disable=arguments-differ
|
|
80
|
+
"""Method to register all custom dve functions for use during business rules application"""
|
|
81
|
+
_registered_functions: set[str] = get_all_registered_udfs(connection)
|
|
82
|
+
_available_functions: dict[str, Callable] = {
|
|
83
|
+
func_name: func
|
|
84
|
+
for func_name, func in vars(functions).items()
|
|
85
|
+
if callable(func) and func.__module__ == "dve.core_engine.functions.implementations"
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
_unregistered_functions: set[str] = set(_available_functions).difference(
|
|
89
|
+
_registered_functions
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
for function_name in _unregistered_functions:
|
|
93
|
+
_func = _available_functions.get(function_name)
|
|
94
|
+
_annotations = get_type_hints(_func)
|
|
95
|
+
_return_type: DuckDBPyType = get_duckdb_type_from_annotation(_annotations["return"])
|
|
96
|
+
|
|
97
|
+
connection.create_function(
|
|
98
|
+
function_name,
|
|
99
|
+
_func, # type: ignore
|
|
100
|
+
return_type=str(_return_type), # type: ignore
|
|
101
|
+
)
|
|
102
|
+
if _unregistered_functions:
|
|
103
|
+
_sql = "INSERT INTO dve_udfs (function_name) VALUES" + ",".join(
|
|
104
|
+
[f"('{f_name}',)" for f_name in _unregistered_functions]
|
|
105
|
+
) # pylint: disable=line-too-long
|
|
106
|
+
connection.sql(_sql)
|
|
107
|
+
return cls(connection=connection, **kwargs)
|
|
108
|
+
|
|
109
|
+
@staticmethod
|
|
110
|
+
def add_row_id(entity: DuckDBPyRelation) -> DuckDBPyRelation:
|
|
111
|
+
"""Adds a row identifier to the Relation"""
|
|
112
|
+
if ROWID_COLUMN_NAME not in entity.columns:
|
|
113
|
+
entity = entity.project(f"*, ROW_NUMBER() OVER () as {ROWID_COLUMN_NAME}")
|
|
114
|
+
return entity
|
|
115
|
+
|
|
116
|
+
@staticmethod
|
|
117
|
+
def drop_row_id(entity: DuckDBPyRelation) -> DuckDBPyRelation:
|
|
118
|
+
"""Drops the row identiifer from a Relation"""
|
|
119
|
+
if ROWID_COLUMN_NAME in entity.columns:
|
|
120
|
+
entity = entity.select(StarExpression(exclude=[ROWID_COLUMN_NAME]))
|
|
121
|
+
return entity
|
|
122
|
+
|
|
123
|
+
def add(self, entities: DuckDBEntities, *, config: ColumnAddition) -> Messages:
|
|
124
|
+
"""A transformation step which adds a column to an entity."""
|
|
125
|
+
entity: DuckDBPyRelation = entities[config.entity_name]
|
|
126
|
+
entity = entity.select(f"*, {config.expression} as {config.column_name}")
|
|
127
|
+
entities[config.new_entity_name or config.entity_name] = entity
|
|
128
|
+
return []
|
|
129
|
+
|
|
130
|
+
def remove(self, entities: DuckDBEntities, *, config: ColumnRemoval) -> Messages:
|
|
131
|
+
"""A transformation step which removes a column from an entity."""
|
|
132
|
+
entity: DuckDBPyRelation = entities[config.entity_name]
|
|
133
|
+
entity = entity.select(StarExpression(exclude=[config.column_name]))
|
|
134
|
+
entities[config.new_entity_name or config.entity_name] = entity
|
|
135
|
+
return []
|
|
136
|
+
|
|
137
|
+
def select(self, entities: DuckDBEntities, *, config: SelectColumns) -> Messages:
|
|
138
|
+
"""A transformation step which selects columns from an entity."""
|
|
139
|
+
entity: DuckDBPyRelation = entities[config.entity_name]
|
|
140
|
+
entity = entity.select(", ".join(parse_multiple_expressions(config.columns)))
|
|
141
|
+
entities[config.new_entity_name or config.entity_name] = entity
|
|
142
|
+
if config.distinct:
|
|
143
|
+
entity = entity.distinct()
|
|
144
|
+
entities[config.new_entity_name or config.entity_name] = entity
|
|
145
|
+
return []
|
|
146
|
+
|
|
147
|
+
def group_by(self, entities: DuckDBEntities, *, config: Aggregation) -> Messages:
|
|
148
|
+
"""A transformation step which performs an aggregation on an entity."""
|
|
149
|
+
|
|
150
|
+
def _add_cnst_field(rel: DuckDBPyRelation) -> tuple[str, DuckDBPyRelation]:
|
|
151
|
+
"""Add a constant field for use as an index to allow for pivoting with no group"""
|
|
152
|
+
fld_name = f"fld_{uuid4().hex[0:8]}"
|
|
153
|
+
return fld_name, rel.select(
|
|
154
|
+
StarExpression(exclude=[]), ConstantExpression(1).alias(fld_name)
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
entity: DuckDBPyRelation = entities[config.entity_name]
|
|
158
|
+
|
|
159
|
+
group_cols = parse_multiple_expressions(config.group_by)
|
|
160
|
+
agg_cols = parse_multiple_expressions(config.agg_columns)
|
|
161
|
+
|
|
162
|
+
if config.pivot_column:
|
|
163
|
+
if not group_cols:
|
|
164
|
+
const_fld, entity = _add_cnst_field(entity)
|
|
165
|
+
group_pl = entity.pl().pivot(
|
|
166
|
+
columns=[config.pivot_column],
|
|
167
|
+
values=agg_cols,
|
|
168
|
+
index=(group_cols or [const_fld]),
|
|
169
|
+
aggregate_function=config.agg_function,
|
|
170
|
+
)
|
|
171
|
+
if const_fld in group_pl.columns:
|
|
172
|
+
group_pl = group_pl.drop(const_fld)
|
|
173
|
+
if config.pivot_values:
|
|
174
|
+
group_pl = group_pl.select(group_cols + config.pivot_values)
|
|
175
|
+
group = self._connection.sql("select * from group_pl")
|
|
176
|
+
else:
|
|
177
|
+
group = entity.aggregate(", ".join(group_cols + agg_cols))
|
|
178
|
+
|
|
179
|
+
entities[config.new_entity_name or config.entity_name] = group
|
|
180
|
+
return []
|
|
181
|
+
|
|
182
|
+
def _resolve_join_name_conflicts(
|
|
183
|
+
self, source_rel: Source, joined_rel: Joined, config: AbstractNewColumnConditionalJoin
|
|
184
|
+
) -> Joined:
|
|
185
|
+
"""Resolve name conflicts in joined DataFrames."""
|
|
186
|
+
# Need to ensure we keep source columns but these can be overridden by
|
|
187
|
+
# new computed keys.
|
|
188
|
+
# Start with new computed names (in ddb - earlier columns take preference)
|
|
189
|
+
columns = parse_multiple_expressions(config.new_columns)
|
|
190
|
+
columns.extend(
|
|
191
|
+
parse_multiple_expressions(
|
|
192
|
+
[f"{config.entity_name}.{column_name}" for column_name in source_rel.columns]
|
|
193
|
+
)
|
|
194
|
+
)
|
|
195
|
+
# Select them from the join. There may be duplicates here for overridden fields.
|
|
196
|
+
result_all_cols = joined_rel.select(", ".join(columns))
|
|
197
|
+
|
|
198
|
+
# Now need to handle the existence of dupes.
|
|
199
|
+
# we keep any specified in config.new_columns as duckdb can only access most left-hand
|
|
200
|
+
# field where name clashes.
|
|
201
|
+
# Need to be careful with case sensitivity - duckdb (by default) ignores case.
|
|
202
|
+
temp_column_names = []
|
|
203
|
+
concrete_to_temp_mapping = {}
|
|
204
|
+
case_mapping = {}
|
|
205
|
+
for index, column_name in enumerate(result_all_cols.columns):
|
|
206
|
+
temp_name = str(index)
|
|
207
|
+
|
|
208
|
+
# Duckdb case insensitive by default
|
|
209
|
+
# iterate through column names and keep only earliest casing
|
|
210
|
+
uppercase_name = column_name.upper()
|
|
211
|
+
if uppercase_name not in case_mapping:
|
|
212
|
+
case_mapping[uppercase_name] = column_name
|
|
213
|
+
concrete_to_temp_mapping[uppercase_name] = temp_name
|
|
214
|
+
|
|
215
|
+
# Store the temp name, and the mapping between uppercase name and temp name.
|
|
216
|
+
temp_column_names.append(temp_name)
|
|
217
|
+
|
|
218
|
+
# Rename with the indices, so we can deduplicate column names.
|
|
219
|
+
result_temp_names = result_all_cols.select(
|
|
220
|
+
*[
|
|
221
|
+
ColumnExpression(column_name).alias(str(index))
|
|
222
|
+
for index, column_name in enumerate(result_all_cols.columns)
|
|
223
|
+
]
|
|
224
|
+
)
|
|
225
|
+
# Keep only the earliest column for each (case insensitive) column name.
|
|
226
|
+
earliest_temp_names = result_temp_names.select(
|
|
227
|
+
*[ColumnExpression(temp_name) for temp_name in concrete_to_temp_mapping.values()]
|
|
228
|
+
)
|
|
229
|
+
# Rename those fields to their 'proper' names (respect user-supplied case).
|
|
230
|
+
return earliest_temp_names.select(
|
|
231
|
+
*[
|
|
232
|
+
ColumnExpression(temp_name).alias(case_mapping[upp_name])
|
|
233
|
+
for upp_name, temp_name in concrete_to_temp_mapping.items()
|
|
234
|
+
]
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
def _perform_join(
|
|
238
|
+
self, entities: DuckDBEntities, config: AbstractConditionalJoin
|
|
239
|
+
) -> tuple[Source, Target, Joined]:
|
|
240
|
+
"""Perform a conditional join between source and target, returning the
|
|
241
|
+
source, target and joined DataFrames.
|
|
242
|
+
|
|
243
|
+
"""
|
|
244
|
+
source_rel: DuckDBPyRelation = entities[config.entity_name]
|
|
245
|
+
source_rel = source_rel.set_alias(config.entity_name)
|
|
246
|
+
target_rel: DuckDBPyRelation = entities[config.target_name]
|
|
247
|
+
target_rel = target_rel.set_alias(config.target_name)
|
|
248
|
+
|
|
249
|
+
if isinstance(config, InnerJoin):
|
|
250
|
+
join_type = "inner"
|
|
251
|
+
elif isinstance(config, SemiJoin):
|
|
252
|
+
join_type = "semi"
|
|
253
|
+
elif isinstance(config, AntiJoin):
|
|
254
|
+
join_type = "anti"
|
|
255
|
+
else:
|
|
256
|
+
join_type = "left"
|
|
257
|
+
|
|
258
|
+
joined_rel = source_rel.join(target_rel, condition=config.join_condition, how=join_type)
|
|
259
|
+
|
|
260
|
+
return source_rel, target_rel, joined_rel
|
|
261
|
+
|
|
262
|
+
def has_match(self, entities: DuckDBEntities, *, config: ConfirmJoinHasMatch) -> Messages:
|
|
263
|
+
"""Add a boolean column to a source entity, indicating whether it matches
|
|
264
|
+
a target for the given condition.
|
|
265
|
+
"""
|
|
266
|
+
source_rel, _, joined_rel = self._perform_join(entities, config)
|
|
267
|
+
entity = joined_rel.select(
|
|
268
|
+
f"*, COALESCE({config.join_condition}, FALSE) AS {config.column_name}"
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
if config.perform_integrity_check:
|
|
272
|
+
joined_count = joined_rel.count("*").fetchone()[0] # type: ignore
|
|
273
|
+
source_count = source_rel.count("*").fetchone()[0] # type: ignore
|
|
274
|
+
if joined_count != source_count:
|
|
275
|
+
raise ConstraintError(
|
|
276
|
+
f"Multiple matches for some records from {config.entity_name!r} for "
|
|
277
|
+
+ f"condition {config.join_condition!r}",
|
|
278
|
+
constraint=(
|
|
279
|
+
f"records in source entity ({config.entity_name!r}) must match at most "
|
|
280
|
+
+ f"a single record in the target ({config.target_name})"
|
|
281
|
+
),
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
entities[config.new_entity_name or config.entity_name] = entity
|
|
285
|
+
return []
|
|
286
|
+
|
|
287
|
+
def left_join(self, entities: DuckDBEntities, *, config: LeftJoin) -> Messages:
|
|
288
|
+
"""Perform a left join from a source entity to a target table, updating
|
|
289
|
+
the source entity or creating a new joined entity.
|
|
290
|
+
|
|
291
|
+
"""
|
|
292
|
+
source_rel, _, joined_rel = self._perform_join(entities, config)
|
|
293
|
+
|
|
294
|
+
entities[config.new_entity_name or config.entity_name] = self._resolve_join_name_conflicts(
|
|
295
|
+
source_rel, joined_rel, config
|
|
296
|
+
)
|
|
297
|
+
return []
|
|
298
|
+
|
|
299
|
+
def inner_join(self, entities: DuckDBEntities, *, config: InnerJoin) -> Messages:
|
|
300
|
+
"""Perform an inner join from a source entity to a target table, updating
|
|
301
|
+
the source entity or creating a new joined entity.
|
|
302
|
+
|
|
303
|
+
"""
|
|
304
|
+
source_rel, _, joined_rel = self._perform_join(entities, config)
|
|
305
|
+
entities[config.new_entity_name or config.entity_name] = self._resolve_join_name_conflicts(
|
|
306
|
+
source_rel, joined_rel, config
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
return []
|
|
310
|
+
|
|
311
|
+
def one_to_one_join(self, entities: DuckDBEntities, *, config: OneToOneJoin) -> Messages:
|
|
312
|
+
"""Perform a join from a source entity to a target table, updating
|
|
313
|
+
the source entity or creating a new joined entity.
|
|
314
|
+
|
|
315
|
+
This will be a left join that enforces a one-to-one relationship.
|
|
316
|
+
|
|
317
|
+
"""
|
|
318
|
+
source_rel: DuckDBPyRelation = entities[config.entity_name]
|
|
319
|
+
messages = self.left_join(entities, config=config)
|
|
320
|
+
joined_rel: DuckDBPyRelation = entities[config.new_entity_name or config.entity_name]
|
|
321
|
+
|
|
322
|
+
if config.perform_integrity_check:
|
|
323
|
+
if (
|
|
324
|
+
joined_rel.count("*").fetchone()[0] != source_rel.count("*").fetchone()[0] # type: ignore # pylint: disable=line-too-long
|
|
325
|
+
):
|
|
326
|
+
raise ConstraintError(
|
|
327
|
+
f"Multiple matches for some records from {config.entity_name!r} for "
|
|
328
|
+
+ f"condition {config.join_condition!r}",
|
|
329
|
+
constraint=(
|
|
330
|
+
f"records in source entity ({config.entity_name!r}) must match at most "
|
|
331
|
+
+ f"a single record in the target ({config.target_name})"
|
|
332
|
+
),
|
|
333
|
+
)
|
|
334
|
+
return messages
|
|
335
|
+
|
|
336
|
+
def semi_join(self, entities: DuckDBEntities, *, config: SemiJoin) -> Messages:
|
|
337
|
+
"""Perform a semi join from a source entity to a target table, updating
|
|
338
|
+
the source entity or creating a new joined entity.
|
|
339
|
+
|
|
340
|
+
"""
|
|
341
|
+
_, _, joined_rel = self._perform_join(entities, config)
|
|
342
|
+
|
|
343
|
+
entities[config.new_entity_name or config.entity_name] = joined_rel
|
|
344
|
+
return []
|
|
345
|
+
|
|
346
|
+
def anti_join(self, entities: DuckDBEntities, *, config: AntiJoin) -> Messages:
|
|
347
|
+
"""Perform an anti join from a source entity to a target table, updating
|
|
348
|
+
the source entity or creating a new joined entity.
|
|
349
|
+
|
|
350
|
+
"""
|
|
351
|
+
_, _, joined_rel = self._perform_join(entities, config)
|
|
352
|
+
|
|
353
|
+
entities[config.new_entity_name or config.entity_name] = joined_rel
|
|
354
|
+
return []
|
|
355
|
+
|
|
356
|
+
def join_header(self, entities: DuckDBEntities, *, config: HeaderJoin) -> Messages:
|
|
357
|
+
"""Add a 'header' entity to each row in the source entity. The header entity
|
|
358
|
+
must contain only a single record.
|
|
359
|
+
|
|
360
|
+
"""
|
|
361
|
+
source_rel: DuckDBPyRelation = entities[config.entity_name]
|
|
362
|
+
source_rel = source_rel.set_alias(config.entity_name)
|
|
363
|
+
target_rel: DuckDBPyRelation = entities[config.target_name]
|
|
364
|
+
target_rel = target_rel.set_alias(config.target_name)
|
|
365
|
+
|
|
366
|
+
target_rows = target_rel.pl().to_struct("header").to_list()
|
|
367
|
+
n_target_rows = len(target_rows)
|
|
368
|
+
if n_target_rows != 1:
|
|
369
|
+
raise ConstraintError(
|
|
370
|
+
f"Unable to join header {config.target_name!r} to {config.entity_name!r} "
|
|
371
|
+
+ f"as it contains multiple entries (expected 1, got {n_target_rows})",
|
|
372
|
+
constraint=(
|
|
373
|
+
f"Header entity {config.target_name!r} must contain a single record "
|
|
374
|
+
+ f"(contains {n_target_rows} records)"
|
|
375
|
+
),
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
target_schema = DDBStruct(dict(zip(target_rel.columns, target_rel.dtypes)))()
|
|
379
|
+
|
|
380
|
+
joined_rel = source_rel.select(
|
|
381
|
+
StarExpression(exclude=[]),
|
|
382
|
+
ConstantExpression(target_rows[0]).cast(target_schema).alias(config.header_column_name),
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
entities[config.new_entity_name or config.entity_name] = joined_rel
|
|
386
|
+
return []
|
|
387
|
+
|
|
388
|
+
def identify_orphans(
|
|
389
|
+
self, entities: DuckDBEntities, *, config: OrphanIdentification
|
|
390
|
+
) -> Messages:
|
|
391
|
+
"""Identify records in an entity which don't have at least one corresponding
|
|
392
|
+
match in the target. A new boolean column will be added to `entity` ('IsOrphaned')
|
|
393
|
+
indicating whether the condition matched.
|
|
394
|
+
|
|
395
|
+
If there is already an 'IsOrphaned' column in the entity, this will be set to the
|
|
396
|
+
logical OR of its current value and the value it would have been set to otherwise.
|
|
397
|
+
|
|
398
|
+
"""
|
|
399
|
+
source_rel: DuckDBPyRelation = entities[config.entity_name]
|
|
400
|
+
source_rel = source_rel.set_alias(config.entity_name)
|
|
401
|
+
target_rel: DuckDBPyRelation = entities[config.target_name]
|
|
402
|
+
target_rel = target_rel.set_alias(config.target_name)
|
|
403
|
+
|
|
404
|
+
key_name = f"key_{uuid4().hex}"
|
|
405
|
+
source_rel = source_rel.select(f"*, row_number() over () as {key_name}").set_alias(
|
|
406
|
+
config.entity_name
|
|
407
|
+
)
|
|
408
|
+
match_name = f"matched_{uuid4().hex}"
|
|
409
|
+
target_rel = target_rel.select(
|
|
410
|
+
StarExpression(exclude=[]), ConstantExpression(1).alias(match_name)
|
|
411
|
+
).set_alias(config.target_name)
|
|
412
|
+
|
|
413
|
+
joined_rel: DuckDBPyRelation = source_rel.join(
|
|
414
|
+
target_rel, condition=config.join_condition, how="left"
|
|
415
|
+
).aggregate(f"{key_name}, coalesce(count({match_name})==0, TRUE) AS IsOrphaned")
|
|
416
|
+
|
|
417
|
+
if "IsOrphaned" not in source_rel.columns:
|
|
418
|
+
result: DuckDBPyRelation = source_rel.join(
|
|
419
|
+
joined_rel, condition=key_name, how="left"
|
|
420
|
+
).select(StarExpression(exclude=[key_name]))
|
|
421
|
+
else:
|
|
422
|
+
result = source_rel.set_alias("source").join(
|
|
423
|
+
joined_rel.set_alias("joined"),
|
|
424
|
+
condition=f"source.{key_name} = joined.{key_name}",
|
|
425
|
+
how="left",
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
columns = {name: f"source.{name}" for name in source_rel.columns}
|
|
429
|
+
if "IsOrphaned" in source_rel.columns:
|
|
430
|
+
columns["IsOrphaned"] = ColumnExpression("source.IsOrphaned") | ColumnExpression("joined.IsOrphaned") # type: ignore # pylint: disable=line-too-long
|
|
431
|
+
columns.pop(key_name, None)
|
|
432
|
+
|
|
433
|
+
result = result.select(
|
|
434
|
+
",".join([f"{column} as {name}" for name, column in columns.items()])
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
entities[config.new_entity_name or config.entity_name] = result
|
|
438
|
+
return []
|
|
439
|
+
|
|
440
|
+
def union(self, entities: DuckDBEntities, *, config: TableUnion) -> Messages:
|
|
441
|
+
"""Union two entities together, taking the columns from each by name.
|
|
442
|
+
|
|
443
|
+
Where columns have the same name, they must be the same type or coerceable.
|
|
444
|
+
Where column casing differs, the casing from the `source` entity will be kept.
|
|
445
|
+
|
|
446
|
+
Column order will be preserved, with columns from `source` taken first and extra
|
|
447
|
+
columns in `target` added in order afterwards.
|
|
448
|
+
|
|
449
|
+
"""
|
|
450
|
+
source_rel: DuckDBPyRelation = entities[config.entity_name]
|
|
451
|
+
source_rel = source_rel.set_alias(config.entity_name)
|
|
452
|
+
target_rel: DuckDBPyRelation = entities[config.target_name]
|
|
453
|
+
target_rel = target_rel.set_alias(config.target_name)
|
|
454
|
+
|
|
455
|
+
# Ensure all keys are present in both
|
|
456
|
+
source_names = {column_name.upper(): column_name for column_name in source_rel.columns}
|
|
457
|
+
target_names = {column_name.upper(): column_name for column_name in target_rel.columns}
|
|
458
|
+
|
|
459
|
+
all_names = list(source_names.keys())
|
|
460
|
+
for name in target_names:
|
|
461
|
+
if name not in source_names:
|
|
462
|
+
all_names.append(name)
|
|
463
|
+
|
|
464
|
+
source_columns, target_columns = [], []
|
|
465
|
+
for uppercase_name in all_names:
|
|
466
|
+
source_name = source_names.get(uppercase_name)
|
|
467
|
+
target_name = target_names.get(uppercase_name)
|
|
468
|
+
|
|
469
|
+
if source_name and target_name:
|
|
470
|
+
source_col = ColumnExpression(source_name)
|
|
471
|
+
target_col = ColumnExpression(target_name).alias(source_name)
|
|
472
|
+
elif source_name:
|
|
473
|
+
source_col = ColumnExpression(source_name)
|
|
474
|
+
target_col = ConstantExpression(None).alias(source_name)
|
|
475
|
+
elif target_name:
|
|
476
|
+
source_col = ConstantExpression(None).alias(target_name)
|
|
477
|
+
target_col = ColumnExpression(target_name)
|
|
478
|
+
else:
|
|
479
|
+
continue
|
|
480
|
+
|
|
481
|
+
source_columns.append(source_col)
|
|
482
|
+
target_columns.append(target_col)
|
|
483
|
+
|
|
484
|
+
source_rel = source_rel.select(*source_columns)
|
|
485
|
+
target_rel = target_rel.select(*target_columns)
|
|
486
|
+
entities[config.new_entity_name or config.entity_name] = source_rel.union(target_rel)
|
|
487
|
+
return []
|
|
488
|
+
|
|
489
|
+
def filter(self, entities: DuckDBEntities, *, config: ImmediateFilter) -> Messages:
|
|
490
|
+
"""Filter an entity immediately, and do not emit any messages.
|
|
491
|
+
|
|
492
|
+
The synchronised filter stage will be implemented separately.
|
|
493
|
+
|
|
494
|
+
"""
|
|
495
|
+
entity = entities[config.entity_name]
|
|
496
|
+
entity = entity.filter(config.expression)
|
|
497
|
+
entities[config.new_entity_name or config.entity_name] = entity
|
|
498
|
+
return []
|
|
499
|
+
|
|
500
|
+
def notify(self, entities: DuckDBEntities, *, config: Notification) -> Messages:
|
|
501
|
+
"""Emit a notification based on an expression. Where the expression is truthy,
|
|
502
|
+
a nofication should be emitted according to the reporting config.
|
|
503
|
+
|
|
504
|
+
This is not intended to be used directly, but is used in the implementation of
|
|
505
|
+
the sync filters.
|
|
506
|
+
|
|
507
|
+
"""
|
|
508
|
+
messages: Messages = []
|
|
509
|
+
entity = entities[config.entity_name]
|
|
510
|
+
|
|
511
|
+
matched = entity.filter(config.expression)
|
|
512
|
+
if config.excluded_columns:
|
|
513
|
+
matched = matched.select(StarExpression(exclude=config.excluded_columns))
|
|
514
|
+
|
|
515
|
+
for record in duckdb_rel_to_dictionaries(matched):
|
|
516
|
+
# NOTE: only templates using values directly accessible in record - nothing nested
|
|
517
|
+
# more complex extraction done in reporting module
|
|
518
|
+
messages.append(
|
|
519
|
+
FeedbackMessage(
|
|
520
|
+
entity=config.reporting.reporting_entity_override or config.entity_name,
|
|
521
|
+
original_entity=config.entity_name,
|
|
522
|
+
record=record, # type: ignore
|
|
523
|
+
error_location=config.reporting.legacy_location,
|
|
524
|
+
error_message=template_object(config.reporting.message, record), # type: ignore
|
|
525
|
+
failure_type=config.reporting.legacy_error_type,
|
|
526
|
+
error_type=config.reporting.legacy_error_type,
|
|
527
|
+
error_code=config.reporting.code,
|
|
528
|
+
reporting_field=config.reporting.legacy_reporting_field,
|
|
529
|
+
reporting_field_name=config.reporting.reporting_field_override,
|
|
530
|
+
is_informational=config.reporting.emit in ("warning", "info"),
|
|
531
|
+
category=config.reporting.category,
|
|
532
|
+
)
|
|
533
|
+
)
|
|
534
|
+
return messages
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Types used in Spark implementations."""
|
|
2
|
+
|
|
3
|
+
# pylint: disable=C0103
|
|
4
|
+
from collections.abc import MutableMapping
|
|
5
|
+
|
|
6
|
+
from duckdb import DuckDBPyRelation
|
|
7
|
+
from typing_extensions import Literal
|
|
8
|
+
|
|
9
|
+
from dve.core_engine.type_hints import EntityName
|
|
10
|
+
|
|
11
|
+
SQLType = Literal[
|
|
12
|
+
"BIGINT",
|
|
13
|
+
"BIT",
|
|
14
|
+
"BLOB",
|
|
15
|
+
"BOOLEAN",
|
|
16
|
+
"DATE",
|
|
17
|
+
"DECIMAL",
|
|
18
|
+
"DOUBLE",
|
|
19
|
+
"HUGEINT",
|
|
20
|
+
"INTEGER",
|
|
21
|
+
"INTERVAL",
|
|
22
|
+
"REAL",
|
|
23
|
+
"SMALLINT",
|
|
24
|
+
"TIME",
|
|
25
|
+
"UBIGINT",
|
|
26
|
+
"UHUGEINT",
|
|
27
|
+
"UINTEGER",
|
|
28
|
+
"USMALLINT",
|
|
29
|
+
"UTINYINT",
|
|
30
|
+
"UUID",
|
|
31
|
+
"VARCHAR",
|
|
32
|
+
]
|
|
33
|
+
"""SQL types recognised in duckdb"""
|
|
34
|
+
|
|
35
|
+
Source = DuckDBPyRelation
|
|
36
|
+
"""The source entity for a join. This will be aliased to the source entity name."""
|
|
37
|
+
Target = DuckDBPyRelation
|
|
38
|
+
"""The target entity for a join. This will be aliased to the target entity name."""
|
|
39
|
+
Joined = DuckDBPyRelation
|
|
40
|
+
"""
|
|
41
|
+
The joined entity.
|
|
42
|
+
|
|
43
|
+
This will be able to reference source and target columns by their aliased names.
|
|
44
|
+
|
|
45
|
+
"""
|
|
46
|
+
DuckDBEntities = MutableMapping[EntityName, DuckDBPyRelation]
|
|
47
|
+
"""The type of a mapping of entity name to Spark entity."""
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Utility objects for use with duckdb backend"""
|
|
2
|
+
|
|
3
|
+
import itertools
|
|
4
|
+
|
|
5
|
+
from dve.core_engine.backends.base.utilities import _split_multiexpr_string
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def parse_multiple_expressions(expressions) -> list[str]:
|
|
9
|
+
"""Break multiple expressions into a list of expressions"""
|
|
10
|
+
if isinstance(expressions, dict):
|
|
11
|
+
return expr_mapping_to_columns(expressions)
|
|
12
|
+
if isinstance(expressions, list):
|
|
13
|
+
return expr_array_to_columns(expressions)
|
|
14
|
+
if isinstance(expressions, str):
|
|
15
|
+
return multiexpr_string_to_columns(expressions)
|
|
16
|
+
return []
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def expr_mapping_to_columns(expressions: dict) -> list[str]:
|
|
20
|
+
"""Map duckdb expressions to column names"""
|
|
21
|
+
columns = []
|
|
22
|
+
for expression, alias in expressions.items():
|
|
23
|
+
columns.append(f"{expression} as {alias}")
|
|
24
|
+
return columns
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def expr_array_to_columns(expressions: list[str]) -> list[str]:
|
|
28
|
+
"""Create list of duckdb expressions from list of expressions"""
|
|
29
|
+
return list(
|
|
30
|
+
itertools.chain.from_iterable(
|
|
31
|
+
_split_multiexpr_string(expression) for expression in expressions
|
|
32
|
+
)
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def multiexpr_string_to_columns(expressions: str) -> list[str]:
|
|
37
|
+
"""Split string containing multiple expressions to list of duck db
|
|
38
|
+
column expressions
|
|
39
|
+
"""
|
|
40
|
+
expression_list = _split_multiexpr_string(expressions)
|
|
41
|
+
return expr_array_to_columns(expression_list)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Implementation of the Apache Spark backend."""
|
|
2
|
+
|
|
3
|
+
from dve.core_engine.backends.readers import register_reader
|
|
4
|
+
|
|
5
|
+
from .backend import SparkBackend
|
|
6
|
+
from .contract import SparkDataContract
|
|
7
|
+
from .readers import SparkCSVReader, SparkJSONReader, SparkXMLReader, SparkXMLStreamReader
|
|
8
|
+
from .reference_data import SparkRefDataLoader
|
|
9
|
+
from .rules import SparkStepImplementations
|
|
10
|
+
|
|
11
|
+
register_reader(SparkCSVReader)
|
|
12
|
+
register_reader(SparkJSONReader)
|
|
13
|
+
register_reader(SparkXMLReader)
|
|
14
|
+
register_reader(SparkXMLStreamReader)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"SparkBackend",
|
|
19
|
+
"SparkDataContract",
|
|
20
|
+
"SparkRefDataLoader",
|
|
21
|
+
"SparkStepImplementations",
|
|
22
|
+
]
|