data-validation-engine 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. data_validation_engine-0.6.2.dist-info/METADATA +104 -0
  2. data_validation_engine-0.6.2.dist-info/RECORD +105 -0
  3. data_validation_engine-0.6.2.dist-info/WHEEL +4 -0
  4. data_validation_engine-0.6.2.dist-info/licenses/LICENSE +21 -0
  5. dve/__init__.py +0 -0
  6. dve/common/__init__.py +0 -0
  7. dve/common/error_utils.py +189 -0
  8. dve/core_engine/__init__.py +0 -0
  9. dve/core_engine/backends/__init__.py +1 -0
  10. dve/core_engine/backends/base/__init__.py +1 -0
  11. dve/core_engine/backends/base/auditing.py +618 -0
  12. dve/core_engine/backends/base/backend.py +240 -0
  13. dve/core_engine/backends/base/contract.py +454 -0
  14. dve/core_engine/backends/base/core.py +124 -0
  15. dve/core_engine/backends/base/reader.py +176 -0
  16. dve/core_engine/backends/base/reference_data.py +217 -0
  17. dve/core_engine/backends/base/rules.py +685 -0
  18. dve/core_engine/backends/base/utilities.py +146 -0
  19. dve/core_engine/backends/exceptions.py +311 -0
  20. dve/core_engine/backends/implementations/__init__.py +1 -0
  21. dve/core_engine/backends/implementations/duckdb/__init__.py +26 -0
  22. dve/core_engine/backends/implementations/duckdb/auditing.py +234 -0
  23. dve/core_engine/backends/implementations/duckdb/contract.py +213 -0
  24. dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +288 -0
  25. dve/core_engine/backends/implementations/duckdb/readers/__init__.py +13 -0
  26. dve/core_engine/backends/implementations/duckdb/readers/csv.py +222 -0
  27. dve/core_engine/backends/implementations/duckdb/readers/json.py +50 -0
  28. dve/core_engine/backends/implementations/duckdb/readers/xml.py +45 -0
  29. dve/core_engine/backends/implementations/duckdb/reference_data.py +49 -0
  30. dve/core_engine/backends/implementations/duckdb/rules.py +534 -0
  31. dve/core_engine/backends/implementations/duckdb/types.py +47 -0
  32. dve/core_engine/backends/implementations/duckdb/utilities.py +41 -0
  33. dve/core_engine/backends/implementations/spark/__init__.py +22 -0
  34. dve/core_engine/backends/implementations/spark/auditing.py +230 -0
  35. dve/core_engine/backends/implementations/spark/backend.py +78 -0
  36. dve/core_engine/backends/implementations/spark/contract.py +241 -0
  37. dve/core_engine/backends/implementations/spark/readers/__init__.py +15 -0
  38. dve/core_engine/backends/implementations/spark/readers/csv.py +77 -0
  39. dve/core_engine/backends/implementations/spark/readers/json.py +66 -0
  40. dve/core_engine/backends/implementations/spark/readers/xml.py +202 -0
  41. dve/core_engine/backends/implementations/spark/reference_data.py +42 -0
  42. dve/core_engine/backends/implementations/spark/rules.py +430 -0
  43. dve/core_engine/backends/implementations/spark/spark_helpers.py +412 -0
  44. dve/core_engine/backends/implementations/spark/types.py +21 -0
  45. dve/core_engine/backends/implementations/spark/utilities.py +144 -0
  46. dve/core_engine/backends/metadata/__init__.py +47 -0
  47. dve/core_engine/backends/metadata/contract.py +80 -0
  48. dve/core_engine/backends/metadata/reporting.py +374 -0
  49. dve/core_engine/backends/metadata/rules.py +737 -0
  50. dve/core_engine/backends/readers/__init__.py +41 -0
  51. dve/core_engine/backends/readers/csv.py +232 -0
  52. dve/core_engine/backends/readers/utilities.py +21 -0
  53. dve/core_engine/backends/readers/xml.py +432 -0
  54. dve/core_engine/backends/readers/xml_linting.py +142 -0
  55. dve/core_engine/backends/types.py +26 -0
  56. dve/core_engine/backends/utilities.py +177 -0
  57. dve/core_engine/configuration/__init__.py +1 -0
  58. dve/core_engine/configuration/base.py +56 -0
  59. dve/core_engine/configuration/v1/__init__.py +351 -0
  60. dve/core_engine/configuration/v1/filters.py +60 -0
  61. dve/core_engine/configuration/v1/rule_stores/__init__.py +1 -0
  62. dve/core_engine/configuration/v1/rule_stores/models.py +57 -0
  63. dve/core_engine/configuration/v1/steps.py +365 -0
  64. dve/core_engine/constants.py +8 -0
  65. dve/core_engine/engine.py +265 -0
  66. dve/core_engine/exceptions.py +29 -0
  67. dve/core_engine/functions/__init__.py +6 -0
  68. dve/core_engine/functions/implementations.py +200 -0
  69. dve/core_engine/loggers.py +57 -0
  70. dve/core_engine/message.py +512 -0
  71. dve/core_engine/models.py +196 -0
  72. dve/core_engine/templating.py +114 -0
  73. dve/core_engine/type_hints.py +255 -0
  74. dve/core_engine/validation.py +160 -0
  75. dve/metadata_parser/__init__.py +2 -0
  76. dve/metadata_parser/domain_types.py +682 -0
  77. dve/metadata_parser/exc.py +44 -0
  78. dve/metadata_parser/function_library.py +64 -0
  79. dve/metadata_parser/function_wrapper.py +201 -0
  80. dve/metadata_parser/model_generator.py +119 -0
  81. dve/metadata_parser/models.py +410 -0
  82. dve/metadata_parser/utilities.py +54 -0
  83. dve/parser/__init__.py +1 -0
  84. dve/parser/exceptions.py +50 -0
  85. dve/parser/file_handling/__init__.py +31 -0
  86. dve/parser/file_handling/helpers.py +29 -0
  87. dve/parser/file_handling/implementations/__init__.py +7 -0
  88. dve/parser/file_handling/implementations/base.py +97 -0
  89. dve/parser/file_handling/implementations/dbfs.py +81 -0
  90. dve/parser/file_handling/implementations/file.py +203 -0
  91. dve/parser/file_handling/implementations/s3.py +371 -0
  92. dve/parser/file_handling/log_handler.py +215 -0
  93. dve/parser/file_handling/service.py +441 -0
  94. dve/parser/file_handling/utilities.py +53 -0
  95. dve/parser/type_hints.py +46 -0
  96. dve/parser/utilities.py +113 -0
  97. dve/pipeline/__init__.py +0 -0
  98. dve/pipeline/duckdb_pipeline.py +56 -0
  99. dve/pipeline/foundry_ddb_pipeline.py +171 -0
  100. dve/pipeline/pipeline.py +935 -0
  101. dve/pipeline/spark_pipeline.py +69 -0
  102. dve/pipeline/utils.py +96 -0
  103. dve/reporting/__init__.py +1 -0
  104. dve/reporting/error_report.py +153 -0
  105. dve/reporting/excel_report.py +319 -0
@@ -0,0 +1,534 @@
1
+ """Business rule definitions for duckdb backend"""
2
+
3
+ from collections.abc import Callable
4
+ from typing import get_type_hints
5
+ from uuid import uuid4
6
+
7
+ from duckdb import (
8
+ ColumnExpression,
9
+ ConstantExpression,
10
+ DuckDBPyConnection,
11
+ DuckDBPyRelation,
12
+ StarExpression,
13
+ )
14
+ from duckdb.typing import DuckDBPyType
15
+
16
+ from dve.core_engine.backends.base.rules import (
17
+ BaseStepImplementations,
18
+ ColumnAddition,
19
+ ColumnRemoval,
20
+ SelectColumns,
21
+ )
22
+ from dve.core_engine.backends.exceptions import ConstraintError
23
+ from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import (
24
+ DDBStruct,
25
+ duckdb_read_parquet,
26
+ duckdb_rel_to_dictionaries,
27
+ duckdb_write_parquet,
28
+ get_all_registered_udfs,
29
+ get_duckdb_type_from_annotation,
30
+ )
31
+ from dve.core_engine.backends.implementations.duckdb.types import (
32
+ DuckDBEntities,
33
+ Joined,
34
+ Source,
35
+ Target,
36
+ )
37
+ from dve.core_engine.backends.implementations.duckdb.utilities import parse_multiple_expressions
38
+ from dve.core_engine.backends.metadata.rules import (
39
+ AbstractConditionalJoin,
40
+ AbstractNewColumnConditionalJoin,
41
+ Aggregation,
42
+ AntiJoin,
43
+ ConfirmJoinHasMatch,
44
+ HeaderJoin,
45
+ ImmediateFilter,
46
+ InnerJoin,
47
+ LeftJoin,
48
+ Notification,
49
+ OneToOneJoin,
50
+ OrphanIdentification,
51
+ SemiJoin,
52
+ TableUnion,
53
+ )
54
+ from dve.core_engine.constants import ROWID_COLUMN_NAME
55
+ from dve.core_engine.functions import implementations as functions
56
+ from dve.core_engine.message import FeedbackMessage
57
+ from dve.core_engine.templating import template_object
58
+ from dve.core_engine.type_hints import Messages
59
+
60
+
61
+ @duckdb_write_parquet
62
+ @duckdb_read_parquet
63
+ class DuckDBStepImplementations(BaseStepImplementations[DuckDBPyRelation]):
64
+ """An implementation of transformation steps in duckdb."""
65
+
66
+ def __init__(self, connection: DuckDBPyConnection, **kwargs):
67
+ self._connection = connection
68
+ self.registered_functions = get_all_registered_udfs(self._connection)
69
+ super().__init__(**kwargs)
70
+
71
+ @property
72
+ def connection(self) -> DuckDBPyConnection:
73
+ """The duckdb connection"""
74
+ return self._connection
75
+
76
+ @classmethod
77
+ def register_udfs( # type: ignore
78
+ cls, connection: DuckDBPyConnection, **kwargs
79
+ ): # pylint: disable=arguments-differ
80
+ """Method to register all custom dve functions for use during business rules application"""
81
+ _registered_functions: set[str] = get_all_registered_udfs(connection)
82
+ _available_functions: dict[str, Callable] = {
83
+ func_name: func
84
+ for func_name, func in vars(functions).items()
85
+ if callable(func) and func.__module__ == "dve.core_engine.functions.implementations"
86
+ }
87
+
88
+ _unregistered_functions: set[str] = set(_available_functions).difference(
89
+ _registered_functions
90
+ )
91
+
92
+ for function_name in _unregistered_functions:
93
+ _func = _available_functions.get(function_name)
94
+ _annotations = get_type_hints(_func)
95
+ _return_type: DuckDBPyType = get_duckdb_type_from_annotation(_annotations["return"])
96
+
97
+ connection.create_function(
98
+ function_name,
99
+ _func, # type: ignore
100
+ return_type=str(_return_type), # type: ignore
101
+ )
102
+ if _unregistered_functions:
103
+ _sql = "INSERT INTO dve_udfs (function_name) VALUES" + ",".join(
104
+ [f"('{f_name}',)" for f_name in _unregistered_functions]
105
+ ) # pylint: disable=line-too-long
106
+ connection.sql(_sql)
107
+ return cls(connection=connection, **kwargs)
108
+
109
+ @staticmethod
110
+ def add_row_id(entity: DuckDBPyRelation) -> DuckDBPyRelation:
111
+ """Adds a row identifier to the Relation"""
112
+ if ROWID_COLUMN_NAME not in entity.columns:
113
+ entity = entity.project(f"*, ROW_NUMBER() OVER () as {ROWID_COLUMN_NAME}")
114
+ return entity
115
+
116
+ @staticmethod
117
+ def drop_row_id(entity: DuckDBPyRelation) -> DuckDBPyRelation:
118
+ """Drops the row identiifer from a Relation"""
119
+ if ROWID_COLUMN_NAME in entity.columns:
120
+ entity = entity.select(StarExpression(exclude=[ROWID_COLUMN_NAME]))
121
+ return entity
122
+
123
+ def add(self, entities: DuckDBEntities, *, config: ColumnAddition) -> Messages:
124
+ """A transformation step which adds a column to an entity."""
125
+ entity: DuckDBPyRelation = entities[config.entity_name]
126
+ entity = entity.select(f"*, {config.expression} as {config.column_name}")
127
+ entities[config.new_entity_name or config.entity_name] = entity
128
+ return []
129
+
130
+ def remove(self, entities: DuckDBEntities, *, config: ColumnRemoval) -> Messages:
131
+ """A transformation step which removes a column from an entity."""
132
+ entity: DuckDBPyRelation = entities[config.entity_name]
133
+ entity = entity.select(StarExpression(exclude=[config.column_name]))
134
+ entities[config.new_entity_name or config.entity_name] = entity
135
+ return []
136
+
137
+ def select(self, entities: DuckDBEntities, *, config: SelectColumns) -> Messages:
138
+ """A transformation step which selects columns from an entity."""
139
+ entity: DuckDBPyRelation = entities[config.entity_name]
140
+ entity = entity.select(", ".join(parse_multiple_expressions(config.columns)))
141
+ entities[config.new_entity_name or config.entity_name] = entity
142
+ if config.distinct:
143
+ entity = entity.distinct()
144
+ entities[config.new_entity_name or config.entity_name] = entity
145
+ return []
146
+
147
+ def group_by(self, entities: DuckDBEntities, *, config: Aggregation) -> Messages:
148
+ """A transformation step which performs an aggregation on an entity."""
149
+
150
+ def _add_cnst_field(rel: DuckDBPyRelation) -> tuple[str, DuckDBPyRelation]:
151
+ """Add a constant field for use as an index to allow for pivoting with no group"""
152
+ fld_name = f"fld_{uuid4().hex[0:8]}"
153
+ return fld_name, rel.select(
154
+ StarExpression(exclude=[]), ConstantExpression(1).alias(fld_name)
155
+ )
156
+
157
+ entity: DuckDBPyRelation = entities[config.entity_name]
158
+
159
+ group_cols = parse_multiple_expressions(config.group_by)
160
+ agg_cols = parse_multiple_expressions(config.agg_columns)
161
+
162
+ if config.pivot_column:
163
+ if not group_cols:
164
+ const_fld, entity = _add_cnst_field(entity)
165
+ group_pl = entity.pl().pivot(
166
+ columns=[config.pivot_column],
167
+ values=agg_cols,
168
+ index=(group_cols or [const_fld]),
169
+ aggregate_function=config.agg_function,
170
+ )
171
+ if const_fld in group_pl.columns:
172
+ group_pl = group_pl.drop(const_fld)
173
+ if config.pivot_values:
174
+ group_pl = group_pl.select(group_cols + config.pivot_values)
175
+ group = self._connection.sql("select * from group_pl")
176
+ else:
177
+ group = entity.aggregate(", ".join(group_cols + agg_cols))
178
+
179
+ entities[config.new_entity_name or config.entity_name] = group
180
+ return []
181
+
182
+ def _resolve_join_name_conflicts(
183
+ self, source_rel: Source, joined_rel: Joined, config: AbstractNewColumnConditionalJoin
184
+ ) -> Joined:
185
+ """Resolve name conflicts in joined DataFrames."""
186
+ # Need to ensure we keep source columns but these can be overridden by
187
+ # new computed keys.
188
+ # Start with new computed names (in ddb - earlier columns take preference)
189
+ columns = parse_multiple_expressions(config.new_columns)
190
+ columns.extend(
191
+ parse_multiple_expressions(
192
+ [f"{config.entity_name}.{column_name}" for column_name in source_rel.columns]
193
+ )
194
+ )
195
+ # Select them from the join. There may be duplicates here for overridden fields.
196
+ result_all_cols = joined_rel.select(", ".join(columns))
197
+
198
+ # Now need to handle the existence of dupes.
199
+ # we keep any specified in config.new_columns as duckdb can only access most left-hand
200
+ # field where name clashes.
201
+ # Need to be careful with case sensitivity - duckdb (by default) ignores case.
202
+ temp_column_names = []
203
+ concrete_to_temp_mapping = {}
204
+ case_mapping = {}
205
+ for index, column_name in enumerate(result_all_cols.columns):
206
+ temp_name = str(index)
207
+
208
+ # Duckdb case insensitive by default
209
+ # iterate through column names and keep only earliest casing
210
+ uppercase_name = column_name.upper()
211
+ if uppercase_name not in case_mapping:
212
+ case_mapping[uppercase_name] = column_name
213
+ concrete_to_temp_mapping[uppercase_name] = temp_name
214
+
215
+ # Store the temp name, and the mapping between uppercase name and temp name.
216
+ temp_column_names.append(temp_name)
217
+
218
+ # Rename with the indices, so we can deduplicate column names.
219
+ result_temp_names = result_all_cols.select(
220
+ *[
221
+ ColumnExpression(column_name).alias(str(index))
222
+ for index, column_name in enumerate(result_all_cols.columns)
223
+ ]
224
+ )
225
+ # Keep only the earliest column for each (case insensitive) column name.
226
+ earliest_temp_names = result_temp_names.select(
227
+ *[ColumnExpression(temp_name) for temp_name in concrete_to_temp_mapping.values()]
228
+ )
229
+ # Rename those fields to their 'proper' names (respect user-supplied case).
230
+ return earliest_temp_names.select(
231
+ *[
232
+ ColumnExpression(temp_name).alias(case_mapping[upp_name])
233
+ for upp_name, temp_name in concrete_to_temp_mapping.items()
234
+ ]
235
+ )
236
+
237
+ def _perform_join(
238
+ self, entities: DuckDBEntities, config: AbstractConditionalJoin
239
+ ) -> tuple[Source, Target, Joined]:
240
+ """Perform a conditional join between source and target, returning the
241
+ source, target and joined DataFrames.
242
+
243
+ """
244
+ source_rel: DuckDBPyRelation = entities[config.entity_name]
245
+ source_rel = source_rel.set_alias(config.entity_name)
246
+ target_rel: DuckDBPyRelation = entities[config.target_name]
247
+ target_rel = target_rel.set_alias(config.target_name)
248
+
249
+ if isinstance(config, InnerJoin):
250
+ join_type = "inner"
251
+ elif isinstance(config, SemiJoin):
252
+ join_type = "semi"
253
+ elif isinstance(config, AntiJoin):
254
+ join_type = "anti"
255
+ else:
256
+ join_type = "left"
257
+
258
+ joined_rel = source_rel.join(target_rel, condition=config.join_condition, how=join_type)
259
+
260
+ return source_rel, target_rel, joined_rel
261
+
262
+ def has_match(self, entities: DuckDBEntities, *, config: ConfirmJoinHasMatch) -> Messages:
263
+ """Add a boolean column to a source entity, indicating whether it matches
264
+ a target for the given condition.
265
+ """
266
+ source_rel, _, joined_rel = self._perform_join(entities, config)
267
+ entity = joined_rel.select(
268
+ f"*, COALESCE({config.join_condition}, FALSE) AS {config.column_name}"
269
+ )
270
+
271
+ if config.perform_integrity_check:
272
+ joined_count = joined_rel.count("*").fetchone()[0] # type: ignore
273
+ source_count = source_rel.count("*").fetchone()[0] # type: ignore
274
+ if joined_count != source_count:
275
+ raise ConstraintError(
276
+ f"Multiple matches for some records from {config.entity_name!r} for "
277
+ + f"condition {config.join_condition!r}",
278
+ constraint=(
279
+ f"records in source entity ({config.entity_name!r}) must match at most "
280
+ + f"a single record in the target ({config.target_name})"
281
+ ),
282
+ )
283
+
284
+ entities[config.new_entity_name or config.entity_name] = entity
285
+ return []
286
+
287
+ def left_join(self, entities: DuckDBEntities, *, config: LeftJoin) -> Messages:
288
+ """Perform a left join from a source entity to a target table, updating
289
+ the source entity or creating a new joined entity.
290
+
291
+ """
292
+ source_rel, _, joined_rel = self._perform_join(entities, config)
293
+
294
+ entities[config.new_entity_name or config.entity_name] = self._resolve_join_name_conflicts(
295
+ source_rel, joined_rel, config
296
+ )
297
+ return []
298
+
299
+ def inner_join(self, entities: DuckDBEntities, *, config: InnerJoin) -> Messages:
300
+ """Perform an inner join from a source entity to a target table, updating
301
+ the source entity or creating a new joined entity.
302
+
303
+ """
304
+ source_rel, _, joined_rel = self._perform_join(entities, config)
305
+ entities[config.new_entity_name or config.entity_name] = self._resolve_join_name_conflicts(
306
+ source_rel, joined_rel, config
307
+ )
308
+
309
+ return []
310
+
311
+ def one_to_one_join(self, entities: DuckDBEntities, *, config: OneToOneJoin) -> Messages:
312
+ """Perform a join from a source entity to a target table, updating
313
+ the source entity or creating a new joined entity.
314
+
315
+ This will be a left join that enforces a one-to-one relationship.
316
+
317
+ """
318
+ source_rel: DuckDBPyRelation = entities[config.entity_name]
319
+ messages = self.left_join(entities, config=config)
320
+ joined_rel: DuckDBPyRelation = entities[config.new_entity_name or config.entity_name]
321
+
322
+ if config.perform_integrity_check:
323
+ if (
324
+ joined_rel.count("*").fetchone()[0] != source_rel.count("*").fetchone()[0] # type: ignore # pylint: disable=line-too-long
325
+ ):
326
+ raise ConstraintError(
327
+ f"Multiple matches for some records from {config.entity_name!r} for "
328
+ + f"condition {config.join_condition!r}",
329
+ constraint=(
330
+ f"records in source entity ({config.entity_name!r}) must match at most "
331
+ + f"a single record in the target ({config.target_name})"
332
+ ),
333
+ )
334
+ return messages
335
+
336
+ def semi_join(self, entities: DuckDBEntities, *, config: SemiJoin) -> Messages:
337
+ """Perform a semi join from a source entity to a target table, updating
338
+ the source entity or creating a new joined entity.
339
+
340
+ """
341
+ _, _, joined_rel = self._perform_join(entities, config)
342
+
343
+ entities[config.new_entity_name or config.entity_name] = joined_rel
344
+ return []
345
+
346
+ def anti_join(self, entities: DuckDBEntities, *, config: AntiJoin) -> Messages:
347
+ """Perform an anti join from a source entity to a target table, updating
348
+ the source entity or creating a new joined entity.
349
+
350
+ """
351
+ _, _, joined_rel = self._perform_join(entities, config)
352
+
353
+ entities[config.new_entity_name or config.entity_name] = joined_rel
354
+ return []
355
+
356
+ def join_header(self, entities: DuckDBEntities, *, config: HeaderJoin) -> Messages:
357
+ """Add a 'header' entity to each row in the source entity. The header entity
358
+ must contain only a single record.
359
+
360
+ """
361
+ source_rel: DuckDBPyRelation = entities[config.entity_name]
362
+ source_rel = source_rel.set_alias(config.entity_name)
363
+ target_rel: DuckDBPyRelation = entities[config.target_name]
364
+ target_rel = target_rel.set_alias(config.target_name)
365
+
366
+ target_rows = target_rel.pl().to_struct("header").to_list()
367
+ n_target_rows = len(target_rows)
368
+ if n_target_rows != 1:
369
+ raise ConstraintError(
370
+ f"Unable to join header {config.target_name!r} to {config.entity_name!r} "
371
+ + f"as it contains multiple entries (expected 1, got {n_target_rows})",
372
+ constraint=(
373
+ f"Header entity {config.target_name!r} must contain a single record "
374
+ + f"(contains {n_target_rows} records)"
375
+ ),
376
+ )
377
+
378
+ target_schema = DDBStruct(dict(zip(target_rel.columns, target_rel.dtypes)))()
379
+
380
+ joined_rel = source_rel.select(
381
+ StarExpression(exclude=[]),
382
+ ConstantExpression(target_rows[0]).cast(target_schema).alias(config.header_column_name),
383
+ )
384
+
385
+ entities[config.new_entity_name or config.entity_name] = joined_rel
386
+ return []
387
+
388
+ def identify_orphans(
389
+ self, entities: DuckDBEntities, *, config: OrphanIdentification
390
+ ) -> Messages:
391
+ """Identify records in an entity which don't have at least one corresponding
392
+ match in the target. A new boolean column will be added to `entity` ('IsOrphaned')
393
+ indicating whether the condition matched.
394
+
395
+ If there is already an 'IsOrphaned' column in the entity, this will be set to the
396
+ logical OR of its current value and the value it would have been set to otherwise.
397
+
398
+ """
399
+ source_rel: DuckDBPyRelation = entities[config.entity_name]
400
+ source_rel = source_rel.set_alias(config.entity_name)
401
+ target_rel: DuckDBPyRelation = entities[config.target_name]
402
+ target_rel = target_rel.set_alias(config.target_name)
403
+
404
+ key_name = f"key_{uuid4().hex}"
405
+ source_rel = source_rel.select(f"*, row_number() over () as {key_name}").set_alias(
406
+ config.entity_name
407
+ )
408
+ match_name = f"matched_{uuid4().hex}"
409
+ target_rel = target_rel.select(
410
+ StarExpression(exclude=[]), ConstantExpression(1).alias(match_name)
411
+ ).set_alias(config.target_name)
412
+
413
+ joined_rel: DuckDBPyRelation = source_rel.join(
414
+ target_rel, condition=config.join_condition, how="left"
415
+ ).aggregate(f"{key_name}, coalesce(count({match_name})==0, TRUE) AS IsOrphaned")
416
+
417
+ if "IsOrphaned" not in source_rel.columns:
418
+ result: DuckDBPyRelation = source_rel.join(
419
+ joined_rel, condition=key_name, how="left"
420
+ ).select(StarExpression(exclude=[key_name]))
421
+ else:
422
+ result = source_rel.set_alias("source").join(
423
+ joined_rel.set_alias("joined"),
424
+ condition=f"source.{key_name} = joined.{key_name}",
425
+ how="left",
426
+ )
427
+
428
+ columns = {name: f"source.{name}" for name in source_rel.columns}
429
+ if "IsOrphaned" in source_rel.columns:
430
+ columns["IsOrphaned"] = ColumnExpression("source.IsOrphaned") | ColumnExpression("joined.IsOrphaned") # type: ignore # pylint: disable=line-too-long
431
+ columns.pop(key_name, None)
432
+
433
+ result = result.select(
434
+ ",".join([f"{column} as {name}" for name, column in columns.items()])
435
+ )
436
+
437
+ entities[config.new_entity_name or config.entity_name] = result
438
+ return []
439
+
440
+ def union(self, entities: DuckDBEntities, *, config: TableUnion) -> Messages:
441
+ """Union two entities together, taking the columns from each by name.
442
+
443
+ Where columns have the same name, they must be the same type or coerceable.
444
+ Where column casing differs, the casing from the `source` entity will be kept.
445
+
446
+ Column order will be preserved, with columns from `source` taken first and extra
447
+ columns in `target` added in order afterwards.
448
+
449
+ """
450
+ source_rel: DuckDBPyRelation = entities[config.entity_name]
451
+ source_rel = source_rel.set_alias(config.entity_name)
452
+ target_rel: DuckDBPyRelation = entities[config.target_name]
453
+ target_rel = target_rel.set_alias(config.target_name)
454
+
455
+ # Ensure all keys are present in both
456
+ source_names = {column_name.upper(): column_name for column_name in source_rel.columns}
457
+ target_names = {column_name.upper(): column_name for column_name in target_rel.columns}
458
+
459
+ all_names = list(source_names.keys())
460
+ for name in target_names:
461
+ if name not in source_names:
462
+ all_names.append(name)
463
+
464
+ source_columns, target_columns = [], []
465
+ for uppercase_name in all_names:
466
+ source_name = source_names.get(uppercase_name)
467
+ target_name = target_names.get(uppercase_name)
468
+
469
+ if source_name and target_name:
470
+ source_col = ColumnExpression(source_name)
471
+ target_col = ColumnExpression(target_name).alias(source_name)
472
+ elif source_name:
473
+ source_col = ColumnExpression(source_name)
474
+ target_col = ConstantExpression(None).alias(source_name)
475
+ elif target_name:
476
+ source_col = ConstantExpression(None).alias(target_name)
477
+ target_col = ColumnExpression(target_name)
478
+ else:
479
+ continue
480
+
481
+ source_columns.append(source_col)
482
+ target_columns.append(target_col)
483
+
484
+ source_rel = source_rel.select(*source_columns)
485
+ target_rel = target_rel.select(*target_columns)
486
+ entities[config.new_entity_name or config.entity_name] = source_rel.union(target_rel)
487
+ return []
488
+
489
+ def filter(self, entities: DuckDBEntities, *, config: ImmediateFilter) -> Messages:
490
+ """Filter an entity immediately, and do not emit any messages.
491
+
492
+ The synchronised filter stage will be implemented separately.
493
+
494
+ """
495
+ entity = entities[config.entity_name]
496
+ entity = entity.filter(config.expression)
497
+ entities[config.new_entity_name or config.entity_name] = entity
498
+ return []
499
+
500
+ def notify(self, entities: DuckDBEntities, *, config: Notification) -> Messages:
501
+ """Emit a notification based on an expression. Where the expression is truthy,
502
+ a nofication should be emitted according to the reporting config.
503
+
504
+ This is not intended to be used directly, but is used in the implementation of
505
+ the sync filters.
506
+
507
+ """
508
+ messages: Messages = []
509
+ entity = entities[config.entity_name]
510
+
511
+ matched = entity.filter(config.expression)
512
+ if config.excluded_columns:
513
+ matched = matched.select(StarExpression(exclude=config.excluded_columns))
514
+
515
+ for record in duckdb_rel_to_dictionaries(matched):
516
+ # NOTE: only templates using values directly accessible in record - nothing nested
517
+ # more complex extraction done in reporting module
518
+ messages.append(
519
+ FeedbackMessage(
520
+ entity=config.reporting.reporting_entity_override or config.entity_name,
521
+ original_entity=config.entity_name,
522
+ record=record, # type: ignore
523
+ error_location=config.reporting.legacy_location,
524
+ error_message=template_object(config.reporting.message, record), # type: ignore
525
+ failure_type=config.reporting.legacy_error_type,
526
+ error_type=config.reporting.legacy_error_type,
527
+ error_code=config.reporting.code,
528
+ reporting_field=config.reporting.legacy_reporting_field,
529
+ reporting_field_name=config.reporting.reporting_field_override,
530
+ is_informational=config.reporting.emit in ("warning", "info"),
531
+ category=config.reporting.category,
532
+ )
533
+ )
534
+ return messages
@@ -0,0 +1,47 @@
1
+ """Types used in Spark implementations."""
2
+
3
+ # pylint: disable=C0103
4
+ from collections.abc import MutableMapping
5
+
6
+ from duckdb import DuckDBPyRelation
7
+ from typing_extensions import Literal
8
+
9
+ from dve.core_engine.type_hints import EntityName
10
+
11
+ SQLType = Literal[
12
+ "BIGINT",
13
+ "BIT",
14
+ "BLOB",
15
+ "BOOLEAN",
16
+ "DATE",
17
+ "DECIMAL",
18
+ "DOUBLE",
19
+ "HUGEINT",
20
+ "INTEGER",
21
+ "INTERVAL",
22
+ "REAL",
23
+ "SMALLINT",
24
+ "TIME",
25
+ "UBIGINT",
26
+ "UHUGEINT",
27
+ "UINTEGER",
28
+ "USMALLINT",
29
+ "UTINYINT",
30
+ "UUID",
31
+ "VARCHAR",
32
+ ]
33
+ """SQL types recognised in duckdb"""
34
+
35
+ Source = DuckDBPyRelation
36
+ """The source entity for a join. This will be aliased to the source entity name."""
37
+ Target = DuckDBPyRelation
38
+ """The target entity for a join. This will be aliased to the target entity name."""
39
+ Joined = DuckDBPyRelation
40
+ """
41
+ The joined entity.
42
+
43
+ This will be able to reference source and target columns by their aliased names.
44
+
45
+ """
46
+ DuckDBEntities = MutableMapping[EntityName, DuckDBPyRelation]
47
+ """The type of a mapping of entity name to Spark entity."""
@@ -0,0 +1,41 @@
1
+ """Utility objects for use with duckdb backend"""
2
+
3
+ import itertools
4
+
5
+ from dve.core_engine.backends.base.utilities import _split_multiexpr_string
6
+
7
+
8
+ def parse_multiple_expressions(expressions) -> list[str]:
9
+ """Break multiple expressions into a list of expressions"""
10
+ if isinstance(expressions, dict):
11
+ return expr_mapping_to_columns(expressions)
12
+ if isinstance(expressions, list):
13
+ return expr_array_to_columns(expressions)
14
+ if isinstance(expressions, str):
15
+ return multiexpr_string_to_columns(expressions)
16
+ return []
17
+
18
+
19
+ def expr_mapping_to_columns(expressions: dict) -> list[str]:
20
+ """Map duckdb expressions to column names"""
21
+ columns = []
22
+ for expression, alias in expressions.items():
23
+ columns.append(f"{expression} as {alias}")
24
+ return columns
25
+
26
+
27
+ def expr_array_to_columns(expressions: list[str]) -> list[str]:
28
+ """Create list of duckdb expressions from list of expressions"""
29
+ return list(
30
+ itertools.chain.from_iterable(
31
+ _split_multiexpr_string(expression) for expression in expressions
32
+ )
33
+ )
34
+
35
+
36
+ def multiexpr_string_to_columns(expressions: str) -> list[str]:
37
+ """Split string containing multiple expressions to list of duck db
38
+ column expressions
39
+ """
40
+ expression_list = _split_multiexpr_string(expressions)
41
+ return expr_array_to_columns(expression_list)
@@ -0,0 +1,22 @@
1
+ """Implementation of the Apache Spark backend."""
2
+
3
+ from dve.core_engine.backends.readers import register_reader
4
+
5
+ from .backend import SparkBackend
6
+ from .contract import SparkDataContract
7
+ from .readers import SparkCSVReader, SparkJSONReader, SparkXMLReader, SparkXMLStreamReader
8
+ from .reference_data import SparkRefDataLoader
9
+ from .rules import SparkStepImplementations
10
+
11
+ register_reader(SparkCSVReader)
12
+ register_reader(SparkJSONReader)
13
+ register_reader(SparkXMLReader)
14
+ register_reader(SparkXMLStreamReader)
15
+
16
+
17
+ __all__ = [
18
+ "SparkBackend",
19
+ "SparkDataContract",
20
+ "SparkRefDataLoader",
21
+ "SparkStepImplementations",
22
+ ]