data-validation-engine 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_validation_engine-0.6.2.dist-info/METADATA +104 -0
- data_validation_engine-0.6.2.dist-info/RECORD +105 -0
- data_validation_engine-0.6.2.dist-info/WHEEL +4 -0
- data_validation_engine-0.6.2.dist-info/licenses/LICENSE +21 -0
- dve/__init__.py +0 -0
- dve/common/__init__.py +0 -0
- dve/common/error_utils.py +189 -0
- dve/core_engine/__init__.py +0 -0
- dve/core_engine/backends/__init__.py +1 -0
- dve/core_engine/backends/base/__init__.py +1 -0
- dve/core_engine/backends/base/auditing.py +618 -0
- dve/core_engine/backends/base/backend.py +240 -0
- dve/core_engine/backends/base/contract.py +454 -0
- dve/core_engine/backends/base/core.py +124 -0
- dve/core_engine/backends/base/reader.py +176 -0
- dve/core_engine/backends/base/reference_data.py +217 -0
- dve/core_engine/backends/base/rules.py +685 -0
- dve/core_engine/backends/base/utilities.py +146 -0
- dve/core_engine/backends/exceptions.py +311 -0
- dve/core_engine/backends/implementations/__init__.py +1 -0
- dve/core_engine/backends/implementations/duckdb/__init__.py +26 -0
- dve/core_engine/backends/implementations/duckdb/auditing.py +234 -0
- dve/core_engine/backends/implementations/duckdb/contract.py +213 -0
- dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +288 -0
- dve/core_engine/backends/implementations/duckdb/readers/__init__.py +13 -0
- dve/core_engine/backends/implementations/duckdb/readers/csv.py +222 -0
- dve/core_engine/backends/implementations/duckdb/readers/json.py +50 -0
- dve/core_engine/backends/implementations/duckdb/readers/xml.py +45 -0
- dve/core_engine/backends/implementations/duckdb/reference_data.py +49 -0
- dve/core_engine/backends/implementations/duckdb/rules.py +534 -0
- dve/core_engine/backends/implementations/duckdb/types.py +47 -0
- dve/core_engine/backends/implementations/duckdb/utilities.py +41 -0
- dve/core_engine/backends/implementations/spark/__init__.py +22 -0
- dve/core_engine/backends/implementations/spark/auditing.py +230 -0
- dve/core_engine/backends/implementations/spark/backend.py +78 -0
- dve/core_engine/backends/implementations/spark/contract.py +241 -0
- dve/core_engine/backends/implementations/spark/readers/__init__.py +15 -0
- dve/core_engine/backends/implementations/spark/readers/csv.py +77 -0
- dve/core_engine/backends/implementations/spark/readers/json.py +66 -0
- dve/core_engine/backends/implementations/spark/readers/xml.py +202 -0
- dve/core_engine/backends/implementations/spark/reference_data.py +42 -0
- dve/core_engine/backends/implementations/spark/rules.py +430 -0
- dve/core_engine/backends/implementations/spark/spark_helpers.py +412 -0
- dve/core_engine/backends/implementations/spark/types.py +21 -0
- dve/core_engine/backends/implementations/spark/utilities.py +144 -0
- dve/core_engine/backends/metadata/__init__.py +47 -0
- dve/core_engine/backends/metadata/contract.py +80 -0
- dve/core_engine/backends/metadata/reporting.py +374 -0
- dve/core_engine/backends/metadata/rules.py +737 -0
- dve/core_engine/backends/readers/__init__.py +41 -0
- dve/core_engine/backends/readers/csv.py +232 -0
- dve/core_engine/backends/readers/utilities.py +21 -0
- dve/core_engine/backends/readers/xml.py +432 -0
- dve/core_engine/backends/readers/xml_linting.py +142 -0
- dve/core_engine/backends/types.py +26 -0
- dve/core_engine/backends/utilities.py +177 -0
- dve/core_engine/configuration/__init__.py +1 -0
- dve/core_engine/configuration/base.py +56 -0
- dve/core_engine/configuration/v1/__init__.py +351 -0
- dve/core_engine/configuration/v1/filters.py +60 -0
- dve/core_engine/configuration/v1/rule_stores/__init__.py +1 -0
- dve/core_engine/configuration/v1/rule_stores/models.py +57 -0
- dve/core_engine/configuration/v1/steps.py +365 -0
- dve/core_engine/constants.py +8 -0
- dve/core_engine/engine.py +265 -0
- dve/core_engine/exceptions.py +29 -0
- dve/core_engine/functions/__init__.py +6 -0
- dve/core_engine/functions/implementations.py +200 -0
- dve/core_engine/loggers.py +57 -0
- dve/core_engine/message.py +512 -0
- dve/core_engine/models.py +196 -0
- dve/core_engine/templating.py +114 -0
- dve/core_engine/type_hints.py +255 -0
- dve/core_engine/validation.py +160 -0
- dve/metadata_parser/__init__.py +2 -0
- dve/metadata_parser/domain_types.py +682 -0
- dve/metadata_parser/exc.py +44 -0
- dve/metadata_parser/function_library.py +64 -0
- dve/metadata_parser/function_wrapper.py +201 -0
- dve/metadata_parser/model_generator.py +119 -0
- dve/metadata_parser/models.py +410 -0
- dve/metadata_parser/utilities.py +54 -0
- dve/parser/__init__.py +1 -0
- dve/parser/exceptions.py +50 -0
- dve/parser/file_handling/__init__.py +31 -0
- dve/parser/file_handling/helpers.py +29 -0
- dve/parser/file_handling/implementations/__init__.py +7 -0
- dve/parser/file_handling/implementations/base.py +97 -0
- dve/parser/file_handling/implementations/dbfs.py +81 -0
- dve/parser/file_handling/implementations/file.py +203 -0
- dve/parser/file_handling/implementations/s3.py +371 -0
- dve/parser/file_handling/log_handler.py +215 -0
- dve/parser/file_handling/service.py +441 -0
- dve/parser/file_handling/utilities.py +53 -0
- dve/parser/type_hints.py +46 -0
- dve/parser/utilities.py +113 -0
- dve/pipeline/__init__.py +0 -0
- dve/pipeline/duckdb_pipeline.py +56 -0
- dve/pipeline/foundry_ddb_pipeline.py +171 -0
- dve/pipeline/pipeline.py +935 -0
- dve/pipeline/spark_pipeline.py +69 -0
- dve/pipeline/utils.py +96 -0
- dve/reporting/__init__.py +1 -0
- dve/reporting/error_report.py +153 -0
- dve/reporting/excel_report.py +319 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Models for components in the rule stores."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional, Union
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
from typing_extensions import Annotated, Literal
|
|
7
|
+
|
|
8
|
+
from dve.core_engine.configuration.v1.filters import FilterConfigUnion
|
|
9
|
+
from dve.core_engine.configuration.v1.steps import StepConfigUnion
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BusinessSpecConfig(BaseModel):
|
|
13
|
+
"""A business rule or filter within the config."""
|
|
14
|
+
|
|
15
|
+
type: str
|
|
16
|
+
"""The type of business rule."""
|
|
17
|
+
|
|
18
|
+
description: Optional[str] = None
|
|
19
|
+
"""A description of what the rule/filter should do."""
|
|
20
|
+
parameter_descriptions: dict[str, str] = Field(default_factory=dict)
|
|
21
|
+
"""Descriptions of parameters used by the rule."""
|
|
22
|
+
parameter_defaults: dict[str, Any] = Field(default_factory=dict)
|
|
23
|
+
"""Default parameters to be used by the rule if no param is passed."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class BusinessFilterSpecConfig(BusinessSpecConfig):
|
|
27
|
+
"""A business filter within the rule store."""
|
|
28
|
+
|
|
29
|
+
type: Literal["filter"]
|
|
30
|
+
|
|
31
|
+
rule_config: FilterConfigUnion
|
|
32
|
+
"""The configuration for the filter."""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ComplexRuleConfig(BaseModel):
|
|
36
|
+
"""The rule config for a business rule."""
|
|
37
|
+
|
|
38
|
+
rules: list[StepConfigUnion] = Field(default_factory=list)
|
|
39
|
+
filters: list[FilterConfigUnion] = Field(default_factory=list)
|
|
40
|
+
post_filter_rules: list[StepConfigUnion] = Field(default_factory=list)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class BusinessRuleSpecConfig(BusinessSpecConfig):
|
|
44
|
+
"""A business rule within the rule store."""
|
|
45
|
+
|
|
46
|
+
type: Literal["complex_rule"]
|
|
47
|
+
|
|
48
|
+
rule_config: ComplexRuleConfig
|
|
49
|
+
"""The configuration for the rule."""
|
|
50
|
+
dependencies: list[str] = Field(default_factory=list)
|
|
51
|
+
"""The dependencies for the business rule."""
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
BusinessComponentSpecConfigUnion = Annotated[
|
|
55
|
+
Union[BusinessFilterSpecConfig, BusinessRuleSpecConfig], Field(discriminator="type")
|
|
56
|
+
]
|
|
57
|
+
"""A union of the different business component types."""
|
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
"""Very basic configuration options for steps.
|
|
2
|
+
|
|
3
|
+
These (mostly) only differ slightly from the metadata steps,
|
|
4
|
+
but there's some repetition here to make it possible to change
|
|
5
|
+
the metadata steps without altering the config classes.
|
|
6
|
+
|
|
7
|
+
N.B. These are quite coarsely copied from the JSON schema.
|
|
8
|
+
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
# pylint: disable=missing-class-docstring
|
|
12
|
+
from abc import ABC, abstractmethod
|
|
13
|
+
from typing import Any, Optional, Union
|
|
14
|
+
|
|
15
|
+
from pydantic import BaseModel, Extra, Field, validator
|
|
16
|
+
from typing_extensions import Annotated, Literal
|
|
17
|
+
|
|
18
|
+
from dve.core_engine.backends.metadata.rules import (
|
|
19
|
+
AbstractStep,
|
|
20
|
+
Aggregation,
|
|
21
|
+
AntiJoin,
|
|
22
|
+
ColumnAddition,
|
|
23
|
+
ColumnRemoval,
|
|
24
|
+
ConfirmJoinHasMatch,
|
|
25
|
+
CopyEntity,
|
|
26
|
+
EntityRemoval,
|
|
27
|
+
HeaderJoin,
|
|
28
|
+
ImmediateFilter,
|
|
29
|
+
InnerJoin,
|
|
30
|
+
LeftJoin,
|
|
31
|
+
OneToOneJoin,
|
|
32
|
+
RenameEntity,
|
|
33
|
+
SelectColumns,
|
|
34
|
+
SemiJoin,
|
|
35
|
+
TableUnion,
|
|
36
|
+
)
|
|
37
|
+
from dve.core_engine.type_hints import MultipleExpressions
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class ConfigStep(BaseModel, ABC):
|
|
41
|
+
"""The parent for the config steps."""
|
|
42
|
+
|
|
43
|
+
class Config: # pylint: disable=too-few-public-methods
|
|
44
|
+
"""Config class for dynamically generated pydantic models"""
|
|
45
|
+
|
|
46
|
+
extra = Extra.forbid
|
|
47
|
+
|
|
48
|
+
name: Optional[str] = None
|
|
49
|
+
"""The 'name' of the rule. This is mapped to an ID in the entity."""
|
|
50
|
+
operation: str
|
|
51
|
+
"""The operation implemented by the step."""
|
|
52
|
+
|
|
53
|
+
@abstractmethod
|
|
54
|
+
def to_step(self) -> AbstractStep:
|
|
55
|
+
"""Convert the config step definition to a 'real' metadata step."""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class AddConfig(ConfigStep):
|
|
59
|
+
"""Configuration step for adding a new column"""
|
|
60
|
+
|
|
61
|
+
operation: Literal["add"]
|
|
62
|
+
|
|
63
|
+
entity: str
|
|
64
|
+
new_entity_name: Optional[str] = None
|
|
65
|
+
column_name: str
|
|
66
|
+
expression: str
|
|
67
|
+
|
|
68
|
+
def to_step(self) -> AbstractStep:
|
|
69
|
+
"""Takes a config object and returns a step object"""
|
|
70
|
+
return ColumnAddition(
|
|
71
|
+
id=self.name,
|
|
72
|
+
entity_name=self.entity,
|
|
73
|
+
new_entity_name=self.new_entity_name,
|
|
74
|
+
column_name=self.column_name,
|
|
75
|
+
expression=self.expression,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class RemoveConfig(ConfigStep):
|
|
80
|
+
"""Configuration step for removing a column"""
|
|
81
|
+
|
|
82
|
+
operation: Literal["remove"]
|
|
83
|
+
|
|
84
|
+
entity: str
|
|
85
|
+
new_entity_name: Optional[str] = None
|
|
86
|
+
column_name: str
|
|
87
|
+
|
|
88
|
+
def to_step(self) -> AbstractStep:
|
|
89
|
+
"""Takes a config object and returns a step object"""
|
|
90
|
+
return ColumnRemoval(
|
|
91
|
+
id=self.name,
|
|
92
|
+
entity_name=self.entity,
|
|
93
|
+
new_entity_name=self.new_entity_name,
|
|
94
|
+
column_name=self.column_name,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class GroupByConfig(ConfigStep):
|
|
99
|
+
"""Configuration step for performing a GROUP BY operation"""
|
|
100
|
+
|
|
101
|
+
operation: Literal["group_by"]
|
|
102
|
+
|
|
103
|
+
entity: str
|
|
104
|
+
new_entity_name: Optional[str] = None
|
|
105
|
+
group_by: MultipleExpressions
|
|
106
|
+
pivot_column: Optional[str] = None
|
|
107
|
+
pivot_values: Optional[list[str]] = None
|
|
108
|
+
agg_columns: MultipleExpressions
|
|
109
|
+
|
|
110
|
+
@validator("pivot_values")
|
|
111
|
+
@classmethod
|
|
112
|
+
def _ensure_no_values_if_not_column(cls, value: Optional[str], values: dict[str, Any]):
|
|
113
|
+
if value and not values["pivot_column"]:
|
|
114
|
+
raise ValueError("Cannot provide 'pivot_values' if no 'pivot_column'")
|
|
115
|
+
return value
|
|
116
|
+
|
|
117
|
+
def to_step(self) -> AbstractStep:
|
|
118
|
+
"""Takes a config object and returns a step object"""
|
|
119
|
+
return Aggregation(
|
|
120
|
+
id=self.name,
|
|
121
|
+
entity_name=self.entity,
|
|
122
|
+
new_entity_name=self.new_entity_name,
|
|
123
|
+
group_by=self.group_by,
|
|
124
|
+
pivot_column=self.pivot_column,
|
|
125
|
+
pivot_values=self.pivot_values,
|
|
126
|
+
agg_columns=self.agg_columns,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class SelectConfig(ConfigStep):
|
|
131
|
+
"""Configuration step for performing a SELECT operation"""
|
|
132
|
+
|
|
133
|
+
operation: Literal["select"]
|
|
134
|
+
|
|
135
|
+
entity: str
|
|
136
|
+
new_entity_name: Optional[str] = None
|
|
137
|
+
columns: MultipleExpressions
|
|
138
|
+
distinct: bool = False
|
|
139
|
+
|
|
140
|
+
def to_step(self) -> AbstractStep:
|
|
141
|
+
"""Takes a config object and returns a step object"""
|
|
142
|
+
return SelectColumns(
|
|
143
|
+
id=self.name,
|
|
144
|
+
entity_name=self.entity,
|
|
145
|
+
new_entity_name=self.new_entity_name,
|
|
146
|
+
columns=self.columns,
|
|
147
|
+
distinct=self.distinct,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class RenameEntityConfig(ConfigStep):
|
|
152
|
+
"""Configuration step for renaming an entity"""
|
|
153
|
+
|
|
154
|
+
operation: Literal["rename_entity"]
|
|
155
|
+
|
|
156
|
+
entity: str
|
|
157
|
+
new_entity_name: str
|
|
158
|
+
|
|
159
|
+
def to_step(self) -> AbstractStep:
|
|
160
|
+
"""Takes a config object and returns a step object"""
|
|
161
|
+
return RenameEntity(
|
|
162
|
+
id=self.name,
|
|
163
|
+
entity_name=self.entity,
|
|
164
|
+
new_entity_name=self.new_entity_name,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class NonNotifyingFilterConfig(ConfigStep):
|
|
169
|
+
"""Configuration step for filtering out values without creating errors
|
|
170
|
+
|
|
171
|
+
mainly used on derived entities
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
operation: Literal["filter_without_notifying"]
|
|
175
|
+
|
|
176
|
+
entity: str
|
|
177
|
+
new_entity_name: Optional[str] = None
|
|
178
|
+
filter_rule: str
|
|
179
|
+
|
|
180
|
+
def to_step(self) -> AbstractStep:
|
|
181
|
+
"""Takes a config object and returns a step object"""
|
|
182
|
+
return ImmediateFilter(
|
|
183
|
+
id=self.name,
|
|
184
|
+
entity_name=self.entity,
|
|
185
|
+
new_entity_name=self.new_entity_name,
|
|
186
|
+
expression=self.filter_rule,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class HasMatchConfig(ConfigStep):
|
|
191
|
+
"""Configuration step for checking if a value has a match in another entity"""
|
|
192
|
+
|
|
193
|
+
operation: Literal["has_match"]
|
|
194
|
+
|
|
195
|
+
entity: str
|
|
196
|
+
new_entity_name: Optional[str] = None
|
|
197
|
+
target: str
|
|
198
|
+
join_condition: str
|
|
199
|
+
column_name: str
|
|
200
|
+
|
|
201
|
+
def to_step(self) -> AbstractStep:
|
|
202
|
+
"""Takes a config object and returns a step object"""
|
|
203
|
+
return ConfirmJoinHasMatch(
|
|
204
|
+
id=self.name,
|
|
205
|
+
entity_name=self.entity,
|
|
206
|
+
target_name=self.target,
|
|
207
|
+
new_entity_name=self.new_entity_name,
|
|
208
|
+
join_condition=self.join_condition,
|
|
209
|
+
column_name=self.column_name,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class SemiOrAntiJoinConfig(ConfigStep):
|
|
214
|
+
"""Configuration step for performing a SEMI or ANTI JOIN
|
|
215
|
+
|
|
216
|
+
More performant than a left or right join for checking membership in another entity
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
operation: Literal["semi_join", "anti_join"]
|
|
220
|
+
|
|
221
|
+
entity: str
|
|
222
|
+
new_entity_name: Optional[str] = None
|
|
223
|
+
target: str
|
|
224
|
+
join_condition: str
|
|
225
|
+
|
|
226
|
+
def to_step(self) -> AbstractStep:
|
|
227
|
+
"""Takes a config object and returns a step object"""
|
|
228
|
+
type_ = AntiJoin if self.operation == "anti_join" else SemiJoin
|
|
229
|
+
return type_(
|
|
230
|
+
id=self.name,
|
|
231
|
+
entity_name=self.entity,
|
|
232
|
+
target_name=self.target,
|
|
233
|
+
new_entity_name=self.new_entity_name,
|
|
234
|
+
join_condition=self.join_condition,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class LeftOrInnerJoinConfig(SemiOrAntiJoinConfig):
|
|
239
|
+
"""Configuration step for performing a LEFT or INNER JOIN"""
|
|
240
|
+
|
|
241
|
+
operation: Literal["left_join", "inner_join"] # type: ignore
|
|
242
|
+
|
|
243
|
+
new_columns: MultipleExpressions
|
|
244
|
+
|
|
245
|
+
def to_step(self) -> AbstractStep:
|
|
246
|
+
"""Takes a config object and returns a step object"""
|
|
247
|
+
type_ = LeftJoin if self.operation == "left_join" else InnerJoin
|
|
248
|
+
return type_(
|
|
249
|
+
id=self.name,
|
|
250
|
+
entity_name=self.entity,
|
|
251
|
+
target_name=self.target,
|
|
252
|
+
new_entity_name=self.new_entity_name,
|
|
253
|
+
join_condition=self.join_condition,
|
|
254
|
+
new_columns=self.new_columns,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
class OneToOneJoinConfig(LeftOrInnerJoinConfig):
|
|
259
|
+
"""Config for joining one entity to another"""
|
|
260
|
+
|
|
261
|
+
operation: Literal["join", "one_to_one_join"] # type: ignore
|
|
262
|
+
|
|
263
|
+
perform_integrity_check: bool = True
|
|
264
|
+
|
|
265
|
+
def to_step(self) -> AbstractStep:
|
|
266
|
+
"""Takes a config object and returns a step object"""
|
|
267
|
+
return OneToOneJoin(
|
|
268
|
+
id=self.name,
|
|
269
|
+
entity_name=self.entity,
|
|
270
|
+
target_name=self.target,
|
|
271
|
+
new_entity_name=self.new_entity_name,
|
|
272
|
+
join_condition=self.join_condition,
|
|
273
|
+
new_columns=self.new_columns,
|
|
274
|
+
perform_integrity_check=self.perform_integrity_check,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
class JoinHeaderConfig(ConfigStep):
|
|
279
|
+
"""Config for joining a header onto another entity"""
|
|
280
|
+
|
|
281
|
+
operation: Literal["join_header"]
|
|
282
|
+
|
|
283
|
+
entity: str
|
|
284
|
+
new_entity_name: Optional[str] = None
|
|
285
|
+
target: str
|
|
286
|
+
header_column_name: str = "_Header"
|
|
287
|
+
perform_integrity_check: bool = True
|
|
288
|
+
|
|
289
|
+
def to_step(self) -> AbstractStep:
|
|
290
|
+
"""Takes a config object and returns a step object"""
|
|
291
|
+
return HeaderJoin(
|
|
292
|
+
id=self.name,
|
|
293
|
+
entity_name=self.entity,
|
|
294
|
+
target_name=self.target,
|
|
295
|
+
new_entity_name=self.new_entity_name,
|
|
296
|
+
header_column_name=self.header_column_name,
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
class UnionConfig(ConfigStep):
|
|
301
|
+
"""Config to unioning two entities"""
|
|
302
|
+
|
|
303
|
+
operation: Literal["union"]
|
|
304
|
+
|
|
305
|
+
entity: str
|
|
306
|
+
new_entity_name: Optional[str] = None
|
|
307
|
+
target: str
|
|
308
|
+
|
|
309
|
+
def to_step(self) -> AbstractStep:
|
|
310
|
+
"""Takes a config object and returns a step object"""
|
|
311
|
+
return TableUnion(
|
|
312
|
+
id=self.name,
|
|
313
|
+
entity_name=self.entity,
|
|
314
|
+
target_name=self.target,
|
|
315
|
+
new_entity_name=self.new_entity_name,
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
class CopyEntityConfig(ConfigStep):
|
|
320
|
+
"""Config for copying entities"""
|
|
321
|
+
|
|
322
|
+
operation: Literal["copy_entity"]
|
|
323
|
+
|
|
324
|
+
entity: str
|
|
325
|
+
new_entity_name: str
|
|
326
|
+
|
|
327
|
+
def to_step(self) -> AbstractStep:
|
|
328
|
+
"""Takes a config object and returns a step object"""
|
|
329
|
+
return CopyEntity(
|
|
330
|
+
id=self.name, entity_name=self.entity, new_entity_name=self.new_entity_name
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
class RemoveEntityConfig(ConfigStep):
|
|
335
|
+
"""Config for removing entities"""
|
|
336
|
+
|
|
337
|
+
operation: Literal["remove_entity", "remove_entities"]
|
|
338
|
+
|
|
339
|
+
entity: Union[str, list[str]]
|
|
340
|
+
|
|
341
|
+
def to_step(self) -> AbstractStep:
|
|
342
|
+
"""Takes a config object and returns a step object"""
|
|
343
|
+
return EntityRemoval(id=self.name, entity_name=self.entity)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
StepConfigUnion = Annotated[
|
|
347
|
+
Union[
|
|
348
|
+
AddConfig,
|
|
349
|
+
CopyEntityConfig,
|
|
350
|
+
GroupByConfig,
|
|
351
|
+
HasMatchConfig,
|
|
352
|
+
JoinHeaderConfig,
|
|
353
|
+
LeftOrInnerJoinConfig,
|
|
354
|
+
NonNotifyingFilterConfig,
|
|
355
|
+
OneToOneJoinConfig,
|
|
356
|
+
RemoveConfig,
|
|
357
|
+
RemoveEntityConfig,
|
|
358
|
+
RenameEntityConfig,
|
|
359
|
+
SelectConfig,
|
|
360
|
+
SemiOrAntiJoinConfig,
|
|
361
|
+
UnionConfig,
|
|
362
|
+
],
|
|
363
|
+
Field(discriminator="operation"),
|
|
364
|
+
]
|
|
365
|
+
"""Pydantic configuration classes for steps."""
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Constant values used in mutiple places."""
|
|
2
|
+
|
|
3
|
+
ROWID_COLUMN_NAME: str = "__rowid__"
|
|
4
|
+
"""The name of the column containing the row ID for each entity."""
|
|
5
|
+
|
|
6
|
+
CONTRACT_ERROR_VALUE_FIELD_NAME: str = "__error_value"
|
|
7
|
+
"""The name of the field that can be used to extract the field value that caused
|
|
8
|
+
a pydantic validation error"""
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
"""The core engine for the data validation engine."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from types import TracebackType
|
|
7
|
+
from typing import Any, Optional, Union
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, Field, PrivateAttr, validate_arguments, validator
|
|
10
|
+
from pydantic.types import FilePath
|
|
11
|
+
from pyspark.sql import SparkSession
|
|
12
|
+
|
|
13
|
+
from dve.core_engine.backends.base.backend import BaseBackend
|
|
14
|
+
from dve.core_engine.backends.implementations.spark.backend import SparkBackend
|
|
15
|
+
from dve.core_engine.backends.implementations.spark.types import SparkEntities
|
|
16
|
+
from dve.core_engine.configuration.base import BaseEngineConfig
|
|
17
|
+
from dve.core_engine.configuration.v1 import V1EngineConfig
|
|
18
|
+
from dve.core_engine.constants import ROWID_COLUMN_NAME
|
|
19
|
+
from dve.core_engine.loggers import get_child_logger, get_logger
|
|
20
|
+
from dve.core_engine.models import EngineRunValidation, SubmissionInfo
|
|
21
|
+
from dve.core_engine.type_hints import EntityName, JSONstring
|
|
22
|
+
from dve.parser.file_handling import TemporaryPrefix, get_resource_exists, joinuri, resolve_location
|
|
23
|
+
from dve.parser.type_hints import URI, Location
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class CoreEngine(BaseModel):
|
|
27
|
+
"""The core engine implementation for the data validation engine."""
|
|
28
|
+
|
|
29
|
+
class Config: # pylint: disable=too-few-public-methods
|
|
30
|
+
"""`pydantic` configuration options."""
|
|
31
|
+
|
|
32
|
+
arbitrary_types_allowed = True
|
|
33
|
+
validate_assignment = True
|
|
34
|
+
|
|
35
|
+
backend_config: BaseEngineConfig
|
|
36
|
+
"""The backend configuration for the given run."""
|
|
37
|
+
dataset_config_uri: URI
|
|
38
|
+
"""The dischema location for the current run"""
|
|
39
|
+
output_prefix_uri: URI = Field(default_factory=lambda: Path("outputs").resolve().as_posix())
|
|
40
|
+
"""The prefix for the parquet outputs."""
|
|
41
|
+
main_log: logging.Logger = Field(default_factory=lambda: get_logger("CoreEngine"))
|
|
42
|
+
"""The `logging.Logger instance for the data ingest process."""
|
|
43
|
+
cache_prefix_uri: Optional[URI] = None
|
|
44
|
+
"""
|
|
45
|
+
An optional cache prefix URI. If not provided, a local temporary directory will
|
|
46
|
+
be used instead (this will not play nicely in Databricks).
|
|
47
|
+
|
|
48
|
+
"""
|
|
49
|
+
_cache_dir: Optional[TemporaryPrefix] = PrivateAttr(default=None)
|
|
50
|
+
"""
|
|
51
|
+
The `TemporaryPrefix` indicating the cache dir.
|
|
52
|
+
|
|
53
|
+
Data will be chunked to parquet in this directory after being read,
|
|
54
|
+
and written here before filters are applied.
|
|
55
|
+
|
|
56
|
+
"""
|
|
57
|
+
backend: BaseBackend = None # type: ignore
|
|
58
|
+
"""The backend to use to process the files."""
|
|
59
|
+
debug: bool = False
|
|
60
|
+
"""Indication of if this run is in debug mode."""
|
|
61
|
+
|
|
62
|
+
@validator("cache_prefix_uri", "output_prefix_uri", allow_reuse=True, pre=True)
|
|
63
|
+
# pylint: disable=E0213
|
|
64
|
+
def _validate_prefix_uri(cls, location: Optional[Location]) -> Optional[URI]:
|
|
65
|
+
"""Ensure we support the cache prefix scheme."""
|
|
66
|
+
if location is None:
|
|
67
|
+
return None
|
|
68
|
+
return resolve_location(location)
|
|
69
|
+
|
|
70
|
+
def __init__(self, *args, **kwargs):
|
|
71
|
+
# pylint: disable=W0235
|
|
72
|
+
super().__init__(*args, **kwargs)
|
|
73
|
+
|
|
74
|
+
@validator("backend", always=True)
|
|
75
|
+
@classmethod
|
|
76
|
+
def _ensure_backend(cls, value: Optional[BaseBackend], values: dict[str, Any]) -> BaseBackend:
|
|
77
|
+
"""Ensure a default backend is created if a backend is not specified."""
|
|
78
|
+
if value is not None:
|
|
79
|
+
return value
|
|
80
|
+
|
|
81
|
+
main_logger = values.get("main_log")
|
|
82
|
+
if main_logger is None:
|
|
83
|
+
return SparkBackend(dataset_config_uri=values.get("dataset_config_uri"))
|
|
84
|
+
return SparkBackend(
|
|
85
|
+
dataset_config_uri=values.get("dataset_config_uri"),
|
|
86
|
+
logger=get_child_logger(
|
|
87
|
+
".".join((SparkBackend.__module__, SparkBackend.__name__)), main_logger
|
|
88
|
+
),
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
@classmethod
|
|
92
|
+
@validate_arguments(config={"arbitrary_types_allowed": True})
|
|
93
|
+
def build(
|
|
94
|
+
cls,
|
|
95
|
+
dataset_config_path: Union[FilePath, URI],
|
|
96
|
+
output_prefix: Location = Path("./outputs"),
|
|
97
|
+
cache_prefix: Optional[Location] = None,
|
|
98
|
+
parent_logger: Optional[logging.Logger] = None,
|
|
99
|
+
debug: Optional[bool] = False,
|
|
100
|
+
**kwargs,
|
|
101
|
+
):
|
|
102
|
+
"""Build an engine from serialised definitions.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
- `dataset_config_path`: a URI or path indicating the location of the
|
|
106
|
+
dataset configuration.
|
|
107
|
+
- `output_prefix`: the prefix for parquet outputs (a URI or a local path).
|
|
108
|
+
- `cache_prefix`: the prefix for caching (a URI or a local path).
|
|
109
|
+
- `parent_logger`: an optional parent logger for the engine.
|
|
110
|
+
- `debug`: whether to run in debug mode (default: False).
|
|
111
|
+
|
|
112
|
+
"""
|
|
113
|
+
if parent_logger:
|
|
114
|
+
main_log = get_child_logger(cls.__name__, parent_logger)
|
|
115
|
+
else:
|
|
116
|
+
main_log = get_logger(cls.__name__)
|
|
117
|
+
main_log.info("Initialising...")
|
|
118
|
+
main_log.info(f"Debug mode: {debug}")
|
|
119
|
+
|
|
120
|
+
if isinstance(dataset_config_path, Path):
|
|
121
|
+
dataset_config_uri = dataset_config_path.resolve().as_posix()
|
|
122
|
+
else:
|
|
123
|
+
dataset_config_uri = dataset_config_path
|
|
124
|
+
if isinstance(output_prefix, Path):
|
|
125
|
+
output_prefix_uri = output_prefix.resolve().as_posix()
|
|
126
|
+
else:
|
|
127
|
+
output_prefix_uri = output_prefix
|
|
128
|
+
|
|
129
|
+
backend_config = V1EngineConfig.load(dataset_config_uri)
|
|
130
|
+
|
|
131
|
+
self = cls(
|
|
132
|
+
dataset_config_uri=dataset_config_uri,
|
|
133
|
+
output_prefix_uri=output_prefix_uri,
|
|
134
|
+
main_log=main_log,
|
|
135
|
+
cache_prefix_uri=cache_prefix,
|
|
136
|
+
backend_config=backend_config,
|
|
137
|
+
debug=debug,
|
|
138
|
+
**kwargs,
|
|
139
|
+
)
|
|
140
|
+
self.main_log.info(f"Output path: {self.output_prefix_uri!r}")
|
|
141
|
+
return self
|
|
142
|
+
|
|
143
|
+
@classmethod
|
|
144
|
+
def build_from_model(cls, model_str: JSONstring):
|
|
145
|
+
"""Build an engine from a serialised JSON pydantic model of definitions.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
- `dataset_config_path`: a URI or path indicating the location of the
|
|
149
|
+
dataset configuration.
|
|
150
|
+
- `output_prefix`: the prefix for parquet outputs (a URI or a local path).
|
|
151
|
+
|
|
152
|
+
"""
|
|
153
|
+
main_log = get_logger("CoreEngine")
|
|
154
|
+
main_log.info("Initalise from model...")
|
|
155
|
+
return cls.build(**EngineRunValidation(**json.loads(model_str)).dict())
|
|
156
|
+
|
|
157
|
+
def __enter__(self) -> "CoreEngine":
|
|
158
|
+
self.main_log.info("Entering pipeline context.")
|
|
159
|
+
if self._cache_dir is not None:
|
|
160
|
+
raise ValueError("Pipeline already within context")
|
|
161
|
+
|
|
162
|
+
self._cache_dir = TemporaryPrefix(self.cache_prefix_uri)
|
|
163
|
+
self._cache_dir.__enter__()
|
|
164
|
+
self.main_log.info(f"Pipeline will cache to {self.cache_prefix!r}")
|
|
165
|
+
return self
|
|
166
|
+
|
|
167
|
+
def __exit__(
|
|
168
|
+
self,
|
|
169
|
+
exc_type: Optional[type[Exception]],
|
|
170
|
+
exc_value: Optional[Exception],
|
|
171
|
+
traceback: Optional[TracebackType],
|
|
172
|
+
) -> None:
|
|
173
|
+
self.main_log.info(f"Exiting pipeline context, clearing {self.cache_prefix!r}")
|
|
174
|
+
cache_dir = self._cache_dir
|
|
175
|
+
self._cache_dir = None
|
|
176
|
+
|
|
177
|
+
if cache_dir is not None:
|
|
178
|
+
cache_dir.__exit__(exc_type, exc_value, traceback)
|
|
179
|
+
|
|
180
|
+
self.main_log.info("Cleared cache.")
|
|
181
|
+
|
|
182
|
+
@property
|
|
183
|
+
def cache_prefix(self) -> URI:
|
|
184
|
+
"""The cache directory for the pipeline run."""
|
|
185
|
+
if self._cache_dir is None:
|
|
186
|
+
raise ValueError(
|
|
187
|
+
"`cache_prefix` is undefined when the pipeline is not being used as a "
|
|
188
|
+
+ "context manager"
|
|
189
|
+
)
|
|
190
|
+
return self._cache_dir.prefix
|
|
191
|
+
|
|
192
|
+
def _write_entity_outputs(self, entities: SparkEntities) -> SparkEntities:
|
|
193
|
+
"""Write the final entities to the output prefix as Parquet.
|
|
194
|
+
|
|
195
|
+
This will result in a directory of files for each entity, containing
|
|
196
|
+
parquet files for each partition in the entity.
|
|
197
|
+
|
|
198
|
+
"""
|
|
199
|
+
output_entities = {}
|
|
200
|
+
|
|
201
|
+
self.main_log.info(f"Writing entities to the output location: {self.output_prefix_uri}")
|
|
202
|
+
for entity_name, entity in entities.items():
|
|
203
|
+
entity = entity.drop(ROWID_COLUMN_NAME)
|
|
204
|
+
|
|
205
|
+
self.main_log.info(f"Entity: {entity_name} {type(entity)}")
|
|
206
|
+
|
|
207
|
+
output_uri = joinuri(self.output_prefix_uri, entity_name)
|
|
208
|
+
if get_resource_exists(output_uri):
|
|
209
|
+
self.main_log.info(f"{output_uri} already exists - will be overwritten")
|
|
210
|
+
|
|
211
|
+
self.main_log.info(f"+ Writing parquet output to {output_uri!r}")
|
|
212
|
+
entity.write.mode("overwrite").parquet(output_uri)
|
|
213
|
+
spark_session = SparkSession.builder.getOrCreate()
|
|
214
|
+
output_entities[entity_name] = spark_session.read.format("parquet").load(
|
|
215
|
+
output_uri, schema=entity.schema
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
return output_entities
|
|
219
|
+
|
|
220
|
+
def _write_outputs(self, entities: SparkEntities) -> SparkEntities:
|
|
221
|
+
"""Write the outputs from the pipeline, returning the written entities
|
|
222
|
+
and messages.
|
|
223
|
+
|
|
224
|
+
"""
|
|
225
|
+
entities = self._write_entity_outputs(entities)
|
|
226
|
+
|
|
227
|
+
return entities
|
|
228
|
+
|
|
229
|
+
def _show_available_entities(self, entities: SparkEntities, *, verbose: bool = False) -> None:
|
|
230
|
+
"""Print current entities."""
|
|
231
|
+
self.main_log.info("Displaying available dataframes in this run:")
|
|
232
|
+
|
|
233
|
+
for entity_name, entity in entities.items():
|
|
234
|
+
# FIXME: Currently a print statement because log messages
|
|
235
|
+
# can arrive out of sequence with the df.show()
|
|
236
|
+
if self.debug:
|
|
237
|
+
print(f"+ Entity dataframe: {entity_name} has {entity.count()} rows")
|
|
238
|
+
else:
|
|
239
|
+
print(f"+ Entity dataframe: {entity_name}")
|
|
240
|
+
|
|
241
|
+
if verbose:
|
|
242
|
+
# Cap the number of rows displayed to reduce probs with max log size on dbr
|
|
243
|
+
entity.show(n=10, truncate=False)
|
|
244
|
+
|
|
245
|
+
def run_pipeline(
|
|
246
|
+
self,
|
|
247
|
+
entity_locations: dict[EntityName, URI],
|
|
248
|
+
# pylint: disable=unused-argument
|
|
249
|
+
submission_info: Optional[SubmissionInfo] = None,
|
|
250
|
+
) -> tuple[SparkEntities, URI]:
|
|
251
|
+
"""Run the pipeline, reading in the entities and applying validation
|
|
252
|
+
and transformation rules, and then write the outputs.
|
|
253
|
+
|
|
254
|
+
The returned entities will reference the output locations, so
|
|
255
|
+
references should be valid after the pipeline context exits.
|
|
256
|
+
|
|
257
|
+
"""
|
|
258
|
+
entities, errors_uri = self.backend.process_legacy(
|
|
259
|
+
self.output_prefix_uri,
|
|
260
|
+
entity_locations,
|
|
261
|
+
self.backend_config.get_contract_metadata(),
|
|
262
|
+
self.backend_config.get_rule_metadata(),
|
|
263
|
+
submission_info,
|
|
264
|
+
)
|
|
265
|
+
return self._write_outputs(entities), errors_uri
|