data-validation-engine 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. data_validation_engine-0.6.2.dist-info/METADATA +104 -0
  2. data_validation_engine-0.6.2.dist-info/RECORD +105 -0
  3. data_validation_engine-0.6.2.dist-info/WHEEL +4 -0
  4. data_validation_engine-0.6.2.dist-info/licenses/LICENSE +21 -0
  5. dve/__init__.py +0 -0
  6. dve/common/__init__.py +0 -0
  7. dve/common/error_utils.py +189 -0
  8. dve/core_engine/__init__.py +0 -0
  9. dve/core_engine/backends/__init__.py +1 -0
  10. dve/core_engine/backends/base/__init__.py +1 -0
  11. dve/core_engine/backends/base/auditing.py +618 -0
  12. dve/core_engine/backends/base/backend.py +240 -0
  13. dve/core_engine/backends/base/contract.py +454 -0
  14. dve/core_engine/backends/base/core.py +124 -0
  15. dve/core_engine/backends/base/reader.py +176 -0
  16. dve/core_engine/backends/base/reference_data.py +217 -0
  17. dve/core_engine/backends/base/rules.py +685 -0
  18. dve/core_engine/backends/base/utilities.py +146 -0
  19. dve/core_engine/backends/exceptions.py +311 -0
  20. dve/core_engine/backends/implementations/__init__.py +1 -0
  21. dve/core_engine/backends/implementations/duckdb/__init__.py +26 -0
  22. dve/core_engine/backends/implementations/duckdb/auditing.py +234 -0
  23. dve/core_engine/backends/implementations/duckdb/contract.py +213 -0
  24. dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +288 -0
  25. dve/core_engine/backends/implementations/duckdb/readers/__init__.py +13 -0
  26. dve/core_engine/backends/implementations/duckdb/readers/csv.py +222 -0
  27. dve/core_engine/backends/implementations/duckdb/readers/json.py +50 -0
  28. dve/core_engine/backends/implementations/duckdb/readers/xml.py +45 -0
  29. dve/core_engine/backends/implementations/duckdb/reference_data.py +49 -0
  30. dve/core_engine/backends/implementations/duckdb/rules.py +534 -0
  31. dve/core_engine/backends/implementations/duckdb/types.py +47 -0
  32. dve/core_engine/backends/implementations/duckdb/utilities.py +41 -0
  33. dve/core_engine/backends/implementations/spark/__init__.py +22 -0
  34. dve/core_engine/backends/implementations/spark/auditing.py +230 -0
  35. dve/core_engine/backends/implementations/spark/backend.py +78 -0
  36. dve/core_engine/backends/implementations/spark/contract.py +241 -0
  37. dve/core_engine/backends/implementations/spark/readers/__init__.py +15 -0
  38. dve/core_engine/backends/implementations/spark/readers/csv.py +77 -0
  39. dve/core_engine/backends/implementations/spark/readers/json.py +66 -0
  40. dve/core_engine/backends/implementations/spark/readers/xml.py +202 -0
  41. dve/core_engine/backends/implementations/spark/reference_data.py +42 -0
  42. dve/core_engine/backends/implementations/spark/rules.py +430 -0
  43. dve/core_engine/backends/implementations/spark/spark_helpers.py +412 -0
  44. dve/core_engine/backends/implementations/spark/types.py +21 -0
  45. dve/core_engine/backends/implementations/spark/utilities.py +144 -0
  46. dve/core_engine/backends/metadata/__init__.py +47 -0
  47. dve/core_engine/backends/metadata/contract.py +80 -0
  48. dve/core_engine/backends/metadata/reporting.py +374 -0
  49. dve/core_engine/backends/metadata/rules.py +737 -0
  50. dve/core_engine/backends/readers/__init__.py +41 -0
  51. dve/core_engine/backends/readers/csv.py +232 -0
  52. dve/core_engine/backends/readers/utilities.py +21 -0
  53. dve/core_engine/backends/readers/xml.py +432 -0
  54. dve/core_engine/backends/readers/xml_linting.py +142 -0
  55. dve/core_engine/backends/types.py +26 -0
  56. dve/core_engine/backends/utilities.py +177 -0
  57. dve/core_engine/configuration/__init__.py +1 -0
  58. dve/core_engine/configuration/base.py +56 -0
  59. dve/core_engine/configuration/v1/__init__.py +351 -0
  60. dve/core_engine/configuration/v1/filters.py +60 -0
  61. dve/core_engine/configuration/v1/rule_stores/__init__.py +1 -0
  62. dve/core_engine/configuration/v1/rule_stores/models.py +57 -0
  63. dve/core_engine/configuration/v1/steps.py +365 -0
  64. dve/core_engine/constants.py +8 -0
  65. dve/core_engine/engine.py +265 -0
  66. dve/core_engine/exceptions.py +29 -0
  67. dve/core_engine/functions/__init__.py +6 -0
  68. dve/core_engine/functions/implementations.py +200 -0
  69. dve/core_engine/loggers.py +57 -0
  70. dve/core_engine/message.py +512 -0
  71. dve/core_engine/models.py +196 -0
  72. dve/core_engine/templating.py +114 -0
  73. dve/core_engine/type_hints.py +255 -0
  74. dve/core_engine/validation.py +160 -0
  75. dve/metadata_parser/__init__.py +2 -0
  76. dve/metadata_parser/domain_types.py +682 -0
  77. dve/metadata_parser/exc.py +44 -0
  78. dve/metadata_parser/function_library.py +64 -0
  79. dve/metadata_parser/function_wrapper.py +201 -0
  80. dve/metadata_parser/model_generator.py +119 -0
  81. dve/metadata_parser/models.py +410 -0
  82. dve/metadata_parser/utilities.py +54 -0
  83. dve/parser/__init__.py +1 -0
  84. dve/parser/exceptions.py +50 -0
  85. dve/parser/file_handling/__init__.py +31 -0
  86. dve/parser/file_handling/helpers.py +29 -0
  87. dve/parser/file_handling/implementations/__init__.py +7 -0
  88. dve/parser/file_handling/implementations/base.py +97 -0
  89. dve/parser/file_handling/implementations/dbfs.py +81 -0
  90. dve/parser/file_handling/implementations/file.py +203 -0
  91. dve/parser/file_handling/implementations/s3.py +371 -0
  92. dve/parser/file_handling/log_handler.py +215 -0
  93. dve/parser/file_handling/service.py +441 -0
  94. dve/parser/file_handling/utilities.py +53 -0
  95. dve/parser/type_hints.py +46 -0
  96. dve/parser/utilities.py +113 -0
  97. dve/pipeline/__init__.py +0 -0
  98. dve/pipeline/duckdb_pipeline.py +56 -0
  99. dve/pipeline/foundry_ddb_pipeline.py +171 -0
  100. dve/pipeline/pipeline.py +935 -0
  101. dve/pipeline/spark_pipeline.py +69 -0
  102. dve/pipeline/utils.py +96 -0
  103. dve/reporting/__init__.py +1 -0
  104. dve/reporting/error_report.py +153 -0
  105. dve/reporting/excel_report.py +319 -0
@@ -0,0 +1,57 @@
1
+ """Models for components in the rule stores."""
2
+
3
+ from typing import Any, Optional, Union
4
+
5
+ from pydantic import BaseModel, Field
6
+ from typing_extensions import Annotated, Literal
7
+
8
+ from dve.core_engine.configuration.v1.filters import FilterConfigUnion
9
+ from dve.core_engine.configuration.v1.steps import StepConfigUnion
10
+
11
+
12
+ class BusinessSpecConfig(BaseModel):
13
+ """A business rule or filter within the config."""
14
+
15
+ type: str
16
+ """The type of business rule."""
17
+
18
+ description: Optional[str] = None
19
+ """A description of what the rule/filter should do."""
20
+ parameter_descriptions: dict[str, str] = Field(default_factory=dict)
21
+ """Descriptions of parameters used by the rule."""
22
+ parameter_defaults: dict[str, Any] = Field(default_factory=dict)
23
+ """Default parameters to be used by the rule if no param is passed."""
24
+
25
+
26
+ class BusinessFilterSpecConfig(BusinessSpecConfig):
27
+ """A business filter within the rule store."""
28
+
29
+ type: Literal["filter"]
30
+
31
+ rule_config: FilterConfigUnion
32
+ """The configuration for the filter."""
33
+
34
+
35
+ class ComplexRuleConfig(BaseModel):
36
+ """The rule config for a business rule."""
37
+
38
+ rules: list[StepConfigUnion] = Field(default_factory=list)
39
+ filters: list[FilterConfigUnion] = Field(default_factory=list)
40
+ post_filter_rules: list[StepConfigUnion] = Field(default_factory=list)
41
+
42
+
43
+ class BusinessRuleSpecConfig(BusinessSpecConfig):
44
+ """A business rule within the rule store."""
45
+
46
+ type: Literal["complex_rule"]
47
+
48
+ rule_config: ComplexRuleConfig
49
+ """The configuration for the rule."""
50
+ dependencies: list[str] = Field(default_factory=list)
51
+ """The dependencies for the business rule."""
52
+
53
+
54
+ BusinessComponentSpecConfigUnion = Annotated[
55
+ Union[BusinessFilterSpecConfig, BusinessRuleSpecConfig], Field(discriminator="type")
56
+ ]
57
+ """A union of the different business component types."""
@@ -0,0 +1,365 @@
1
+ """Very basic configuration options for steps.
2
+
3
+ These (mostly) only differ slightly from the metadata steps,
4
+ but there's some repetition here to make it possible to change
5
+ the metadata steps without altering the config classes.
6
+
7
+ N.B. These are quite coarsely copied from the JSON schema.
8
+
9
+ """
10
+
11
+ # pylint: disable=missing-class-docstring
12
+ from abc import ABC, abstractmethod
13
+ from typing import Any, Optional, Union
14
+
15
+ from pydantic import BaseModel, Extra, Field, validator
16
+ from typing_extensions import Annotated, Literal
17
+
18
+ from dve.core_engine.backends.metadata.rules import (
19
+ AbstractStep,
20
+ Aggregation,
21
+ AntiJoin,
22
+ ColumnAddition,
23
+ ColumnRemoval,
24
+ ConfirmJoinHasMatch,
25
+ CopyEntity,
26
+ EntityRemoval,
27
+ HeaderJoin,
28
+ ImmediateFilter,
29
+ InnerJoin,
30
+ LeftJoin,
31
+ OneToOneJoin,
32
+ RenameEntity,
33
+ SelectColumns,
34
+ SemiJoin,
35
+ TableUnion,
36
+ )
37
+ from dve.core_engine.type_hints import MultipleExpressions
38
+
39
+
40
+ class ConfigStep(BaseModel, ABC):
41
+ """The parent for the config steps."""
42
+
43
+ class Config: # pylint: disable=too-few-public-methods
44
+ """Config class for dynamically generated pydantic models"""
45
+
46
+ extra = Extra.forbid
47
+
48
+ name: Optional[str] = None
49
+ """The 'name' of the rule. This is mapped to an ID in the entity."""
50
+ operation: str
51
+ """The operation implemented by the step."""
52
+
53
+ @abstractmethod
54
+ def to_step(self) -> AbstractStep:
55
+ """Convert the config step definition to a 'real' metadata step."""
56
+
57
+
58
+ class AddConfig(ConfigStep):
59
+ """Configuration step for adding a new column"""
60
+
61
+ operation: Literal["add"]
62
+
63
+ entity: str
64
+ new_entity_name: Optional[str] = None
65
+ column_name: str
66
+ expression: str
67
+
68
+ def to_step(self) -> AbstractStep:
69
+ """Takes a config object and returns a step object"""
70
+ return ColumnAddition(
71
+ id=self.name,
72
+ entity_name=self.entity,
73
+ new_entity_name=self.new_entity_name,
74
+ column_name=self.column_name,
75
+ expression=self.expression,
76
+ )
77
+
78
+
79
+ class RemoveConfig(ConfigStep):
80
+ """Configuration step for removing a column"""
81
+
82
+ operation: Literal["remove"]
83
+
84
+ entity: str
85
+ new_entity_name: Optional[str] = None
86
+ column_name: str
87
+
88
+ def to_step(self) -> AbstractStep:
89
+ """Takes a config object and returns a step object"""
90
+ return ColumnRemoval(
91
+ id=self.name,
92
+ entity_name=self.entity,
93
+ new_entity_name=self.new_entity_name,
94
+ column_name=self.column_name,
95
+ )
96
+
97
+
98
+ class GroupByConfig(ConfigStep):
99
+ """Configuration step for performing a GROUP BY operation"""
100
+
101
+ operation: Literal["group_by"]
102
+
103
+ entity: str
104
+ new_entity_name: Optional[str] = None
105
+ group_by: MultipleExpressions
106
+ pivot_column: Optional[str] = None
107
+ pivot_values: Optional[list[str]] = None
108
+ agg_columns: MultipleExpressions
109
+
110
+ @validator("pivot_values")
111
+ @classmethod
112
+ def _ensure_no_values_if_not_column(cls, value: Optional[str], values: dict[str, Any]):
113
+ if value and not values["pivot_column"]:
114
+ raise ValueError("Cannot provide 'pivot_values' if no 'pivot_column'")
115
+ return value
116
+
117
+ def to_step(self) -> AbstractStep:
118
+ """Takes a config object and returns a step object"""
119
+ return Aggregation(
120
+ id=self.name,
121
+ entity_name=self.entity,
122
+ new_entity_name=self.new_entity_name,
123
+ group_by=self.group_by,
124
+ pivot_column=self.pivot_column,
125
+ pivot_values=self.pivot_values,
126
+ agg_columns=self.agg_columns,
127
+ )
128
+
129
+
130
+ class SelectConfig(ConfigStep):
131
+ """Configuration step for performing a SELECT operation"""
132
+
133
+ operation: Literal["select"]
134
+
135
+ entity: str
136
+ new_entity_name: Optional[str] = None
137
+ columns: MultipleExpressions
138
+ distinct: bool = False
139
+
140
+ def to_step(self) -> AbstractStep:
141
+ """Takes a config object and returns a step object"""
142
+ return SelectColumns(
143
+ id=self.name,
144
+ entity_name=self.entity,
145
+ new_entity_name=self.new_entity_name,
146
+ columns=self.columns,
147
+ distinct=self.distinct,
148
+ )
149
+
150
+
151
+ class RenameEntityConfig(ConfigStep):
152
+ """Configuration step for renaming an entity"""
153
+
154
+ operation: Literal["rename_entity"]
155
+
156
+ entity: str
157
+ new_entity_name: str
158
+
159
+ def to_step(self) -> AbstractStep:
160
+ """Takes a config object and returns a step object"""
161
+ return RenameEntity(
162
+ id=self.name,
163
+ entity_name=self.entity,
164
+ new_entity_name=self.new_entity_name,
165
+ )
166
+
167
+
168
+ class NonNotifyingFilterConfig(ConfigStep):
169
+ """Configuration step for filtering out values without creating errors
170
+
171
+ mainly used on derived entities
172
+ """
173
+
174
+ operation: Literal["filter_without_notifying"]
175
+
176
+ entity: str
177
+ new_entity_name: Optional[str] = None
178
+ filter_rule: str
179
+
180
+ def to_step(self) -> AbstractStep:
181
+ """Takes a config object and returns a step object"""
182
+ return ImmediateFilter(
183
+ id=self.name,
184
+ entity_name=self.entity,
185
+ new_entity_name=self.new_entity_name,
186
+ expression=self.filter_rule,
187
+ )
188
+
189
+
190
+ class HasMatchConfig(ConfigStep):
191
+ """Configuration step for checking if a value has a match in another entity"""
192
+
193
+ operation: Literal["has_match"]
194
+
195
+ entity: str
196
+ new_entity_name: Optional[str] = None
197
+ target: str
198
+ join_condition: str
199
+ column_name: str
200
+
201
+ def to_step(self) -> AbstractStep:
202
+ """Takes a config object and returns a step object"""
203
+ return ConfirmJoinHasMatch(
204
+ id=self.name,
205
+ entity_name=self.entity,
206
+ target_name=self.target,
207
+ new_entity_name=self.new_entity_name,
208
+ join_condition=self.join_condition,
209
+ column_name=self.column_name,
210
+ )
211
+
212
+
213
+ class SemiOrAntiJoinConfig(ConfigStep):
214
+ """Configuration step for performing a SEMI or ANTI JOIN
215
+
216
+ More performant than a left or right join for checking membership in another entity
217
+ """
218
+
219
+ operation: Literal["semi_join", "anti_join"]
220
+
221
+ entity: str
222
+ new_entity_name: Optional[str] = None
223
+ target: str
224
+ join_condition: str
225
+
226
+ def to_step(self) -> AbstractStep:
227
+ """Takes a config object and returns a step object"""
228
+ type_ = AntiJoin if self.operation == "anti_join" else SemiJoin
229
+ return type_(
230
+ id=self.name,
231
+ entity_name=self.entity,
232
+ target_name=self.target,
233
+ new_entity_name=self.new_entity_name,
234
+ join_condition=self.join_condition,
235
+ )
236
+
237
+
238
+ class LeftOrInnerJoinConfig(SemiOrAntiJoinConfig):
239
+ """Configuration step for performing a LEFT or INNER JOIN"""
240
+
241
+ operation: Literal["left_join", "inner_join"] # type: ignore
242
+
243
+ new_columns: MultipleExpressions
244
+
245
+ def to_step(self) -> AbstractStep:
246
+ """Takes a config object and returns a step object"""
247
+ type_ = LeftJoin if self.operation == "left_join" else InnerJoin
248
+ return type_(
249
+ id=self.name,
250
+ entity_name=self.entity,
251
+ target_name=self.target,
252
+ new_entity_name=self.new_entity_name,
253
+ join_condition=self.join_condition,
254
+ new_columns=self.new_columns,
255
+ )
256
+
257
+
258
+ class OneToOneJoinConfig(LeftOrInnerJoinConfig):
259
+ """Config for joining one entity to another"""
260
+
261
+ operation: Literal["join", "one_to_one_join"] # type: ignore
262
+
263
+ perform_integrity_check: bool = True
264
+
265
+ def to_step(self) -> AbstractStep:
266
+ """Takes a config object and returns a step object"""
267
+ return OneToOneJoin(
268
+ id=self.name,
269
+ entity_name=self.entity,
270
+ target_name=self.target,
271
+ new_entity_name=self.new_entity_name,
272
+ join_condition=self.join_condition,
273
+ new_columns=self.new_columns,
274
+ perform_integrity_check=self.perform_integrity_check,
275
+ )
276
+
277
+
278
+ class JoinHeaderConfig(ConfigStep):
279
+ """Config for joining a header onto another entity"""
280
+
281
+ operation: Literal["join_header"]
282
+
283
+ entity: str
284
+ new_entity_name: Optional[str] = None
285
+ target: str
286
+ header_column_name: str = "_Header"
287
+ perform_integrity_check: bool = True
288
+
289
+ def to_step(self) -> AbstractStep:
290
+ """Takes a config object and returns a step object"""
291
+ return HeaderJoin(
292
+ id=self.name,
293
+ entity_name=self.entity,
294
+ target_name=self.target,
295
+ new_entity_name=self.new_entity_name,
296
+ header_column_name=self.header_column_name,
297
+ )
298
+
299
+
300
+ class UnionConfig(ConfigStep):
301
+ """Config to unioning two entities"""
302
+
303
+ operation: Literal["union"]
304
+
305
+ entity: str
306
+ new_entity_name: Optional[str] = None
307
+ target: str
308
+
309
+ def to_step(self) -> AbstractStep:
310
+ """Takes a config object and returns a step object"""
311
+ return TableUnion(
312
+ id=self.name,
313
+ entity_name=self.entity,
314
+ target_name=self.target,
315
+ new_entity_name=self.new_entity_name,
316
+ )
317
+
318
+
319
+ class CopyEntityConfig(ConfigStep):
320
+ """Config for copying entities"""
321
+
322
+ operation: Literal["copy_entity"]
323
+
324
+ entity: str
325
+ new_entity_name: str
326
+
327
+ def to_step(self) -> AbstractStep:
328
+ """Takes a config object and returns a step object"""
329
+ return CopyEntity(
330
+ id=self.name, entity_name=self.entity, new_entity_name=self.new_entity_name
331
+ )
332
+
333
+
334
+ class RemoveEntityConfig(ConfigStep):
335
+ """Config for removing entities"""
336
+
337
+ operation: Literal["remove_entity", "remove_entities"]
338
+
339
+ entity: Union[str, list[str]]
340
+
341
+ def to_step(self) -> AbstractStep:
342
+ """Takes a config object and returns a step object"""
343
+ return EntityRemoval(id=self.name, entity_name=self.entity)
344
+
345
+
346
+ StepConfigUnion = Annotated[
347
+ Union[
348
+ AddConfig,
349
+ CopyEntityConfig,
350
+ GroupByConfig,
351
+ HasMatchConfig,
352
+ JoinHeaderConfig,
353
+ LeftOrInnerJoinConfig,
354
+ NonNotifyingFilterConfig,
355
+ OneToOneJoinConfig,
356
+ RemoveConfig,
357
+ RemoveEntityConfig,
358
+ RenameEntityConfig,
359
+ SelectConfig,
360
+ SemiOrAntiJoinConfig,
361
+ UnionConfig,
362
+ ],
363
+ Field(discriminator="operation"),
364
+ ]
365
+ """Pydantic configuration classes for steps."""
@@ -0,0 +1,8 @@
1
+ """Constant values used in mutiple places."""
2
+
3
+ ROWID_COLUMN_NAME: str = "__rowid__"
4
+ """The name of the column containing the row ID for each entity."""
5
+
6
+ CONTRACT_ERROR_VALUE_FIELD_NAME: str = "__error_value"
7
+ """The name of the field that can be used to extract the field value that caused
8
+ a pydantic validation error"""
@@ -0,0 +1,265 @@
1
+ """The core engine for the data validation engine."""
2
+
3
+ import json
4
+ import logging
5
+ from pathlib import Path
6
+ from types import TracebackType
7
+ from typing import Any, Optional, Union
8
+
9
+ from pydantic import BaseModel, Field, PrivateAttr, validate_arguments, validator
10
+ from pydantic.types import FilePath
11
+ from pyspark.sql import SparkSession
12
+
13
+ from dve.core_engine.backends.base.backend import BaseBackend
14
+ from dve.core_engine.backends.implementations.spark.backend import SparkBackend
15
+ from dve.core_engine.backends.implementations.spark.types import SparkEntities
16
+ from dve.core_engine.configuration.base import BaseEngineConfig
17
+ from dve.core_engine.configuration.v1 import V1EngineConfig
18
+ from dve.core_engine.constants import ROWID_COLUMN_NAME
19
+ from dve.core_engine.loggers import get_child_logger, get_logger
20
+ from dve.core_engine.models import EngineRunValidation, SubmissionInfo
21
+ from dve.core_engine.type_hints import EntityName, JSONstring
22
+ from dve.parser.file_handling import TemporaryPrefix, get_resource_exists, joinuri, resolve_location
23
+ from dve.parser.type_hints import URI, Location
24
+
25
+
26
+ class CoreEngine(BaseModel):
27
+ """The core engine implementation for the data validation engine."""
28
+
29
+ class Config: # pylint: disable=too-few-public-methods
30
+ """`pydantic` configuration options."""
31
+
32
+ arbitrary_types_allowed = True
33
+ validate_assignment = True
34
+
35
+ backend_config: BaseEngineConfig
36
+ """The backend configuration for the given run."""
37
+ dataset_config_uri: URI
38
+ """The dischema location for the current run"""
39
+ output_prefix_uri: URI = Field(default_factory=lambda: Path("outputs").resolve().as_posix())
40
+ """The prefix for the parquet outputs."""
41
+ main_log: logging.Logger = Field(default_factory=lambda: get_logger("CoreEngine"))
42
+ """The `logging.Logger instance for the data ingest process."""
43
+ cache_prefix_uri: Optional[URI] = None
44
+ """
45
+ An optional cache prefix URI. If not provided, a local temporary directory will
46
+ be used instead (this will not play nicely in Databricks).
47
+
48
+ """
49
+ _cache_dir: Optional[TemporaryPrefix] = PrivateAttr(default=None)
50
+ """
51
+ The `TemporaryPrefix` indicating the cache dir.
52
+
53
+ Data will be chunked to parquet in this directory after being read,
54
+ and written here before filters are applied.
55
+
56
+ """
57
+ backend: BaseBackend = None # type: ignore
58
+ """The backend to use to process the files."""
59
+ debug: bool = False
60
+ """Indication of if this run is in debug mode."""
61
+
62
+ @validator("cache_prefix_uri", "output_prefix_uri", allow_reuse=True, pre=True)
63
+ # pylint: disable=E0213
64
+ def _validate_prefix_uri(cls, location: Optional[Location]) -> Optional[URI]:
65
+ """Ensure we support the cache prefix scheme."""
66
+ if location is None:
67
+ return None
68
+ return resolve_location(location)
69
+
70
+ def __init__(self, *args, **kwargs):
71
+ # pylint: disable=W0235
72
+ super().__init__(*args, **kwargs)
73
+
74
+ @validator("backend", always=True)
75
+ @classmethod
76
+ def _ensure_backend(cls, value: Optional[BaseBackend], values: dict[str, Any]) -> BaseBackend:
77
+ """Ensure a default backend is created if a backend is not specified."""
78
+ if value is not None:
79
+ return value
80
+
81
+ main_logger = values.get("main_log")
82
+ if main_logger is None:
83
+ return SparkBackend(dataset_config_uri=values.get("dataset_config_uri"))
84
+ return SparkBackend(
85
+ dataset_config_uri=values.get("dataset_config_uri"),
86
+ logger=get_child_logger(
87
+ ".".join((SparkBackend.__module__, SparkBackend.__name__)), main_logger
88
+ ),
89
+ )
90
+
91
+ @classmethod
92
+ @validate_arguments(config={"arbitrary_types_allowed": True})
93
+ def build(
94
+ cls,
95
+ dataset_config_path: Union[FilePath, URI],
96
+ output_prefix: Location = Path("./outputs"),
97
+ cache_prefix: Optional[Location] = None,
98
+ parent_logger: Optional[logging.Logger] = None,
99
+ debug: Optional[bool] = False,
100
+ **kwargs,
101
+ ):
102
+ """Build an engine from serialised definitions.
103
+
104
+ Args:
105
+ - `dataset_config_path`: a URI or path indicating the location of the
106
+ dataset configuration.
107
+ - `output_prefix`: the prefix for parquet outputs (a URI or a local path).
108
+ - `cache_prefix`: the prefix for caching (a URI or a local path).
109
+ - `parent_logger`: an optional parent logger for the engine.
110
+ - `debug`: whether to run in debug mode (default: False).
111
+
112
+ """
113
+ if parent_logger:
114
+ main_log = get_child_logger(cls.__name__, parent_logger)
115
+ else:
116
+ main_log = get_logger(cls.__name__)
117
+ main_log.info("Initialising...")
118
+ main_log.info(f"Debug mode: {debug}")
119
+
120
+ if isinstance(dataset_config_path, Path):
121
+ dataset_config_uri = dataset_config_path.resolve().as_posix()
122
+ else:
123
+ dataset_config_uri = dataset_config_path
124
+ if isinstance(output_prefix, Path):
125
+ output_prefix_uri = output_prefix.resolve().as_posix()
126
+ else:
127
+ output_prefix_uri = output_prefix
128
+
129
+ backend_config = V1EngineConfig.load(dataset_config_uri)
130
+
131
+ self = cls(
132
+ dataset_config_uri=dataset_config_uri,
133
+ output_prefix_uri=output_prefix_uri,
134
+ main_log=main_log,
135
+ cache_prefix_uri=cache_prefix,
136
+ backend_config=backend_config,
137
+ debug=debug,
138
+ **kwargs,
139
+ )
140
+ self.main_log.info(f"Output path: {self.output_prefix_uri!r}")
141
+ return self
142
+
143
+ @classmethod
144
+ def build_from_model(cls, model_str: JSONstring):
145
+ """Build an engine from a serialised JSON pydantic model of definitions.
146
+
147
+ Args:
148
+ - `dataset_config_path`: a URI or path indicating the location of the
149
+ dataset configuration.
150
+ - `output_prefix`: the prefix for parquet outputs (a URI or a local path).
151
+
152
+ """
153
+ main_log = get_logger("CoreEngine")
154
+ main_log.info("Initalise from model...")
155
+ return cls.build(**EngineRunValidation(**json.loads(model_str)).dict())
156
+
157
+ def __enter__(self) -> "CoreEngine":
158
+ self.main_log.info("Entering pipeline context.")
159
+ if self._cache_dir is not None:
160
+ raise ValueError("Pipeline already within context")
161
+
162
+ self._cache_dir = TemporaryPrefix(self.cache_prefix_uri)
163
+ self._cache_dir.__enter__()
164
+ self.main_log.info(f"Pipeline will cache to {self.cache_prefix!r}")
165
+ return self
166
+
167
+ def __exit__(
168
+ self,
169
+ exc_type: Optional[type[Exception]],
170
+ exc_value: Optional[Exception],
171
+ traceback: Optional[TracebackType],
172
+ ) -> None:
173
+ self.main_log.info(f"Exiting pipeline context, clearing {self.cache_prefix!r}")
174
+ cache_dir = self._cache_dir
175
+ self._cache_dir = None
176
+
177
+ if cache_dir is not None:
178
+ cache_dir.__exit__(exc_type, exc_value, traceback)
179
+
180
+ self.main_log.info("Cleared cache.")
181
+
182
+ @property
183
+ def cache_prefix(self) -> URI:
184
+ """The cache directory for the pipeline run."""
185
+ if self._cache_dir is None:
186
+ raise ValueError(
187
+ "`cache_prefix` is undefined when the pipeline is not being used as a "
188
+ + "context manager"
189
+ )
190
+ return self._cache_dir.prefix
191
+
192
+ def _write_entity_outputs(self, entities: SparkEntities) -> SparkEntities:
193
+ """Write the final entities to the output prefix as Parquet.
194
+
195
+ This will result in a directory of files for each entity, containing
196
+ parquet files for each partition in the entity.
197
+
198
+ """
199
+ output_entities = {}
200
+
201
+ self.main_log.info(f"Writing entities to the output location: {self.output_prefix_uri}")
202
+ for entity_name, entity in entities.items():
203
+ entity = entity.drop(ROWID_COLUMN_NAME)
204
+
205
+ self.main_log.info(f"Entity: {entity_name} {type(entity)}")
206
+
207
+ output_uri = joinuri(self.output_prefix_uri, entity_name)
208
+ if get_resource_exists(output_uri):
209
+ self.main_log.info(f"{output_uri} already exists - will be overwritten")
210
+
211
+ self.main_log.info(f"+ Writing parquet output to {output_uri!r}")
212
+ entity.write.mode("overwrite").parquet(output_uri)
213
+ spark_session = SparkSession.builder.getOrCreate()
214
+ output_entities[entity_name] = spark_session.read.format("parquet").load(
215
+ output_uri, schema=entity.schema
216
+ )
217
+
218
+ return output_entities
219
+
220
+ def _write_outputs(self, entities: SparkEntities) -> SparkEntities:
221
+ """Write the outputs from the pipeline, returning the written entities
222
+ and messages.
223
+
224
+ """
225
+ entities = self._write_entity_outputs(entities)
226
+
227
+ return entities
228
+
229
+ def _show_available_entities(self, entities: SparkEntities, *, verbose: bool = False) -> None:
230
+ """Print current entities."""
231
+ self.main_log.info("Displaying available dataframes in this run:")
232
+
233
+ for entity_name, entity in entities.items():
234
+ # FIXME: Currently a print statement because log messages
235
+ # can arrive out of sequence with the df.show()
236
+ if self.debug:
237
+ print(f"+ Entity dataframe: {entity_name} has {entity.count()} rows")
238
+ else:
239
+ print(f"+ Entity dataframe: {entity_name}")
240
+
241
+ if verbose:
242
+ # Cap the number of rows displayed to reduce probs with max log size on dbr
243
+ entity.show(n=10, truncate=False)
244
+
245
+ def run_pipeline(
246
+ self,
247
+ entity_locations: dict[EntityName, URI],
248
+ # pylint: disable=unused-argument
249
+ submission_info: Optional[SubmissionInfo] = None,
250
+ ) -> tuple[SparkEntities, URI]:
251
+ """Run the pipeline, reading in the entities and applying validation
252
+ and transformation rules, and then write the outputs.
253
+
254
+ The returned entities will reference the output locations, so
255
+ references should be valid after the pipeline context exits.
256
+
257
+ """
258
+ entities, errors_uri = self.backend.process_legacy(
259
+ self.output_prefix_uri,
260
+ entity_locations,
261
+ self.backend_config.get_contract_metadata(),
262
+ self.backend_config.get_rule_metadata(),
263
+ submission_info,
264
+ )
265
+ return self._write_outputs(entities), errors_uri