data-validation-engine 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_validation_engine-0.6.2.dist-info/METADATA +104 -0
- data_validation_engine-0.6.2.dist-info/RECORD +105 -0
- data_validation_engine-0.6.2.dist-info/WHEEL +4 -0
- data_validation_engine-0.6.2.dist-info/licenses/LICENSE +21 -0
- dve/__init__.py +0 -0
- dve/common/__init__.py +0 -0
- dve/common/error_utils.py +189 -0
- dve/core_engine/__init__.py +0 -0
- dve/core_engine/backends/__init__.py +1 -0
- dve/core_engine/backends/base/__init__.py +1 -0
- dve/core_engine/backends/base/auditing.py +618 -0
- dve/core_engine/backends/base/backend.py +240 -0
- dve/core_engine/backends/base/contract.py +454 -0
- dve/core_engine/backends/base/core.py +124 -0
- dve/core_engine/backends/base/reader.py +176 -0
- dve/core_engine/backends/base/reference_data.py +217 -0
- dve/core_engine/backends/base/rules.py +685 -0
- dve/core_engine/backends/base/utilities.py +146 -0
- dve/core_engine/backends/exceptions.py +311 -0
- dve/core_engine/backends/implementations/__init__.py +1 -0
- dve/core_engine/backends/implementations/duckdb/__init__.py +26 -0
- dve/core_engine/backends/implementations/duckdb/auditing.py +234 -0
- dve/core_engine/backends/implementations/duckdb/contract.py +213 -0
- dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +288 -0
- dve/core_engine/backends/implementations/duckdb/readers/__init__.py +13 -0
- dve/core_engine/backends/implementations/duckdb/readers/csv.py +222 -0
- dve/core_engine/backends/implementations/duckdb/readers/json.py +50 -0
- dve/core_engine/backends/implementations/duckdb/readers/xml.py +45 -0
- dve/core_engine/backends/implementations/duckdb/reference_data.py +49 -0
- dve/core_engine/backends/implementations/duckdb/rules.py +534 -0
- dve/core_engine/backends/implementations/duckdb/types.py +47 -0
- dve/core_engine/backends/implementations/duckdb/utilities.py +41 -0
- dve/core_engine/backends/implementations/spark/__init__.py +22 -0
- dve/core_engine/backends/implementations/spark/auditing.py +230 -0
- dve/core_engine/backends/implementations/spark/backend.py +78 -0
- dve/core_engine/backends/implementations/spark/contract.py +241 -0
- dve/core_engine/backends/implementations/spark/readers/__init__.py +15 -0
- dve/core_engine/backends/implementations/spark/readers/csv.py +77 -0
- dve/core_engine/backends/implementations/spark/readers/json.py +66 -0
- dve/core_engine/backends/implementations/spark/readers/xml.py +202 -0
- dve/core_engine/backends/implementations/spark/reference_data.py +42 -0
- dve/core_engine/backends/implementations/spark/rules.py +430 -0
- dve/core_engine/backends/implementations/spark/spark_helpers.py +412 -0
- dve/core_engine/backends/implementations/spark/types.py +21 -0
- dve/core_engine/backends/implementations/spark/utilities.py +144 -0
- dve/core_engine/backends/metadata/__init__.py +47 -0
- dve/core_engine/backends/metadata/contract.py +80 -0
- dve/core_engine/backends/metadata/reporting.py +374 -0
- dve/core_engine/backends/metadata/rules.py +737 -0
- dve/core_engine/backends/readers/__init__.py +41 -0
- dve/core_engine/backends/readers/csv.py +232 -0
- dve/core_engine/backends/readers/utilities.py +21 -0
- dve/core_engine/backends/readers/xml.py +432 -0
- dve/core_engine/backends/readers/xml_linting.py +142 -0
- dve/core_engine/backends/types.py +26 -0
- dve/core_engine/backends/utilities.py +177 -0
- dve/core_engine/configuration/__init__.py +1 -0
- dve/core_engine/configuration/base.py +56 -0
- dve/core_engine/configuration/v1/__init__.py +351 -0
- dve/core_engine/configuration/v1/filters.py +60 -0
- dve/core_engine/configuration/v1/rule_stores/__init__.py +1 -0
- dve/core_engine/configuration/v1/rule_stores/models.py +57 -0
- dve/core_engine/configuration/v1/steps.py +365 -0
- dve/core_engine/constants.py +8 -0
- dve/core_engine/engine.py +265 -0
- dve/core_engine/exceptions.py +29 -0
- dve/core_engine/functions/__init__.py +6 -0
- dve/core_engine/functions/implementations.py +200 -0
- dve/core_engine/loggers.py +57 -0
- dve/core_engine/message.py +512 -0
- dve/core_engine/models.py +196 -0
- dve/core_engine/templating.py +114 -0
- dve/core_engine/type_hints.py +255 -0
- dve/core_engine/validation.py +160 -0
- dve/metadata_parser/__init__.py +2 -0
- dve/metadata_parser/domain_types.py +682 -0
- dve/metadata_parser/exc.py +44 -0
- dve/metadata_parser/function_library.py +64 -0
- dve/metadata_parser/function_wrapper.py +201 -0
- dve/metadata_parser/model_generator.py +119 -0
- dve/metadata_parser/models.py +410 -0
- dve/metadata_parser/utilities.py +54 -0
- dve/parser/__init__.py +1 -0
- dve/parser/exceptions.py +50 -0
- dve/parser/file_handling/__init__.py +31 -0
- dve/parser/file_handling/helpers.py +29 -0
- dve/parser/file_handling/implementations/__init__.py +7 -0
- dve/parser/file_handling/implementations/base.py +97 -0
- dve/parser/file_handling/implementations/dbfs.py +81 -0
- dve/parser/file_handling/implementations/file.py +203 -0
- dve/parser/file_handling/implementations/s3.py +371 -0
- dve/parser/file_handling/log_handler.py +215 -0
- dve/parser/file_handling/service.py +441 -0
- dve/parser/file_handling/utilities.py +53 -0
- dve/parser/type_hints.py +46 -0
- dve/parser/utilities.py +113 -0
- dve/pipeline/__init__.py +0 -0
- dve/pipeline/duckdb_pipeline.py +56 -0
- dve/pipeline/foundry_ddb_pipeline.py +171 -0
- dve/pipeline/pipeline.py +935 -0
- dve/pipeline/spark_pipeline.py +69 -0
- dve/pipeline/utils.py +96 -0
- dve/reporting/__init__.py +1 -0
- dve/reporting/error_report.py +153 -0
- dve/reporting/excel_report.py +319 -0
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
"""A complete backend implementation."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import warnings
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from collections.abc import Mapping, MutableMapping
|
|
7
|
+
from typing import Any, ClassVar, Generic, Optional
|
|
8
|
+
|
|
9
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
10
|
+
|
|
11
|
+
from dve.core_engine.backends.base.contract import BaseDataContract
|
|
12
|
+
from dve.core_engine.backends.base.core import EntityManager, get_entity_type
|
|
13
|
+
from dve.core_engine.backends.base.reference_data import BaseRefDataLoader, ReferenceConfigUnion
|
|
14
|
+
from dve.core_engine.backends.base.rules import BaseStepImplementations
|
|
15
|
+
from dve.core_engine.backends.metadata.contract import DataContractMetadata
|
|
16
|
+
from dve.core_engine.backends.metadata.rules import RuleMetadata
|
|
17
|
+
from dve.core_engine.backends.types import Entities, EntityType, StageSuccessful
|
|
18
|
+
from dve.core_engine.loggers import get_logger
|
|
19
|
+
from dve.core_engine.models import SubmissionInfo
|
|
20
|
+
from dve.core_engine.type_hints import URI, EntityLocations, EntityName, EntityParquetLocations
|
|
21
|
+
from dve.parser.file_handling.service import get_parent, joinuri
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class BaseBackend(Generic[EntityType], ABC):
|
|
25
|
+
"""A complete implementation of a backend."""
|
|
26
|
+
|
|
27
|
+
__entity_type__: ClassVar[type[EntityType]] # type: ignore
|
|
28
|
+
"""
|
|
29
|
+
The entity type used within the backend.
|
|
30
|
+
|
|
31
|
+
This will be populated from the generic annotation at class creation time.
|
|
32
|
+
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init_subclass__(cls, *_, **__) -> None:
|
|
36
|
+
# Set entity type from parent class subscript.
|
|
37
|
+
if cls is not BaseBackend:
|
|
38
|
+
cls.__entity_type__ = get_entity_type(cls, "BaseBackend")
|
|
39
|
+
|
|
40
|
+
def __init__( # pylint: disable=unused-argument
|
|
41
|
+
self,
|
|
42
|
+
contract: BaseDataContract[EntityType],
|
|
43
|
+
steps: BaseStepImplementations[EntityType],
|
|
44
|
+
reference_data_loader_type: Optional[type[BaseRefDataLoader[EntityType]]],
|
|
45
|
+
logger: Optional[logging.Logger] = None,
|
|
46
|
+
**kwargs: Any,
|
|
47
|
+
) -> None:
|
|
48
|
+
for component_name, component in (
|
|
49
|
+
("Contract", contract),
|
|
50
|
+
("Step implementation", steps),
|
|
51
|
+
("Reference data loader", reference_data_loader_type),
|
|
52
|
+
):
|
|
53
|
+
component_entity_type = getattr(component, "__entity_type__", None)
|
|
54
|
+
if component_entity_type != self.__entity_type__:
|
|
55
|
+
raise TypeError(
|
|
56
|
+
f"{component_name} entity type ({component_entity_type}) does not match "
|
|
57
|
+
+ f"the type expected by this backend ({self.__entity_type__})"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
self.contract = contract
|
|
61
|
+
"""The data contract implementation used by the backend."""
|
|
62
|
+
self.step_implementations = steps
|
|
63
|
+
"""The step implementations used by the backend."""
|
|
64
|
+
self.reference_data_loader_type = reference_data_loader_type
|
|
65
|
+
"""
|
|
66
|
+
The loader type to use for the reference data. If `None`, do not
|
|
67
|
+
load any reference data and error if it is provided.
|
|
68
|
+
|
|
69
|
+
"""
|
|
70
|
+
self.logger = logger or get_logger(type(self).__name__)
|
|
71
|
+
"""The `logging.Logger instance for the backend."""
|
|
72
|
+
|
|
73
|
+
def load_reference_data(
|
|
74
|
+
self,
|
|
75
|
+
reference_entity_config: dict[EntityName, ReferenceConfigUnion],
|
|
76
|
+
submission_info: Optional[SubmissionInfo],
|
|
77
|
+
) -> Mapping[EntityName, EntityType]:
|
|
78
|
+
"""Load the reference data as specified in the reference entity config."""
|
|
79
|
+
sub_info_entity: Optional[EntityType] = None
|
|
80
|
+
if submission_info:
|
|
81
|
+
sub_info_entity = self.convert_submission_info(submission_info)
|
|
82
|
+
|
|
83
|
+
if self.reference_data_loader_type is None:
|
|
84
|
+
if reference_entity_config:
|
|
85
|
+
raise ValueError(
|
|
86
|
+
"Reference data has been specified but no reference data loader is "
|
|
87
|
+
+ "configured for this backend"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
reference_data_dict = {}
|
|
91
|
+
if sub_info_entity is not None:
|
|
92
|
+
reference_data_dict["dve_submission_info"] = sub_info_entity
|
|
93
|
+
return reference_data_dict
|
|
94
|
+
|
|
95
|
+
reference_data_loader = self.reference_data_loader_type(reference_entity_config)
|
|
96
|
+
if sub_info_entity is not None:
|
|
97
|
+
reference_data_loader.entity_cache["dve_submission_info"] = sub_info_entity
|
|
98
|
+
|
|
99
|
+
return reference_data_loader
|
|
100
|
+
|
|
101
|
+
@abstractmethod
|
|
102
|
+
def convert_submission_info(self, submission_info: SubmissionInfo) -> EntityType:
|
|
103
|
+
"""Convert the submission info to an entity."""
|
|
104
|
+
|
|
105
|
+
@abstractmethod
|
|
106
|
+
def write_entities_to_parquet(
|
|
107
|
+
self, entities: Entities, cache_prefix: URI
|
|
108
|
+
) -> EntityParquetLocations:
|
|
109
|
+
"""Write entities out to parquet, returning the locations."""
|
|
110
|
+
raise NotImplementedError()
|
|
111
|
+
|
|
112
|
+
def convert_entities_to_spark(
|
|
113
|
+
self, entities: Entities, cache_prefix: URI, _emit_deprecation_warning: bool = True
|
|
114
|
+
) -> dict[EntityName, DataFrame]:
|
|
115
|
+
"""Convert entities to Spark DataFrames.
|
|
116
|
+
|
|
117
|
+
Entities may be omitted if they are blank, because Spark cannot create an
|
|
118
|
+
entity from an empty parquet file.
|
|
119
|
+
|
|
120
|
+
"""
|
|
121
|
+
if _emit_deprecation_warning:
|
|
122
|
+
self.logger.warning("DEPRECATED: Converting entities to Spark is deprecated")
|
|
123
|
+
warnings.warn(
|
|
124
|
+
"Converting entities to Spark is deprecated, and may be removed if the core engine "
|
|
125
|
+
+ "changes the internal representation",
|
|
126
|
+
category=DeprecationWarning,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
parquet_locations = self.write_entities_to_parquet(entities, cache_prefix)
|
|
130
|
+
spark_session = SparkSession.builder.getOrCreate()
|
|
131
|
+
|
|
132
|
+
spark_entities = {}
|
|
133
|
+
for entity_name, parquet_location in parquet_locations.items():
|
|
134
|
+
try:
|
|
135
|
+
spark_entities[entity_name] = spark_session.read.parquet(parquet_location)
|
|
136
|
+
except Exception as err: # pylint: disable=broad-except
|
|
137
|
+
self.logger.warning(
|
|
138
|
+
f"Failed to read entity {entity_name!r} back from parquet location "
|
|
139
|
+
+ repr(parquet_location)
|
|
140
|
+
)
|
|
141
|
+
self.logger.exception(err)
|
|
142
|
+
return spark_entities
|
|
143
|
+
|
|
144
|
+
def apply(
|
|
145
|
+
self,
|
|
146
|
+
working_dir: URI,
|
|
147
|
+
entity_locations: EntityLocations,
|
|
148
|
+
contract_metadata: DataContractMetadata,
|
|
149
|
+
rule_metadata: RuleMetadata,
|
|
150
|
+
submission_info: Optional[SubmissionInfo] = None,
|
|
151
|
+
) -> tuple[Entities, URI, StageSuccessful]:
|
|
152
|
+
"""Apply the data contract and the rules, returning the entities and all
|
|
153
|
+
generated messages.
|
|
154
|
+
|
|
155
|
+
"""
|
|
156
|
+
reference_data = self.load_reference_data(
|
|
157
|
+
rule_metadata.reference_data_config, submission_info
|
|
158
|
+
)
|
|
159
|
+
entities, dc_feedback_errors_uri, successful, processing_errors_uri = self.contract.apply(
|
|
160
|
+
working_dir, entity_locations, contract_metadata
|
|
161
|
+
)
|
|
162
|
+
if not successful:
|
|
163
|
+
return entities, get_parent(processing_errors_uri), successful
|
|
164
|
+
|
|
165
|
+
for entity_name, entity in entities.items():
|
|
166
|
+
entities[entity_name] = self.step_implementations.add_row_id(entity)
|
|
167
|
+
|
|
168
|
+
# TODO: Handle entity manager creation errors.
|
|
169
|
+
entity_manager = EntityManager(entities, reference_data)
|
|
170
|
+
# TODO: Add stage success to 'apply_rules'
|
|
171
|
+
# TODO: In case of large errors in business rules, write messages to jsonl file
|
|
172
|
+
# TODO: and return uri to errors
|
|
173
|
+
_ = self.step_implementations.apply_rules(working_dir, entity_manager, rule_metadata)
|
|
174
|
+
|
|
175
|
+
for entity_name, entity in entity_manager.entities.items():
|
|
176
|
+
entity_manager.entities[entity_name] = self.step_implementations.drop_row_id(entity)
|
|
177
|
+
|
|
178
|
+
return entity_manager.entities, get_parent(dc_feedback_errors_uri), True
|
|
179
|
+
|
|
180
|
+
def process(
|
|
181
|
+
self,
|
|
182
|
+
working_dir: URI,
|
|
183
|
+
entity_locations: EntityLocations,
|
|
184
|
+
contract_metadata: DataContractMetadata,
|
|
185
|
+
rule_metadata: RuleMetadata,
|
|
186
|
+
submission_info: Optional[SubmissionInfo] = None,
|
|
187
|
+
) -> tuple[MutableMapping[EntityName, URI], URI]:
|
|
188
|
+
"""Apply the data contract and the rules, write the entities out to parquet
|
|
189
|
+
and returning the entity locations and all generated messages.
|
|
190
|
+
|
|
191
|
+
"""
|
|
192
|
+
entities, feedback_errors_uri, successful = self.apply(
|
|
193
|
+
working_dir, entity_locations, contract_metadata, rule_metadata, submission_info
|
|
194
|
+
)
|
|
195
|
+
if successful:
|
|
196
|
+
parquet_locations = self.write_entities_to_parquet(
|
|
197
|
+
entities, joinuri(working_dir, "outputs")
|
|
198
|
+
)
|
|
199
|
+
else:
|
|
200
|
+
parquet_locations = {}
|
|
201
|
+
return parquet_locations, get_parent(feedback_errors_uri)
|
|
202
|
+
|
|
203
|
+
def process_legacy(
|
|
204
|
+
self,
|
|
205
|
+
working_dir: URI,
|
|
206
|
+
entity_locations: EntityLocations,
|
|
207
|
+
contract_metadata: DataContractMetadata,
|
|
208
|
+
rule_metadata: RuleMetadata,
|
|
209
|
+
submission_info: Optional[SubmissionInfo] = None,
|
|
210
|
+
) -> tuple[MutableMapping[EntityName, DataFrame], URI]:
|
|
211
|
+
"""Apply the data contract and the rules, create Spark `DataFrame`s from the
|
|
212
|
+
entities and return the Spark entities and all generated messages.
|
|
213
|
+
|
|
214
|
+
Entities may be omitted if they are blank, because Spark cannot create an
|
|
215
|
+
entity from an empty parquet file.
|
|
216
|
+
|
|
217
|
+
"""
|
|
218
|
+
self.logger.warning("DEPRECATED: Processing entities to Spark is deprecated")
|
|
219
|
+
warnings.warn(
|
|
220
|
+
"Converting entities to Spark is deprecated, and may be removed if the core engine "
|
|
221
|
+
+ "changes the internal representation",
|
|
222
|
+
category=DeprecationWarning,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
entities, errors_uri, successful = self.apply(
|
|
226
|
+
working_dir, entity_locations, contract_metadata, rule_metadata, submission_info
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
if not successful:
|
|
230
|
+
return {}, errors_uri
|
|
231
|
+
|
|
232
|
+
if self.__entity_type__ == DataFrame:
|
|
233
|
+
return entities, errors_uri # type: ignore
|
|
234
|
+
|
|
235
|
+
return (
|
|
236
|
+
self.convert_entities_to_spark(
|
|
237
|
+
entities, joinuri(working_dir, "outputs"), _emit_deprecation_warning=False
|
|
238
|
+
),
|
|
239
|
+
errors_uri,
|
|
240
|
+
)
|
|
@@ -0,0 +1,454 @@
|
|
|
1
|
+
"""Base implementation of the data contract."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from collections.abc import Iterable, Iterator
|
|
6
|
+
from inspect import ismethod
|
|
7
|
+
from typing import Any, ClassVar, Generic, Optional, TypeVar
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
from typing_extensions import Protocol
|
|
11
|
+
|
|
12
|
+
from dve.common.error_utils import (
|
|
13
|
+
dump_processing_errors,
|
|
14
|
+
get_feedback_errors_uri,
|
|
15
|
+
get_processing_errors_uri,
|
|
16
|
+
)
|
|
17
|
+
from dve.core_engine.backends.base.core import get_entity_type
|
|
18
|
+
from dve.core_engine.backends.base.reader import BaseFileReader
|
|
19
|
+
from dve.core_engine.backends.exceptions import ReaderLacksEntityTypeSupport, render_error
|
|
20
|
+
from dve.core_engine.backends.metadata.contract import DataContractMetadata
|
|
21
|
+
from dve.core_engine.backends.readers import get_reader
|
|
22
|
+
from dve.core_engine.backends.types import Entities, EntityType, StageSuccessful
|
|
23
|
+
from dve.core_engine.backends.utilities import dedup_messages, stringify_model
|
|
24
|
+
from dve.core_engine.exceptions import CriticalProcessingError
|
|
25
|
+
from dve.core_engine.loggers import get_logger
|
|
26
|
+
from dve.core_engine.message import FeedbackMessage
|
|
27
|
+
from dve.core_engine.type_hints import (
|
|
28
|
+
URI,
|
|
29
|
+
ArbitraryFunction,
|
|
30
|
+
DVEStageName,
|
|
31
|
+
EntityLocations,
|
|
32
|
+
EntityName,
|
|
33
|
+
JSONDict,
|
|
34
|
+
Messages,
|
|
35
|
+
WrapDecorator,
|
|
36
|
+
)
|
|
37
|
+
from dve.parser.file_handling import get_file_suffix, get_resource_exists
|
|
38
|
+
from dve.parser.type_hints import Extension
|
|
39
|
+
|
|
40
|
+
T = TypeVar("T")
|
|
41
|
+
ExtensionConfig = dict[Extension, "ReaderConfig"]
|
|
42
|
+
"""Configuration options for file extensions."""
|
|
43
|
+
_READER_OVERRIDE_ATTR_NAME = "_implements_reader_for"
|
|
44
|
+
"""The name of the reader override function's reader override attribute."""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ReaderConfig(BaseModel):
|
|
48
|
+
"""Configuration options for a given reader."""
|
|
49
|
+
|
|
50
|
+
reader: str
|
|
51
|
+
"""The name of the reader to be used."""
|
|
52
|
+
parameters: JSONDict
|
|
53
|
+
"""The parameters the reader should use."""
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class _UnboundReaderOverride(Protocol[T]): # pylint: disable=too-few-public-methods
|
|
57
|
+
"""The protocol required to implement an override for a specific file reader."""
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def __call__( # pylint: disable=bad-staticmethod-argument
|
|
61
|
+
self: "BaseDataContract[T]", # This is the protocol for an _unbound_ method.
|
|
62
|
+
reader: BaseFileReader,
|
|
63
|
+
resource: URI,
|
|
64
|
+
entity_name: EntityName,
|
|
65
|
+
schema: type[BaseModel],
|
|
66
|
+
) -> T: ...
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def reader_override(reader_type: type[BaseFileReader]) -> WrapDecorator:
|
|
70
|
+
"""A decorator function which wraps a `ReaderProtocol` method to add support
|
|
71
|
+
for custom reader overrides.
|
|
72
|
+
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def reader_impl_decorator(func: ArbitraryFunction) -> ArbitraryFunction:
|
|
76
|
+
"""Wrap a reader function to indicate the reader type it implements an override
|
|
77
|
+
for.
|
|
78
|
+
|
|
79
|
+
"""
|
|
80
|
+
setattr(func, _READER_OVERRIDE_ATTR_NAME, reader_type)
|
|
81
|
+
return func
|
|
82
|
+
|
|
83
|
+
return reader_impl_decorator
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class BaseDataContract(Generic[EntityType], ABC):
|
|
87
|
+
"""The base implementation of a data contract."""
|
|
88
|
+
|
|
89
|
+
__entity_type__: ClassVar[type[EntityType]] # type: ignore
|
|
90
|
+
"""
|
|
91
|
+
The entity type that should be requested from a reader without a
|
|
92
|
+
specific implementation.
|
|
93
|
+
|
|
94
|
+
This will be populated from the generic annotation at class creation time.
|
|
95
|
+
|
|
96
|
+
"""
|
|
97
|
+
__reader_overrides__: ClassVar[dict[type[BaseFileReader], _UnboundReaderOverride[EntityType]]] = {} # type: ignore # pylint: disable=line-too-long
|
|
98
|
+
"""
|
|
99
|
+
A dictionary mapping implemented reader types to override functions which provide
|
|
100
|
+
a 'local' implementation of the reader. These can provide a more optimised version
|
|
101
|
+
of a specific reader for the implemented backend.
|
|
102
|
+
|
|
103
|
+
This is set and populated in `__init_subclass__` by identifying methods
|
|
104
|
+
decorated with the '@reader_override' decorator, and is used in `read_entity_type`.
|
|
105
|
+
|
|
106
|
+
"""
|
|
107
|
+
__stage_name__: DVEStageName = "data_contract"
|
|
108
|
+
"""
|
|
109
|
+
The name of the data contract DVE stage for use in auditing and logging
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
def __init_subclass__(cls, *_, **__) -> None:
|
|
113
|
+
"""When this class is subclassed, create and populate the `__reader_overrides__`
|
|
114
|
+
and `__entity_type__` class variables for this subclass.
|
|
115
|
+
|
|
116
|
+
"""
|
|
117
|
+
# Set entity type from parent class subscript.
|
|
118
|
+
if cls is not BaseDataContract:
|
|
119
|
+
cls.__entity_type__ = get_entity_type(cls, "BaseDataContract")
|
|
120
|
+
|
|
121
|
+
# Identify provided reader overrides.
|
|
122
|
+
cls.__reader_overrides__ = {}
|
|
123
|
+
|
|
124
|
+
for method_name in dir(cls):
|
|
125
|
+
method = getattr(cls, method_name, None)
|
|
126
|
+
if not (ismethod(method) or callable(method)):
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
reader_type = getattr(method, _READER_OVERRIDE_ATTR_NAME, None)
|
|
130
|
+
if reader_type is None:
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
if not (isinstance(reader_type, type) and issubclass(reader_type, BaseFileReader)):
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
cls.__reader_overrides__[reader_type] = method # type: ignore
|
|
137
|
+
|
|
138
|
+
def __init__( # pylint: disable=unused-argument
|
|
139
|
+
self,
|
|
140
|
+
logger: Optional[logging.Logger] = None,
|
|
141
|
+
**kwargs: Any,
|
|
142
|
+
):
|
|
143
|
+
self.logger = logger or get_logger(type(self).__name__)
|
|
144
|
+
"""The `logging.Logger instance for the data contract config."""
|
|
145
|
+
|
|
146
|
+
@abstractmethod
|
|
147
|
+
def create_entity_from_py_iterator(
|
|
148
|
+
self, entity_name: EntityName, records: Iterator[dict[str, Any]], schema: type[BaseModel]
|
|
149
|
+
) -> EntityType:
|
|
150
|
+
"""A fallback function to be used where no entity type specific
|
|
151
|
+
reader implemenattions are available.
|
|
152
|
+
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
def read_entity_from_py_iterator(
|
|
156
|
+
self,
|
|
157
|
+
reader: BaseFileReader,
|
|
158
|
+
resource: URI,
|
|
159
|
+
entity_name: EntityName,
|
|
160
|
+
schema: type[BaseModel],
|
|
161
|
+
) -> EntityType:
|
|
162
|
+
"""A fallback function for readers that should read records with the
|
|
163
|
+
'read_to_py_iterator' implementation and create an entity of the correct
|
|
164
|
+
type.
|
|
165
|
+
|
|
166
|
+
This will be used where there are not more specific implementations for a
|
|
167
|
+
given reader type (either as a reader-specific override, or through direct
|
|
168
|
+
support for the contract's entity type in the reader).
|
|
169
|
+
|
|
170
|
+
"""
|
|
171
|
+
py_iterator = reader.read_to_py_iterator(resource, entity_name, schema)
|
|
172
|
+
return self.create_entity_from_py_iterator(entity_name, py_iterator, schema)
|
|
173
|
+
|
|
174
|
+
def read_entity(
|
|
175
|
+
self,
|
|
176
|
+
reader: BaseFileReader,
|
|
177
|
+
resource: URI,
|
|
178
|
+
entity_name: EntityName,
|
|
179
|
+
schema: type[BaseModel],
|
|
180
|
+
) -> EntityType:
|
|
181
|
+
"""Read an entity using the provided reader class.
|
|
182
|
+
|
|
183
|
+
NOTE: In the reader, simple types will either be returned as strings (if present)
|
|
184
|
+
or `None`. Format validation, casting, and parsing should be done when the
|
|
185
|
+
contract is applied.
|
|
186
|
+
|
|
187
|
+
NOTE 2: The default implementation will stringify schemas before passing them
|
|
188
|
+
to the reader and `create_entity_from_py_iterator`.
|
|
189
|
+
|
|
190
|
+
"""
|
|
191
|
+
schema = stringify_model(schema)
|
|
192
|
+
try:
|
|
193
|
+
# Try fetching an overridden implementation for the given reader type.
|
|
194
|
+
impl = self.__reader_overrides__[type(reader)]
|
|
195
|
+
except KeyError:
|
|
196
|
+
try:
|
|
197
|
+
# If there is no override, try having the reader read directly to
|
|
198
|
+
# the contract's entity type.
|
|
199
|
+
self.logger.debug("Attempting to read directly to contract entity type...")
|
|
200
|
+
entity = reader.read_to_entity_type(
|
|
201
|
+
self.__entity_type__, resource, entity_name, schema
|
|
202
|
+
)
|
|
203
|
+
return entity
|
|
204
|
+
except ReaderLacksEntityTypeSupport:
|
|
205
|
+
pass
|
|
206
|
+
else:
|
|
207
|
+
self.logger.debug(f"Using contract-specific override for {type(reader).__name__}...")
|
|
208
|
+
return impl(self, reader, resource, entity_name, schema)
|
|
209
|
+
|
|
210
|
+
# Finally, fall back to using the pure Python reader and creating an entity.
|
|
211
|
+
self.logger.debug("Reading via Python iterator...")
|
|
212
|
+
return self.read_entity_from_py_iterator(reader, resource, entity_name, schema)
|
|
213
|
+
|
|
214
|
+
def _create_critical_error(
|
|
215
|
+
self, entity_name: EntityName, error_message: str
|
|
216
|
+
) -> FeedbackMessage:
|
|
217
|
+
"""Create a critical data contract error."""
|
|
218
|
+
return FeedbackMessage(
|
|
219
|
+
record=None,
|
|
220
|
+
entity=entity_name,
|
|
221
|
+
failure_type="integrity",
|
|
222
|
+
error_message=error_message,
|
|
223
|
+
error_location="Whole file",
|
|
224
|
+
category="Bad file",
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
def _ensure_all_entities_provided(
|
|
228
|
+
self, entity_names: Iterable[str], contract_metadata: DataContractMetadata
|
|
229
|
+
) -> Messages:
|
|
230
|
+
"""Ensure all entities are provided, with no extras."""
|
|
231
|
+
provided_entities = set(entity_names)
|
|
232
|
+
expected_entities = set(contract_metadata.schemas.keys())
|
|
233
|
+
|
|
234
|
+
missing_entities = sorted(provided_entities - expected_entities)
|
|
235
|
+
extra_entities = sorted(expected_entities - provided_entities)
|
|
236
|
+
|
|
237
|
+
messages: Messages = []
|
|
238
|
+
|
|
239
|
+
for entity_name in missing_entities:
|
|
240
|
+
self.logger.error(f"No location specified for {entity_name!r}")
|
|
241
|
+
message = self._create_critical_error(entity_name, "Entity was not provided")
|
|
242
|
+
messages.append(message)
|
|
243
|
+
|
|
244
|
+
for entity_name in extra_entities:
|
|
245
|
+
self.logger.error(f"Unrecognised entity provided ({entity_name!r})")
|
|
246
|
+
message = self._create_critical_error(entity_name, "Unrecognised entity name provided")
|
|
247
|
+
messages.append(message)
|
|
248
|
+
|
|
249
|
+
return messages
|
|
250
|
+
|
|
251
|
+
def _ensure_entity_locations_appropriate(
|
|
252
|
+
self, entity_locations: EntityLocations, contract_metadata: DataContractMetadata
|
|
253
|
+
) -> Messages:
|
|
254
|
+
"""Ensure the provided entity locations really exist."""
|
|
255
|
+
messages: Messages = []
|
|
256
|
+
for entity_name in contract_metadata.schemas:
|
|
257
|
+
try:
|
|
258
|
+
entity_location = entity_locations[entity_name]
|
|
259
|
+
except KeyError:
|
|
260
|
+
continue
|
|
261
|
+
|
|
262
|
+
try:
|
|
263
|
+
if not get_resource_exists(entity_location):
|
|
264
|
+
self.logger.error(
|
|
265
|
+
f"Resource does not exist for {entity_name!r} (location: "
|
|
266
|
+
+ f"{entity_location!r})"
|
|
267
|
+
)
|
|
268
|
+
message = self._create_critical_error(
|
|
269
|
+
entity_name, "The provided location does not exist"
|
|
270
|
+
)
|
|
271
|
+
messages.append(message)
|
|
272
|
+
except Exception as err: # pylint: disable=broad-except
|
|
273
|
+
self.logger.error(
|
|
274
|
+
f"Error checking location exists for {entity_name!r} (location: "
|
|
275
|
+
+ f"{entity_location!r})"
|
|
276
|
+
)
|
|
277
|
+
self.logger.exception(err)
|
|
278
|
+
error_message = (
|
|
279
|
+
f"Unable to ensure entity location exists ({type(err).__name__}: {err})"
|
|
280
|
+
)
|
|
281
|
+
message = self._create_critical_error(entity_name, error_message)
|
|
282
|
+
messages.append(message)
|
|
283
|
+
|
|
284
|
+
return messages
|
|
285
|
+
|
|
286
|
+
def _ensure_entity_locations_have_read_support(
|
|
287
|
+
self, entity_locations: EntityLocations, contract_metadata: DataContractMetadata
|
|
288
|
+
) -> Messages:
|
|
289
|
+
"""Ensure that provided entity locations have supported readers."""
|
|
290
|
+
messages: Messages = []
|
|
291
|
+
|
|
292
|
+
for entity_name in contract_metadata.schemas:
|
|
293
|
+
try:
|
|
294
|
+
entity_location = entity_locations[entity_name]
|
|
295
|
+
except KeyError:
|
|
296
|
+
continue
|
|
297
|
+
|
|
298
|
+
suffix = get_file_suffix(entity_location) or ""
|
|
299
|
+
if not suffix:
|
|
300
|
+
self.logger.error(
|
|
301
|
+
f"{entity_name!r} (location: {entity_location!r}) missing file extension"
|
|
302
|
+
)
|
|
303
|
+
message = self._create_critical_error(entity_name, "Missing file extension")
|
|
304
|
+
messages.append(message)
|
|
305
|
+
|
|
306
|
+
extension = f".{suffix}"
|
|
307
|
+
if extension not in contract_metadata.reader_metadata[entity_name]:
|
|
308
|
+
self.logger.error(
|
|
309
|
+
f"{entity_name!r} (location: {entity_location!r}) does not have configured "
|
|
310
|
+
+ f"reader for {extension} files"
|
|
311
|
+
)
|
|
312
|
+
error_message = f"Does not implement support for {extension!r} types"
|
|
313
|
+
message = self._create_critical_error(entity_name, error_message)
|
|
314
|
+
|
|
315
|
+
return messages
|
|
316
|
+
|
|
317
|
+
def read_raw_entities(
|
|
318
|
+
self, entity_locations: EntityLocations, contract_metadata: DataContractMetadata
|
|
319
|
+
) -> tuple[Entities, Messages, StageSuccessful]:
|
|
320
|
+
"""Read the raw entities from the entity locations using the configured readers.
|
|
321
|
+
|
|
322
|
+
These will not yet have had the data contracts applied.
|
|
323
|
+
|
|
324
|
+
"""
|
|
325
|
+
messages: Messages = []
|
|
326
|
+
messages.extend(self._ensure_all_entities_provided(entity_locations, contract_metadata))
|
|
327
|
+
messages.extend(
|
|
328
|
+
self._ensure_entity_locations_appropriate(entity_locations, contract_metadata)
|
|
329
|
+
)
|
|
330
|
+
messages.extend(
|
|
331
|
+
self._ensure_entity_locations_have_read_support(entity_locations, contract_metadata)
|
|
332
|
+
)
|
|
333
|
+
if any(message.is_critical for message in messages):
|
|
334
|
+
return {}, messages, False
|
|
335
|
+
|
|
336
|
+
entities: Entities = {}
|
|
337
|
+
successful = True
|
|
338
|
+
for entity_name, resource in entity_locations.items():
|
|
339
|
+
reader_metadata = contract_metadata.reader_metadata[entity_name]
|
|
340
|
+
extension = "." + (
|
|
341
|
+
get_file_suffix(resource) or ""
|
|
342
|
+
).lower() # Already checked that extension supported.
|
|
343
|
+
|
|
344
|
+
reader_config = reader_metadata[extension]
|
|
345
|
+
reader_type = get_reader(reader_config.reader)
|
|
346
|
+
reader = reader_type(**reader_config.parameters)
|
|
347
|
+
|
|
348
|
+
self.logger.info(f"Reading entity {entity_name!r} using {reader_config.reader!r}")
|
|
349
|
+
try:
|
|
350
|
+
schema = contract_metadata.schemas[entity_name]
|
|
351
|
+
entities[entity_name] = self.read_entity(
|
|
352
|
+
reader,
|
|
353
|
+
resource,
|
|
354
|
+
entity_name,
|
|
355
|
+
schema, # type: ignore
|
|
356
|
+
)
|
|
357
|
+
except Exception as err: # pylint: disable=broad-except
|
|
358
|
+
successful = False
|
|
359
|
+
location = f"data contract (reading entity {entity_name!r} from {resource!r})"
|
|
360
|
+
new_messages = render_error(
|
|
361
|
+
err,
|
|
362
|
+
location,
|
|
363
|
+
self.logger,
|
|
364
|
+
entity_name=entity_name,
|
|
365
|
+
error_location="Whole file",
|
|
366
|
+
error_category="Bad file",
|
|
367
|
+
)
|
|
368
|
+
messages.extend(new_messages)
|
|
369
|
+
|
|
370
|
+
return entities, dedup_messages(messages), successful
|
|
371
|
+
|
|
372
|
+
@abstractmethod
|
|
373
|
+
def apply_data_contract(
|
|
374
|
+
self,
|
|
375
|
+
working_dir: URI,
|
|
376
|
+
entities: Entities,
|
|
377
|
+
entity_locations: EntityLocations,
|
|
378
|
+
contract_metadata: DataContractMetadata,
|
|
379
|
+
key_fields: Optional[dict[str, list[str]]] = None,
|
|
380
|
+
) -> tuple[Entities, URI, StageSuccessful]:
|
|
381
|
+
"""Apply the data contract to the raw entities, returning the validated entities
|
|
382
|
+
and any messages.
|
|
383
|
+
|
|
384
|
+
Record-level identifiers should be added at this point.
|
|
385
|
+
|
|
386
|
+
"""
|
|
387
|
+
raise NotImplementedError()
|
|
388
|
+
|
|
389
|
+
def apply(
|
|
390
|
+
self,
|
|
391
|
+
working_dir: URI,
|
|
392
|
+
entity_locations: EntityLocations,
|
|
393
|
+
contract_metadata: DataContractMetadata,
|
|
394
|
+
key_fields: Optional[dict[str, list[str]]] = None,
|
|
395
|
+
) -> tuple[Entities, URI, StageSuccessful, URI]:
|
|
396
|
+
"""Read the entities from the provided locations according to the data contract,
|
|
397
|
+
and return the validated entities and any messages.
|
|
398
|
+
|
|
399
|
+
"""
|
|
400
|
+
feedback_errors_uri = get_feedback_errors_uri(working_dir, self.__stage_name__)
|
|
401
|
+
processing_errors_uri = get_processing_errors_uri(working_dir)
|
|
402
|
+
entities, messages, successful = self.read_raw_entities(entity_locations, contract_metadata)
|
|
403
|
+
if not successful:
|
|
404
|
+
dump_processing_errors(
|
|
405
|
+
working_dir,
|
|
406
|
+
self.__stage_name__,
|
|
407
|
+
[
|
|
408
|
+
CriticalProcessingError(
|
|
409
|
+
"Issue occurred while reading raw entities",
|
|
410
|
+
[msg.error_message for msg in messages],
|
|
411
|
+
)
|
|
412
|
+
],
|
|
413
|
+
)
|
|
414
|
+
return {}, feedback_errors_uri, successful, processing_errors_uri
|
|
415
|
+
|
|
416
|
+
try:
|
|
417
|
+
entities, feedback_errors_uri, successful = self.apply_data_contract(
|
|
418
|
+
working_dir, entities, entity_locations, contract_metadata, key_fields
|
|
419
|
+
)
|
|
420
|
+
except Exception as err: # pylint: disable=broad-except
|
|
421
|
+
successful = False
|
|
422
|
+
new_messages = render_error(
|
|
423
|
+
err,
|
|
424
|
+
self.__stage_name__,
|
|
425
|
+
self.logger,
|
|
426
|
+
)
|
|
427
|
+
dump_processing_errors(
|
|
428
|
+
working_dir,
|
|
429
|
+
self.__stage_name__,
|
|
430
|
+
[
|
|
431
|
+
CriticalProcessingError(
|
|
432
|
+
f"Issue occurred while applying {self.__stage_name__}",
|
|
433
|
+
[msg.error_message for msg in new_messages],
|
|
434
|
+
)
|
|
435
|
+
],
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
if contract_metadata.cache_originals:
|
|
439
|
+
for entity_name in list(entities):
|
|
440
|
+
entities[f"Original{entity_name}"] = entities[entity_name]
|
|
441
|
+
|
|
442
|
+
return entities, feedback_errors_uri, successful, processing_errors_uri
|
|
443
|
+
|
|
444
|
+
def read_parquet(self, path: URI, **kwargs) -> EntityType:
|
|
445
|
+
"""Method to read parquet files from stringified parquet output
|
|
446
|
+
from file transformation phase.
|
|
447
|
+
"""
|
|
448
|
+
raise NotImplementedError()
|
|
449
|
+
|
|
450
|
+
def write_parquet(self, entity: EntityType, target_location: URI, **kwargs) -> URI:
|
|
451
|
+
"""Method to write parquet files from type cast entities
|
|
452
|
+
following data contract application
|
|
453
|
+
"""
|
|
454
|
+
raise NotImplementedError()
|