data-validation-engine 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. data_validation_engine-0.6.2.dist-info/METADATA +104 -0
  2. data_validation_engine-0.6.2.dist-info/RECORD +105 -0
  3. data_validation_engine-0.6.2.dist-info/WHEEL +4 -0
  4. data_validation_engine-0.6.2.dist-info/licenses/LICENSE +21 -0
  5. dve/__init__.py +0 -0
  6. dve/common/__init__.py +0 -0
  7. dve/common/error_utils.py +189 -0
  8. dve/core_engine/__init__.py +0 -0
  9. dve/core_engine/backends/__init__.py +1 -0
  10. dve/core_engine/backends/base/__init__.py +1 -0
  11. dve/core_engine/backends/base/auditing.py +618 -0
  12. dve/core_engine/backends/base/backend.py +240 -0
  13. dve/core_engine/backends/base/contract.py +454 -0
  14. dve/core_engine/backends/base/core.py +124 -0
  15. dve/core_engine/backends/base/reader.py +176 -0
  16. dve/core_engine/backends/base/reference_data.py +217 -0
  17. dve/core_engine/backends/base/rules.py +685 -0
  18. dve/core_engine/backends/base/utilities.py +146 -0
  19. dve/core_engine/backends/exceptions.py +311 -0
  20. dve/core_engine/backends/implementations/__init__.py +1 -0
  21. dve/core_engine/backends/implementations/duckdb/__init__.py +26 -0
  22. dve/core_engine/backends/implementations/duckdb/auditing.py +234 -0
  23. dve/core_engine/backends/implementations/duckdb/contract.py +213 -0
  24. dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +288 -0
  25. dve/core_engine/backends/implementations/duckdb/readers/__init__.py +13 -0
  26. dve/core_engine/backends/implementations/duckdb/readers/csv.py +222 -0
  27. dve/core_engine/backends/implementations/duckdb/readers/json.py +50 -0
  28. dve/core_engine/backends/implementations/duckdb/readers/xml.py +45 -0
  29. dve/core_engine/backends/implementations/duckdb/reference_data.py +49 -0
  30. dve/core_engine/backends/implementations/duckdb/rules.py +534 -0
  31. dve/core_engine/backends/implementations/duckdb/types.py +47 -0
  32. dve/core_engine/backends/implementations/duckdb/utilities.py +41 -0
  33. dve/core_engine/backends/implementations/spark/__init__.py +22 -0
  34. dve/core_engine/backends/implementations/spark/auditing.py +230 -0
  35. dve/core_engine/backends/implementations/spark/backend.py +78 -0
  36. dve/core_engine/backends/implementations/spark/contract.py +241 -0
  37. dve/core_engine/backends/implementations/spark/readers/__init__.py +15 -0
  38. dve/core_engine/backends/implementations/spark/readers/csv.py +77 -0
  39. dve/core_engine/backends/implementations/spark/readers/json.py +66 -0
  40. dve/core_engine/backends/implementations/spark/readers/xml.py +202 -0
  41. dve/core_engine/backends/implementations/spark/reference_data.py +42 -0
  42. dve/core_engine/backends/implementations/spark/rules.py +430 -0
  43. dve/core_engine/backends/implementations/spark/spark_helpers.py +412 -0
  44. dve/core_engine/backends/implementations/spark/types.py +21 -0
  45. dve/core_engine/backends/implementations/spark/utilities.py +144 -0
  46. dve/core_engine/backends/metadata/__init__.py +47 -0
  47. dve/core_engine/backends/metadata/contract.py +80 -0
  48. dve/core_engine/backends/metadata/reporting.py +374 -0
  49. dve/core_engine/backends/metadata/rules.py +737 -0
  50. dve/core_engine/backends/readers/__init__.py +41 -0
  51. dve/core_engine/backends/readers/csv.py +232 -0
  52. dve/core_engine/backends/readers/utilities.py +21 -0
  53. dve/core_engine/backends/readers/xml.py +432 -0
  54. dve/core_engine/backends/readers/xml_linting.py +142 -0
  55. dve/core_engine/backends/types.py +26 -0
  56. dve/core_engine/backends/utilities.py +177 -0
  57. dve/core_engine/configuration/__init__.py +1 -0
  58. dve/core_engine/configuration/base.py +56 -0
  59. dve/core_engine/configuration/v1/__init__.py +351 -0
  60. dve/core_engine/configuration/v1/filters.py +60 -0
  61. dve/core_engine/configuration/v1/rule_stores/__init__.py +1 -0
  62. dve/core_engine/configuration/v1/rule_stores/models.py +57 -0
  63. dve/core_engine/configuration/v1/steps.py +365 -0
  64. dve/core_engine/constants.py +8 -0
  65. dve/core_engine/engine.py +265 -0
  66. dve/core_engine/exceptions.py +29 -0
  67. dve/core_engine/functions/__init__.py +6 -0
  68. dve/core_engine/functions/implementations.py +200 -0
  69. dve/core_engine/loggers.py +57 -0
  70. dve/core_engine/message.py +512 -0
  71. dve/core_engine/models.py +196 -0
  72. dve/core_engine/templating.py +114 -0
  73. dve/core_engine/type_hints.py +255 -0
  74. dve/core_engine/validation.py +160 -0
  75. dve/metadata_parser/__init__.py +2 -0
  76. dve/metadata_parser/domain_types.py +682 -0
  77. dve/metadata_parser/exc.py +44 -0
  78. dve/metadata_parser/function_library.py +64 -0
  79. dve/metadata_parser/function_wrapper.py +201 -0
  80. dve/metadata_parser/model_generator.py +119 -0
  81. dve/metadata_parser/models.py +410 -0
  82. dve/metadata_parser/utilities.py +54 -0
  83. dve/parser/__init__.py +1 -0
  84. dve/parser/exceptions.py +50 -0
  85. dve/parser/file_handling/__init__.py +31 -0
  86. dve/parser/file_handling/helpers.py +29 -0
  87. dve/parser/file_handling/implementations/__init__.py +7 -0
  88. dve/parser/file_handling/implementations/base.py +97 -0
  89. dve/parser/file_handling/implementations/dbfs.py +81 -0
  90. dve/parser/file_handling/implementations/file.py +203 -0
  91. dve/parser/file_handling/implementations/s3.py +371 -0
  92. dve/parser/file_handling/log_handler.py +215 -0
  93. dve/parser/file_handling/service.py +441 -0
  94. dve/parser/file_handling/utilities.py +53 -0
  95. dve/parser/type_hints.py +46 -0
  96. dve/parser/utilities.py +113 -0
  97. dve/pipeline/__init__.py +0 -0
  98. dve/pipeline/duckdb_pipeline.py +56 -0
  99. dve/pipeline/foundry_ddb_pipeline.py +171 -0
  100. dve/pipeline/pipeline.py +935 -0
  101. dve/pipeline/spark_pipeline.py +69 -0
  102. dve/pipeline/utils.py +96 -0
  103. dve/reporting/__init__.py +1 -0
  104. dve/reporting/error_report.py +153 -0
  105. dve/reporting/excel_report.py +319 -0
@@ -0,0 +1,240 @@
1
+ """A complete backend implementation."""
2
+
3
+ import logging
4
+ import warnings
5
+ from abc import ABC, abstractmethod
6
+ from collections.abc import Mapping, MutableMapping
7
+ from typing import Any, ClassVar, Generic, Optional
8
+
9
+ from pyspark.sql import DataFrame, SparkSession
10
+
11
+ from dve.core_engine.backends.base.contract import BaseDataContract
12
+ from dve.core_engine.backends.base.core import EntityManager, get_entity_type
13
+ from dve.core_engine.backends.base.reference_data import BaseRefDataLoader, ReferenceConfigUnion
14
+ from dve.core_engine.backends.base.rules import BaseStepImplementations
15
+ from dve.core_engine.backends.metadata.contract import DataContractMetadata
16
+ from dve.core_engine.backends.metadata.rules import RuleMetadata
17
+ from dve.core_engine.backends.types import Entities, EntityType, StageSuccessful
18
+ from dve.core_engine.loggers import get_logger
19
+ from dve.core_engine.models import SubmissionInfo
20
+ from dve.core_engine.type_hints import URI, EntityLocations, EntityName, EntityParquetLocations
21
+ from dve.parser.file_handling.service import get_parent, joinuri
22
+
23
+
24
+ class BaseBackend(Generic[EntityType], ABC):
25
+ """A complete implementation of a backend."""
26
+
27
+ __entity_type__: ClassVar[type[EntityType]] # type: ignore
28
+ """
29
+ The entity type used within the backend.
30
+
31
+ This will be populated from the generic annotation at class creation time.
32
+
33
+ """
34
+
35
+ def __init_subclass__(cls, *_, **__) -> None:
36
+ # Set entity type from parent class subscript.
37
+ if cls is not BaseBackend:
38
+ cls.__entity_type__ = get_entity_type(cls, "BaseBackend")
39
+
40
+ def __init__( # pylint: disable=unused-argument
41
+ self,
42
+ contract: BaseDataContract[EntityType],
43
+ steps: BaseStepImplementations[EntityType],
44
+ reference_data_loader_type: Optional[type[BaseRefDataLoader[EntityType]]],
45
+ logger: Optional[logging.Logger] = None,
46
+ **kwargs: Any,
47
+ ) -> None:
48
+ for component_name, component in (
49
+ ("Contract", contract),
50
+ ("Step implementation", steps),
51
+ ("Reference data loader", reference_data_loader_type),
52
+ ):
53
+ component_entity_type = getattr(component, "__entity_type__", None)
54
+ if component_entity_type != self.__entity_type__:
55
+ raise TypeError(
56
+ f"{component_name} entity type ({component_entity_type}) does not match "
57
+ + f"the type expected by this backend ({self.__entity_type__})"
58
+ )
59
+
60
+ self.contract = contract
61
+ """The data contract implementation used by the backend."""
62
+ self.step_implementations = steps
63
+ """The step implementations used by the backend."""
64
+ self.reference_data_loader_type = reference_data_loader_type
65
+ """
66
+ The loader type to use for the reference data. If `None`, do not
67
+ load any reference data and error if it is provided.
68
+
69
+ """
70
+ self.logger = logger or get_logger(type(self).__name__)
71
+ """The `logging.Logger instance for the backend."""
72
+
73
+ def load_reference_data(
74
+ self,
75
+ reference_entity_config: dict[EntityName, ReferenceConfigUnion],
76
+ submission_info: Optional[SubmissionInfo],
77
+ ) -> Mapping[EntityName, EntityType]:
78
+ """Load the reference data as specified in the reference entity config."""
79
+ sub_info_entity: Optional[EntityType] = None
80
+ if submission_info:
81
+ sub_info_entity = self.convert_submission_info(submission_info)
82
+
83
+ if self.reference_data_loader_type is None:
84
+ if reference_entity_config:
85
+ raise ValueError(
86
+ "Reference data has been specified but no reference data loader is "
87
+ + "configured for this backend"
88
+ )
89
+
90
+ reference_data_dict = {}
91
+ if sub_info_entity is not None:
92
+ reference_data_dict["dve_submission_info"] = sub_info_entity
93
+ return reference_data_dict
94
+
95
+ reference_data_loader = self.reference_data_loader_type(reference_entity_config)
96
+ if sub_info_entity is not None:
97
+ reference_data_loader.entity_cache["dve_submission_info"] = sub_info_entity
98
+
99
+ return reference_data_loader
100
+
101
+ @abstractmethod
102
+ def convert_submission_info(self, submission_info: SubmissionInfo) -> EntityType:
103
+ """Convert the submission info to an entity."""
104
+
105
+ @abstractmethod
106
+ def write_entities_to_parquet(
107
+ self, entities: Entities, cache_prefix: URI
108
+ ) -> EntityParquetLocations:
109
+ """Write entities out to parquet, returning the locations."""
110
+ raise NotImplementedError()
111
+
112
+ def convert_entities_to_spark(
113
+ self, entities: Entities, cache_prefix: URI, _emit_deprecation_warning: bool = True
114
+ ) -> dict[EntityName, DataFrame]:
115
+ """Convert entities to Spark DataFrames.
116
+
117
+ Entities may be omitted if they are blank, because Spark cannot create an
118
+ entity from an empty parquet file.
119
+
120
+ """
121
+ if _emit_deprecation_warning:
122
+ self.logger.warning("DEPRECATED: Converting entities to Spark is deprecated")
123
+ warnings.warn(
124
+ "Converting entities to Spark is deprecated, and may be removed if the core engine "
125
+ + "changes the internal representation",
126
+ category=DeprecationWarning,
127
+ )
128
+
129
+ parquet_locations = self.write_entities_to_parquet(entities, cache_prefix)
130
+ spark_session = SparkSession.builder.getOrCreate()
131
+
132
+ spark_entities = {}
133
+ for entity_name, parquet_location in parquet_locations.items():
134
+ try:
135
+ spark_entities[entity_name] = spark_session.read.parquet(parquet_location)
136
+ except Exception as err: # pylint: disable=broad-except
137
+ self.logger.warning(
138
+ f"Failed to read entity {entity_name!r} back from parquet location "
139
+ + repr(parquet_location)
140
+ )
141
+ self.logger.exception(err)
142
+ return spark_entities
143
+
144
+ def apply(
145
+ self,
146
+ working_dir: URI,
147
+ entity_locations: EntityLocations,
148
+ contract_metadata: DataContractMetadata,
149
+ rule_metadata: RuleMetadata,
150
+ submission_info: Optional[SubmissionInfo] = None,
151
+ ) -> tuple[Entities, URI, StageSuccessful]:
152
+ """Apply the data contract and the rules, returning the entities and all
153
+ generated messages.
154
+
155
+ """
156
+ reference_data = self.load_reference_data(
157
+ rule_metadata.reference_data_config, submission_info
158
+ )
159
+ entities, dc_feedback_errors_uri, successful, processing_errors_uri = self.contract.apply(
160
+ working_dir, entity_locations, contract_metadata
161
+ )
162
+ if not successful:
163
+ return entities, get_parent(processing_errors_uri), successful
164
+
165
+ for entity_name, entity in entities.items():
166
+ entities[entity_name] = self.step_implementations.add_row_id(entity)
167
+
168
+ # TODO: Handle entity manager creation errors.
169
+ entity_manager = EntityManager(entities, reference_data)
170
+ # TODO: Add stage success to 'apply_rules'
171
+ # TODO: In case of large errors in business rules, write messages to jsonl file
172
+ # TODO: and return uri to errors
173
+ _ = self.step_implementations.apply_rules(working_dir, entity_manager, rule_metadata)
174
+
175
+ for entity_name, entity in entity_manager.entities.items():
176
+ entity_manager.entities[entity_name] = self.step_implementations.drop_row_id(entity)
177
+
178
+ return entity_manager.entities, get_parent(dc_feedback_errors_uri), True
179
+
180
+ def process(
181
+ self,
182
+ working_dir: URI,
183
+ entity_locations: EntityLocations,
184
+ contract_metadata: DataContractMetadata,
185
+ rule_metadata: RuleMetadata,
186
+ submission_info: Optional[SubmissionInfo] = None,
187
+ ) -> tuple[MutableMapping[EntityName, URI], URI]:
188
+ """Apply the data contract and the rules, write the entities out to parquet
189
+ and returning the entity locations and all generated messages.
190
+
191
+ """
192
+ entities, feedback_errors_uri, successful = self.apply(
193
+ working_dir, entity_locations, contract_metadata, rule_metadata, submission_info
194
+ )
195
+ if successful:
196
+ parquet_locations = self.write_entities_to_parquet(
197
+ entities, joinuri(working_dir, "outputs")
198
+ )
199
+ else:
200
+ parquet_locations = {}
201
+ return parquet_locations, get_parent(feedback_errors_uri)
202
+
203
+ def process_legacy(
204
+ self,
205
+ working_dir: URI,
206
+ entity_locations: EntityLocations,
207
+ contract_metadata: DataContractMetadata,
208
+ rule_metadata: RuleMetadata,
209
+ submission_info: Optional[SubmissionInfo] = None,
210
+ ) -> tuple[MutableMapping[EntityName, DataFrame], URI]:
211
+ """Apply the data contract and the rules, create Spark `DataFrame`s from the
212
+ entities and return the Spark entities and all generated messages.
213
+
214
+ Entities may be omitted if they are blank, because Spark cannot create an
215
+ entity from an empty parquet file.
216
+
217
+ """
218
+ self.logger.warning("DEPRECATED: Processing entities to Spark is deprecated")
219
+ warnings.warn(
220
+ "Converting entities to Spark is deprecated, and may be removed if the core engine "
221
+ + "changes the internal representation",
222
+ category=DeprecationWarning,
223
+ )
224
+
225
+ entities, errors_uri, successful = self.apply(
226
+ working_dir, entity_locations, contract_metadata, rule_metadata, submission_info
227
+ )
228
+
229
+ if not successful:
230
+ return {}, errors_uri
231
+
232
+ if self.__entity_type__ == DataFrame:
233
+ return entities, errors_uri # type: ignore
234
+
235
+ return (
236
+ self.convert_entities_to_spark(
237
+ entities, joinuri(working_dir, "outputs"), _emit_deprecation_warning=False
238
+ ),
239
+ errors_uri,
240
+ )
@@ -0,0 +1,454 @@
1
+ """Base implementation of the data contract."""
2
+
3
+ import logging
4
+ from abc import ABC, abstractmethod
5
+ from collections.abc import Iterable, Iterator
6
+ from inspect import ismethod
7
+ from typing import Any, ClassVar, Generic, Optional, TypeVar
8
+
9
+ from pydantic import BaseModel
10
+ from typing_extensions import Protocol
11
+
12
+ from dve.common.error_utils import (
13
+ dump_processing_errors,
14
+ get_feedback_errors_uri,
15
+ get_processing_errors_uri,
16
+ )
17
+ from dve.core_engine.backends.base.core import get_entity_type
18
+ from dve.core_engine.backends.base.reader import BaseFileReader
19
+ from dve.core_engine.backends.exceptions import ReaderLacksEntityTypeSupport, render_error
20
+ from dve.core_engine.backends.metadata.contract import DataContractMetadata
21
+ from dve.core_engine.backends.readers import get_reader
22
+ from dve.core_engine.backends.types import Entities, EntityType, StageSuccessful
23
+ from dve.core_engine.backends.utilities import dedup_messages, stringify_model
24
+ from dve.core_engine.exceptions import CriticalProcessingError
25
+ from dve.core_engine.loggers import get_logger
26
+ from dve.core_engine.message import FeedbackMessage
27
+ from dve.core_engine.type_hints import (
28
+ URI,
29
+ ArbitraryFunction,
30
+ DVEStageName,
31
+ EntityLocations,
32
+ EntityName,
33
+ JSONDict,
34
+ Messages,
35
+ WrapDecorator,
36
+ )
37
+ from dve.parser.file_handling import get_file_suffix, get_resource_exists
38
+ from dve.parser.type_hints import Extension
39
+
40
+ T = TypeVar("T")
41
+ ExtensionConfig = dict[Extension, "ReaderConfig"]
42
+ """Configuration options for file extensions."""
43
+ _READER_OVERRIDE_ATTR_NAME = "_implements_reader_for"
44
+ """The name of the reader override function's reader override attribute."""
45
+
46
+
47
+ class ReaderConfig(BaseModel):
48
+ """Configuration options for a given reader."""
49
+
50
+ reader: str
51
+ """The name of the reader to be used."""
52
+ parameters: JSONDict
53
+ """The parameters the reader should use."""
54
+
55
+
56
+ class _UnboundReaderOverride(Protocol[T]): # pylint: disable=too-few-public-methods
57
+ """The protocol required to implement an override for a specific file reader."""
58
+
59
+ @staticmethod
60
+ def __call__( # pylint: disable=bad-staticmethod-argument
61
+ self: "BaseDataContract[T]", # This is the protocol for an _unbound_ method.
62
+ reader: BaseFileReader,
63
+ resource: URI,
64
+ entity_name: EntityName,
65
+ schema: type[BaseModel],
66
+ ) -> T: ...
67
+
68
+
69
+ def reader_override(reader_type: type[BaseFileReader]) -> WrapDecorator:
70
+ """A decorator function which wraps a `ReaderProtocol` method to add support
71
+ for custom reader overrides.
72
+
73
+ """
74
+
75
+ def reader_impl_decorator(func: ArbitraryFunction) -> ArbitraryFunction:
76
+ """Wrap a reader function to indicate the reader type it implements an override
77
+ for.
78
+
79
+ """
80
+ setattr(func, _READER_OVERRIDE_ATTR_NAME, reader_type)
81
+ return func
82
+
83
+ return reader_impl_decorator
84
+
85
+
86
+ class BaseDataContract(Generic[EntityType], ABC):
87
+ """The base implementation of a data contract."""
88
+
89
+ __entity_type__: ClassVar[type[EntityType]] # type: ignore
90
+ """
91
+ The entity type that should be requested from a reader without a
92
+ specific implementation.
93
+
94
+ This will be populated from the generic annotation at class creation time.
95
+
96
+ """
97
+ __reader_overrides__: ClassVar[dict[type[BaseFileReader], _UnboundReaderOverride[EntityType]]] = {} # type: ignore # pylint: disable=line-too-long
98
+ """
99
+ A dictionary mapping implemented reader types to override functions which provide
100
+ a 'local' implementation of the reader. These can provide a more optimised version
101
+ of a specific reader for the implemented backend.
102
+
103
+ This is set and populated in `__init_subclass__` by identifying methods
104
+ decorated with the '@reader_override' decorator, and is used in `read_entity_type`.
105
+
106
+ """
107
+ __stage_name__: DVEStageName = "data_contract"
108
+ """
109
+ The name of the data contract DVE stage for use in auditing and logging
110
+ """
111
+
112
+ def __init_subclass__(cls, *_, **__) -> None:
113
+ """When this class is subclassed, create and populate the `__reader_overrides__`
114
+ and `__entity_type__` class variables for this subclass.
115
+
116
+ """
117
+ # Set entity type from parent class subscript.
118
+ if cls is not BaseDataContract:
119
+ cls.__entity_type__ = get_entity_type(cls, "BaseDataContract")
120
+
121
+ # Identify provided reader overrides.
122
+ cls.__reader_overrides__ = {}
123
+
124
+ for method_name in dir(cls):
125
+ method = getattr(cls, method_name, None)
126
+ if not (ismethod(method) or callable(method)):
127
+ continue
128
+
129
+ reader_type = getattr(method, _READER_OVERRIDE_ATTR_NAME, None)
130
+ if reader_type is None:
131
+ continue
132
+
133
+ if not (isinstance(reader_type, type) and issubclass(reader_type, BaseFileReader)):
134
+ continue
135
+
136
+ cls.__reader_overrides__[reader_type] = method # type: ignore
137
+
138
+ def __init__( # pylint: disable=unused-argument
139
+ self,
140
+ logger: Optional[logging.Logger] = None,
141
+ **kwargs: Any,
142
+ ):
143
+ self.logger = logger or get_logger(type(self).__name__)
144
+ """The `logging.Logger instance for the data contract config."""
145
+
146
+ @abstractmethod
147
+ def create_entity_from_py_iterator(
148
+ self, entity_name: EntityName, records: Iterator[dict[str, Any]], schema: type[BaseModel]
149
+ ) -> EntityType:
150
+ """A fallback function to be used where no entity type specific
151
+ reader implemenattions are available.
152
+
153
+ """
154
+
155
+ def read_entity_from_py_iterator(
156
+ self,
157
+ reader: BaseFileReader,
158
+ resource: URI,
159
+ entity_name: EntityName,
160
+ schema: type[BaseModel],
161
+ ) -> EntityType:
162
+ """A fallback function for readers that should read records with the
163
+ 'read_to_py_iterator' implementation and create an entity of the correct
164
+ type.
165
+
166
+ This will be used where there are not more specific implementations for a
167
+ given reader type (either as a reader-specific override, or through direct
168
+ support for the contract's entity type in the reader).
169
+
170
+ """
171
+ py_iterator = reader.read_to_py_iterator(resource, entity_name, schema)
172
+ return self.create_entity_from_py_iterator(entity_name, py_iterator, schema)
173
+
174
+ def read_entity(
175
+ self,
176
+ reader: BaseFileReader,
177
+ resource: URI,
178
+ entity_name: EntityName,
179
+ schema: type[BaseModel],
180
+ ) -> EntityType:
181
+ """Read an entity using the provided reader class.
182
+
183
+ NOTE: In the reader, simple types will either be returned as strings (if present)
184
+ or `None`. Format validation, casting, and parsing should be done when the
185
+ contract is applied.
186
+
187
+ NOTE 2: The default implementation will stringify schemas before passing them
188
+ to the reader and `create_entity_from_py_iterator`.
189
+
190
+ """
191
+ schema = stringify_model(schema)
192
+ try:
193
+ # Try fetching an overridden implementation for the given reader type.
194
+ impl = self.__reader_overrides__[type(reader)]
195
+ except KeyError:
196
+ try:
197
+ # If there is no override, try having the reader read directly to
198
+ # the contract's entity type.
199
+ self.logger.debug("Attempting to read directly to contract entity type...")
200
+ entity = reader.read_to_entity_type(
201
+ self.__entity_type__, resource, entity_name, schema
202
+ )
203
+ return entity
204
+ except ReaderLacksEntityTypeSupport:
205
+ pass
206
+ else:
207
+ self.logger.debug(f"Using contract-specific override for {type(reader).__name__}...")
208
+ return impl(self, reader, resource, entity_name, schema)
209
+
210
+ # Finally, fall back to using the pure Python reader and creating an entity.
211
+ self.logger.debug("Reading via Python iterator...")
212
+ return self.read_entity_from_py_iterator(reader, resource, entity_name, schema)
213
+
214
+ def _create_critical_error(
215
+ self, entity_name: EntityName, error_message: str
216
+ ) -> FeedbackMessage:
217
+ """Create a critical data contract error."""
218
+ return FeedbackMessage(
219
+ record=None,
220
+ entity=entity_name,
221
+ failure_type="integrity",
222
+ error_message=error_message,
223
+ error_location="Whole file",
224
+ category="Bad file",
225
+ )
226
+
227
+ def _ensure_all_entities_provided(
228
+ self, entity_names: Iterable[str], contract_metadata: DataContractMetadata
229
+ ) -> Messages:
230
+ """Ensure all entities are provided, with no extras."""
231
+ provided_entities = set(entity_names)
232
+ expected_entities = set(contract_metadata.schemas.keys())
233
+
234
+ missing_entities = sorted(provided_entities - expected_entities)
235
+ extra_entities = sorted(expected_entities - provided_entities)
236
+
237
+ messages: Messages = []
238
+
239
+ for entity_name in missing_entities:
240
+ self.logger.error(f"No location specified for {entity_name!r}")
241
+ message = self._create_critical_error(entity_name, "Entity was not provided")
242
+ messages.append(message)
243
+
244
+ for entity_name in extra_entities:
245
+ self.logger.error(f"Unrecognised entity provided ({entity_name!r})")
246
+ message = self._create_critical_error(entity_name, "Unrecognised entity name provided")
247
+ messages.append(message)
248
+
249
+ return messages
250
+
251
+ def _ensure_entity_locations_appropriate(
252
+ self, entity_locations: EntityLocations, contract_metadata: DataContractMetadata
253
+ ) -> Messages:
254
+ """Ensure the provided entity locations really exist."""
255
+ messages: Messages = []
256
+ for entity_name in contract_metadata.schemas:
257
+ try:
258
+ entity_location = entity_locations[entity_name]
259
+ except KeyError:
260
+ continue
261
+
262
+ try:
263
+ if not get_resource_exists(entity_location):
264
+ self.logger.error(
265
+ f"Resource does not exist for {entity_name!r} (location: "
266
+ + f"{entity_location!r})"
267
+ )
268
+ message = self._create_critical_error(
269
+ entity_name, "The provided location does not exist"
270
+ )
271
+ messages.append(message)
272
+ except Exception as err: # pylint: disable=broad-except
273
+ self.logger.error(
274
+ f"Error checking location exists for {entity_name!r} (location: "
275
+ + f"{entity_location!r})"
276
+ )
277
+ self.logger.exception(err)
278
+ error_message = (
279
+ f"Unable to ensure entity location exists ({type(err).__name__}: {err})"
280
+ )
281
+ message = self._create_critical_error(entity_name, error_message)
282
+ messages.append(message)
283
+
284
+ return messages
285
+
286
+ def _ensure_entity_locations_have_read_support(
287
+ self, entity_locations: EntityLocations, contract_metadata: DataContractMetadata
288
+ ) -> Messages:
289
+ """Ensure that provided entity locations have supported readers."""
290
+ messages: Messages = []
291
+
292
+ for entity_name in contract_metadata.schemas:
293
+ try:
294
+ entity_location = entity_locations[entity_name]
295
+ except KeyError:
296
+ continue
297
+
298
+ suffix = get_file_suffix(entity_location) or ""
299
+ if not suffix:
300
+ self.logger.error(
301
+ f"{entity_name!r} (location: {entity_location!r}) missing file extension"
302
+ )
303
+ message = self._create_critical_error(entity_name, "Missing file extension")
304
+ messages.append(message)
305
+
306
+ extension = f".{suffix}"
307
+ if extension not in contract_metadata.reader_metadata[entity_name]:
308
+ self.logger.error(
309
+ f"{entity_name!r} (location: {entity_location!r}) does not have configured "
310
+ + f"reader for {extension} files"
311
+ )
312
+ error_message = f"Does not implement support for {extension!r} types"
313
+ message = self._create_critical_error(entity_name, error_message)
314
+
315
+ return messages
316
+
317
+ def read_raw_entities(
318
+ self, entity_locations: EntityLocations, contract_metadata: DataContractMetadata
319
+ ) -> tuple[Entities, Messages, StageSuccessful]:
320
+ """Read the raw entities from the entity locations using the configured readers.
321
+
322
+ These will not yet have had the data contracts applied.
323
+
324
+ """
325
+ messages: Messages = []
326
+ messages.extend(self._ensure_all_entities_provided(entity_locations, contract_metadata))
327
+ messages.extend(
328
+ self._ensure_entity_locations_appropriate(entity_locations, contract_metadata)
329
+ )
330
+ messages.extend(
331
+ self._ensure_entity_locations_have_read_support(entity_locations, contract_metadata)
332
+ )
333
+ if any(message.is_critical for message in messages):
334
+ return {}, messages, False
335
+
336
+ entities: Entities = {}
337
+ successful = True
338
+ for entity_name, resource in entity_locations.items():
339
+ reader_metadata = contract_metadata.reader_metadata[entity_name]
340
+ extension = "." + (
341
+ get_file_suffix(resource) or ""
342
+ ).lower() # Already checked that extension supported.
343
+
344
+ reader_config = reader_metadata[extension]
345
+ reader_type = get_reader(reader_config.reader)
346
+ reader = reader_type(**reader_config.parameters)
347
+
348
+ self.logger.info(f"Reading entity {entity_name!r} using {reader_config.reader!r}")
349
+ try:
350
+ schema = contract_metadata.schemas[entity_name]
351
+ entities[entity_name] = self.read_entity(
352
+ reader,
353
+ resource,
354
+ entity_name,
355
+ schema, # type: ignore
356
+ )
357
+ except Exception as err: # pylint: disable=broad-except
358
+ successful = False
359
+ location = f"data contract (reading entity {entity_name!r} from {resource!r})"
360
+ new_messages = render_error(
361
+ err,
362
+ location,
363
+ self.logger,
364
+ entity_name=entity_name,
365
+ error_location="Whole file",
366
+ error_category="Bad file",
367
+ )
368
+ messages.extend(new_messages)
369
+
370
+ return entities, dedup_messages(messages), successful
371
+
372
+ @abstractmethod
373
+ def apply_data_contract(
374
+ self,
375
+ working_dir: URI,
376
+ entities: Entities,
377
+ entity_locations: EntityLocations,
378
+ contract_metadata: DataContractMetadata,
379
+ key_fields: Optional[dict[str, list[str]]] = None,
380
+ ) -> tuple[Entities, URI, StageSuccessful]:
381
+ """Apply the data contract to the raw entities, returning the validated entities
382
+ and any messages.
383
+
384
+ Record-level identifiers should be added at this point.
385
+
386
+ """
387
+ raise NotImplementedError()
388
+
389
+ def apply(
390
+ self,
391
+ working_dir: URI,
392
+ entity_locations: EntityLocations,
393
+ contract_metadata: DataContractMetadata,
394
+ key_fields: Optional[dict[str, list[str]]] = None,
395
+ ) -> tuple[Entities, URI, StageSuccessful, URI]:
396
+ """Read the entities from the provided locations according to the data contract,
397
+ and return the validated entities and any messages.
398
+
399
+ """
400
+ feedback_errors_uri = get_feedback_errors_uri(working_dir, self.__stage_name__)
401
+ processing_errors_uri = get_processing_errors_uri(working_dir)
402
+ entities, messages, successful = self.read_raw_entities(entity_locations, contract_metadata)
403
+ if not successful:
404
+ dump_processing_errors(
405
+ working_dir,
406
+ self.__stage_name__,
407
+ [
408
+ CriticalProcessingError(
409
+ "Issue occurred while reading raw entities",
410
+ [msg.error_message for msg in messages],
411
+ )
412
+ ],
413
+ )
414
+ return {}, feedback_errors_uri, successful, processing_errors_uri
415
+
416
+ try:
417
+ entities, feedback_errors_uri, successful = self.apply_data_contract(
418
+ working_dir, entities, entity_locations, contract_metadata, key_fields
419
+ )
420
+ except Exception as err: # pylint: disable=broad-except
421
+ successful = False
422
+ new_messages = render_error(
423
+ err,
424
+ self.__stage_name__,
425
+ self.logger,
426
+ )
427
+ dump_processing_errors(
428
+ working_dir,
429
+ self.__stage_name__,
430
+ [
431
+ CriticalProcessingError(
432
+ f"Issue occurred while applying {self.__stage_name__}",
433
+ [msg.error_message for msg in new_messages],
434
+ )
435
+ ],
436
+ )
437
+
438
+ if contract_metadata.cache_originals:
439
+ for entity_name in list(entities):
440
+ entities[f"Original{entity_name}"] = entities[entity_name]
441
+
442
+ return entities, feedback_errors_uri, successful, processing_errors_uri
443
+
444
+ def read_parquet(self, path: URI, **kwargs) -> EntityType:
445
+ """Method to read parquet files from stringified parquet output
446
+ from file transformation phase.
447
+ """
448
+ raise NotImplementedError()
449
+
450
+ def write_parquet(self, entity: EntityType, target_location: URI, **kwargs) -> URI:
451
+ """Method to write parquet files from type cast entities
452
+ following data contract application
453
+ """
454
+ raise NotImplementedError()