data-validation-engine 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_validation_engine-0.6.2.dist-info/METADATA +104 -0
- data_validation_engine-0.6.2.dist-info/RECORD +105 -0
- data_validation_engine-0.6.2.dist-info/WHEEL +4 -0
- data_validation_engine-0.6.2.dist-info/licenses/LICENSE +21 -0
- dve/__init__.py +0 -0
- dve/common/__init__.py +0 -0
- dve/common/error_utils.py +189 -0
- dve/core_engine/__init__.py +0 -0
- dve/core_engine/backends/__init__.py +1 -0
- dve/core_engine/backends/base/__init__.py +1 -0
- dve/core_engine/backends/base/auditing.py +618 -0
- dve/core_engine/backends/base/backend.py +240 -0
- dve/core_engine/backends/base/contract.py +454 -0
- dve/core_engine/backends/base/core.py +124 -0
- dve/core_engine/backends/base/reader.py +176 -0
- dve/core_engine/backends/base/reference_data.py +217 -0
- dve/core_engine/backends/base/rules.py +685 -0
- dve/core_engine/backends/base/utilities.py +146 -0
- dve/core_engine/backends/exceptions.py +311 -0
- dve/core_engine/backends/implementations/__init__.py +1 -0
- dve/core_engine/backends/implementations/duckdb/__init__.py +26 -0
- dve/core_engine/backends/implementations/duckdb/auditing.py +234 -0
- dve/core_engine/backends/implementations/duckdb/contract.py +213 -0
- dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +288 -0
- dve/core_engine/backends/implementations/duckdb/readers/__init__.py +13 -0
- dve/core_engine/backends/implementations/duckdb/readers/csv.py +222 -0
- dve/core_engine/backends/implementations/duckdb/readers/json.py +50 -0
- dve/core_engine/backends/implementations/duckdb/readers/xml.py +45 -0
- dve/core_engine/backends/implementations/duckdb/reference_data.py +49 -0
- dve/core_engine/backends/implementations/duckdb/rules.py +534 -0
- dve/core_engine/backends/implementations/duckdb/types.py +47 -0
- dve/core_engine/backends/implementations/duckdb/utilities.py +41 -0
- dve/core_engine/backends/implementations/spark/__init__.py +22 -0
- dve/core_engine/backends/implementations/spark/auditing.py +230 -0
- dve/core_engine/backends/implementations/spark/backend.py +78 -0
- dve/core_engine/backends/implementations/spark/contract.py +241 -0
- dve/core_engine/backends/implementations/spark/readers/__init__.py +15 -0
- dve/core_engine/backends/implementations/spark/readers/csv.py +77 -0
- dve/core_engine/backends/implementations/spark/readers/json.py +66 -0
- dve/core_engine/backends/implementations/spark/readers/xml.py +202 -0
- dve/core_engine/backends/implementations/spark/reference_data.py +42 -0
- dve/core_engine/backends/implementations/spark/rules.py +430 -0
- dve/core_engine/backends/implementations/spark/spark_helpers.py +412 -0
- dve/core_engine/backends/implementations/spark/types.py +21 -0
- dve/core_engine/backends/implementations/spark/utilities.py +144 -0
- dve/core_engine/backends/metadata/__init__.py +47 -0
- dve/core_engine/backends/metadata/contract.py +80 -0
- dve/core_engine/backends/metadata/reporting.py +374 -0
- dve/core_engine/backends/metadata/rules.py +737 -0
- dve/core_engine/backends/readers/__init__.py +41 -0
- dve/core_engine/backends/readers/csv.py +232 -0
- dve/core_engine/backends/readers/utilities.py +21 -0
- dve/core_engine/backends/readers/xml.py +432 -0
- dve/core_engine/backends/readers/xml_linting.py +142 -0
- dve/core_engine/backends/types.py +26 -0
- dve/core_engine/backends/utilities.py +177 -0
- dve/core_engine/configuration/__init__.py +1 -0
- dve/core_engine/configuration/base.py +56 -0
- dve/core_engine/configuration/v1/__init__.py +351 -0
- dve/core_engine/configuration/v1/filters.py +60 -0
- dve/core_engine/configuration/v1/rule_stores/__init__.py +1 -0
- dve/core_engine/configuration/v1/rule_stores/models.py +57 -0
- dve/core_engine/configuration/v1/steps.py +365 -0
- dve/core_engine/constants.py +8 -0
- dve/core_engine/engine.py +265 -0
- dve/core_engine/exceptions.py +29 -0
- dve/core_engine/functions/__init__.py +6 -0
- dve/core_engine/functions/implementations.py +200 -0
- dve/core_engine/loggers.py +57 -0
- dve/core_engine/message.py +512 -0
- dve/core_engine/models.py +196 -0
- dve/core_engine/templating.py +114 -0
- dve/core_engine/type_hints.py +255 -0
- dve/core_engine/validation.py +160 -0
- dve/metadata_parser/__init__.py +2 -0
- dve/metadata_parser/domain_types.py +682 -0
- dve/metadata_parser/exc.py +44 -0
- dve/metadata_parser/function_library.py +64 -0
- dve/metadata_parser/function_wrapper.py +201 -0
- dve/metadata_parser/model_generator.py +119 -0
- dve/metadata_parser/models.py +410 -0
- dve/metadata_parser/utilities.py +54 -0
- dve/parser/__init__.py +1 -0
- dve/parser/exceptions.py +50 -0
- dve/parser/file_handling/__init__.py +31 -0
- dve/parser/file_handling/helpers.py +29 -0
- dve/parser/file_handling/implementations/__init__.py +7 -0
- dve/parser/file_handling/implementations/base.py +97 -0
- dve/parser/file_handling/implementations/dbfs.py +81 -0
- dve/parser/file_handling/implementations/file.py +203 -0
- dve/parser/file_handling/implementations/s3.py +371 -0
- dve/parser/file_handling/log_handler.py +215 -0
- dve/parser/file_handling/service.py +441 -0
- dve/parser/file_handling/utilities.py +53 -0
- dve/parser/type_hints.py +46 -0
- dve/parser/utilities.py +113 -0
- dve/pipeline/__init__.py +0 -0
- dve/pipeline/duckdb_pipeline.py +56 -0
- dve/pipeline/foundry_ddb_pipeline.py +171 -0
- dve/pipeline/pipeline.py +935 -0
- dve/pipeline/spark_pipeline.py +69 -0
- dve/pipeline/utils.py +96 -0
- dve/reporting/__init__.py +1 -0
- dve/reporting/error_report.py +153 -0
- dve/reporting/excel_report.py +319 -0
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""Core functionality for the backend bases."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable, Iterator, Mapping, MutableMapping
|
|
4
|
+
from typing import Any, Generic, Optional
|
|
5
|
+
|
|
6
|
+
from typing_extensions import get_args, get_origin
|
|
7
|
+
|
|
8
|
+
from dve.core_engine.backends.exceptions import ConstraintError, MissingEntity, MissingRefDataEntity
|
|
9
|
+
from dve.core_engine.backends.types import EntityType
|
|
10
|
+
from dve.core_engine.type_hints import EntityName
|
|
11
|
+
|
|
12
|
+
get_original_bases: Callable[[type], tuple[Any, ...]]
|
|
13
|
+
try:
|
|
14
|
+
# pylint: disable=ungrouped-imports
|
|
15
|
+
from typing_extensions import get_original_bases # type: ignore
|
|
16
|
+
except ImportError:
|
|
17
|
+
|
|
18
|
+
def get_original_bases(__cls: type) -> tuple[Any, ...]:
|
|
19
|
+
"""A basic version of 'get_original_bases' in case it's not in typing extensions."""
|
|
20
|
+
try:
|
|
21
|
+
return __cls.__orig_bases__ # type: ignore
|
|
22
|
+
except AttributeError:
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
return __cls.__mro_entries__ # type: ignore
|
|
27
|
+
except AttributeError:
|
|
28
|
+
pass
|
|
29
|
+
return __cls.__mro__
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_entity_type(child: type, annotated_type_name: str) -> type[EntityType]:
|
|
33
|
+
"""Get the annotated entity type from a subclass, given the name of the parent
|
|
34
|
+
class which must be annotated.
|
|
35
|
+
|
|
36
|
+
"""
|
|
37
|
+
for base in get_original_bases(child):
|
|
38
|
+
if isinstance(base, type):
|
|
39
|
+
if base.__name__ != annotated_type_name:
|
|
40
|
+
continue
|
|
41
|
+
else:
|
|
42
|
+
origin = get_origin(base)
|
|
43
|
+
if origin is None or origin.__name__ != annotated_type_name:
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
annotations = get_args(base)
|
|
47
|
+
if not annotations:
|
|
48
|
+
raise TypeError(f"{child}: Cannot create an untyped `{annotated_type_name}` subclass")
|
|
49
|
+
if len(annotations) != 1:
|
|
50
|
+
raise TypeError(f"{child}: `{annotated_type_name}` must have exactly one entity type")
|
|
51
|
+
return annotations[0] # type: ignore
|
|
52
|
+
|
|
53
|
+
raise TypeError(f"{child}: No `{annotated_type_name}` parent found")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
IsRefdata = bool
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class EntityManager(Generic[EntityType], MutableMapping[EntityName, EntityType]):
|
|
60
|
+
"""An entity manager that creates a copy of the entities to mutate
|
|
61
|
+
during processing and ensures that reference data is not mutated.
|
|
62
|
+
|
|
63
|
+
This also ensures appropriate errors are raised for get/sets that
|
|
64
|
+
result in nicer error logs.
|
|
65
|
+
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
def __init__(
|
|
69
|
+
self,
|
|
70
|
+
entities: MutableMapping[EntityName, EntityType],
|
|
71
|
+
reference_data: Optional[Mapping[EntityName, EntityType]] = None,
|
|
72
|
+
) -> None:
|
|
73
|
+
self.entities = {}
|
|
74
|
+
"""A copy of the loaded entities."""
|
|
75
|
+
for entity_name, entity in entities.items():
|
|
76
|
+
if entity_name.startswith("refdata_"):
|
|
77
|
+
raise ValueError(f"Entity name cannot start with 'refdata_', got {entity_name!r}")
|
|
78
|
+
self.entities[entity_name] = entity
|
|
79
|
+
|
|
80
|
+
self.reference_data = reference_data if reference_data is not None else {}
|
|
81
|
+
"""The reference data mapping."""
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def _get_key_and_whether_refdata(key: str) -> tuple[EntityName, IsRefdata]:
|
|
85
|
+
"""Get the key and whether the entity is a reference data entry."""
|
|
86
|
+
if key.startswith("refdata_"):
|
|
87
|
+
return key[8:], True
|
|
88
|
+
return key, False
|
|
89
|
+
|
|
90
|
+
def __getitem__(self, key: EntityName) -> EntityType:
|
|
91
|
+
entity_name, is_refdata = self._get_key_and_whether_refdata(key)
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
if is_refdata:
|
|
95
|
+
return self.reference_data[entity_name]
|
|
96
|
+
return self.entities[entity_name]
|
|
97
|
+
except KeyError as err:
|
|
98
|
+
error_type = MissingRefDataEntity if is_refdata else MissingEntity
|
|
99
|
+
raise error_type(entity_name=entity_name) from err
|
|
100
|
+
|
|
101
|
+
def __setitem__(self, key: EntityName, value: EntityType) -> None:
|
|
102
|
+
entity_name, is_refdata = self._get_key_and_whether_refdata(key)
|
|
103
|
+
if is_refdata:
|
|
104
|
+
raise ConstraintError(
|
|
105
|
+
f"Attempting to mutate reference data entity {entity_name!r}",
|
|
106
|
+
constraint=f"reference data entry {entity_name!r} must not be mutated",
|
|
107
|
+
)
|
|
108
|
+
self.entities[entity_name] = value
|
|
109
|
+
|
|
110
|
+
def __delitem__(self, key: EntityName) -> None:
|
|
111
|
+
entity_name, is_refdata = self._get_key_and_whether_refdata(key)
|
|
112
|
+
if is_refdata:
|
|
113
|
+
raise ConstraintError(
|
|
114
|
+
f"Attempting to remove reference data entity {entity_name!r}",
|
|
115
|
+
constraint=f"reference data entry {entity_name!r} must not be mutated",
|
|
116
|
+
)
|
|
117
|
+
del self.entities[entity_name]
|
|
118
|
+
|
|
119
|
+
def __iter__(self) -> Iterator[str]:
|
|
120
|
+
yield from iter(self.entities.keys())
|
|
121
|
+
yield from ("_".join(("refdata", key)) for key in self.reference_data.keys())
|
|
122
|
+
|
|
123
|
+
def __len__(self) -> int:
|
|
124
|
+
return len(self.entities) + len(self.reference_data)
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""Abstract implementation of the file parser."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from collections.abc import Iterator
|
|
5
|
+
from inspect import ismethod
|
|
6
|
+
from typing import Any, ClassVar, Optional, TypeVar
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
from typing_extensions import Protocol
|
|
10
|
+
|
|
11
|
+
from dve.core_engine.backends.exceptions import MessageBearingError, ReaderLacksEntityTypeSupport
|
|
12
|
+
from dve.core_engine.backends.types import EntityName, EntityType
|
|
13
|
+
from dve.core_engine.message import FeedbackMessage
|
|
14
|
+
from dve.core_engine.type_hints import URI, ArbitraryFunction, WrapDecorator
|
|
15
|
+
from dve.parser.file_handling.service import open_stream
|
|
16
|
+
|
|
17
|
+
T = TypeVar("T")
|
|
18
|
+
ET_co = TypeVar("ET_co", covariant=True)
|
|
19
|
+
# This needs to be defined outside the class since otherwise mypy expects
|
|
20
|
+
# BaseFileReader to be generic:
|
|
21
|
+
_ReadFunctions = dict[type[T], "_UnboundReadFunction[T]"]
|
|
22
|
+
"""A convenience type indicating a mapping from type to reader function."""
|
|
23
|
+
_ENTITY_TYPE_ATTR_NAME = "_read_func_entity_type"
|
|
24
|
+
"""The name of the read function's entity type annotation attribute."""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class _UnboundReadFunction(Protocol[ET_co]): # pylint: disable=too-few-public-methods
|
|
28
|
+
"""The protocol required to implement a read function for a new entity type."""
|
|
29
|
+
|
|
30
|
+
@staticmethod
|
|
31
|
+
def __call__( # pylint: disable=bad-staticmethod-argument
|
|
32
|
+
self: "BaseFileReader", # This is the protocol for an _unbound_ method.
|
|
33
|
+
resource: URI,
|
|
34
|
+
entity_name: EntityName,
|
|
35
|
+
schema: type[BaseModel],
|
|
36
|
+
) -> ET_co: ...
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def read_function(entity_type: T) -> WrapDecorator:
|
|
40
|
+
"""A decorator function which tags read function methods within a reader class.
|
|
41
|
+
This is used to add support for different entity types in reader implementations.
|
|
42
|
+
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def reader_impl_decorator(func: ArbitraryFunction) -> ArbitraryFunction:
|
|
46
|
+
"""Wrap a read function to tag the entity type it implements support for."""
|
|
47
|
+
setattr(func, _ENTITY_TYPE_ATTR_NAME, entity_type)
|
|
48
|
+
return func
|
|
49
|
+
|
|
50
|
+
return reader_impl_decorator
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class BaseFileReader(ABC):
|
|
54
|
+
"""An abstract representation of a reader for some file type."""
|
|
55
|
+
|
|
56
|
+
__read_methods__: ClassVar[_ReadFunctions] = {}
|
|
57
|
+
"""
|
|
58
|
+
A dictionary mapping implemented entity types to their read functions.
|
|
59
|
+
|
|
60
|
+
This enables readers to implement optimised support for specific entity
|
|
61
|
+
types (rather than relying on the data contract having an optimised implementation,
|
|
62
|
+
or on the 'fallback' via a Python iterator.
|
|
63
|
+
|
|
64
|
+
This is set and populated in `__init_subclass__` by identifying methods
|
|
65
|
+
decorated with the '@read_function' decorator, and is used in `read_entity_type`.
|
|
66
|
+
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
def __init_subclass__(cls, *_, **__) -> None:
|
|
70
|
+
"""When this class is subclassed, create and populate the `__read_methods__`
|
|
71
|
+
class variable for the subclass.
|
|
72
|
+
|
|
73
|
+
"""
|
|
74
|
+
cls.__read_methods__ = {}
|
|
75
|
+
|
|
76
|
+
for attr_name in dir(cls):
|
|
77
|
+
method = getattr(cls, attr_name, None)
|
|
78
|
+
if not (ismethod(method) or callable(method)):
|
|
79
|
+
continue
|
|
80
|
+
|
|
81
|
+
entity_type: Optional[type] = getattr(method, _ENTITY_TYPE_ATTR_NAME, None)
|
|
82
|
+
if entity_type is None:
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
cls.__read_methods__[entity_type] = method # type: ignore
|
|
86
|
+
|
|
87
|
+
@abstractmethod
|
|
88
|
+
def read_to_py_iterator(
|
|
89
|
+
self,
|
|
90
|
+
resource: URI,
|
|
91
|
+
entity_name: EntityName,
|
|
92
|
+
schema: type[BaseModel],
|
|
93
|
+
) -> Iterator[dict[str, Any]]:
|
|
94
|
+
"""Iterate through the contents of the resource, yielding dicts
|
|
95
|
+
representing each record.
|
|
96
|
+
|
|
97
|
+
NOTE: Simple types should either be returned as strings (if present) or
|
|
98
|
+
`None`. Format validation, casting, and parsing should be done in the
|
|
99
|
+
data contract.
|
|
100
|
+
|
|
101
|
+
"""
|
|
102
|
+
raise NotImplementedError
|
|
103
|
+
|
|
104
|
+
def read_to_entity_type(
|
|
105
|
+
self,
|
|
106
|
+
entity_type: type[EntityType],
|
|
107
|
+
resource: URI,
|
|
108
|
+
entity_name: EntityName,
|
|
109
|
+
schema: type[BaseModel],
|
|
110
|
+
) -> EntityType:
|
|
111
|
+
"""Read to the specified entity type, if supported.
|
|
112
|
+
|
|
113
|
+
NOTE: Simple types should either be returned as strings (if present) or
|
|
114
|
+
`None`. Format validation, casting, and parsing should be done in the
|
|
115
|
+
data contract.
|
|
116
|
+
|
|
117
|
+
"""
|
|
118
|
+
if entity_name == Iterator[dict[str, Any]]:
|
|
119
|
+
return self.read_to_py_iterator(resource, entity_name, schema) # type: ignore
|
|
120
|
+
|
|
121
|
+
self.raise_if_not_sensible_file(resource, entity_name)
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
reader_func = self.__read_methods__[entity_type]
|
|
125
|
+
except KeyError as err:
|
|
126
|
+
raise ReaderLacksEntityTypeSupport(entity_type=entity_type) from err
|
|
127
|
+
|
|
128
|
+
return reader_func(self, resource, entity_name, schema)
|
|
129
|
+
|
|
130
|
+
def write_parquet(
|
|
131
|
+
self,
|
|
132
|
+
entity: EntityType,
|
|
133
|
+
target_location: URI,
|
|
134
|
+
schema: Optional[type[BaseModel]] = None,
|
|
135
|
+
**kwargs,
|
|
136
|
+
) -> URI:
|
|
137
|
+
"""Write entity to parquet.
|
|
138
|
+
|
|
139
|
+
NOTE: Simple types should be cast as strings (if present) or None.
|
|
140
|
+
If schema supplied then all simple types will be coerced to strings.
|
|
141
|
+
|
|
142
|
+
"""
|
|
143
|
+
raise NotImplementedError(f"write_parquet not implemented in {self.__class__}")
|
|
144
|
+
|
|
145
|
+
@staticmethod
|
|
146
|
+
def _check_likely_text_file(resource: URI) -> bool:
|
|
147
|
+
"""Quick sense check of file to see if it looks like text
|
|
148
|
+
- not 100% full proof, but hopefully enough to weed out most
|
|
149
|
+
non-text files"""
|
|
150
|
+
with open_stream(resource, "rb") as fle:
|
|
151
|
+
start_chunk = fle.read(4096)
|
|
152
|
+
# check for BOM character - utf-16 can contain NULL bytes
|
|
153
|
+
if start_chunk.startswith((b"\xff\xfe", b"\xfe\xff")):
|
|
154
|
+
return True
|
|
155
|
+
# if null byte in - unlikely text
|
|
156
|
+
if b"\x00" in start_chunk:
|
|
157
|
+
return False
|
|
158
|
+
return True
|
|
159
|
+
|
|
160
|
+
def raise_if_not_sensible_file(self, resource: URI, entity_name: str):
|
|
161
|
+
"""Sense check that the file is a text file. Raise error if doesn't
|
|
162
|
+
appear to be the case."""
|
|
163
|
+
if not self._check_likely_text_file(resource):
|
|
164
|
+
raise MessageBearingError(
|
|
165
|
+
"The submitted file doesn't appear to be text",
|
|
166
|
+
messages=[
|
|
167
|
+
FeedbackMessage(
|
|
168
|
+
entity=entity_name,
|
|
169
|
+
record=None,
|
|
170
|
+
failure_type="submission",
|
|
171
|
+
error_location="Whole File",
|
|
172
|
+
error_code="MalformedFile",
|
|
173
|
+
error_message="The resource doesn't seem to be a valid text file",
|
|
174
|
+
)
|
|
175
|
+
],
|
|
176
|
+
)
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""The base implementation of the reference data loader.."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from collections.abc import Callable, Iterator, Mapping
|
|
5
|
+
from typing import ClassVar, Generic, Optional, Union, get_type_hints
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
from typing_extensions import Annotated, Literal
|
|
9
|
+
|
|
10
|
+
import dve.parser.file_handling as fh
|
|
11
|
+
from dve.core_engine.backends.base.core import get_entity_type
|
|
12
|
+
from dve.core_engine.backends.exceptions import (
|
|
13
|
+
MissingRefDataEntity,
|
|
14
|
+
RefdataLacksFileExtensionSupport,
|
|
15
|
+
)
|
|
16
|
+
from dve.core_engine.backends.types import EntityType
|
|
17
|
+
from dve.core_engine.type_hints import URI, EntityName
|
|
18
|
+
from dve.parser.file_handling.implementations.file import LocalFilesystemImplementation
|
|
19
|
+
from dve.parser.file_handling.service import _get_implementation
|
|
20
|
+
|
|
21
|
+
_FILE_EXTENSION_NAME: str = "_REFDATA_FILE_EXTENSION"
|
|
22
|
+
"""Name of attribute added to methods where they relate
|
|
23
|
+
to loading a particular reference file type."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def mark_refdata_file_extension(file_extension):
|
|
27
|
+
"""Mark a method for loading a particular file extension"""
|
|
28
|
+
|
|
29
|
+
def wrapper(func: Callable):
|
|
30
|
+
setattr(func, _FILE_EXTENSION_NAME, file_extension)
|
|
31
|
+
return func
|
|
32
|
+
|
|
33
|
+
return wrapper
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ReferenceTable(BaseModel, frozen=True):
|
|
37
|
+
"""Configuration for a reference data object when table_name."""
|
|
38
|
+
|
|
39
|
+
type: Literal["table"]
|
|
40
|
+
"""The object type."""
|
|
41
|
+
table_name: str
|
|
42
|
+
"""Name of the table where the data persists."""
|
|
43
|
+
database: Optional[str] = None
|
|
44
|
+
"""Name of the database where the reference data is located."""
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def fq_table_name(self):
|
|
48
|
+
"""The fully qualified table name"""
|
|
49
|
+
if self.database:
|
|
50
|
+
return f"{self.database}.{self.table_name}"
|
|
51
|
+
return self.table_name
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class ReferenceFile(BaseModel, frozen=True):
|
|
55
|
+
"""Configuration for a reference data object when a file."""
|
|
56
|
+
|
|
57
|
+
type: Literal["filename"]
|
|
58
|
+
"""The object type."""
|
|
59
|
+
filename: str
|
|
60
|
+
"""The path to the reference data relative to the contract."""
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def file_extension(self) -> str:
|
|
64
|
+
"""The file extension of the reference file"""
|
|
65
|
+
return fh.get_file_suffix(self.filename) # type: ignore
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class ReferenceURI(BaseModel, frozen=True):
|
|
69
|
+
"""Configuration for a reference data object when a URI."""
|
|
70
|
+
|
|
71
|
+
type: Literal["uri"]
|
|
72
|
+
"""The object type."""
|
|
73
|
+
uri: str
|
|
74
|
+
"""The absolute URI of the reference data (as Parquet)."""
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def file_extension(self) -> str:
|
|
78
|
+
"""The file extension of the reference uri"""
|
|
79
|
+
return fh.get_file_suffix(self.uri) # type: ignore
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
ReferenceConfig = Union[ReferenceFile, ReferenceTable, ReferenceURI]
|
|
83
|
+
"""The config utilised to load the reference data"""
|
|
84
|
+
|
|
85
|
+
ReferenceConfigUnion = Annotated[ReferenceConfig, Field(discriminator="type")]
|
|
86
|
+
"""Discriminated union to determine refdata config from supplied type"""
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class BaseRefDataLoader(Generic[EntityType], Mapping[EntityName, EntityType], ABC):
|
|
90
|
+
"""A reference data mapper which lazy-loads requested entities."""
|
|
91
|
+
|
|
92
|
+
__entity_type__: ClassVar[type[EntityType]] # type: ignore
|
|
93
|
+
"""
|
|
94
|
+
The entity type used for the reference data.
|
|
95
|
+
|
|
96
|
+
This will be populated from the generic annotation at class creation time.
|
|
97
|
+
|
|
98
|
+
"""
|
|
99
|
+
__step_functions__: ClassVar[dict[type[ReferenceConfig], Callable]] = {}
|
|
100
|
+
"""
|
|
101
|
+
A mapping between refdata config types and functions to call to load these configs
|
|
102
|
+
into reference data entities
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
__reader_functions__: ClassVar[dict[str, Callable]] = {}
|
|
106
|
+
"""
|
|
107
|
+
A mapping between file extensions and functions to load the file uris
|
|
108
|
+
into reference data entities
|
|
109
|
+
"""
|
|
110
|
+
prefix: str = "refdata_"
|
|
111
|
+
|
|
112
|
+
def __init_subclass__(cls, *_, **__) -> None:
|
|
113
|
+
"""When this class is subclassed, create and populate the `__step_functions__`
|
|
114
|
+
class variable for the subclass.
|
|
115
|
+
|
|
116
|
+
"""
|
|
117
|
+
# Set entity type from parent class subscript.
|
|
118
|
+
if cls is not BaseRefDataLoader:
|
|
119
|
+
cls.__entity_type__ = get_entity_type(cls, "BaseRefDataLoader")
|
|
120
|
+
|
|
121
|
+
# ensure that dicts are specific to each subclass - redefine rather
|
|
122
|
+
# than keep the same reference
|
|
123
|
+
cls.__reader_functions__ = {}
|
|
124
|
+
cls.__step_functions__ = {}
|
|
125
|
+
|
|
126
|
+
for method_name in dir(cls):
|
|
127
|
+
if method_name.startswith("_"):
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
method = getattr(cls, method_name, None)
|
|
131
|
+
if method is None or not callable(method):
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
if ext := getattr(method, _FILE_EXTENSION_NAME, None):
|
|
135
|
+
cls.__reader_functions__[ext] = method
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
type_hints = get_type_hints(method)
|
|
139
|
+
if set(type_hints.keys()) != {"config", "return"}:
|
|
140
|
+
continue
|
|
141
|
+
config_type = type_hints["config"]
|
|
142
|
+
if not issubclass(config_type, BaseModel):
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
cls.__step_functions__[config_type] = method # type: ignore
|
|
146
|
+
|
|
147
|
+
# pylint: disable=unused-argument
|
|
148
|
+
def __init__(
|
|
149
|
+
self,
|
|
150
|
+
reference_entity_config: dict[EntityName, ReferenceConfig],
|
|
151
|
+
dataset_config_uri: Optional[URI] = None,
|
|
152
|
+
**kwargs,
|
|
153
|
+
) -> None:
|
|
154
|
+
self.reference_entity_config = reference_entity_config
|
|
155
|
+
self.dataset_config_uri = dataset_config_uri
|
|
156
|
+
"""
|
|
157
|
+
Configuration options for the reference data. This is likely to vary
|
|
158
|
+
from backend to backend (e.g. might be locations and file types for
|
|
159
|
+
some backends, and table names for others).
|
|
160
|
+
|
|
161
|
+
"""
|
|
162
|
+
self.entity_cache: dict[EntityName, EntityType] = {}
|
|
163
|
+
"""A cache for already-loaded entities."""
|
|
164
|
+
|
|
165
|
+
@abstractmethod
|
|
166
|
+
def load_table(self, config: ReferenceTable) -> EntityType:
|
|
167
|
+
"""Load reference entity from a database table"""
|
|
168
|
+
raise NotImplementedError()
|
|
169
|
+
|
|
170
|
+
def load_file(self, config: ReferenceFile) -> EntityType:
|
|
171
|
+
"Load reference entity from a relative file path"
|
|
172
|
+
if not self.dataset_config_uri:
|
|
173
|
+
raise AttributeError("dataset_config_uri must be specified if using relative paths")
|
|
174
|
+
target_location = fh.build_relative_uri(self.dataset_config_uri, config.filename)
|
|
175
|
+
if isinstance(_get_implementation(self.dataset_config_uri), LocalFilesystemImplementation):
|
|
176
|
+
target_location = fh.file_uri_to_local_path(target_location).as_posix()
|
|
177
|
+
try:
|
|
178
|
+
impl = self.__reader_functions__[config.file_extension]
|
|
179
|
+
return impl(self, target_location)
|
|
180
|
+
except KeyError as exc:
|
|
181
|
+
raise RefdataLacksFileExtensionSupport(file_extension=config.file_extension) from exc
|
|
182
|
+
|
|
183
|
+
def load_uri(self, config: ReferenceURI) -> EntityType:
|
|
184
|
+
"Load reference entity from an absolute URI"
|
|
185
|
+
if isinstance(_get_implementation(config.uri), LocalFilesystemImplementation):
|
|
186
|
+
target_location = fh.file_uri_to_local_path(config.uri).as_posix()
|
|
187
|
+
else:
|
|
188
|
+
target_location = config.uri
|
|
189
|
+
try:
|
|
190
|
+
impl = self.__reader_functions__[config.file_extension]
|
|
191
|
+
return impl(self, target_location)
|
|
192
|
+
except KeyError as exc:
|
|
193
|
+
raise RefdataLacksFileExtensionSupport(file_extension=config.file_extension) from exc
|
|
194
|
+
|
|
195
|
+
def load_entity(self, entity_name: EntityName, config: ReferenceConfig) -> EntityType:
|
|
196
|
+
"""Load a reference entity given the reference config"""
|
|
197
|
+
config_type = type(config)
|
|
198
|
+
func = self.__step_functions__[config_type]
|
|
199
|
+
entity = func(self, config)
|
|
200
|
+
self.entity_cache[entity_name] = entity
|
|
201
|
+
return entity
|
|
202
|
+
|
|
203
|
+
def __getitem__(self, key: EntityName) -> EntityType:
|
|
204
|
+
try:
|
|
205
|
+
return self.entity_cache[key]
|
|
206
|
+
except KeyError:
|
|
207
|
+
try:
|
|
208
|
+
config = self.reference_entity_config[key]
|
|
209
|
+
return self.load_entity(entity_name=key, config=config)
|
|
210
|
+
except Exception as err:
|
|
211
|
+
raise MissingRefDataEntity(entity_name=key) from err
|
|
212
|
+
|
|
213
|
+
def __iter__(self) -> Iterator[str]:
|
|
214
|
+
return iter(self.reference_entity_config.keys())
|
|
215
|
+
|
|
216
|
+
def __len__(self) -> int:
|
|
217
|
+
return len(self.reference_entity_config)
|