data-validation-engine 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. data_validation_engine-0.6.2.dist-info/METADATA +104 -0
  2. data_validation_engine-0.6.2.dist-info/RECORD +105 -0
  3. data_validation_engine-0.6.2.dist-info/WHEEL +4 -0
  4. data_validation_engine-0.6.2.dist-info/licenses/LICENSE +21 -0
  5. dve/__init__.py +0 -0
  6. dve/common/__init__.py +0 -0
  7. dve/common/error_utils.py +189 -0
  8. dve/core_engine/__init__.py +0 -0
  9. dve/core_engine/backends/__init__.py +1 -0
  10. dve/core_engine/backends/base/__init__.py +1 -0
  11. dve/core_engine/backends/base/auditing.py +618 -0
  12. dve/core_engine/backends/base/backend.py +240 -0
  13. dve/core_engine/backends/base/contract.py +454 -0
  14. dve/core_engine/backends/base/core.py +124 -0
  15. dve/core_engine/backends/base/reader.py +176 -0
  16. dve/core_engine/backends/base/reference_data.py +217 -0
  17. dve/core_engine/backends/base/rules.py +685 -0
  18. dve/core_engine/backends/base/utilities.py +146 -0
  19. dve/core_engine/backends/exceptions.py +311 -0
  20. dve/core_engine/backends/implementations/__init__.py +1 -0
  21. dve/core_engine/backends/implementations/duckdb/__init__.py +26 -0
  22. dve/core_engine/backends/implementations/duckdb/auditing.py +234 -0
  23. dve/core_engine/backends/implementations/duckdb/contract.py +213 -0
  24. dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +288 -0
  25. dve/core_engine/backends/implementations/duckdb/readers/__init__.py +13 -0
  26. dve/core_engine/backends/implementations/duckdb/readers/csv.py +222 -0
  27. dve/core_engine/backends/implementations/duckdb/readers/json.py +50 -0
  28. dve/core_engine/backends/implementations/duckdb/readers/xml.py +45 -0
  29. dve/core_engine/backends/implementations/duckdb/reference_data.py +49 -0
  30. dve/core_engine/backends/implementations/duckdb/rules.py +534 -0
  31. dve/core_engine/backends/implementations/duckdb/types.py +47 -0
  32. dve/core_engine/backends/implementations/duckdb/utilities.py +41 -0
  33. dve/core_engine/backends/implementations/spark/__init__.py +22 -0
  34. dve/core_engine/backends/implementations/spark/auditing.py +230 -0
  35. dve/core_engine/backends/implementations/spark/backend.py +78 -0
  36. dve/core_engine/backends/implementations/spark/contract.py +241 -0
  37. dve/core_engine/backends/implementations/spark/readers/__init__.py +15 -0
  38. dve/core_engine/backends/implementations/spark/readers/csv.py +77 -0
  39. dve/core_engine/backends/implementations/spark/readers/json.py +66 -0
  40. dve/core_engine/backends/implementations/spark/readers/xml.py +202 -0
  41. dve/core_engine/backends/implementations/spark/reference_data.py +42 -0
  42. dve/core_engine/backends/implementations/spark/rules.py +430 -0
  43. dve/core_engine/backends/implementations/spark/spark_helpers.py +412 -0
  44. dve/core_engine/backends/implementations/spark/types.py +21 -0
  45. dve/core_engine/backends/implementations/spark/utilities.py +144 -0
  46. dve/core_engine/backends/metadata/__init__.py +47 -0
  47. dve/core_engine/backends/metadata/contract.py +80 -0
  48. dve/core_engine/backends/metadata/reporting.py +374 -0
  49. dve/core_engine/backends/metadata/rules.py +737 -0
  50. dve/core_engine/backends/readers/__init__.py +41 -0
  51. dve/core_engine/backends/readers/csv.py +232 -0
  52. dve/core_engine/backends/readers/utilities.py +21 -0
  53. dve/core_engine/backends/readers/xml.py +432 -0
  54. dve/core_engine/backends/readers/xml_linting.py +142 -0
  55. dve/core_engine/backends/types.py +26 -0
  56. dve/core_engine/backends/utilities.py +177 -0
  57. dve/core_engine/configuration/__init__.py +1 -0
  58. dve/core_engine/configuration/base.py +56 -0
  59. dve/core_engine/configuration/v1/__init__.py +351 -0
  60. dve/core_engine/configuration/v1/filters.py +60 -0
  61. dve/core_engine/configuration/v1/rule_stores/__init__.py +1 -0
  62. dve/core_engine/configuration/v1/rule_stores/models.py +57 -0
  63. dve/core_engine/configuration/v1/steps.py +365 -0
  64. dve/core_engine/constants.py +8 -0
  65. dve/core_engine/engine.py +265 -0
  66. dve/core_engine/exceptions.py +29 -0
  67. dve/core_engine/functions/__init__.py +6 -0
  68. dve/core_engine/functions/implementations.py +200 -0
  69. dve/core_engine/loggers.py +57 -0
  70. dve/core_engine/message.py +512 -0
  71. dve/core_engine/models.py +196 -0
  72. dve/core_engine/templating.py +114 -0
  73. dve/core_engine/type_hints.py +255 -0
  74. dve/core_engine/validation.py +160 -0
  75. dve/metadata_parser/__init__.py +2 -0
  76. dve/metadata_parser/domain_types.py +682 -0
  77. dve/metadata_parser/exc.py +44 -0
  78. dve/metadata_parser/function_library.py +64 -0
  79. dve/metadata_parser/function_wrapper.py +201 -0
  80. dve/metadata_parser/model_generator.py +119 -0
  81. dve/metadata_parser/models.py +410 -0
  82. dve/metadata_parser/utilities.py +54 -0
  83. dve/parser/__init__.py +1 -0
  84. dve/parser/exceptions.py +50 -0
  85. dve/parser/file_handling/__init__.py +31 -0
  86. dve/parser/file_handling/helpers.py +29 -0
  87. dve/parser/file_handling/implementations/__init__.py +7 -0
  88. dve/parser/file_handling/implementations/base.py +97 -0
  89. dve/parser/file_handling/implementations/dbfs.py +81 -0
  90. dve/parser/file_handling/implementations/file.py +203 -0
  91. dve/parser/file_handling/implementations/s3.py +371 -0
  92. dve/parser/file_handling/log_handler.py +215 -0
  93. dve/parser/file_handling/service.py +441 -0
  94. dve/parser/file_handling/utilities.py +53 -0
  95. dve/parser/type_hints.py +46 -0
  96. dve/parser/utilities.py +113 -0
  97. dve/pipeline/__init__.py +0 -0
  98. dve/pipeline/duckdb_pipeline.py +56 -0
  99. dve/pipeline/foundry_ddb_pipeline.py +171 -0
  100. dve/pipeline/pipeline.py +935 -0
  101. dve/pipeline/spark_pipeline.py +69 -0
  102. dve/pipeline/utils.py +96 -0
  103. dve/reporting/__init__.py +1 -0
  104. dve/reporting/error_report.py +153 -0
  105. dve/reporting/excel_report.py +319 -0
@@ -0,0 +1,124 @@
1
+ """Core functionality for the backend bases."""
2
+
3
+ from collections.abc import Callable, Iterator, Mapping, MutableMapping
4
+ from typing import Any, Generic, Optional
5
+
6
+ from typing_extensions import get_args, get_origin
7
+
8
+ from dve.core_engine.backends.exceptions import ConstraintError, MissingEntity, MissingRefDataEntity
9
+ from dve.core_engine.backends.types import EntityType
10
+ from dve.core_engine.type_hints import EntityName
11
+
12
+ get_original_bases: Callable[[type], tuple[Any, ...]]
13
+ try:
14
+ # pylint: disable=ungrouped-imports
15
+ from typing_extensions import get_original_bases # type: ignore
16
+ except ImportError:
17
+
18
+ def get_original_bases(__cls: type) -> tuple[Any, ...]:
19
+ """A basic version of 'get_original_bases' in case it's not in typing extensions."""
20
+ try:
21
+ return __cls.__orig_bases__ # type: ignore
22
+ except AttributeError:
23
+ pass
24
+
25
+ try:
26
+ return __cls.__mro_entries__ # type: ignore
27
+ except AttributeError:
28
+ pass
29
+ return __cls.__mro__
30
+
31
+
32
+ def get_entity_type(child: type, annotated_type_name: str) -> type[EntityType]:
33
+ """Get the annotated entity type from a subclass, given the name of the parent
34
+ class which must be annotated.
35
+
36
+ """
37
+ for base in get_original_bases(child):
38
+ if isinstance(base, type):
39
+ if base.__name__ != annotated_type_name:
40
+ continue
41
+ else:
42
+ origin = get_origin(base)
43
+ if origin is None or origin.__name__ != annotated_type_name:
44
+ continue
45
+
46
+ annotations = get_args(base)
47
+ if not annotations:
48
+ raise TypeError(f"{child}: Cannot create an untyped `{annotated_type_name}` subclass")
49
+ if len(annotations) != 1:
50
+ raise TypeError(f"{child}: `{annotated_type_name}` must have exactly one entity type")
51
+ return annotations[0] # type: ignore
52
+
53
+ raise TypeError(f"{child}: No `{annotated_type_name}` parent found")
54
+
55
+
56
+ IsRefdata = bool
57
+
58
+
59
+ class EntityManager(Generic[EntityType], MutableMapping[EntityName, EntityType]):
60
+ """An entity manager that creates a copy of the entities to mutate
61
+ during processing and ensures that reference data is not mutated.
62
+
63
+ This also ensures appropriate errors are raised for get/sets that
64
+ result in nicer error logs.
65
+
66
+ """
67
+
68
+ def __init__(
69
+ self,
70
+ entities: MutableMapping[EntityName, EntityType],
71
+ reference_data: Optional[Mapping[EntityName, EntityType]] = None,
72
+ ) -> None:
73
+ self.entities = {}
74
+ """A copy of the loaded entities."""
75
+ for entity_name, entity in entities.items():
76
+ if entity_name.startswith("refdata_"):
77
+ raise ValueError(f"Entity name cannot start with 'refdata_', got {entity_name!r}")
78
+ self.entities[entity_name] = entity
79
+
80
+ self.reference_data = reference_data if reference_data is not None else {}
81
+ """The reference data mapping."""
82
+
83
+ @staticmethod
84
+ def _get_key_and_whether_refdata(key: str) -> tuple[EntityName, IsRefdata]:
85
+ """Get the key and whether the entity is a reference data entry."""
86
+ if key.startswith("refdata_"):
87
+ return key[8:], True
88
+ return key, False
89
+
90
+ def __getitem__(self, key: EntityName) -> EntityType:
91
+ entity_name, is_refdata = self._get_key_and_whether_refdata(key)
92
+
93
+ try:
94
+ if is_refdata:
95
+ return self.reference_data[entity_name]
96
+ return self.entities[entity_name]
97
+ except KeyError as err:
98
+ error_type = MissingRefDataEntity if is_refdata else MissingEntity
99
+ raise error_type(entity_name=entity_name) from err
100
+
101
+ def __setitem__(self, key: EntityName, value: EntityType) -> None:
102
+ entity_name, is_refdata = self._get_key_and_whether_refdata(key)
103
+ if is_refdata:
104
+ raise ConstraintError(
105
+ f"Attempting to mutate reference data entity {entity_name!r}",
106
+ constraint=f"reference data entry {entity_name!r} must not be mutated",
107
+ )
108
+ self.entities[entity_name] = value
109
+
110
+ def __delitem__(self, key: EntityName) -> None:
111
+ entity_name, is_refdata = self._get_key_and_whether_refdata(key)
112
+ if is_refdata:
113
+ raise ConstraintError(
114
+ f"Attempting to remove reference data entity {entity_name!r}",
115
+ constraint=f"reference data entry {entity_name!r} must not be mutated",
116
+ )
117
+ del self.entities[entity_name]
118
+
119
+ def __iter__(self) -> Iterator[str]:
120
+ yield from iter(self.entities.keys())
121
+ yield from ("_".join(("refdata", key)) for key in self.reference_data.keys())
122
+
123
+ def __len__(self) -> int:
124
+ return len(self.entities) + len(self.reference_data)
@@ -0,0 +1,176 @@
1
+ """Abstract implementation of the file parser."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from collections.abc import Iterator
5
+ from inspect import ismethod
6
+ from typing import Any, ClassVar, Optional, TypeVar
7
+
8
+ from pydantic import BaseModel
9
+ from typing_extensions import Protocol
10
+
11
+ from dve.core_engine.backends.exceptions import MessageBearingError, ReaderLacksEntityTypeSupport
12
+ from dve.core_engine.backends.types import EntityName, EntityType
13
+ from dve.core_engine.message import FeedbackMessage
14
+ from dve.core_engine.type_hints import URI, ArbitraryFunction, WrapDecorator
15
+ from dve.parser.file_handling.service import open_stream
16
+
17
+ T = TypeVar("T")
18
+ ET_co = TypeVar("ET_co", covariant=True)
19
+ # This needs to be defined outside the class since otherwise mypy expects
20
+ # BaseFileReader to be generic:
21
+ _ReadFunctions = dict[type[T], "_UnboundReadFunction[T]"]
22
+ """A convenience type indicating a mapping from type to reader function."""
23
+ _ENTITY_TYPE_ATTR_NAME = "_read_func_entity_type"
24
+ """The name of the read function's entity type annotation attribute."""
25
+
26
+
27
+ class _UnboundReadFunction(Protocol[ET_co]): # pylint: disable=too-few-public-methods
28
+ """The protocol required to implement a read function for a new entity type."""
29
+
30
+ @staticmethod
31
+ def __call__( # pylint: disable=bad-staticmethod-argument
32
+ self: "BaseFileReader", # This is the protocol for an _unbound_ method.
33
+ resource: URI,
34
+ entity_name: EntityName,
35
+ schema: type[BaseModel],
36
+ ) -> ET_co: ...
37
+
38
+
39
+ def read_function(entity_type: T) -> WrapDecorator:
40
+ """A decorator function which tags read function methods within a reader class.
41
+ This is used to add support for different entity types in reader implementations.
42
+
43
+ """
44
+
45
+ def reader_impl_decorator(func: ArbitraryFunction) -> ArbitraryFunction:
46
+ """Wrap a read function to tag the entity type it implements support for."""
47
+ setattr(func, _ENTITY_TYPE_ATTR_NAME, entity_type)
48
+ return func
49
+
50
+ return reader_impl_decorator
51
+
52
+
53
+ class BaseFileReader(ABC):
54
+ """An abstract representation of a reader for some file type."""
55
+
56
+ __read_methods__: ClassVar[_ReadFunctions] = {}
57
+ """
58
+ A dictionary mapping implemented entity types to their read functions.
59
+
60
+ This enables readers to implement optimised support for specific entity
61
+ types (rather than relying on the data contract having an optimised implementation,
62
+ or on the 'fallback' via a Python iterator.
63
+
64
+ This is set and populated in `__init_subclass__` by identifying methods
65
+ decorated with the '@read_function' decorator, and is used in `read_entity_type`.
66
+
67
+ """
68
+
69
+ def __init_subclass__(cls, *_, **__) -> None:
70
+ """When this class is subclassed, create and populate the `__read_methods__`
71
+ class variable for the subclass.
72
+
73
+ """
74
+ cls.__read_methods__ = {}
75
+
76
+ for attr_name in dir(cls):
77
+ method = getattr(cls, attr_name, None)
78
+ if not (ismethod(method) or callable(method)):
79
+ continue
80
+
81
+ entity_type: Optional[type] = getattr(method, _ENTITY_TYPE_ATTR_NAME, None)
82
+ if entity_type is None:
83
+ continue
84
+
85
+ cls.__read_methods__[entity_type] = method # type: ignore
86
+
87
+ @abstractmethod
88
+ def read_to_py_iterator(
89
+ self,
90
+ resource: URI,
91
+ entity_name: EntityName,
92
+ schema: type[BaseModel],
93
+ ) -> Iterator[dict[str, Any]]:
94
+ """Iterate through the contents of the resource, yielding dicts
95
+ representing each record.
96
+
97
+ NOTE: Simple types should either be returned as strings (if present) or
98
+ `None`. Format validation, casting, and parsing should be done in the
99
+ data contract.
100
+
101
+ """
102
+ raise NotImplementedError
103
+
104
+ def read_to_entity_type(
105
+ self,
106
+ entity_type: type[EntityType],
107
+ resource: URI,
108
+ entity_name: EntityName,
109
+ schema: type[BaseModel],
110
+ ) -> EntityType:
111
+ """Read to the specified entity type, if supported.
112
+
113
+ NOTE: Simple types should either be returned as strings (if present) or
114
+ `None`. Format validation, casting, and parsing should be done in the
115
+ data contract.
116
+
117
+ """
118
+ if entity_name == Iterator[dict[str, Any]]:
119
+ return self.read_to_py_iterator(resource, entity_name, schema) # type: ignore
120
+
121
+ self.raise_if_not_sensible_file(resource, entity_name)
122
+
123
+ try:
124
+ reader_func = self.__read_methods__[entity_type]
125
+ except KeyError as err:
126
+ raise ReaderLacksEntityTypeSupport(entity_type=entity_type) from err
127
+
128
+ return reader_func(self, resource, entity_name, schema)
129
+
130
+ def write_parquet(
131
+ self,
132
+ entity: EntityType,
133
+ target_location: URI,
134
+ schema: Optional[type[BaseModel]] = None,
135
+ **kwargs,
136
+ ) -> URI:
137
+ """Write entity to parquet.
138
+
139
+ NOTE: Simple types should be cast as strings (if present) or None.
140
+ If schema supplied then all simple types will be coerced to strings.
141
+
142
+ """
143
+ raise NotImplementedError(f"write_parquet not implemented in {self.__class__}")
144
+
145
+ @staticmethod
146
+ def _check_likely_text_file(resource: URI) -> bool:
147
+ """Quick sense check of file to see if it looks like text
148
+ - not 100% full proof, but hopefully enough to weed out most
149
+ non-text files"""
150
+ with open_stream(resource, "rb") as fle:
151
+ start_chunk = fle.read(4096)
152
+ # check for BOM character - utf-16 can contain NULL bytes
153
+ if start_chunk.startswith((b"\xff\xfe", b"\xfe\xff")):
154
+ return True
155
+ # if null byte in - unlikely text
156
+ if b"\x00" in start_chunk:
157
+ return False
158
+ return True
159
+
160
+ def raise_if_not_sensible_file(self, resource: URI, entity_name: str):
161
+ """Sense check that the file is a text file. Raise error if doesn't
162
+ appear to be the case."""
163
+ if not self._check_likely_text_file(resource):
164
+ raise MessageBearingError(
165
+ "The submitted file doesn't appear to be text",
166
+ messages=[
167
+ FeedbackMessage(
168
+ entity=entity_name,
169
+ record=None,
170
+ failure_type="submission",
171
+ error_location="Whole File",
172
+ error_code="MalformedFile",
173
+ error_message="The resource doesn't seem to be a valid text file",
174
+ )
175
+ ],
176
+ )
@@ -0,0 +1,217 @@
1
+ """The base implementation of the reference data loader.."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from collections.abc import Callable, Iterator, Mapping
5
+ from typing import ClassVar, Generic, Optional, Union, get_type_hints
6
+
7
+ from pydantic import BaseModel, Field
8
+ from typing_extensions import Annotated, Literal
9
+
10
+ import dve.parser.file_handling as fh
11
+ from dve.core_engine.backends.base.core import get_entity_type
12
+ from dve.core_engine.backends.exceptions import (
13
+ MissingRefDataEntity,
14
+ RefdataLacksFileExtensionSupport,
15
+ )
16
+ from dve.core_engine.backends.types import EntityType
17
+ from dve.core_engine.type_hints import URI, EntityName
18
+ from dve.parser.file_handling.implementations.file import LocalFilesystemImplementation
19
+ from dve.parser.file_handling.service import _get_implementation
20
+
21
+ _FILE_EXTENSION_NAME: str = "_REFDATA_FILE_EXTENSION"
22
+ """Name of attribute added to methods where they relate
23
+ to loading a particular reference file type."""
24
+
25
+
26
+ def mark_refdata_file_extension(file_extension):
27
+ """Mark a method for loading a particular file extension"""
28
+
29
+ def wrapper(func: Callable):
30
+ setattr(func, _FILE_EXTENSION_NAME, file_extension)
31
+ return func
32
+
33
+ return wrapper
34
+
35
+
36
+ class ReferenceTable(BaseModel, frozen=True):
37
+ """Configuration for a reference data object when table_name."""
38
+
39
+ type: Literal["table"]
40
+ """The object type."""
41
+ table_name: str
42
+ """Name of the table where the data persists."""
43
+ database: Optional[str] = None
44
+ """Name of the database where the reference data is located."""
45
+
46
+ @property
47
+ def fq_table_name(self):
48
+ """The fully qualified table name"""
49
+ if self.database:
50
+ return f"{self.database}.{self.table_name}"
51
+ return self.table_name
52
+
53
+
54
+ class ReferenceFile(BaseModel, frozen=True):
55
+ """Configuration for a reference data object when a file."""
56
+
57
+ type: Literal["filename"]
58
+ """The object type."""
59
+ filename: str
60
+ """The path to the reference data relative to the contract."""
61
+
62
+ @property
63
+ def file_extension(self) -> str:
64
+ """The file extension of the reference file"""
65
+ return fh.get_file_suffix(self.filename) # type: ignore
66
+
67
+
68
+ class ReferenceURI(BaseModel, frozen=True):
69
+ """Configuration for a reference data object when a URI."""
70
+
71
+ type: Literal["uri"]
72
+ """The object type."""
73
+ uri: str
74
+ """The absolute URI of the reference data (as Parquet)."""
75
+
76
+ @property
77
+ def file_extension(self) -> str:
78
+ """The file extension of the reference uri"""
79
+ return fh.get_file_suffix(self.uri) # type: ignore
80
+
81
+
82
+ ReferenceConfig = Union[ReferenceFile, ReferenceTable, ReferenceURI]
83
+ """The config utilised to load the reference data"""
84
+
85
+ ReferenceConfigUnion = Annotated[ReferenceConfig, Field(discriminator="type")]
86
+ """Discriminated union to determine refdata config from supplied type"""
87
+
88
+
89
+ class BaseRefDataLoader(Generic[EntityType], Mapping[EntityName, EntityType], ABC):
90
+ """A reference data mapper which lazy-loads requested entities."""
91
+
92
+ __entity_type__: ClassVar[type[EntityType]] # type: ignore
93
+ """
94
+ The entity type used for the reference data.
95
+
96
+ This will be populated from the generic annotation at class creation time.
97
+
98
+ """
99
+ __step_functions__: ClassVar[dict[type[ReferenceConfig], Callable]] = {}
100
+ """
101
+ A mapping between refdata config types and functions to call to load these configs
102
+ into reference data entities
103
+ """
104
+
105
+ __reader_functions__: ClassVar[dict[str, Callable]] = {}
106
+ """
107
+ A mapping between file extensions and functions to load the file uris
108
+ into reference data entities
109
+ """
110
+ prefix: str = "refdata_"
111
+
112
+ def __init_subclass__(cls, *_, **__) -> None:
113
+ """When this class is subclassed, create and populate the `__step_functions__`
114
+ class variable for the subclass.
115
+
116
+ """
117
+ # Set entity type from parent class subscript.
118
+ if cls is not BaseRefDataLoader:
119
+ cls.__entity_type__ = get_entity_type(cls, "BaseRefDataLoader")
120
+
121
+ # ensure that dicts are specific to each subclass - redefine rather
122
+ # than keep the same reference
123
+ cls.__reader_functions__ = {}
124
+ cls.__step_functions__ = {}
125
+
126
+ for method_name in dir(cls):
127
+ if method_name.startswith("_"):
128
+ continue
129
+
130
+ method = getattr(cls, method_name, None)
131
+ if method is None or not callable(method):
132
+ continue
133
+
134
+ if ext := getattr(method, _FILE_EXTENSION_NAME, None):
135
+ cls.__reader_functions__[ext] = method
136
+ continue
137
+
138
+ type_hints = get_type_hints(method)
139
+ if set(type_hints.keys()) != {"config", "return"}:
140
+ continue
141
+ config_type = type_hints["config"]
142
+ if not issubclass(config_type, BaseModel):
143
+ continue
144
+
145
+ cls.__step_functions__[config_type] = method # type: ignore
146
+
147
+ # pylint: disable=unused-argument
148
+ def __init__(
149
+ self,
150
+ reference_entity_config: dict[EntityName, ReferenceConfig],
151
+ dataset_config_uri: Optional[URI] = None,
152
+ **kwargs,
153
+ ) -> None:
154
+ self.reference_entity_config = reference_entity_config
155
+ self.dataset_config_uri = dataset_config_uri
156
+ """
157
+ Configuration options for the reference data. This is likely to vary
158
+ from backend to backend (e.g. might be locations and file types for
159
+ some backends, and table names for others).
160
+
161
+ """
162
+ self.entity_cache: dict[EntityName, EntityType] = {}
163
+ """A cache for already-loaded entities."""
164
+
165
+ @abstractmethod
166
+ def load_table(self, config: ReferenceTable) -> EntityType:
167
+ """Load reference entity from a database table"""
168
+ raise NotImplementedError()
169
+
170
+ def load_file(self, config: ReferenceFile) -> EntityType:
171
+ "Load reference entity from a relative file path"
172
+ if not self.dataset_config_uri:
173
+ raise AttributeError("dataset_config_uri must be specified if using relative paths")
174
+ target_location = fh.build_relative_uri(self.dataset_config_uri, config.filename)
175
+ if isinstance(_get_implementation(self.dataset_config_uri), LocalFilesystemImplementation):
176
+ target_location = fh.file_uri_to_local_path(target_location).as_posix()
177
+ try:
178
+ impl = self.__reader_functions__[config.file_extension]
179
+ return impl(self, target_location)
180
+ except KeyError as exc:
181
+ raise RefdataLacksFileExtensionSupport(file_extension=config.file_extension) from exc
182
+
183
+ def load_uri(self, config: ReferenceURI) -> EntityType:
184
+ "Load reference entity from an absolute URI"
185
+ if isinstance(_get_implementation(config.uri), LocalFilesystemImplementation):
186
+ target_location = fh.file_uri_to_local_path(config.uri).as_posix()
187
+ else:
188
+ target_location = config.uri
189
+ try:
190
+ impl = self.__reader_functions__[config.file_extension]
191
+ return impl(self, target_location)
192
+ except KeyError as exc:
193
+ raise RefdataLacksFileExtensionSupport(file_extension=config.file_extension) from exc
194
+
195
+ def load_entity(self, entity_name: EntityName, config: ReferenceConfig) -> EntityType:
196
+ """Load a reference entity given the reference config"""
197
+ config_type = type(config)
198
+ func = self.__step_functions__[config_type]
199
+ entity = func(self, config)
200
+ self.entity_cache[entity_name] = entity
201
+ return entity
202
+
203
+ def __getitem__(self, key: EntityName) -> EntityType:
204
+ try:
205
+ return self.entity_cache[key]
206
+ except KeyError:
207
+ try:
208
+ config = self.reference_entity_config[key]
209
+ return self.load_entity(entity_name=key, config=config)
210
+ except Exception as err:
211
+ raise MissingRefDataEntity(entity_name=key) from err
212
+
213
+ def __iter__(self) -> Iterator[str]:
214
+ return iter(self.reference_entity_config.keys())
215
+
216
+ def __len__(self) -> int:
217
+ return len(self.reference_entity_config)