data-validation-engine 0.7.4__tar.gz → 0.7.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/PKG-INFO +1 -1
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/pyproject.toml +1 -1
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/base/backend.py +4 -32
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/base/reference_data.py +6 -3
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/exceptions.py +14 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +1 -1
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/reference_data.py +7 -10
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/backend.py +25 -7
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/reference_data.py +5 -9
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/pipeline/duckdb_pipeline.py +13 -3
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/pipeline/pipeline.py +13 -13
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/pipeline/spark_pipeline.py +13 -3
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/LICENSE +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/README.md +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/common/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/common/error_utils.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/base/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/base/auditing.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/base/contract.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/base/core.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/base/reader.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/base/rules.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/base/utilities.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/auditing.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/contract.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/readers/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/readers/json.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/readers/xml.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/rules.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/types.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/utilities.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/auditing.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/contract.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/readers/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/readers/csv.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/readers/json.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/readers/xml.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/rules.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/spark_helpers.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/types.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/utilities.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/metadata/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/metadata/contract.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/metadata/reporting.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/metadata/rules.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/readers/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/readers/csv.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/readers/utilities.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/readers/xml.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/readers/xml_linting.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/types.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/utilities.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/configuration/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/configuration/base.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/configuration/v1/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/configuration/v1/filters.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/configuration/v1/rule_stores/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/configuration/v1/rule_stores/models.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/configuration/v1/steps.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/constants.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/engine.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/exceptions.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/functions/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/functions/implementations.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/loggers.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/message.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/models.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/templating.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/type_hints.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/validation.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/metadata_parser/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/metadata_parser/domain_types.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/metadata_parser/exc.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/metadata_parser/function_library.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/metadata_parser/function_wrapper.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/metadata_parser/model_generator.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/metadata_parser/models.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/metadata_parser/utilities.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/parser/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/parser/exceptions.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/helpers.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/implementations/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/implementations/base.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/implementations/dbfs.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/implementations/file.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/implementations/s3.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/log_handler.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/service.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/utilities.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/parser/type_hints.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/parser/utilities.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/pipeline/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/pipeline/foundry_ddb_pipeline.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/pipeline/utils.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/reporting/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/reporting/error_report.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/reporting/excel_report.py +0 -0
|
@@ -24,7 +24,7 @@ Issues = "https://github.com/NHSDigital/data-validation-engine/issues"
|
|
|
24
24
|
Changelog = "https://github.com/NHSDigital/data-validation-engine/blob/main/CHANGELOG.md"
|
|
25
25
|
|
|
26
26
|
[tool.poetry]
|
|
27
|
-
version = "0.7.
|
|
27
|
+
version = "0.7.5"
|
|
28
28
|
packages = [
|
|
29
29
|
{ include = "dve", from = "src" },
|
|
30
30
|
]
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
import warnings
|
|
5
5
|
from abc import ABC, abstractmethod
|
|
6
|
-
from collections.abc import
|
|
6
|
+
from collections.abc import MutableMapping
|
|
7
7
|
from typing import Any, ClassVar, Generic, Optional
|
|
8
8
|
|
|
9
9
|
from pyspark.sql import DataFrame, SparkSession
|
|
@@ -41,14 +41,12 @@ class BaseBackend(Generic[EntityType], ABC):
|
|
|
41
41
|
self,
|
|
42
42
|
contract: BaseDataContract[EntityType],
|
|
43
43
|
steps: BaseStepImplementations[EntityType],
|
|
44
|
-
reference_data_loader_type: Optional[type[BaseRefDataLoader[EntityType]]],
|
|
45
44
|
logger: Optional[logging.Logger] = None,
|
|
46
45
|
**kwargs: Any,
|
|
47
46
|
) -> None:
|
|
48
47
|
for component_name, component in (
|
|
49
48
|
("Contract", contract),
|
|
50
49
|
("Step implementation", steps),
|
|
51
|
-
("Reference data loader", reference_data_loader_type),
|
|
52
50
|
):
|
|
53
51
|
component_entity_type = getattr(component, "__entity_type__", None)
|
|
54
52
|
if component_entity_type != self.__entity_type__:
|
|
@@ -61,12 +59,6 @@ class BaseBackend(Generic[EntityType], ABC):
|
|
|
61
59
|
"""The data contract implementation used by the backend."""
|
|
62
60
|
self.step_implementations = steps
|
|
63
61
|
"""The step implementations used by the backend."""
|
|
64
|
-
self.reference_data_loader_type = reference_data_loader_type
|
|
65
|
-
"""
|
|
66
|
-
The loader type to use for the reference data. If `None`, do not
|
|
67
|
-
load any reference data and error if it is provided.
|
|
68
|
-
|
|
69
|
-
"""
|
|
70
62
|
self.logger = logger or get_logger(type(self).__name__)
|
|
71
63
|
"""The `logging.Logger instance for the backend."""
|
|
72
64
|
|
|
@@ -74,29 +66,9 @@ class BaseBackend(Generic[EntityType], ABC):
|
|
|
74
66
|
self,
|
|
75
67
|
reference_entity_config: dict[EntityName, ReferenceConfigUnion],
|
|
76
68
|
submission_info: Optional[SubmissionInfo],
|
|
77
|
-
) ->
|
|
78
|
-
"""
|
|
79
|
-
|
|
80
|
-
if submission_info:
|
|
81
|
-
sub_info_entity = self.convert_submission_info(submission_info)
|
|
82
|
-
|
|
83
|
-
if self.reference_data_loader_type is None:
|
|
84
|
-
if reference_entity_config:
|
|
85
|
-
raise ValueError(
|
|
86
|
-
"Reference data has been specified but no reference data loader is "
|
|
87
|
-
+ "configured for this backend"
|
|
88
|
-
)
|
|
89
|
-
|
|
90
|
-
reference_data_dict = {}
|
|
91
|
-
if sub_info_entity is not None:
|
|
92
|
-
reference_data_dict["dve_submission_info"] = sub_info_entity
|
|
93
|
-
return reference_data_dict
|
|
94
|
-
|
|
95
|
-
reference_data_loader = self.reference_data_loader_type(reference_entity_config)
|
|
96
|
-
if sub_info_entity is not None:
|
|
97
|
-
reference_data_loader.entity_cache["dve_submission_info"] = sub_info_entity
|
|
98
|
-
|
|
99
|
-
return reference_data_loader
|
|
69
|
+
) -> BaseRefDataLoader[EntityType]:
|
|
70
|
+
"""Supply configured reference data loader for use with business rules"""
|
|
71
|
+
raise NotImplementedError()
|
|
100
72
|
|
|
101
73
|
@abstractmethod
|
|
102
74
|
def convert_submission_info(self, submission_info: SubmissionInfo) -> EntityType:
|
|
@@ -11,6 +11,7 @@ import dve.parser.file_handling as fh
|
|
|
11
11
|
from dve.core_engine.backends.base.core import get_entity_type
|
|
12
12
|
from dve.core_engine.backends.exceptions import (
|
|
13
13
|
MissingRefDataEntity,
|
|
14
|
+
NoRefDataConfigSupplied,
|
|
14
15
|
RefdataLacksFileExtensionSupport,
|
|
15
16
|
)
|
|
16
17
|
from dve.core_engine.backends.types import EntityType
|
|
@@ -147,11 +148,11 @@ class BaseRefDataLoader(Generic[EntityType], Mapping[EntityName, EntityType], AB
|
|
|
147
148
|
# pylint: disable=unused-argument
|
|
148
149
|
def __init__(
|
|
149
150
|
self,
|
|
150
|
-
|
|
151
|
-
dataset_config_uri:
|
|
151
|
+
reference_data_config: dict[EntityName, ReferenceConfig],
|
|
152
|
+
dataset_config_uri: URI,
|
|
152
153
|
**kwargs,
|
|
153
154
|
) -> None:
|
|
154
|
-
self.reference_entity_config =
|
|
155
|
+
self.reference_entity_config = reference_data_config
|
|
155
156
|
self.dataset_config_uri = dataset_config_uri
|
|
156
157
|
"""
|
|
157
158
|
Configuration options for the reference data. This is likely to vary
|
|
@@ -207,6 +208,8 @@ class BaseRefDataLoader(Generic[EntityType], Mapping[EntityName, EntityType], AB
|
|
|
207
208
|
try:
|
|
208
209
|
config = self.reference_entity_config[key]
|
|
209
210
|
return self.load_entity(entity_name=key, config=config)
|
|
211
|
+
except TypeError as err:
|
|
212
|
+
raise NoRefDataConfigSupplied() from err
|
|
210
213
|
except Exception as err:
|
|
211
214
|
raise MissingRefDataEntity(entity_name=key) from err
|
|
212
215
|
|
|
@@ -119,6 +119,20 @@ class MissingRefDataEntity(MissingEntity, BackendErrorMixin): # pylint: disable
|
|
|
119
119
|
return f"Missing reference data entity {self.entity_name!r}"
|
|
120
120
|
|
|
121
121
|
|
|
122
|
+
class NoRefDataConfigSupplied(BackendError):
|
|
123
|
+
"""An error raised when trying to load a refdata entity when no refdata
|
|
124
|
+
config has been supplied.
|
|
125
|
+
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
def __init__(self, *args: object) -> None:
|
|
129
|
+
super().__init__(*args)
|
|
130
|
+
|
|
131
|
+
def get_message_preamble(self) -> EntityName:
|
|
132
|
+
"""Message for logging purposes"""
|
|
133
|
+
return "Refdata loader not supplied with refdata config - unable to load refdata entities"
|
|
134
|
+
|
|
135
|
+
|
|
122
136
|
class ConstraintError(ValueError, BackendErrorMixin):
|
|
123
137
|
"""Raised when a given constraint is violated."""
|
|
124
138
|
|
|
@@ -411,7 +411,7 @@ def get_duckdb_cast_statement_from_annotation(
|
|
|
411
411
|
stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{date_regex}') THEN TRY_CAST(TRIM({quoted_name}) as DATE) ELSE NULL END" # pylint: disable=C0301
|
|
412
412
|
return stmt
|
|
413
413
|
if issubclass(type_, time):
|
|
414
|
-
stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{time_regex}') THEN TRY_CAST(TRIM({quoted_name}) as TIME) ELSE NULL END"
|
|
414
|
+
stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{time_regex}') THEN TRY_CAST(TRIM({quoted_name}) as TIME) ELSE NULL END" # pylint: disable=C0301
|
|
415
415
|
return stmt
|
|
416
416
|
duck_type = get_duckdb_type_from_annotation(type_)
|
|
417
417
|
if duck_type:
|
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
"""A reference data loader for duckdb."""
|
|
2
2
|
|
|
3
|
-
from typing import Optional
|
|
4
|
-
|
|
5
3
|
from duckdb import DuckDBPyConnection, DuckDBPyRelation
|
|
6
4
|
from pyarrow import ipc # type: ignore
|
|
7
5
|
|
|
8
6
|
from dve.core_engine.backends.base.reference_data import (
|
|
9
7
|
BaseRefDataLoader,
|
|
10
|
-
|
|
8
|
+
ReferenceConfig,
|
|
11
9
|
ReferenceTable,
|
|
12
10
|
mark_refdata_file_extension,
|
|
13
11
|
)
|
|
@@ -19,17 +17,16 @@ from dve.parser.type_hints import URI
|
|
|
19
17
|
class DuckDBRefDataLoader(BaseRefDataLoader[DuckDBPyRelation]):
|
|
20
18
|
"""A reference data loader using already existing DuckDB tables."""
|
|
21
19
|
|
|
22
|
-
connection: DuckDBPyConnection
|
|
23
|
-
"""The DuckDB connection for the backend."""
|
|
24
|
-
dataset_config_uri: Optional[URI] = None
|
|
25
|
-
"""The location of the dischema file"""
|
|
26
|
-
|
|
27
20
|
def __init__(
|
|
28
21
|
self,
|
|
29
|
-
|
|
22
|
+
connection: DuckDBPyConnection,
|
|
23
|
+
reference_data_config: dict[EntityName, ReferenceConfig],
|
|
24
|
+
dataset_config_uri: URI,
|
|
30
25
|
**kwargs,
|
|
31
26
|
) -> None:
|
|
32
|
-
super().__init__(
|
|
27
|
+
super().__init__(reference_data_config, dataset_config_uri, **kwargs)
|
|
28
|
+
|
|
29
|
+
self.connection = connection
|
|
33
30
|
|
|
34
31
|
if not self.connection:
|
|
35
32
|
raise AttributeError("DuckDBConnection must be specified")
|
|
@@ -6,6 +6,7 @@ from typing import Any, Optional
|
|
|
6
6
|
from pyspark.sql import DataFrame, SparkSession
|
|
7
7
|
|
|
8
8
|
from dve.core_engine.backends.base.backend import BaseBackend
|
|
9
|
+
from dve.core_engine.backends.base.reference_data import ReferenceConfigUnion
|
|
9
10
|
from dve.core_engine.backends.implementations.spark.contract import SparkDataContract
|
|
10
11
|
from dve.core_engine.backends.implementations.spark.reference_data import SparkRefDataLoader
|
|
11
12
|
from dve.core_engine.backends.implementations.spark.rules import SparkStepImplementations
|
|
@@ -14,7 +15,7 @@ from dve.core_engine.backends.implementations.spark.types import SparkEntities
|
|
|
14
15
|
from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME
|
|
15
16
|
from dve.core_engine.loggers import get_child_logger, get_logger
|
|
16
17
|
from dve.core_engine.models import SubmissionInfo
|
|
17
|
-
from dve.core_engine.type_hints import URI, EntityParquetLocations
|
|
18
|
+
from dve.core_engine.type_hints import URI, EntityName, EntityParquetLocations
|
|
18
19
|
from dve.parser.file_handling import get_resource_exists, joinuri
|
|
19
20
|
|
|
20
21
|
|
|
@@ -26,7 +27,6 @@ class SparkBackend(BaseBackend[DataFrame]):
|
|
|
26
27
|
dataset_config_uri: Optional[URI] = None,
|
|
27
28
|
contract: Optional[SparkDataContract] = None,
|
|
28
29
|
steps: Optional[SparkStepImplementations] = None,
|
|
29
|
-
reference_data_loader: Optional[type[SparkRefDataLoader]] = None,
|
|
30
30
|
logger: Optional[logging.Logger] = None,
|
|
31
31
|
spark_session: Optional[SparkSession] = None,
|
|
32
32
|
**kwargs: Any,
|
|
@@ -36,6 +36,8 @@ class SparkBackend(BaseBackend[DataFrame]):
|
|
|
36
36
|
|
|
37
37
|
self.spark_session = spark_session or SparkSession.builder.getOrCreate()
|
|
38
38
|
"""The Spark session for the backend."""
|
|
39
|
+
self.dataset_config_uri = dataset_config_uri
|
|
40
|
+
"""The uri of the dischema specifying the DVE config"""
|
|
39
41
|
|
|
40
42
|
if contract is None:
|
|
41
43
|
contract = SparkDataContract(
|
|
@@ -46,11 +48,27 @@ class SparkBackend(BaseBackend[DataFrame]):
|
|
|
46
48
|
steps = SparkStepImplementations.register_udfs(
|
|
47
49
|
logger=get_child_logger("SparkStepImplementations", logger)
|
|
48
50
|
)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
51
|
+
super().__init__(contract, steps, logger, **kwargs)
|
|
52
|
+
|
|
53
|
+
def load_reference_data(
|
|
54
|
+
self,
|
|
55
|
+
reference_entity_config: dict[EntityName, ReferenceConfigUnion],
|
|
56
|
+
submission_info: Optional[SubmissionInfo],
|
|
57
|
+
):
|
|
58
|
+
"""Load the reference data as specified in the reference entity config."""
|
|
59
|
+
sub_info_entity: Optional[DataFrame] = None
|
|
60
|
+
if submission_info:
|
|
61
|
+
sub_info_entity = self.convert_submission_info(submission_info)
|
|
62
|
+
|
|
63
|
+
reference_data_loader = SparkRefDataLoader(
|
|
64
|
+
spark=self.spark_session,
|
|
65
|
+
reference_data_config=reference_entity_config,
|
|
66
|
+
dataset_config_uri=self.dataset_config_uri, # type: ignore
|
|
67
|
+
)
|
|
68
|
+
if sub_info_entity is not None:
|
|
69
|
+
reference_data_loader.entity_cache["dve_submission_info"] = sub_info_entity
|
|
70
|
+
|
|
71
|
+
return reference_data_loader
|
|
54
72
|
|
|
55
73
|
def write_entities_to_parquet(
|
|
56
74
|
self, entities: SparkEntities, cache_prefix: URI
|
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
# pylint: disable=no-member
|
|
2
2
|
"""A reference data loader for Spark."""
|
|
3
3
|
|
|
4
|
-
from typing import Optional
|
|
5
|
-
|
|
6
4
|
from pyspark.sql import DataFrame, SparkSession
|
|
7
5
|
|
|
8
6
|
from dve.core_engine.backends.base.reference_data import (
|
|
@@ -19,17 +17,15 @@ from dve.parser.type_hints import URI
|
|
|
19
17
|
class SparkRefDataLoader(BaseRefDataLoader[DataFrame]):
|
|
20
18
|
"""A reference data loader using already existing Apache Spark Tables."""
|
|
21
19
|
|
|
22
|
-
spark: SparkSession
|
|
23
|
-
"""The Spark session for the backend."""
|
|
24
|
-
dataset_config_uri: Optional[URI] = None
|
|
25
|
-
"""The location of the dischema file defining business rules"""
|
|
26
|
-
|
|
27
20
|
def __init__(
|
|
28
21
|
self,
|
|
29
|
-
|
|
22
|
+
spark: SparkSession,
|
|
23
|
+
reference_data_config: dict[EntityName, ReferenceConfig],
|
|
24
|
+
dataset_config_uri: URI,
|
|
30
25
|
**kwargs,
|
|
31
26
|
) -> None:
|
|
32
|
-
super().__init__(
|
|
27
|
+
super().__init__(reference_data_config, dataset_config_uri, **kwargs)
|
|
28
|
+
self.spark = spark
|
|
33
29
|
if not self.spark:
|
|
34
30
|
raise AttributeError("Spark session must be provided")
|
|
35
31
|
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/pipeline/duckdb_pipeline.py
RENAMED
|
@@ -5,10 +5,12 @@ from typing import Optional
|
|
|
5
5
|
|
|
6
6
|
from duckdb import DuckDBPyConnection, DuckDBPyRelation
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
import dve.parser.file_handling as fh
|
|
9
|
+
from dve.core_engine.backends.base.reference_data import ReferenceConfig
|
|
9
10
|
from dve.core_engine.backends.implementations.duckdb.auditing import DDBAuditingManager
|
|
10
11
|
from dve.core_engine.backends.implementations.duckdb.contract import DuckDBDataContract
|
|
11
12
|
from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import duckdb_get_entity_count
|
|
13
|
+
from dve.core_engine.backends.implementations.duckdb.reference_data import DuckDBRefDataLoader
|
|
12
14
|
from dve.core_engine.backends.implementations.duckdb.rules import DuckDBStepImplementations
|
|
13
15
|
from dve.core_engine.models import SubmissionInfo
|
|
14
16
|
from dve.core_engine.type_hints import URI
|
|
@@ -30,7 +32,6 @@ class DDBDVEPipeline(BaseDVEPipeline):
|
|
|
30
32
|
connection: DuckDBPyConnection,
|
|
31
33
|
rules_path: Optional[URI],
|
|
32
34
|
submitted_files_path: Optional[URI],
|
|
33
|
-
reference_data_loader: Optional[type[BaseRefDataLoader]] = None,
|
|
34
35
|
job_run_id: Optional[int] = None,
|
|
35
36
|
logger: Optional[logging.Logger] = None,
|
|
36
37
|
):
|
|
@@ -42,11 +43,20 @@ class DDBDVEPipeline(BaseDVEPipeline):
|
|
|
42
43
|
DuckDBStepImplementations.register_udfs(connection=self._connection),
|
|
43
44
|
rules_path,
|
|
44
45
|
submitted_files_path,
|
|
45
|
-
reference_data_loader,
|
|
46
46
|
job_run_id,
|
|
47
47
|
logger,
|
|
48
48
|
)
|
|
49
49
|
|
|
50
|
+
def init_reference_data_loader(
|
|
51
|
+
self, reference_data_config: dict[str, ReferenceConfig], **kwargs
|
|
52
|
+
) -> DuckDBRefDataLoader:
|
|
53
|
+
return DuckDBRefDataLoader(
|
|
54
|
+
connection=self._connection,
|
|
55
|
+
reference_data_config=reference_data_config,
|
|
56
|
+
dataset_config_uri=fh.get_parent(self._rules_path), # type: ignore
|
|
57
|
+
**kwargs
|
|
58
|
+
)
|
|
59
|
+
|
|
50
60
|
# pylint: disable=arguments-differ
|
|
51
61
|
def write_file_to_parquet( # type: ignore
|
|
52
62
|
self, submission_file_uri: URI, submission_info: SubmissionInfo, output: URI
|
|
@@ -26,7 +26,7 @@ from dve.common.error_utils import (
|
|
|
26
26
|
from dve.core_engine.backends.base.auditing import BaseAuditingManager
|
|
27
27
|
from dve.core_engine.backends.base.contract import BaseDataContract
|
|
28
28
|
from dve.core_engine.backends.base.core import EntityManager
|
|
29
|
-
from dve.core_engine.backends.base.reference_data import BaseRefDataLoader
|
|
29
|
+
from dve.core_engine.backends.base.reference_data import BaseRefDataLoader, ReferenceConfig
|
|
30
30
|
from dve.core_engine.backends.base.rules import BaseStepImplementations
|
|
31
31
|
from dve.core_engine.backends.exceptions import MessageBearingError
|
|
32
32
|
from dve.core_engine.backends.readers import BaseFileReader
|
|
@@ -36,7 +36,7 @@ from dve.core_engine.exceptions import CriticalProcessingError
|
|
|
36
36
|
from dve.core_engine.loggers import get_logger
|
|
37
37
|
from dve.core_engine.message import FeedbackMessage
|
|
38
38
|
from dve.core_engine.models import SubmissionInfo, SubmissionStatisticsRecord
|
|
39
|
-
from dve.core_engine.type_hints import URI, DVEStageName, FileURI, InfoURI
|
|
39
|
+
from dve.core_engine.type_hints import URI, DVEStageName, EntityName, FileURI, InfoURI
|
|
40
40
|
from dve.parser import file_handling as fh
|
|
41
41
|
from dve.parser.file_handling.implementations.file import LocalFilesystemImplementation
|
|
42
42
|
from dve.parser.file_handling.service import _get_implementation
|
|
@@ -62,14 +62,12 @@ class BaseDVEPipeline:
|
|
|
62
62
|
step_implementations: Optional[BaseStepImplementations[EntityType]],
|
|
63
63
|
rules_path: Optional[URI],
|
|
64
64
|
submitted_files_path: Optional[URI],
|
|
65
|
-
reference_data_loader: Optional[type[BaseRefDataLoader]] = None,
|
|
66
65
|
job_run_id: Optional[int] = None,
|
|
67
66
|
logger: Optional[logging.Logger] = None,
|
|
68
67
|
):
|
|
69
68
|
self._submitted_files_path = submitted_files_path
|
|
70
69
|
self._processed_files_path = processed_files_path
|
|
71
70
|
self._rules_path = rules_path
|
|
72
|
-
self._reference_data_loader = reference_data_loader
|
|
73
71
|
self._job_run_id = job_run_id
|
|
74
72
|
self._audit_tables = audit_tables
|
|
75
73
|
self._data_contract = data_contract
|
|
@@ -114,6 +112,12 @@ class BaseDVEPipeline:
|
|
|
114
112
|
"""Get a row count of an entity stored as parquet"""
|
|
115
113
|
raise NotImplementedError()
|
|
116
114
|
|
|
115
|
+
def init_reference_data_loader(
|
|
116
|
+
self, reference_data_config: dict[EntityName, ReferenceConfig], **kwargs
|
|
117
|
+
) -> BaseRefDataLoader:
|
|
118
|
+
"""Get reference data loader if required for business rules"""
|
|
119
|
+
raise NotImplementedError()
|
|
120
|
+
|
|
117
121
|
def get_submission_status(
|
|
118
122
|
self, step_name: DVEStageName, submission_id: str
|
|
119
123
|
) -> SubmissionStatus:
|
|
@@ -527,7 +531,7 @@ class BaseDVEPipeline:
|
|
|
527
531
|
|
|
528
532
|
return processed_files, failed_processing
|
|
529
533
|
|
|
530
|
-
def apply_business_rules(
|
|
534
|
+
def apply_business_rules( # pylint: disable=R0914
|
|
531
535
|
self, submission_info: SubmissionInfo, submission_status: Optional[SubmissionStatus] = None
|
|
532
536
|
) -> tuple[SubmissionInfo, SubmissionStatus]:
|
|
533
537
|
"""Apply the business rules to a given submission, the submission may have failed at the
|
|
@@ -542,9 +546,6 @@ class BaseDVEPipeline:
|
|
|
542
546
|
if not self.rules_path:
|
|
543
547
|
raise AttributeError("business rules path not provided.")
|
|
544
548
|
|
|
545
|
-
if not self._reference_data_loader:
|
|
546
|
-
raise AttributeError("reference data loader not provided.")
|
|
547
|
-
|
|
548
549
|
if not self.processed_files_path:
|
|
549
550
|
raise AttributeError("processed files path has not been provided.")
|
|
550
551
|
|
|
@@ -556,8 +557,10 @@ class BaseDVEPipeline:
|
|
|
556
557
|
self._processed_files_path, submission_info.submission_id
|
|
557
558
|
)
|
|
558
559
|
ref_data = config.get_reference_data_config()
|
|
560
|
+
reference_data: BaseRefDataLoader = self.init_reference_data_loader(
|
|
561
|
+
reference_data_config=ref_data
|
|
562
|
+
)
|
|
559
563
|
rules = config.get_rule_metadata()
|
|
560
|
-
reference_data = self._reference_data_loader(ref_data) # type: ignore
|
|
561
564
|
entities = {}
|
|
562
565
|
contract = fh.joinuri(
|
|
563
566
|
self.processed_files_path, submission_info.submission_id, "data_contract"
|
|
@@ -582,10 +585,7 @@ class BaseDVEPipeline:
|
|
|
582
585
|
key_fields = {model: conf.reporting_fields for model, conf in model_config.items()}
|
|
583
586
|
|
|
584
587
|
_errors_uri, rules_success = self.step_implementations.apply_rules( # type: ignore
|
|
585
|
-
working_directory,
|
|
586
|
-
entity_manager,
|
|
587
|
-
rules,
|
|
588
|
-
key_fields
|
|
588
|
+
working_directory, entity_manager, rules, key_fields
|
|
589
589
|
)
|
|
590
590
|
|
|
591
591
|
rule_messages = load_feedback_messages(
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/pipeline/spark_pipeline.py
RENAMED
|
@@ -6,9 +6,11 @@ from typing import Optional
|
|
|
6
6
|
|
|
7
7
|
from pyspark.sql import DataFrame, SparkSession
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
import dve.parser.file_handling as fh
|
|
10
|
+
from dve.core_engine.backends.base.reference_data import ReferenceConfig
|
|
10
11
|
from dve.core_engine.backends.implementations.spark.auditing import SparkAuditingManager
|
|
11
12
|
from dve.core_engine.backends.implementations.spark.contract import SparkDataContract
|
|
13
|
+
from dve.core_engine.backends.implementations.spark.reference_data import SparkRefDataLoader
|
|
12
14
|
from dve.core_engine.backends.implementations.spark.rules import SparkStepImplementations
|
|
13
15
|
from dve.core_engine.backends.implementations.spark.spark_helpers import spark_get_entity_count
|
|
14
16
|
from dve.core_engine.models import SubmissionInfo
|
|
@@ -31,7 +33,6 @@ class SparkDVEPipeline(BaseDVEPipeline):
|
|
|
31
33
|
audit_tables: SparkAuditingManager,
|
|
32
34
|
rules_path: Optional[URI],
|
|
33
35
|
submitted_files_path: Optional[URI],
|
|
34
|
-
reference_data_loader: Optional[type[BaseRefDataLoader]] = None,
|
|
35
36
|
spark: Optional[SparkSession] = None,
|
|
36
37
|
job_run_id: Optional[int] = None,
|
|
37
38
|
logger: Optional[logging.Logger] = None,
|
|
@@ -44,11 +45,20 @@ class SparkDVEPipeline(BaseDVEPipeline):
|
|
|
44
45
|
SparkStepImplementations.register_udfs(self._spark),
|
|
45
46
|
rules_path,
|
|
46
47
|
submitted_files_path,
|
|
47
|
-
reference_data_loader,
|
|
48
48
|
job_run_id,
|
|
49
49
|
logger,
|
|
50
50
|
)
|
|
51
51
|
|
|
52
|
+
def init_reference_data_loader(
|
|
53
|
+
self, reference_data_config: dict[str, ReferenceConfig], **kwargs
|
|
54
|
+
) -> SparkRefDataLoader:
|
|
55
|
+
return SparkRefDataLoader(
|
|
56
|
+
spark=self._spark,
|
|
57
|
+
reference_data_config=reference_data_config,
|
|
58
|
+
dataset_config_uri=fh.get_parent(self._rules_path), # type: ignore
|
|
59
|
+
**kwargs
|
|
60
|
+
)
|
|
61
|
+
|
|
52
62
|
# pylint: disable=arguments-differ
|
|
53
63
|
def write_file_to_parquet( # type: ignore
|
|
54
64
|
self, submission_file_uri: URI, submission_info: SubmissionInfo, output: URI
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/types.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/constants.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/exceptions.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/loggers.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/message.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/templating.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/type_hints.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/core_engine/validation.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/metadata_parser/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/metadata_parser/exc.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/metadata_parser/models.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/metadata_parser/utilities.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/reporting/error_report.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.5}/src/dve/reporting/excel_report.py
RENAMED
|
File without changes
|