data-validation-engine 0.7.4__tar.gz → 0.7.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/PKG-INFO +1 -1
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/pyproject.toml +1 -1
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/base/backend.py +4 -32
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/base/reference_data.py +6 -3
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/exceptions.py +14 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +1 -1
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py +3 -9
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/readers/json.py +7 -2
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/readers/xml.py +5 -4
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/reference_data.py +7 -10
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/backend.py +25 -7
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/reference_data.py +5 -9
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/pipeline/duckdb_pipeline.py +14 -3
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/pipeline/pipeline.py +26 -15
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/pipeline/spark_pipeline.py +13 -3
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/pipeline/utils.py +10 -3
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/LICENSE +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/README.md +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/common/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/common/error_utils.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/base/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/base/auditing.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/base/contract.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/base/core.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/base/reader.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/base/rules.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/base/utilities.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/auditing.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/contract.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/readers/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/rules.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/types.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/utilities.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/auditing.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/contract.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/readers/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/readers/csv.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/readers/json.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/readers/xml.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/rules.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/spark_helpers.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/types.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/utilities.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/metadata/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/metadata/contract.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/metadata/reporting.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/metadata/rules.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/readers/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/readers/csv.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/readers/utilities.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/readers/xml.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/readers/xml_linting.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/types.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/utilities.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/configuration/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/configuration/base.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/configuration/v1/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/configuration/v1/filters.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/configuration/v1/rule_stores/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/configuration/v1/rule_stores/models.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/configuration/v1/steps.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/constants.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/engine.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/exceptions.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/functions/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/functions/implementations.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/loggers.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/message.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/models.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/templating.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/type_hints.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/validation.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/domain_types.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/exc.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/function_library.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/function_wrapper.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/model_generator.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/models.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/utilities.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/exceptions.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/helpers.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/implementations/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/implementations/base.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/implementations/dbfs.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/implementations/file.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/implementations/s3.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/log_handler.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/service.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/utilities.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/type_hints.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/utilities.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/pipeline/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/pipeline/foundry_ddb_pipeline.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/reporting/__init__.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/reporting/error_report.py +0 -0
- {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/reporting/excel_report.py +0 -0
|
@@ -24,7 +24,7 @@ Issues = "https://github.com/NHSDigital/data-validation-engine/issues"
|
|
|
24
24
|
Changelog = "https://github.com/NHSDigital/data-validation-engine/blob/main/CHANGELOG.md"
|
|
25
25
|
|
|
26
26
|
[tool.poetry]
|
|
27
|
-
version = "0.7.
|
|
27
|
+
version = "0.7.6"
|
|
28
28
|
packages = [
|
|
29
29
|
{ include = "dve", from = "src" },
|
|
30
30
|
]
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
import warnings
|
|
5
5
|
from abc import ABC, abstractmethod
|
|
6
|
-
from collections.abc import
|
|
6
|
+
from collections.abc import MutableMapping
|
|
7
7
|
from typing import Any, ClassVar, Generic, Optional
|
|
8
8
|
|
|
9
9
|
from pyspark.sql import DataFrame, SparkSession
|
|
@@ -41,14 +41,12 @@ class BaseBackend(Generic[EntityType], ABC):
|
|
|
41
41
|
self,
|
|
42
42
|
contract: BaseDataContract[EntityType],
|
|
43
43
|
steps: BaseStepImplementations[EntityType],
|
|
44
|
-
reference_data_loader_type: Optional[type[BaseRefDataLoader[EntityType]]],
|
|
45
44
|
logger: Optional[logging.Logger] = None,
|
|
46
45
|
**kwargs: Any,
|
|
47
46
|
) -> None:
|
|
48
47
|
for component_name, component in (
|
|
49
48
|
("Contract", contract),
|
|
50
49
|
("Step implementation", steps),
|
|
51
|
-
("Reference data loader", reference_data_loader_type),
|
|
52
50
|
):
|
|
53
51
|
component_entity_type = getattr(component, "__entity_type__", None)
|
|
54
52
|
if component_entity_type != self.__entity_type__:
|
|
@@ -61,12 +59,6 @@ class BaseBackend(Generic[EntityType], ABC):
|
|
|
61
59
|
"""The data contract implementation used by the backend."""
|
|
62
60
|
self.step_implementations = steps
|
|
63
61
|
"""The step implementations used by the backend."""
|
|
64
|
-
self.reference_data_loader_type = reference_data_loader_type
|
|
65
|
-
"""
|
|
66
|
-
The loader type to use for the reference data. If `None`, do not
|
|
67
|
-
load any reference data and error if it is provided.
|
|
68
|
-
|
|
69
|
-
"""
|
|
70
62
|
self.logger = logger or get_logger(type(self).__name__)
|
|
71
63
|
"""The `logging.Logger instance for the backend."""
|
|
72
64
|
|
|
@@ -74,29 +66,9 @@ class BaseBackend(Generic[EntityType], ABC):
|
|
|
74
66
|
self,
|
|
75
67
|
reference_entity_config: dict[EntityName, ReferenceConfigUnion],
|
|
76
68
|
submission_info: Optional[SubmissionInfo],
|
|
77
|
-
) ->
|
|
78
|
-
"""
|
|
79
|
-
|
|
80
|
-
if submission_info:
|
|
81
|
-
sub_info_entity = self.convert_submission_info(submission_info)
|
|
82
|
-
|
|
83
|
-
if self.reference_data_loader_type is None:
|
|
84
|
-
if reference_entity_config:
|
|
85
|
-
raise ValueError(
|
|
86
|
-
"Reference data has been specified but no reference data loader is "
|
|
87
|
-
+ "configured for this backend"
|
|
88
|
-
)
|
|
89
|
-
|
|
90
|
-
reference_data_dict = {}
|
|
91
|
-
if sub_info_entity is not None:
|
|
92
|
-
reference_data_dict["dve_submission_info"] = sub_info_entity
|
|
93
|
-
return reference_data_dict
|
|
94
|
-
|
|
95
|
-
reference_data_loader = self.reference_data_loader_type(reference_entity_config)
|
|
96
|
-
if sub_info_entity is not None:
|
|
97
|
-
reference_data_loader.entity_cache["dve_submission_info"] = sub_info_entity
|
|
98
|
-
|
|
99
|
-
return reference_data_loader
|
|
69
|
+
) -> BaseRefDataLoader[EntityType]:
|
|
70
|
+
"""Supply configured reference data loader for use with business rules"""
|
|
71
|
+
raise NotImplementedError()
|
|
100
72
|
|
|
101
73
|
@abstractmethod
|
|
102
74
|
def convert_submission_info(self, submission_info: SubmissionInfo) -> EntityType:
|
|
@@ -11,6 +11,7 @@ import dve.parser.file_handling as fh
|
|
|
11
11
|
from dve.core_engine.backends.base.core import get_entity_type
|
|
12
12
|
from dve.core_engine.backends.exceptions import (
|
|
13
13
|
MissingRefDataEntity,
|
|
14
|
+
NoRefDataConfigSupplied,
|
|
14
15
|
RefdataLacksFileExtensionSupport,
|
|
15
16
|
)
|
|
16
17
|
from dve.core_engine.backends.types import EntityType
|
|
@@ -147,11 +148,11 @@ class BaseRefDataLoader(Generic[EntityType], Mapping[EntityName, EntityType], AB
|
|
|
147
148
|
# pylint: disable=unused-argument
|
|
148
149
|
def __init__(
|
|
149
150
|
self,
|
|
150
|
-
|
|
151
|
-
dataset_config_uri:
|
|
151
|
+
reference_data_config: dict[EntityName, ReferenceConfig],
|
|
152
|
+
dataset_config_uri: URI,
|
|
152
153
|
**kwargs,
|
|
153
154
|
) -> None:
|
|
154
|
-
self.reference_entity_config =
|
|
155
|
+
self.reference_entity_config = reference_data_config
|
|
155
156
|
self.dataset_config_uri = dataset_config_uri
|
|
156
157
|
"""
|
|
157
158
|
Configuration options for the reference data. This is likely to vary
|
|
@@ -207,6 +208,8 @@ class BaseRefDataLoader(Generic[EntityType], Mapping[EntityName, EntityType], AB
|
|
|
207
208
|
try:
|
|
208
209
|
config = self.reference_entity_config[key]
|
|
209
210
|
return self.load_entity(entity_name=key, config=config)
|
|
211
|
+
except TypeError as err:
|
|
212
|
+
raise NoRefDataConfigSupplied() from err
|
|
210
213
|
except Exception as err:
|
|
211
214
|
raise MissingRefDataEntity(entity_name=key) from err
|
|
212
215
|
|
|
@@ -119,6 +119,20 @@ class MissingRefDataEntity(MissingEntity, BackendErrorMixin): # pylint: disable
|
|
|
119
119
|
return f"Missing reference data entity {self.entity_name!r}"
|
|
120
120
|
|
|
121
121
|
|
|
122
|
+
class NoRefDataConfigSupplied(BackendError):
|
|
123
|
+
"""An error raised when trying to load a refdata entity when no refdata
|
|
124
|
+
config has been supplied.
|
|
125
|
+
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
def __init__(self, *args: object) -> None:
|
|
129
|
+
super().__init__(*args)
|
|
130
|
+
|
|
131
|
+
def get_message_preamble(self) -> EntityName:
|
|
132
|
+
"""Message for logging purposes"""
|
|
133
|
+
return "Refdata loader not supplied with refdata config - unable to load refdata entities"
|
|
134
|
+
|
|
135
|
+
|
|
122
136
|
class ConstraintError(ValueError, BackendErrorMixin):
|
|
123
137
|
"""Raised when a given constraint is violated."""
|
|
124
138
|
|
|
@@ -411,7 +411,7 @@ def get_duckdb_cast_statement_from_annotation(
|
|
|
411
411
|
stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{date_regex}') THEN TRY_CAST(TRIM({quoted_name}) as DATE) ELSE NULL END" # pylint: disable=C0301
|
|
412
412
|
return stmt
|
|
413
413
|
if issubclass(type_, time):
|
|
414
|
-
stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{time_regex}') THEN TRY_CAST(TRIM({quoted_name}) as TIME) ELSE NULL END"
|
|
414
|
+
stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{time_regex}') THEN TRY_CAST(TRIM({quoted_name}) as TIME) ELSE NULL END" # pylint: disable=C0301
|
|
415
415
|
return stmt
|
|
416
416
|
duck_type = get_duckdb_type_from_annotation(type_)
|
|
417
417
|
if duck_type:
|
|
@@ -6,13 +6,7 @@ from typing import Any, Optional
|
|
|
6
6
|
|
|
7
7
|
import duckdb as ddb
|
|
8
8
|
import polars as pl
|
|
9
|
-
from duckdb import
|
|
10
|
-
DuckDBPyConnection,
|
|
11
|
-
DuckDBPyRelation,
|
|
12
|
-
StarExpression,
|
|
13
|
-
default_connection,
|
|
14
|
-
read_csv,
|
|
15
|
-
)
|
|
9
|
+
from duckdb import DuckDBPyConnection, DuckDBPyRelation, StarExpression, read_csv
|
|
16
10
|
from pydantic import BaseModel
|
|
17
11
|
|
|
18
12
|
from dve.core_engine.backends.base.reader import BaseFileReader, read_function
|
|
@@ -61,7 +55,7 @@ class DuckDBCSVReader(BaseFileReader):
|
|
|
61
55
|
self.header = header
|
|
62
56
|
self.delim = delim
|
|
63
57
|
self.quotechar = quotechar
|
|
64
|
-
self._connection = connection if connection else
|
|
58
|
+
self._connection = connection if connection else ddb.connect(":memory:")
|
|
65
59
|
self.field_check = field_check
|
|
66
60
|
self.field_check_error_code = field_check_error_code
|
|
67
61
|
self.field_check_error_message = field_check_error_message
|
|
@@ -181,7 +175,7 @@ class PolarsToDuckDBCSVReader(DuckDBCSVReader):
|
|
|
181
175
|
] + [pl.col(RECORD_INDEX_COLUMN_NAME)]
|
|
182
176
|
df = df.select(pl_exprs)
|
|
183
177
|
|
|
184
|
-
return
|
|
178
|
+
return self._connection.sql("SELECT * FROM df")
|
|
185
179
|
|
|
186
180
|
|
|
187
181
|
class DuckDBCSVRepeatingHeaderReader(PolarsToDuckDBCSVReader):
|
|
@@ -4,7 +4,8 @@
|
|
|
4
4
|
from collections.abc import Iterator
|
|
5
5
|
from typing import Any, Optional
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
import duckdb
|
|
8
|
+
from duckdb import DuckDBPyConnection, DuckDBPyRelation
|
|
8
9
|
from pydantic import BaseModel
|
|
9
10
|
|
|
10
11
|
from dve.core_engine.backends.base.reader import BaseFileReader, read_function
|
|
@@ -26,9 +27,11 @@ class DuckDBJSONReader(BaseFileReader):
|
|
|
26
27
|
self,
|
|
27
28
|
*,
|
|
28
29
|
json_format: Optional[str] = "array",
|
|
30
|
+
connection: Optional[DuckDBPyConnection] = None,
|
|
29
31
|
**_,
|
|
30
32
|
):
|
|
31
33
|
self._json_format = json_format
|
|
34
|
+
self._connection = duckdb.connect(":memory:") if not connection else connection
|
|
32
35
|
|
|
33
36
|
super().__init__()
|
|
34
37
|
|
|
@@ -50,5 +53,7 @@ class DuckDBJSONReader(BaseFileReader):
|
|
|
50
53
|
}
|
|
51
54
|
|
|
52
55
|
return self.add_record_index(
|
|
53
|
-
read_json(
|
|
56
|
+
self._connection.read_json(
|
|
57
|
+
resource, columns=ddb_schema, format=self._json_format # type: ignore
|
|
58
|
+
)
|
|
54
59
|
)
|
|
@@ -3,8 +3,9 @@
|
|
|
3
3
|
|
|
4
4
|
from typing import Optional
|
|
5
5
|
|
|
6
|
+
import duckdb
|
|
6
7
|
import polars as pl
|
|
7
|
-
from duckdb import DuckDBPyConnection, DuckDBPyRelation
|
|
8
|
+
from duckdb import DuckDBPyConnection, DuckDBPyRelation
|
|
8
9
|
from pydantic import BaseModel
|
|
9
10
|
|
|
10
11
|
from dve.core_engine.backends.base.reader import read_function
|
|
@@ -24,8 +25,8 @@ from dve.core_engine.type_hints import URI
|
|
|
24
25
|
class DuckDBXMLStreamReader(XMLStreamReader):
|
|
25
26
|
"""A reader for XML files"""
|
|
26
27
|
|
|
27
|
-
def __init__(self, *,
|
|
28
|
-
self.
|
|
28
|
+
def __init__(self, *, connection: Optional[DuckDBPyConnection] = None, **kwargs):
|
|
29
|
+
self._connection = connection if connection else duckdb.connect(":memory:")
|
|
29
30
|
super().__init__(**kwargs)
|
|
30
31
|
|
|
31
32
|
@read_function(DuckDBPyRelation)
|
|
@@ -49,4 +50,4 @@ class DuckDBXMLStreamReader(XMLStreamReader):
|
|
|
49
50
|
data=self.read_to_py_iterator(resource, entity_name, schema), schema=polars_schema
|
|
50
51
|
)
|
|
51
52
|
)
|
|
52
|
-
return self.
|
|
53
|
+
return self._connection.sql("select * from _lazy_frame")
|
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
"""A reference data loader for duckdb."""
|
|
2
2
|
|
|
3
|
-
from typing import Optional
|
|
4
|
-
|
|
5
3
|
from duckdb import DuckDBPyConnection, DuckDBPyRelation
|
|
6
4
|
from pyarrow import ipc # type: ignore
|
|
7
5
|
|
|
8
6
|
from dve.core_engine.backends.base.reference_data import (
|
|
9
7
|
BaseRefDataLoader,
|
|
10
|
-
|
|
8
|
+
ReferenceConfig,
|
|
11
9
|
ReferenceTable,
|
|
12
10
|
mark_refdata_file_extension,
|
|
13
11
|
)
|
|
@@ -19,17 +17,16 @@ from dve.parser.type_hints import URI
|
|
|
19
17
|
class DuckDBRefDataLoader(BaseRefDataLoader[DuckDBPyRelation]):
|
|
20
18
|
"""A reference data loader using already existing DuckDB tables."""
|
|
21
19
|
|
|
22
|
-
connection: DuckDBPyConnection
|
|
23
|
-
"""The DuckDB connection for the backend."""
|
|
24
|
-
dataset_config_uri: Optional[URI] = None
|
|
25
|
-
"""The location of the dischema file"""
|
|
26
|
-
|
|
27
20
|
def __init__(
|
|
28
21
|
self,
|
|
29
|
-
|
|
22
|
+
connection: DuckDBPyConnection,
|
|
23
|
+
reference_data_config: dict[EntityName, ReferenceConfig],
|
|
24
|
+
dataset_config_uri: URI,
|
|
30
25
|
**kwargs,
|
|
31
26
|
) -> None:
|
|
32
|
-
super().__init__(
|
|
27
|
+
super().__init__(reference_data_config, dataset_config_uri, **kwargs)
|
|
28
|
+
|
|
29
|
+
self.connection = connection
|
|
33
30
|
|
|
34
31
|
if not self.connection:
|
|
35
32
|
raise AttributeError("DuckDBConnection must be specified")
|
|
@@ -6,6 +6,7 @@ from typing import Any, Optional
|
|
|
6
6
|
from pyspark.sql import DataFrame, SparkSession
|
|
7
7
|
|
|
8
8
|
from dve.core_engine.backends.base.backend import BaseBackend
|
|
9
|
+
from dve.core_engine.backends.base.reference_data import ReferenceConfigUnion
|
|
9
10
|
from dve.core_engine.backends.implementations.spark.contract import SparkDataContract
|
|
10
11
|
from dve.core_engine.backends.implementations.spark.reference_data import SparkRefDataLoader
|
|
11
12
|
from dve.core_engine.backends.implementations.spark.rules import SparkStepImplementations
|
|
@@ -14,7 +15,7 @@ from dve.core_engine.backends.implementations.spark.types import SparkEntities
|
|
|
14
15
|
from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME
|
|
15
16
|
from dve.core_engine.loggers import get_child_logger, get_logger
|
|
16
17
|
from dve.core_engine.models import SubmissionInfo
|
|
17
|
-
from dve.core_engine.type_hints import URI, EntityParquetLocations
|
|
18
|
+
from dve.core_engine.type_hints import URI, EntityName, EntityParquetLocations
|
|
18
19
|
from dve.parser.file_handling import get_resource_exists, joinuri
|
|
19
20
|
|
|
20
21
|
|
|
@@ -26,7 +27,6 @@ class SparkBackend(BaseBackend[DataFrame]):
|
|
|
26
27
|
dataset_config_uri: Optional[URI] = None,
|
|
27
28
|
contract: Optional[SparkDataContract] = None,
|
|
28
29
|
steps: Optional[SparkStepImplementations] = None,
|
|
29
|
-
reference_data_loader: Optional[type[SparkRefDataLoader]] = None,
|
|
30
30
|
logger: Optional[logging.Logger] = None,
|
|
31
31
|
spark_session: Optional[SparkSession] = None,
|
|
32
32
|
**kwargs: Any,
|
|
@@ -36,6 +36,8 @@ class SparkBackend(BaseBackend[DataFrame]):
|
|
|
36
36
|
|
|
37
37
|
self.spark_session = spark_session or SparkSession.builder.getOrCreate()
|
|
38
38
|
"""The Spark session for the backend."""
|
|
39
|
+
self.dataset_config_uri = dataset_config_uri
|
|
40
|
+
"""The uri of the dischema specifying the DVE config"""
|
|
39
41
|
|
|
40
42
|
if contract is None:
|
|
41
43
|
contract = SparkDataContract(
|
|
@@ -46,11 +48,27 @@ class SparkBackend(BaseBackend[DataFrame]):
|
|
|
46
48
|
steps = SparkStepImplementations.register_udfs(
|
|
47
49
|
logger=get_child_logger("SparkStepImplementations", logger)
|
|
48
50
|
)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
51
|
+
super().__init__(contract, steps, logger, **kwargs)
|
|
52
|
+
|
|
53
|
+
def load_reference_data(
|
|
54
|
+
self,
|
|
55
|
+
reference_entity_config: dict[EntityName, ReferenceConfigUnion],
|
|
56
|
+
submission_info: Optional[SubmissionInfo],
|
|
57
|
+
):
|
|
58
|
+
"""Load the reference data as specified in the reference entity config."""
|
|
59
|
+
sub_info_entity: Optional[DataFrame] = None
|
|
60
|
+
if submission_info:
|
|
61
|
+
sub_info_entity = self.convert_submission_info(submission_info)
|
|
62
|
+
|
|
63
|
+
reference_data_loader = SparkRefDataLoader(
|
|
64
|
+
spark=self.spark_session,
|
|
65
|
+
reference_data_config=reference_entity_config,
|
|
66
|
+
dataset_config_uri=self.dataset_config_uri, # type: ignore
|
|
67
|
+
)
|
|
68
|
+
if sub_info_entity is not None:
|
|
69
|
+
reference_data_loader.entity_cache["dve_submission_info"] = sub_info_entity
|
|
70
|
+
|
|
71
|
+
return reference_data_loader
|
|
54
72
|
|
|
55
73
|
def write_entities_to_parquet(
|
|
56
74
|
self, entities: SparkEntities, cache_prefix: URI
|
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
# pylint: disable=no-member
|
|
2
2
|
"""A reference data loader for Spark."""
|
|
3
3
|
|
|
4
|
-
from typing import Optional
|
|
5
|
-
|
|
6
4
|
from pyspark.sql import DataFrame, SparkSession
|
|
7
5
|
|
|
8
6
|
from dve.core_engine.backends.base.reference_data import (
|
|
@@ -19,17 +17,15 @@ from dve.parser.type_hints import URI
|
|
|
19
17
|
class SparkRefDataLoader(BaseRefDataLoader[DataFrame]):
|
|
20
18
|
"""A reference data loader using already existing Apache Spark Tables."""
|
|
21
19
|
|
|
22
|
-
spark: SparkSession
|
|
23
|
-
"""The Spark session for the backend."""
|
|
24
|
-
dataset_config_uri: Optional[URI] = None
|
|
25
|
-
"""The location of the dischema file defining business rules"""
|
|
26
|
-
|
|
27
20
|
def __init__(
|
|
28
21
|
self,
|
|
29
|
-
|
|
22
|
+
spark: SparkSession,
|
|
23
|
+
reference_data_config: dict[EntityName, ReferenceConfig],
|
|
24
|
+
dataset_config_uri: URI,
|
|
30
25
|
**kwargs,
|
|
31
26
|
) -> None:
|
|
32
|
-
super().__init__(
|
|
27
|
+
super().__init__(reference_data_config, dataset_config_uri, **kwargs)
|
|
28
|
+
self.spark = spark
|
|
33
29
|
if not self.spark:
|
|
34
30
|
raise AttributeError("Spark session must be provided")
|
|
35
31
|
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/pipeline/duckdb_pipeline.py
RENAMED
|
@@ -5,10 +5,12 @@ from typing import Optional
|
|
|
5
5
|
|
|
6
6
|
from duckdb import DuckDBPyConnection, DuckDBPyRelation
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
import dve.parser.file_handling as fh
|
|
9
|
+
from dve.core_engine.backends.base.reference_data import ReferenceConfig
|
|
9
10
|
from dve.core_engine.backends.implementations.duckdb.auditing import DDBAuditingManager
|
|
10
11
|
from dve.core_engine.backends.implementations.duckdb.contract import DuckDBDataContract
|
|
11
12
|
from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import duckdb_get_entity_count
|
|
13
|
+
from dve.core_engine.backends.implementations.duckdb.reference_data import DuckDBRefDataLoader
|
|
12
14
|
from dve.core_engine.backends.implementations.duckdb.rules import DuckDBStepImplementations
|
|
13
15
|
from dve.core_engine.models import SubmissionInfo
|
|
14
16
|
from dve.core_engine.type_hints import URI
|
|
@@ -30,7 +32,6 @@ class DDBDVEPipeline(BaseDVEPipeline):
|
|
|
30
32
|
connection: DuckDBPyConnection,
|
|
31
33
|
rules_path: Optional[URI],
|
|
32
34
|
submitted_files_path: Optional[URI],
|
|
33
|
-
reference_data_loader: Optional[type[BaseRefDataLoader]] = None,
|
|
34
35
|
job_run_id: Optional[int] = None,
|
|
35
36
|
logger: Optional[logging.Logger] = None,
|
|
36
37
|
):
|
|
@@ -42,9 +43,19 @@ class DDBDVEPipeline(BaseDVEPipeline):
|
|
|
42
43
|
DuckDBStepImplementations.register_udfs(connection=self._connection),
|
|
43
44
|
rules_path,
|
|
44
45
|
submitted_files_path,
|
|
45
|
-
reference_data_loader,
|
|
46
46
|
job_run_id,
|
|
47
47
|
logger,
|
|
48
|
+
{"connection": self._connection},
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def init_reference_data_loader(
|
|
52
|
+
self, reference_data_config: dict[str, ReferenceConfig], **kwargs
|
|
53
|
+
) -> DuckDBRefDataLoader:
|
|
54
|
+
return DuckDBRefDataLoader(
|
|
55
|
+
connection=self._connection,
|
|
56
|
+
reference_data_config=reference_data_config,
|
|
57
|
+
dataset_config_uri=fh.get_parent(self._rules_path), # type: ignore
|
|
58
|
+
**kwargs
|
|
48
59
|
)
|
|
49
60
|
|
|
50
61
|
# pylint: disable=arguments-differ
|
|
@@ -9,7 +9,7 @@ from concurrent.futures import Executor, Future, ThreadPoolExecutor
|
|
|
9
9
|
from functools import lru_cache
|
|
10
10
|
from itertools import starmap
|
|
11
11
|
from threading import Lock
|
|
12
|
-
from typing import Optional, Union
|
|
12
|
+
from typing import Any, Optional, Union
|
|
13
13
|
from uuid import uuid4
|
|
14
14
|
|
|
15
15
|
import polars as pl
|
|
@@ -26,7 +26,7 @@ from dve.common.error_utils import (
|
|
|
26
26
|
from dve.core_engine.backends.base.auditing import BaseAuditingManager
|
|
27
27
|
from dve.core_engine.backends.base.contract import BaseDataContract
|
|
28
28
|
from dve.core_engine.backends.base.core import EntityManager
|
|
29
|
-
from dve.core_engine.backends.base.reference_data import BaseRefDataLoader
|
|
29
|
+
from dve.core_engine.backends.base.reference_data import BaseRefDataLoader, ReferenceConfig
|
|
30
30
|
from dve.core_engine.backends.base.rules import BaseStepImplementations
|
|
31
31
|
from dve.core_engine.backends.exceptions import MessageBearingError
|
|
32
32
|
from dve.core_engine.backends.readers import BaseFileReader
|
|
@@ -36,7 +36,7 @@ from dve.core_engine.exceptions import CriticalProcessingError
|
|
|
36
36
|
from dve.core_engine.loggers import get_logger
|
|
37
37
|
from dve.core_engine.message import FeedbackMessage
|
|
38
38
|
from dve.core_engine.models import SubmissionInfo, SubmissionStatisticsRecord
|
|
39
|
-
from dve.core_engine.type_hints import URI, DVEStageName, FileURI, InfoURI
|
|
39
|
+
from dve.core_engine.type_hints import URI, DVEStageName, EntityName, FileURI, InfoURI
|
|
40
40
|
from dve.parser import file_handling as fh
|
|
41
41
|
from dve.parser.file_handling.implementations.file import LocalFilesystemImplementation
|
|
42
42
|
from dve.parser.file_handling.service import _get_implementation
|
|
@@ -49,6 +49,7 @@ PERMISSIBLE_EXCEPTIONS: tuple[type[Exception]] = (
|
|
|
49
49
|
)
|
|
50
50
|
|
|
51
51
|
|
|
52
|
+
# pylint: disable=R0904
|
|
52
53
|
class BaseDVEPipeline:
|
|
53
54
|
"""
|
|
54
55
|
Base class for running a DVE Pipeline either by a given step or a full e2e process.
|
|
@@ -62,14 +63,13 @@ class BaseDVEPipeline:
|
|
|
62
63
|
step_implementations: Optional[BaseStepImplementations[EntityType]],
|
|
63
64
|
rules_path: Optional[URI],
|
|
64
65
|
submitted_files_path: Optional[URI],
|
|
65
|
-
reference_data_loader: Optional[type[BaseRefDataLoader]] = None,
|
|
66
66
|
job_run_id: Optional[int] = None,
|
|
67
67
|
logger: Optional[logging.Logger] = None,
|
|
68
|
+
backend_reader_kwargs: Optional[dict[str, Any]] = None,
|
|
68
69
|
):
|
|
69
70
|
self._submitted_files_path = submitted_files_path
|
|
70
71
|
self._processed_files_path = processed_files_path
|
|
71
72
|
self._rules_path = rules_path
|
|
72
|
-
self._reference_data_loader = reference_data_loader
|
|
73
73
|
self._job_run_id = job_run_id
|
|
74
74
|
self._audit_tables = audit_tables
|
|
75
75
|
self._data_contract = data_contract
|
|
@@ -78,6 +78,7 @@ class BaseDVEPipeline:
|
|
|
78
78
|
self._summary_lock = Lock()
|
|
79
79
|
self._rec_tracking_lock = Lock()
|
|
80
80
|
self._aggregates_lock = Lock()
|
|
81
|
+
self._backend_reader_kwargs = backend_reader_kwargs
|
|
81
82
|
|
|
82
83
|
if self._data_contract:
|
|
83
84
|
self._data_contract.logger = self._logger
|
|
@@ -109,11 +110,23 @@ class BaseDVEPipeline:
|
|
|
109
110
|
"""The step implementations to apply the business rules to a given dataset"""
|
|
110
111
|
return self._step_implementations
|
|
111
112
|
|
|
113
|
+
@property
|
|
114
|
+
def backend_reader_kwargs(self) -> dict[str, Any] | None:
|
|
115
|
+
"""Important required arguments for all readers related to the specific backend
|
|
116
|
+
that can't be specified at time of writing config eg. duckdb connection"""
|
|
117
|
+
return self._backend_reader_kwargs
|
|
118
|
+
|
|
112
119
|
@staticmethod
|
|
113
120
|
def get_entity_count(entity: EntityType) -> int:
|
|
114
121
|
"""Get a row count of an entity stored as parquet"""
|
|
115
122
|
raise NotImplementedError()
|
|
116
123
|
|
|
124
|
+
def init_reference_data_loader(
|
|
125
|
+
self, reference_data_config: dict[EntityName, ReferenceConfig], **kwargs
|
|
126
|
+
) -> BaseRefDataLoader:
|
|
127
|
+
"""Get reference data loader if required for business rules"""
|
|
128
|
+
raise NotImplementedError()
|
|
129
|
+
|
|
117
130
|
def get_submission_status(
|
|
118
131
|
self, step_name: DVEStageName, submission_id: str
|
|
119
132
|
) -> SubmissionStatus:
|
|
@@ -199,7 +212,9 @@ class BaseDVEPipeline:
|
|
|
199
212
|
|
|
200
213
|
for model_name, model in models.items():
|
|
201
214
|
self._logger.info(f"Transforming {model_name} to stringified parquet")
|
|
202
|
-
reader: BaseFileReader = load_reader(
|
|
215
|
+
reader: BaseFileReader = load_reader(
|
|
216
|
+
dataset, model_name, ext, self.backend_reader_kwargs
|
|
217
|
+
)
|
|
203
218
|
try:
|
|
204
219
|
if not entity_type:
|
|
205
220
|
reader.write_parquet(
|
|
@@ -527,7 +542,7 @@ class BaseDVEPipeline:
|
|
|
527
542
|
|
|
528
543
|
return processed_files, failed_processing
|
|
529
544
|
|
|
530
|
-
def apply_business_rules(
|
|
545
|
+
def apply_business_rules( # pylint: disable=R0914
|
|
531
546
|
self, submission_info: SubmissionInfo, submission_status: Optional[SubmissionStatus] = None
|
|
532
547
|
) -> tuple[SubmissionInfo, SubmissionStatus]:
|
|
533
548
|
"""Apply the business rules to a given submission, the submission may have failed at the
|
|
@@ -542,9 +557,6 @@ class BaseDVEPipeline:
|
|
|
542
557
|
if not self.rules_path:
|
|
543
558
|
raise AttributeError("business rules path not provided.")
|
|
544
559
|
|
|
545
|
-
if not self._reference_data_loader:
|
|
546
|
-
raise AttributeError("reference data loader not provided.")
|
|
547
|
-
|
|
548
560
|
if not self.processed_files_path:
|
|
549
561
|
raise AttributeError("processed files path has not been provided.")
|
|
550
562
|
|
|
@@ -556,8 +568,10 @@ class BaseDVEPipeline:
|
|
|
556
568
|
self._processed_files_path, submission_info.submission_id
|
|
557
569
|
)
|
|
558
570
|
ref_data = config.get_reference_data_config()
|
|
571
|
+
reference_data: BaseRefDataLoader = self.init_reference_data_loader(
|
|
572
|
+
reference_data_config=ref_data
|
|
573
|
+
)
|
|
559
574
|
rules = config.get_rule_metadata()
|
|
560
|
-
reference_data = self._reference_data_loader(ref_data) # type: ignore
|
|
561
575
|
entities = {}
|
|
562
576
|
contract = fh.joinuri(
|
|
563
577
|
self.processed_files_path, submission_info.submission_id, "data_contract"
|
|
@@ -582,10 +596,7 @@ class BaseDVEPipeline:
|
|
|
582
596
|
key_fields = {model: conf.reporting_fields for model, conf in model_config.items()}
|
|
583
597
|
|
|
584
598
|
_errors_uri, rules_success = self.step_implementations.apply_rules( # type: ignore
|
|
585
|
-
working_directory,
|
|
586
|
-
entity_manager,
|
|
587
|
-
rules,
|
|
588
|
-
key_fields
|
|
599
|
+
working_directory, entity_manager, rules, key_fields
|
|
589
600
|
)
|
|
590
601
|
|
|
591
602
|
rule_messages = load_feedback_messages(
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/pipeline/spark_pipeline.py
RENAMED
|
@@ -6,9 +6,11 @@ from typing import Optional
|
|
|
6
6
|
|
|
7
7
|
from pyspark.sql import DataFrame, SparkSession
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
import dve.parser.file_handling as fh
|
|
10
|
+
from dve.core_engine.backends.base.reference_data import ReferenceConfig
|
|
10
11
|
from dve.core_engine.backends.implementations.spark.auditing import SparkAuditingManager
|
|
11
12
|
from dve.core_engine.backends.implementations.spark.contract import SparkDataContract
|
|
13
|
+
from dve.core_engine.backends.implementations.spark.reference_data import SparkRefDataLoader
|
|
12
14
|
from dve.core_engine.backends.implementations.spark.rules import SparkStepImplementations
|
|
13
15
|
from dve.core_engine.backends.implementations.spark.spark_helpers import spark_get_entity_count
|
|
14
16
|
from dve.core_engine.models import SubmissionInfo
|
|
@@ -31,7 +33,6 @@ class SparkDVEPipeline(BaseDVEPipeline):
|
|
|
31
33
|
audit_tables: SparkAuditingManager,
|
|
32
34
|
rules_path: Optional[URI],
|
|
33
35
|
submitted_files_path: Optional[URI],
|
|
34
|
-
reference_data_loader: Optional[type[BaseRefDataLoader]] = None,
|
|
35
36
|
spark: Optional[SparkSession] = None,
|
|
36
37
|
job_run_id: Optional[int] = None,
|
|
37
38
|
logger: Optional[logging.Logger] = None,
|
|
@@ -44,11 +45,20 @@ class SparkDVEPipeline(BaseDVEPipeline):
|
|
|
44
45
|
SparkStepImplementations.register_udfs(self._spark),
|
|
45
46
|
rules_path,
|
|
46
47
|
submitted_files_path,
|
|
47
|
-
reference_data_loader,
|
|
48
48
|
job_run_id,
|
|
49
49
|
logger,
|
|
50
50
|
)
|
|
51
51
|
|
|
52
|
+
def init_reference_data_loader(
|
|
53
|
+
self, reference_data_config: dict[str, ReferenceConfig], **kwargs
|
|
54
|
+
) -> SparkRefDataLoader:
|
|
55
|
+
return SparkRefDataLoader(
|
|
56
|
+
spark=self._spark,
|
|
57
|
+
reference_data_config=reference_data_config,
|
|
58
|
+
dataset_config_uri=fh.get_parent(self._rules_path), # type: ignore
|
|
59
|
+
**kwargs
|
|
60
|
+
)
|
|
61
|
+
|
|
52
62
|
# pylint: disable=arguments-differ
|
|
53
63
|
def write_file_to_parquet( # type: ignore
|
|
54
64
|
self, submission_file_uri: URI, submission_info: SubmissionInfo, output: URI
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
|
|
4
4
|
import json
|
|
5
5
|
from threading import Lock
|
|
6
|
-
from typing import Optional
|
|
6
|
+
from typing import Any, Optional
|
|
7
7
|
|
|
8
8
|
from pydantic.main import ModelMetaclass
|
|
9
9
|
from pyspark.sql import SparkSession
|
|
@@ -45,10 +45,17 @@ def load_config(
|
|
|
45
45
|
return models, config, dataset
|
|
46
46
|
|
|
47
47
|
|
|
48
|
-
def load_reader(
|
|
48
|
+
def load_reader(
|
|
49
|
+
dataset: Dataset,
|
|
50
|
+
model_name: str,
|
|
51
|
+
file_extension: str,
|
|
52
|
+
backend_reader_kwargs: Optional[dict[str, Any]] = None,
|
|
53
|
+
):
|
|
49
54
|
"""Loads the readers for the diven feed, model name and file extension"""
|
|
50
55
|
reader_config = dataset[model_name].reader_config[f".{file_extension.lower()}"]
|
|
51
|
-
reader = _READER_REGISTRY[reader_config.reader](
|
|
56
|
+
reader = _READER_REGISTRY[reader_config.reader](
|
|
57
|
+
**reader_config.kwargs_, **backend_reader_kwargs if backend_reader_kwargs else {}
|
|
58
|
+
)
|
|
52
59
|
return reader
|
|
53
60
|
|
|
54
61
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/types.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/constants.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/exceptions.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/loggers.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/message.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/templating.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/type_hints.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/validation.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/exc.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/models.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/utilities.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/reporting/error_report.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/reporting/excel_report.py
RENAMED
|
File without changes
|