data-validation-engine 0.7.3__tar.gz → 0.7.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/PKG-INFO +1 -1
  2. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/pyproject.toml +1 -1
  3. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/base/backend.py +4 -32
  4. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/base/reference_data.py +6 -3
  5. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/exceptions.py +14 -0
  6. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +1 -1
  7. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/reference_data.py +7 -10
  8. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/backend.py +25 -7
  9. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/reference_data.py +5 -9
  10. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/pipeline/duckdb_pipeline.py +13 -3
  11. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/pipeline/foundry_ddb_pipeline.py +3 -0
  12. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/pipeline/pipeline.py +20 -12
  13. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/pipeline/spark_pipeline.py +13 -3
  14. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/LICENSE +0 -0
  15. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/README.md +0 -0
  16. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/__init__.py +0 -0
  17. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/common/__init__.py +0 -0
  18. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/common/error_utils.py +0 -0
  19. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/__init__.py +0 -0
  20. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/__init__.py +0 -0
  21. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/base/__init__.py +0 -0
  22. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/base/auditing.py +0 -0
  23. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/base/contract.py +0 -0
  24. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/base/core.py +0 -0
  25. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/base/reader.py +0 -0
  26. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/base/rules.py +0 -0
  27. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/base/utilities.py +0 -0
  28. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/__init__.py +0 -0
  29. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/__init__.py +0 -0
  30. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/auditing.py +0 -0
  31. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/contract.py +0 -0
  32. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/readers/__init__.py +0 -0
  33. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py +0 -0
  34. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/readers/json.py +0 -0
  35. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/readers/xml.py +0 -0
  36. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/rules.py +0 -0
  37. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/types.py +0 -0
  38. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/duckdb/utilities.py +0 -0
  39. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/__init__.py +0 -0
  40. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/auditing.py +0 -0
  41. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/contract.py +0 -0
  42. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/readers/__init__.py +0 -0
  43. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/readers/csv.py +0 -0
  44. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/readers/json.py +0 -0
  45. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/readers/xml.py +0 -0
  46. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/rules.py +0 -0
  47. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/spark_helpers.py +0 -0
  48. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/types.py +0 -0
  49. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/implementations/spark/utilities.py +0 -0
  50. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/metadata/__init__.py +0 -0
  51. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/metadata/contract.py +0 -0
  52. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/metadata/reporting.py +0 -0
  53. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/metadata/rules.py +0 -0
  54. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/readers/__init__.py +0 -0
  55. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/readers/csv.py +0 -0
  56. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/readers/utilities.py +0 -0
  57. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/readers/xml.py +0 -0
  58. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/readers/xml_linting.py +0 -0
  59. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/types.py +0 -0
  60. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/backends/utilities.py +0 -0
  61. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/configuration/__init__.py +0 -0
  62. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/configuration/base.py +0 -0
  63. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/configuration/v1/__init__.py +0 -0
  64. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/configuration/v1/filters.py +0 -0
  65. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/configuration/v1/rule_stores/__init__.py +0 -0
  66. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/configuration/v1/rule_stores/models.py +0 -0
  67. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/configuration/v1/steps.py +0 -0
  68. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/constants.py +0 -0
  69. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/engine.py +0 -0
  70. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/exceptions.py +0 -0
  71. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/functions/__init__.py +0 -0
  72. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/functions/implementations.py +0 -0
  73. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/loggers.py +0 -0
  74. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/message.py +0 -0
  75. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/models.py +0 -0
  76. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/templating.py +0 -0
  77. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/type_hints.py +0 -0
  78. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/core_engine/validation.py +0 -0
  79. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/metadata_parser/__init__.py +0 -0
  80. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/metadata_parser/domain_types.py +0 -0
  81. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/metadata_parser/exc.py +0 -0
  82. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/metadata_parser/function_library.py +0 -0
  83. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/metadata_parser/function_wrapper.py +0 -0
  84. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/metadata_parser/model_generator.py +0 -0
  85. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/metadata_parser/models.py +0 -0
  86. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/metadata_parser/utilities.py +0 -0
  87. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/parser/__init__.py +0 -0
  88. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/parser/exceptions.py +0 -0
  89. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/__init__.py +0 -0
  90. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/helpers.py +0 -0
  91. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/implementations/__init__.py +0 -0
  92. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/implementations/base.py +0 -0
  93. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/implementations/dbfs.py +0 -0
  94. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/implementations/file.py +0 -0
  95. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/implementations/s3.py +0 -0
  96. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/log_handler.py +0 -0
  97. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/service.py +0 -0
  98. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/parser/file_handling/utilities.py +0 -0
  99. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/parser/type_hints.py +0 -0
  100. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/parser/utilities.py +0 -0
  101. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/pipeline/__init__.py +0 -0
  102. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/pipeline/utils.py +0 -0
  103. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/reporting/__init__.py +0 -0
  104. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/reporting/error_report.py +0 -0
  105. {data_validation_engine-0.7.3 → data_validation_engine-0.7.5}/src/dve/reporting/excel_report.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-validation-engine
3
- Version: 0.7.3
3
+ Version: 0.7.5
4
4
  Summary: `nhs data validation engine` is a framework used to validate data
5
5
  License-Expression: MIT
6
6
  License-File: LICENSE
@@ -24,7 +24,7 @@ Issues = "https://github.com/NHSDigital/data-validation-engine/issues"
24
24
  Changelog = "https://github.com/NHSDigital/data-validation-engine/blob/main/CHANGELOG.md"
25
25
 
26
26
  [tool.poetry]
27
- version = "0.7.3"
27
+ version = "0.7.5"
28
28
  packages = [
29
29
  { include = "dve", from = "src" },
30
30
  ]
@@ -3,7 +3,7 @@
3
3
  import logging
4
4
  import warnings
5
5
  from abc import ABC, abstractmethod
6
- from collections.abc import Mapping, MutableMapping
6
+ from collections.abc import MutableMapping
7
7
  from typing import Any, ClassVar, Generic, Optional
8
8
 
9
9
  from pyspark.sql import DataFrame, SparkSession
@@ -41,14 +41,12 @@ class BaseBackend(Generic[EntityType], ABC):
41
41
  self,
42
42
  contract: BaseDataContract[EntityType],
43
43
  steps: BaseStepImplementations[EntityType],
44
- reference_data_loader_type: Optional[type[BaseRefDataLoader[EntityType]]],
45
44
  logger: Optional[logging.Logger] = None,
46
45
  **kwargs: Any,
47
46
  ) -> None:
48
47
  for component_name, component in (
49
48
  ("Contract", contract),
50
49
  ("Step implementation", steps),
51
- ("Reference data loader", reference_data_loader_type),
52
50
  ):
53
51
  component_entity_type = getattr(component, "__entity_type__", None)
54
52
  if component_entity_type != self.__entity_type__:
@@ -61,12 +59,6 @@ class BaseBackend(Generic[EntityType], ABC):
61
59
  """The data contract implementation used by the backend."""
62
60
  self.step_implementations = steps
63
61
  """The step implementations used by the backend."""
64
- self.reference_data_loader_type = reference_data_loader_type
65
- """
66
- The loader type to use for the reference data. If `None`, do not
67
- load any reference data and error if it is provided.
68
-
69
- """
70
62
  self.logger = logger or get_logger(type(self).__name__)
71
63
  """The `logging.Logger instance for the backend."""
72
64
 
@@ -74,29 +66,9 @@ class BaseBackend(Generic[EntityType], ABC):
74
66
  self,
75
67
  reference_entity_config: dict[EntityName, ReferenceConfigUnion],
76
68
  submission_info: Optional[SubmissionInfo],
77
- ) -> Mapping[EntityName, EntityType]:
78
- """Load the reference data as specified in the reference entity config."""
79
- sub_info_entity: Optional[EntityType] = None
80
- if submission_info:
81
- sub_info_entity = self.convert_submission_info(submission_info)
82
-
83
- if self.reference_data_loader_type is None:
84
- if reference_entity_config:
85
- raise ValueError(
86
- "Reference data has been specified but no reference data loader is "
87
- + "configured for this backend"
88
- )
89
-
90
- reference_data_dict = {}
91
- if sub_info_entity is not None:
92
- reference_data_dict["dve_submission_info"] = sub_info_entity
93
- return reference_data_dict
94
-
95
- reference_data_loader = self.reference_data_loader_type(reference_entity_config)
96
- if sub_info_entity is not None:
97
- reference_data_loader.entity_cache["dve_submission_info"] = sub_info_entity
98
-
99
- return reference_data_loader
69
+ ) -> BaseRefDataLoader[EntityType]:
70
+ """Supply configured reference data loader for use with business rules"""
71
+ raise NotImplementedError()
100
72
 
101
73
  @abstractmethod
102
74
  def convert_submission_info(self, submission_info: SubmissionInfo) -> EntityType:
@@ -11,6 +11,7 @@ import dve.parser.file_handling as fh
11
11
  from dve.core_engine.backends.base.core import get_entity_type
12
12
  from dve.core_engine.backends.exceptions import (
13
13
  MissingRefDataEntity,
14
+ NoRefDataConfigSupplied,
14
15
  RefdataLacksFileExtensionSupport,
15
16
  )
16
17
  from dve.core_engine.backends.types import EntityType
@@ -147,11 +148,11 @@ class BaseRefDataLoader(Generic[EntityType], Mapping[EntityName, EntityType], AB
147
148
  # pylint: disable=unused-argument
148
149
  def __init__(
149
150
  self,
150
- reference_entity_config: dict[EntityName, ReferenceConfig],
151
- dataset_config_uri: Optional[URI] = None,
151
+ reference_data_config: dict[EntityName, ReferenceConfig],
152
+ dataset_config_uri: URI,
152
153
  **kwargs,
153
154
  ) -> None:
154
- self.reference_entity_config = reference_entity_config
155
+ self.reference_entity_config = reference_data_config
155
156
  self.dataset_config_uri = dataset_config_uri
156
157
  """
157
158
  Configuration options for the reference data. This is likely to vary
@@ -207,6 +208,8 @@ class BaseRefDataLoader(Generic[EntityType], Mapping[EntityName, EntityType], AB
207
208
  try:
208
209
  config = self.reference_entity_config[key]
209
210
  return self.load_entity(entity_name=key, config=config)
211
+ except TypeError as err:
212
+ raise NoRefDataConfigSupplied() from err
210
213
  except Exception as err:
211
214
  raise MissingRefDataEntity(entity_name=key) from err
212
215
 
@@ -119,6 +119,20 @@ class MissingRefDataEntity(MissingEntity, BackendErrorMixin): # pylint: disable
119
119
  return f"Missing reference data entity {self.entity_name!r}"
120
120
 
121
121
 
122
+ class NoRefDataConfigSupplied(BackendError):
123
+ """An error raised when trying to load a refdata entity when no refdata
124
+ config has been supplied.
125
+
126
+ """
127
+
128
+ def __init__(self, *args: object) -> None:
129
+ super().__init__(*args)
130
+
131
+ def get_message_preamble(self) -> EntityName:
132
+ """Message for logging purposes"""
133
+ return "Refdata loader not supplied with refdata config - unable to load refdata entities"
134
+
135
+
122
136
  class ConstraintError(ValueError, BackendErrorMixin):
123
137
  """Raised when a given constraint is violated."""
124
138
 
@@ -411,7 +411,7 @@ def get_duckdb_cast_statement_from_annotation(
411
411
  stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{date_regex}') THEN TRY_CAST(TRIM({quoted_name}) as DATE) ELSE NULL END" # pylint: disable=C0301
412
412
  return stmt
413
413
  if issubclass(type_, time):
414
- stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{time_regex}') THEN TRY_CAST(TRIM({quoted_name}) as TIME) ELSE NULL END" # pylint: disable=C0301
414
+ stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{time_regex}') THEN TRY_CAST(TRIM({quoted_name}) as TIME) ELSE NULL END" # pylint: disable=C0301
415
415
  return stmt
416
416
  duck_type = get_duckdb_type_from_annotation(type_)
417
417
  if duck_type:
@@ -1,13 +1,11 @@
1
1
  """A reference data loader for duckdb."""
2
2
 
3
- from typing import Optional
4
-
5
3
  from duckdb import DuckDBPyConnection, DuckDBPyRelation
6
4
  from pyarrow import ipc # type: ignore
7
5
 
8
6
  from dve.core_engine.backends.base.reference_data import (
9
7
  BaseRefDataLoader,
10
- ReferenceConfigUnion,
8
+ ReferenceConfig,
11
9
  ReferenceTable,
12
10
  mark_refdata_file_extension,
13
11
  )
@@ -19,17 +17,16 @@ from dve.parser.type_hints import URI
19
17
  class DuckDBRefDataLoader(BaseRefDataLoader[DuckDBPyRelation]):
20
18
  """A reference data loader using already existing DuckDB tables."""
21
19
 
22
- connection: DuckDBPyConnection
23
- """The DuckDB connection for the backend."""
24
- dataset_config_uri: Optional[URI] = None
25
- """The location of the dischema file"""
26
-
27
20
  def __init__(
28
21
  self,
29
- reference_entity_config: dict[EntityName, ReferenceConfigUnion],
22
+ connection: DuckDBPyConnection,
23
+ reference_data_config: dict[EntityName, ReferenceConfig],
24
+ dataset_config_uri: URI,
30
25
  **kwargs,
31
26
  ) -> None:
32
- super().__init__(reference_entity_config, self.dataset_config_uri, **kwargs)
27
+ super().__init__(reference_data_config, dataset_config_uri, **kwargs)
28
+
29
+ self.connection = connection
33
30
 
34
31
  if not self.connection:
35
32
  raise AttributeError("DuckDBConnection must be specified")
@@ -6,6 +6,7 @@ from typing import Any, Optional
6
6
  from pyspark.sql import DataFrame, SparkSession
7
7
 
8
8
  from dve.core_engine.backends.base.backend import BaseBackend
9
+ from dve.core_engine.backends.base.reference_data import ReferenceConfigUnion
9
10
  from dve.core_engine.backends.implementations.spark.contract import SparkDataContract
10
11
  from dve.core_engine.backends.implementations.spark.reference_data import SparkRefDataLoader
11
12
  from dve.core_engine.backends.implementations.spark.rules import SparkStepImplementations
@@ -14,7 +15,7 @@ from dve.core_engine.backends.implementations.spark.types import SparkEntities
14
15
  from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME
15
16
  from dve.core_engine.loggers import get_child_logger, get_logger
16
17
  from dve.core_engine.models import SubmissionInfo
17
- from dve.core_engine.type_hints import URI, EntityParquetLocations
18
+ from dve.core_engine.type_hints import URI, EntityName, EntityParquetLocations
18
19
  from dve.parser.file_handling import get_resource_exists, joinuri
19
20
 
20
21
 
@@ -26,7 +27,6 @@ class SparkBackend(BaseBackend[DataFrame]):
26
27
  dataset_config_uri: Optional[URI] = None,
27
28
  contract: Optional[SparkDataContract] = None,
28
29
  steps: Optional[SparkStepImplementations] = None,
29
- reference_data_loader: Optional[type[SparkRefDataLoader]] = None,
30
30
  logger: Optional[logging.Logger] = None,
31
31
  spark_session: Optional[SparkSession] = None,
32
32
  **kwargs: Any,
@@ -36,6 +36,8 @@ class SparkBackend(BaseBackend[DataFrame]):
36
36
 
37
37
  self.spark_session = spark_session or SparkSession.builder.getOrCreate()
38
38
  """The Spark session for the backend."""
39
+ self.dataset_config_uri = dataset_config_uri
40
+ """The uri of the dischema specifying the DVE config"""
39
41
 
40
42
  if contract is None:
41
43
  contract = SparkDataContract(
@@ -46,11 +48,27 @@ class SparkBackend(BaseBackend[DataFrame]):
46
48
  steps = SparkStepImplementations.register_udfs(
47
49
  logger=get_child_logger("SparkStepImplementations", logger)
48
50
  )
49
- if reference_data_loader is None:
50
- reference_data_loader = SparkRefDataLoader
51
- reference_data_loader.spark = self.spark_session
52
- reference_data_loader.dataset_config_uri = dataset_config_uri
53
- super().__init__(contract, steps, reference_data_loader, logger, **kwargs)
51
+ super().__init__(contract, steps, logger, **kwargs)
52
+
53
+ def load_reference_data(
54
+ self,
55
+ reference_entity_config: dict[EntityName, ReferenceConfigUnion],
56
+ submission_info: Optional[SubmissionInfo],
57
+ ):
58
+ """Load the reference data as specified in the reference entity config."""
59
+ sub_info_entity: Optional[DataFrame] = None
60
+ if submission_info:
61
+ sub_info_entity = self.convert_submission_info(submission_info)
62
+
63
+ reference_data_loader = SparkRefDataLoader(
64
+ spark=self.spark_session,
65
+ reference_data_config=reference_entity_config,
66
+ dataset_config_uri=self.dataset_config_uri, # type: ignore
67
+ )
68
+ if sub_info_entity is not None:
69
+ reference_data_loader.entity_cache["dve_submission_info"] = sub_info_entity
70
+
71
+ return reference_data_loader
54
72
 
55
73
  def write_entities_to_parquet(
56
74
  self, entities: SparkEntities, cache_prefix: URI
@@ -1,8 +1,6 @@
1
1
  # pylint: disable=no-member
2
2
  """A reference data loader for Spark."""
3
3
 
4
- from typing import Optional
5
-
6
4
  from pyspark.sql import DataFrame, SparkSession
7
5
 
8
6
  from dve.core_engine.backends.base.reference_data import (
@@ -19,17 +17,15 @@ from dve.parser.type_hints import URI
19
17
  class SparkRefDataLoader(BaseRefDataLoader[DataFrame]):
20
18
  """A reference data loader using already existing Apache Spark Tables."""
21
19
 
22
- spark: SparkSession
23
- """The Spark session for the backend."""
24
- dataset_config_uri: Optional[URI] = None
25
- """The location of the dischema file defining business rules"""
26
-
27
20
  def __init__(
28
21
  self,
29
- reference_entity_config: dict[EntityName, ReferenceConfig],
22
+ spark: SparkSession,
23
+ reference_data_config: dict[EntityName, ReferenceConfig],
24
+ dataset_config_uri: URI,
30
25
  **kwargs,
31
26
  ) -> None:
32
- super().__init__(reference_entity_config, self.dataset_config_uri, **kwargs)
27
+ super().__init__(reference_data_config, dataset_config_uri, **kwargs)
28
+ self.spark = spark
33
29
  if not self.spark:
34
30
  raise AttributeError("Spark session must be provided")
35
31
 
@@ -5,10 +5,12 @@ from typing import Optional
5
5
 
6
6
  from duckdb import DuckDBPyConnection, DuckDBPyRelation
7
7
 
8
- from dve.core_engine.backends.base.reference_data import BaseRefDataLoader
8
+ import dve.parser.file_handling as fh
9
+ from dve.core_engine.backends.base.reference_data import ReferenceConfig
9
10
  from dve.core_engine.backends.implementations.duckdb.auditing import DDBAuditingManager
10
11
  from dve.core_engine.backends.implementations.duckdb.contract import DuckDBDataContract
11
12
  from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import duckdb_get_entity_count
13
+ from dve.core_engine.backends.implementations.duckdb.reference_data import DuckDBRefDataLoader
12
14
  from dve.core_engine.backends.implementations.duckdb.rules import DuckDBStepImplementations
13
15
  from dve.core_engine.models import SubmissionInfo
14
16
  from dve.core_engine.type_hints import URI
@@ -30,7 +32,6 @@ class DDBDVEPipeline(BaseDVEPipeline):
30
32
  connection: DuckDBPyConnection,
31
33
  rules_path: Optional[URI],
32
34
  submitted_files_path: Optional[URI],
33
- reference_data_loader: Optional[type[BaseRefDataLoader]] = None,
34
35
  job_run_id: Optional[int] = None,
35
36
  logger: Optional[logging.Logger] = None,
36
37
  ):
@@ -42,11 +43,20 @@ class DDBDVEPipeline(BaseDVEPipeline):
42
43
  DuckDBStepImplementations.register_udfs(connection=self._connection),
43
44
  rules_path,
44
45
  submitted_files_path,
45
- reference_data_loader,
46
46
  job_run_id,
47
47
  logger,
48
48
  )
49
49
 
50
+ def init_reference_data_loader(
51
+ self, reference_data_config: dict[str, ReferenceConfig], **kwargs
52
+ ) -> DuckDBRefDataLoader:
53
+ return DuckDBRefDataLoader(
54
+ connection=self._connection,
55
+ reference_data_config=reference_data_config,
56
+ dataset_config_uri=fh.get_parent(self._rules_path), # type: ignore
57
+ **kwargs
58
+ )
59
+
50
60
  # pylint: disable=arguments-differ
51
61
  def write_file_to_parquet( # type: ignore
52
62
  self, submission_file_uri: URI, submission_info: SubmissionInfo, output: URI
@@ -152,6 +152,9 @@ class FoundryDDBPipeline(DDBDVEPipeline):
152
152
  )
153
153
  if sub_stats:
154
154
  self._audit_tables.add_submission_statistics_records(sub_stats=[sub_stats])
155
+ else:
156
+ self._audit_tables.mark_failed(submissions=[sub_id])
157
+
155
158
  except Exception as err: # pylint: disable=W0718
156
159
  self._logger.exception(
157
160
  f"During processing of submission_id: {sub_id}, this exception was raised:"
@@ -26,7 +26,7 @@ from dve.common.error_utils import (
26
26
  from dve.core_engine.backends.base.auditing import BaseAuditingManager
27
27
  from dve.core_engine.backends.base.contract import BaseDataContract
28
28
  from dve.core_engine.backends.base.core import EntityManager
29
- from dve.core_engine.backends.base.reference_data import BaseRefDataLoader
29
+ from dve.core_engine.backends.base.reference_data import BaseRefDataLoader, ReferenceConfig
30
30
  from dve.core_engine.backends.base.rules import BaseStepImplementations
31
31
  from dve.core_engine.backends.exceptions import MessageBearingError
32
32
  from dve.core_engine.backends.readers import BaseFileReader
@@ -36,7 +36,7 @@ from dve.core_engine.exceptions import CriticalProcessingError
36
36
  from dve.core_engine.loggers import get_logger
37
37
  from dve.core_engine.message import FeedbackMessage
38
38
  from dve.core_engine.models import SubmissionInfo, SubmissionStatisticsRecord
39
- from dve.core_engine.type_hints import URI, DVEStageName, FileURI, InfoURI
39
+ from dve.core_engine.type_hints import URI, DVEStageName, EntityName, FileURI, InfoURI
40
40
  from dve.parser import file_handling as fh
41
41
  from dve.parser.file_handling.implementations.file import LocalFilesystemImplementation
42
42
  from dve.parser.file_handling.service import _get_implementation
@@ -62,14 +62,12 @@ class BaseDVEPipeline:
62
62
  step_implementations: Optional[BaseStepImplementations[EntityType]],
63
63
  rules_path: Optional[URI],
64
64
  submitted_files_path: Optional[URI],
65
- reference_data_loader: Optional[type[BaseRefDataLoader]] = None,
66
65
  job_run_id: Optional[int] = None,
67
66
  logger: Optional[logging.Logger] = None,
68
67
  ):
69
68
  self._submitted_files_path = submitted_files_path
70
69
  self._processed_files_path = processed_files_path
71
70
  self._rules_path = rules_path
72
- self._reference_data_loader = reference_data_loader
73
71
  self._job_run_id = job_run_id
74
72
  self._audit_tables = audit_tables
75
73
  self._data_contract = data_contract
@@ -114,6 +112,12 @@ class BaseDVEPipeline:
114
112
  """Get a row count of an entity stored as parquet"""
115
113
  raise NotImplementedError()
116
114
 
115
+ def init_reference_data_loader(
116
+ self, reference_data_config: dict[EntityName, ReferenceConfig], **kwargs
117
+ ) -> BaseRefDataLoader:
118
+ """Get reference data loader if required for business rules"""
119
+ raise NotImplementedError()
120
+
117
121
  def get_submission_status(
118
122
  self, step_name: DVEStageName, submission_id: str
119
123
  ) -> SubmissionStatus:
@@ -527,7 +531,7 @@ class BaseDVEPipeline:
527
531
 
528
532
  return processed_files, failed_processing
529
533
 
530
- def apply_business_rules(
534
+ def apply_business_rules( # pylint: disable=R0914
531
535
  self, submission_info: SubmissionInfo, submission_status: Optional[SubmissionStatus] = None
532
536
  ) -> tuple[SubmissionInfo, SubmissionStatus]:
533
537
  """Apply the business rules to a given submission, the submission may have failed at the
@@ -542,9 +546,6 @@ class BaseDVEPipeline:
542
546
  if not self.rules_path:
543
547
  raise AttributeError("business rules path not provided.")
544
548
 
545
- if not self._reference_data_loader:
546
- raise AttributeError("reference data loader not provided.")
547
-
548
549
  if not self.processed_files_path:
549
550
  raise AttributeError("processed files path has not been provided.")
550
551
 
@@ -556,8 +557,10 @@ class BaseDVEPipeline:
556
557
  self._processed_files_path, submission_info.submission_id
557
558
  )
558
559
  ref_data = config.get_reference_data_config()
560
+ reference_data: BaseRefDataLoader = self.init_reference_data_loader(
561
+ reference_data_config=ref_data
562
+ )
559
563
  rules = config.get_rule_metadata()
560
- reference_data = self._reference_data_loader(ref_data) # type: ignore
561
564
  entities = {}
562
565
  contract = fh.joinuri(
563
566
  self.processed_files_path, submission_info.submission_id, "data_contract"
@@ -581,15 +584,20 @@ class BaseDVEPipeline:
581
584
 
582
585
  key_fields = {model: conf.reporting_fields for model, conf in model_config.items()}
583
586
 
584
- self.step_implementations.apply_rules(working_directory, entity_manager, rules, key_fields) # type: ignore
587
+ _errors_uri, rules_success = self.step_implementations.apply_rules( # type: ignore
588
+ working_directory, entity_manager, rules, key_fields
589
+ )
585
590
 
586
591
  rule_messages = load_feedback_messages(
587
592
  get_feedback_errors_uri(working_directory, "business_rules")
588
593
  )
589
- submission_status.validation_failed = (
594
+ if (
590
595
  any(not rule_message.is_informational for rule_message in rule_messages)
591
596
  or submission_status.validation_failed
592
- )
597
+ ):
598
+ submission_status.validation_failed = True
599
+ elif not rules_success:
600
+ submission_status.processing_failed = True
593
601
 
594
602
  for entity_name, entity in entity_manager.entities.items():
595
603
  projected = self._step_implementations.write_parquet( # type: ignore
@@ -6,9 +6,11 @@ from typing import Optional
6
6
 
7
7
  from pyspark.sql import DataFrame, SparkSession
8
8
 
9
- from dve.core_engine.backends.base.reference_data import BaseRefDataLoader
9
+ import dve.parser.file_handling as fh
10
+ from dve.core_engine.backends.base.reference_data import ReferenceConfig
10
11
  from dve.core_engine.backends.implementations.spark.auditing import SparkAuditingManager
11
12
  from dve.core_engine.backends.implementations.spark.contract import SparkDataContract
13
+ from dve.core_engine.backends.implementations.spark.reference_data import SparkRefDataLoader
12
14
  from dve.core_engine.backends.implementations.spark.rules import SparkStepImplementations
13
15
  from dve.core_engine.backends.implementations.spark.spark_helpers import spark_get_entity_count
14
16
  from dve.core_engine.models import SubmissionInfo
@@ -31,7 +33,6 @@ class SparkDVEPipeline(BaseDVEPipeline):
31
33
  audit_tables: SparkAuditingManager,
32
34
  rules_path: Optional[URI],
33
35
  submitted_files_path: Optional[URI],
34
- reference_data_loader: Optional[type[BaseRefDataLoader]] = None,
35
36
  spark: Optional[SparkSession] = None,
36
37
  job_run_id: Optional[int] = None,
37
38
  logger: Optional[logging.Logger] = None,
@@ -44,11 +45,20 @@ class SparkDVEPipeline(BaseDVEPipeline):
44
45
  SparkStepImplementations.register_udfs(self._spark),
45
46
  rules_path,
46
47
  submitted_files_path,
47
- reference_data_loader,
48
48
  job_run_id,
49
49
  logger,
50
50
  )
51
51
 
52
+ def init_reference_data_loader(
53
+ self, reference_data_config: dict[str, ReferenceConfig], **kwargs
54
+ ) -> SparkRefDataLoader:
55
+ return SparkRefDataLoader(
56
+ spark=self._spark,
57
+ reference_data_config=reference_data_config,
58
+ dataset_config_uri=fh.get_parent(self._rules_path), # type: ignore
59
+ **kwargs
60
+ )
61
+
52
62
  # pylint: disable=arguments-differ
53
63
  def write_file_to_parquet( # type: ignore
54
64
  self, submission_file_uri: URI, submission_info: SubmissionInfo, output: URI