data-validation-engine 0.7.4__tar.gz → 0.7.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/PKG-INFO +1 -1
  2. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/pyproject.toml +1 -1
  3. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/base/backend.py +4 -32
  4. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/base/reference_data.py +6 -3
  5. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/exceptions.py +14 -0
  6. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +1 -1
  7. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py +3 -9
  8. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/readers/json.py +7 -2
  9. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/readers/xml.py +5 -4
  10. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/reference_data.py +7 -10
  11. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/backend.py +25 -7
  12. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/reference_data.py +5 -9
  13. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/pipeline/duckdb_pipeline.py +14 -3
  14. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/pipeline/pipeline.py +26 -15
  15. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/pipeline/spark_pipeline.py +13 -3
  16. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/pipeline/utils.py +10 -3
  17. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/LICENSE +0 -0
  18. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/README.md +0 -0
  19. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/__init__.py +0 -0
  20. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/common/__init__.py +0 -0
  21. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/common/error_utils.py +0 -0
  22. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/__init__.py +0 -0
  23. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/__init__.py +0 -0
  24. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/base/__init__.py +0 -0
  25. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/base/auditing.py +0 -0
  26. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/base/contract.py +0 -0
  27. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/base/core.py +0 -0
  28. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/base/reader.py +0 -0
  29. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/base/rules.py +0 -0
  30. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/base/utilities.py +0 -0
  31. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/__init__.py +0 -0
  32. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/__init__.py +0 -0
  33. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/auditing.py +0 -0
  34. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/contract.py +0 -0
  35. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/readers/__init__.py +0 -0
  36. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/rules.py +0 -0
  37. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/types.py +0 -0
  38. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/duckdb/utilities.py +0 -0
  39. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/__init__.py +0 -0
  40. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/auditing.py +0 -0
  41. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/contract.py +0 -0
  42. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/readers/__init__.py +0 -0
  43. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/readers/csv.py +0 -0
  44. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/readers/json.py +0 -0
  45. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/readers/xml.py +0 -0
  46. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/rules.py +0 -0
  47. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/spark_helpers.py +0 -0
  48. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/types.py +0 -0
  49. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/implementations/spark/utilities.py +0 -0
  50. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/metadata/__init__.py +0 -0
  51. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/metadata/contract.py +0 -0
  52. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/metadata/reporting.py +0 -0
  53. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/metadata/rules.py +0 -0
  54. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/readers/__init__.py +0 -0
  55. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/readers/csv.py +0 -0
  56. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/readers/utilities.py +0 -0
  57. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/readers/xml.py +0 -0
  58. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/readers/xml_linting.py +0 -0
  59. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/types.py +0 -0
  60. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/backends/utilities.py +0 -0
  61. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/configuration/__init__.py +0 -0
  62. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/configuration/base.py +0 -0
  63. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/configuration/v1/__init__.py +0 -0
  64. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/configuration/v1/filters.py +0 -0
  65. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/configuration/v1/rule_stores/__init__.py +0 -0
  66. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/configuration/v1/rule_stores/models.py +0 -0
  67. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/configuration/v1/steps.py +0 -0
  68. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/constants.py +0 -0
  69. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/engine.py +0 -0
  70. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/exceptions.py +0 -0
  71. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/functions/__init__.py +0 -0
  72. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/functions/implementations.py +0 -0
  73. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/loggers.py +0 -0
  74. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/message.py +0 -0
  75. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/models.py +0 -0
  76. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/templating.py +0 -0
  77. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/type_hints.py +0 -0
  78. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/core_engine/validation.py +0 -0
  79. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/__init__.py +0 -0
  80. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/domain_types.py +0 -0
  81. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/exc.py +0 -0
  82. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/function_library.py +0 -0
  83. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/function_wrapper.py +0 -0
  84. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/model_generator.py +0 -0
  85. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/models.py +0 -0
  86. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/metadata_parser/utilities.py +0 -0
  87. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/__init__.py +0 -0
  88. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/exceptions.py +0 -0
  89. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/__init__.py +0 -0
  90. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/helpers.py +0 -0
  91. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/implementations/__init__.py +0 -0
  92. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/implementations/base.py +0 -0
  93. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/implementations/dbfs.py +0 -0
  94. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/implementations/file.py +0 -0
  95. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/implementations/s3.py +0 -0
  96. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/log_handler.py +0 -0
  97. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/service.py +0 -0
  98. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/file_handling/utilities.py +0 -0
  99. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/type_hints.py +0 -0
  100. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/parser/utilities.py +0 -0
  101. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/pipeline/__init__.py +0 -0
  102. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/pipeline/foundry_ddb_pipeline.py +0 -0
  103. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/reporting/__init__.py +0 -0
  104. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/reporting/error_report.py +0 -0
  105. {data_validation_engine-0.7.4 → data_validation_engine-0.7.6}/src/dve/reporting/excel_report.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-validation-engine
3
- Version: 0.7.4
3
+ Version: 0.7.6
4
4
  Summary: `nhs data validation engine` is a framework used to validate data
5
5
  License-Expression: MIT
6
6
  License-File: LICENSE
@@ -24,7 +24,7 @@ Issues = "https://github.com/NHSDigital/data-validation-engine/issues"
24
24
  Changelog = "https://github.com/NHSDigital/data-validation-engine/blob/main/CHANGELOG.md"
25
25
 
26
26
  [tool.poetry]
27
- version = "0.7.4"
27
+ version = "0.7.6"
28
28
  packages = [
29
29
  { include = "dve", from = "src" },
30
30
  ]
@@ -3,7 +3,7 @@
3
3
  import logging
4
4
  import warnings
5
5
  from abc import ABC, abstractmethod
6
- from collections.abc import Mapping, MutableMapping
6
+ from collections.abc import MutableMapping
7
7
  from typing import Any, ClassVar, Generic, Optional
8
8
 
9
9
  from pyspark.sql import DataFrame, SparkSession
@@ -41,14 +41,12 @@ class BaseBackend(Generic[EntityType], ABC):
41
41
  self,
42
42
  contract: BaseDataContract[EntityType],
43
43
  steps: BaseStepImplementations[EntityType],
44
- reference_data_loader_type: Optional[type[BaseRefDataLoader[EntityType]]],
45
44
  logger: Optional[logging.Logger] = None,
46
45
  **kwargs: Any,
47
46
  ) -> None:
48
47
  for component_name, component in (
49
48
  ("Contract", contract),
50
49
  ("Step implementation", steps),
51
- ("Reference data loader", reference_data_loader_type),
52
50
  ):
53
51
  component_entity_type = getattr(component, "__entity_type__", None)
54
52
  if component_entity_type != self.__entity_type__:
@@ -61,12 +59,6 @@ class BaseBackend(Generic[EntityType], ABC):
61
59
  """The data contract implementation used by the backend."""
62
60
  self.step_implementations = steps
63
61
  """The step implementations used by the backend."""
64
- self.reference_data_loader_type = reference_data_loader_type
65
- """
66
- The loader type to use for the reference data. If `None`, do not
67
- load any reference data and error if it is provided.
68
-
69
- """
70
62
  self.logger = logger or get_logger(type(self).__name__)
71
63
  """The `logging.Logger instance for the backend."""
72
64
 
@@ -74,29 +66,9 @@ class BaseBackend(Generic[EntityType], ABC):
74
66
  self,
75
67
  reference_entity_config: dict[EntityName, ReferenceConfigUnion],
76
68
  submission_info: Optional[SubmissionInfo],
77
- ) -> Mapping[EntityName, EntityType]:
78
- """Load the reference data as specified in the reference entity config."""
79
- sub_info_entity: Optional[EntityType] = None
80
- if submission_info:
81
- sub_info_entity = self.convert_submission_info(submission_info)
82
-
83
- if self.reference_data_loader_type is None:
84
- if reference_entity_config:
85
- raise ValueError(
86
- "Reference data has been specified but no reference data loader is "
87
- + "configured for this backend"
88
- )
89
-
90
- reference_data_dict = {}
91
- if sub_info_entity is not None:
92
- reference_data_dict["dve_submission_info"] = sub_info_entity
93
- return reference_data_dict
94
-
95
- reference_data_loader = self.reference_data_loader_type(reference_entity_config)
96
- if sub_info_entity is not None:
97
- reference_data_loader.entity_cache["dve_submission_info"] = sub_info_entity
98
-
99
- return reference_data_loader
69
+ ) -> BaseRefDataLoader[EntityType]:
70
+ """Supply configured reference data loader for use with business rules"""
71
+ raise NotImplementedError()
100
72
 
101
73
  @abstractmethod
102
74
  def convert_submission_info(self, submission_info: SubmissionInfo) -> EntityType:
@@ -11,6 +11,7 @@ import dve.parser.file_handling as fh
11
11
  from dve.core_engine.backends.base.core import get_entity_type
12
12
  from dve.core_engine.backends.exceptions import (
13
13
  MissingRefDataEntity,
14
+ NoRefDataConfigSupplied,
14
15
  RefdataLacksFileExtensionSupport,
15
16
  )
16
17
  from dve.core_engine.backends.types import EntityType
@@ -147,11 +148,11 @@ class BaseRefDataLoader(Generic[EntityType], Mapping[EntityName, EntityType], AB
147
148
  # pylint: disable=unused-argument
148
149
  def __init__(
149
150
  self,
150
- reference_entity_config: dict[EntityName, ReferenceConfig],
151
- dataset_config_uri: Optional[URI] = None,
151
+ reference_data_config: dict[EntityName, ReferenceConfig],
152
+ dataset_config_uri: URI,
152
153
  **kwargs,
153
154
  ) -> None:
154
- self.reference_entity_config = reference_entity_config
155
+ self.reference_entity_config = reference_data_config
155
156
  self.dataset_config_uri = dataset_config_uri
156
157
  """
157
158
  Configuration options for the reference data. This is likely to vary
@@ -207,6 +208,8 @@ class BaseRefDataLoader(Generic[EntityType], Mapping[EntityName, EntityType], AB
207
208
  try:
208
209
  config = self.reference_entity_config[key]
209
210
  return self.load_entity(entity_name=key, config=config)
211
+ except TypeError as err:
212
+ raise NoRefDataConfigSupplied() from err
210
213
  except Exception as err:
211
214
  raise MissingRefDataEntity(entity_name=key) from err
212
215
 
@@ -119,6 +119,20 @@ class MissingRefDataEntity(MissingEntity, BackendErrorMixin): # pylint: disable
119
119
  return f"Missing reference data entity {self.entity_name!r}"
120
120
 
121
121
 
122
+ class NoRefDataConfigSupplied(BackendError):
123
+ """An error raised when trying to load a refdata entity when no refdata
124
+ config has been supplied.
125
+
126
+ """
127
+
128
+ def __init__(self, *args: object) -> None:
129
+ super().__init__(*args)
130
+
131
+ def get_message_preamble(self) -> EntityName:
132
+ """Message for logging purposes"""
133
+ return "Refdata loader not supplied with refdata config - unable to load refdata entities"
134
+
135
+
122
136
  class ConstraintError(ValueError, BackendErrorMixin):
123
137
  """Raised when a given constraint is violated."""
124
138
 
@@ -411,7 +411,7 @@ def get_duckdb_cast_statement_from_annotation(
411
411
  stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{date_regex}') THEN TRY_CAST(TRIM({quoted_name}) as DATE) ELSE NULL END" # pylint: disable=C0301
412
412
  return stmt
413
413
  if issubclass(type_, time):
414
- stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{time_regex}') THEN TRY_CAST(TRIM({quoted_name}) as TIME) ELSE NULL END" # pylint: disable=C0301
414
+ stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{time_regex}') THEN TRY_CAST(TRIM({quoted_name}) as TIME) ELSE NULL END" # pylint: disable=C0301
415
415
  return stmt
416
416
  duck_type = get_duckdb_type_from_annotation(type_)
417
417
  if duck_type:
@@ -6,13 +6,7 @@ from typing import Any, Optional
6
6
 
7
7
  import duckdb as ddb
8
8
  import polars as pl
9
- from duckdb import (
10
- DuckDBPyConnection,
11
- DuckDBPyRelation,
12
- StarExpression,
13
- default_connection,
14
- read_csv,
15
- )
9
+ from duckdb import DuckDBPyConnection, DuckDBPyRelation, StarExpression, read_csv
16
10
  from pydantic import BaseModel
17
11
 
18
12
  from dve.core_engine.backends.base.reader import BaseFileReader, read_function
@@ -61,7 +55,7 @@ class DuckDBCSVReader(BaseFileReader):
61
55
  self.header = header
62
56
  self.delim = delim
63
57
  self.quotechar = quotechar
64
- self._connection = connection if connection else default_connection
58
+ self._connection = connection if connection else ddb.connect(":memory:")
65
59
  self.field_check = field_check
66
60
  self.field_check_error_code = field_check_error_code
67
61
  self.field_check_error_message = field_check_error_message
@@ -181,7 +175,7 @@ class PolarsToDuckDBCSVReader(DuckDBCSVReader):
181
175
  ] + [pl.col(RECORD_INDEX_COLUMN_NAME)]
182
176
  df = df.select(pl_exprs)
183
177
 
184
- return ddb.sql("SELECT * FROM df")
178
+ return self._connection.sql("SELECT * FROM df")
185
179
 
186
180
 
187
181
  class DuckDBCSVRepeatingHeaderReader(PolarsToDuckDBCSVReader):
@@ -4,7 +4,8 @@
4
4
  from collections.abc import Iterator
5
5
  from typing import Any, Optional
6
6
 
7
- from duckdb import DuckDBPyRelation, read_json
7
+ import duckdb
8
+ from duckdb import DuckDBPyConnection, DuckDBPyRelation
8
9
  from pydantic import BaseModel
9
10
 
10
11
  from dve.core_engine.backends.base.reader import BaseFileReader, read_function
@@ -26,9 +27,11 @@ class DuckDBJSONReader(BaseFileReader):
26
27
  self,
27
28
  *,
28
29
  json_format: Optional[str] = "array",
30
+ connection: Optional[DuckDBPyConnection] = None,
29
31
  **_,
30
32
  ):
31
33
  self._json_format = json_format
34
+ self._connection = duckdb.connect(":memory:") if not connection else connection
32
35
 
33
36
  super().__init__()
34
37
 
@@ -50,5 +53,7 @@ class DuckDBJSONReader(BaseFileReader):
50
53
  }
51
54
 
52
55
  return self.add_record_index(
53
- read_json(resource, columns=ddb_schema, format=self._json_format) # type: ignore
56
+ self._connection.read_json(
57
+ resource, columns=ddb_schema, format=self._json_format # type: ignore
58
+ )
54
59
  )
@@ -3,8 +3,9 @@
3
3
 
4
4
  from typing import Optional
5
5
 
6
+ import duckdb
6
7
  import polars as pl
7
- from duckdb import DuckDBPyConnection, DuckDBPyRelation, default_connection
8
+ from duckdb import DuckDBPyConnection, DuckDBPyRelation
8
9
  from pydantic import BaseModel
9
10
 
10
11
  from dve.core_engine.backends.base.reader import read_function
@@ -24,8 +25,8 @@ from dve.core_engine.type_hints import URI
24
25
  class DuckDBXMLStreamReader(XMLStreamReader):
25
26
  """A reader for XML files"""
26
27
 
27
- def __init__(self, *, ddb_connection: Optional[DuckDBPyConnection] = None, **kwargs):
28
- self.ddb_connection = ddb_connection if ddb_connection else default_connection
28
+ def __init__(self, *, connection: Optional[DuckDBPyConnection] = None, **kwargs):
29
+ self._connection = connection if connection else duckdb.connect(":memory:")
29
30
  super().__init__(**kwargs)
30
31
 
31
32
  @read_function(DuckDBPyRelation)
@@ -49,4 +50,4 @@ class DuckDBXMLStreamReader(XMLStreamReader):
49
50
  data=self.read_to_py_iterator(resource, entity_name, schema), schema=polars_schema
50
51
  )
51
52
  )
52
- return self.ddb_connection.sql("select * from _lazy_frame")
53
+ return self._connection.sql("select * from _lazy_frame")
@@ -1,13 +1,11 @@
1
1
  """A reference data loader for duckdb."""
2
2
 
3
- from typing import Optional
4
-
5
3
  from duckdb import DuckDBPyConnection, DuckDBPyRelation
6
4
  from pyarrow import ipc # type: ignore
7
5
 
8
6
  from dve.core_engine.backends.base.reference_data import (
9
7
  BaseRefDataLoader,
10
- ReferenceConfigUnion,
8
+ ReferenceConfig,
11
9
  ReferenceTable,
12
10
  mark_refdata_file_extension,
13
11
  )
@@ -19,17 +17,16 @@ from dve.parser.type_hints import URI
19
17
  class DuckDBRefDataLoader(BaseRefDataLoader[DuckDBPyRelation]):
20
18
  """A reference data loader using already existing DuckDB tables."""
21
19
 
22
- connection: DuckDBPyConnection
23
- """The DuckDB connection for the backend."""
24
- dataset_config_uri: Optional[URI] = None
25
- """The location of the dischema file"""
26
-
27
20
  def __init__(
28
21
  self,
29
- reference_entity_config: dict[EntityName, ReferenceConfigUnion],
22
+ connection: DuckDBPyConnection,
23
+ reference_data_config: dict[EntityName, ReferenceConfig],
24
+ dataset_config_uri: URI,
30
25
  **kwargs,
31
26
  ) -> None:
32
- super().__init__(reference_entity_config, self.dataset_config_uri, **kwargs)
27
+ super().__init__(reference_data_config, dataset_config_uri, **kwargs)
28
+
29
+ self.connection = connection
33
30
 
34
31
  if not self.connection:
35
32
  raise AttributeError("DuckDBConnection must be specified")
@@ -6,6 +6,7 @@ from typing import Any, Optional
6
6
  from pyspark.sql import DataFrame, SparkSession
7
7
 
8
8
  from dve.core_engine.backends.base.backend import BaseBackend
9
+ from dve.core_engine.backends.base.reference_data import ReferenceConfigUnion
9
10
  from dve.core_engine.backends.implementations.spark.contract import SparkDataContract
10
11
  from dve.core_engine.backends.implementations.spark.reference_data import SparkRefDataLoader
11
12
  from dve.core_engine.backends.implementations.spark.rules import SparkStepImplementations
@@ -14,7 +15,7 @@ from dve.core_engine.backends.implementations.spark.types import SparkEntities
14
15
  from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME
15
16
  from dve.core_engine.loggers import get_child_logger, get_logger
16
17
  from dve.core_engine.models import SubmissionInfo
17
- from dve.core_engine.type_hints import URI, EntityParquetLocations
18
+ from dve.core_engine.type_hints import URI, EntityName, EntityParquetLocations
18
19
  from dve.parser.file_handling import get_resource_exists, joinuri
19
20
 
20
21
 
@@ -26,7 +27,6 @@ class SparkBackend(BaseBackend[DataFrame]):
26
27
  dataset_config_uri: Optional[URI] = None,
27
28
  contract: Optional[SparkDataContract] = None,
28
29
  steps: Optional[SparkStepImplementations] = None,
29
- reference_data_loader: Optional[type[SparkRefDataLoader]] = None,
30
30
  logger: Optional[logging.Logger] = None,
31
31
  spark_session: Optional[SparkSession] = None,
32
32
  **kwargs: Any,
@@ -36,6 +36,8 @@ class SparkBackend(BaseBackend[DataFrame]):
36
36
 
37
37
  self.spark_session = spark_session or SparkSession.builder.getOrCreate()
38
38
  """The Spark session for the backend."""
39
+ self.dataset_config_uri = dataset_config_uri
40
+ """The uri of the dischema specifying the DVE config"""
39
41
 
40
42
  if contract is None:
41
43
  contract = SparkDataContract(
@@ -46,11 +48,27 @@ class SparkBackend(BaseBackend[DataFrame]):
46
48
  steps = SparkStepImplementations.register_udfs(
47
49
  logger=get_child_logger("SparkStepImplementations", logger)
48
50
  )
49
- if reference_data_loader is None:
50
- reference_data_loader = SparkRefDataLoader
51
- reference_data_loader.spark = self.spark_session
52
- reference_data_loader.dataset_config_uri = dataset_config_uri
53
- super().__init__(contract, steps, reference_data_loader, logger, **kwargs)
51
+ super().__init__(contract, steps, logger, **kwargs)
52
+
53
+ def load_reference_data(
54
+ self,
55
+ reference_entity_config: dict[EntityName, ReferenceConfigUnion],
56
+ submission_info: Optional[SubmissionInfo],
57
+ ):
58
+ """Load the reference data as specified in the reference entity config."""
59
+ sub_info_entity: Optional[DataFrame] = None
60
+ if submission_info:
61
+ sub_info_entity = self.convert_submission_info(submission_info)
62
+
63
+ reference_data_loader = SparkRefDataLoader(
64
+ spark=self.spark_session,
65
+ reference_data_config=reference_entity_config,
66
+ dataset_config_uri=self.dataset_config_uri, # type: ignore
67
+ )
68
+ if sub_info_entity is not None:
69
+ reference_data_loader.entity_cache["dve_submission_info"] = sub_info_entity
70
+
71
+ return reference_data_loader
54
72
 
55
73
  def write_entities_to_parquet(
56
74
  self, entities: SparkEntities, cache_prefix: URI
@@ -1,8 +1,6 @@
1
1
  # pylint: disable=no-member
2
2
  """A reference data loader for Spark."""
3
3
 
4
- from typing import Optional
5
-
6
4
  from pyspark.sql import DataFrame, SparkSession
7
5
 
8
6
  from dve.core_engine.backends.base.reference_data import (
@@ -19,17 +17,15 @@ from dve.parser.type_hints import URI
19
17
  class SparkRefDataLoader(BaseRefDataLoader[DataFrame]):
20
18
  """A reference data loader using already existing Apache Spark Tables."""
21
19
 
22
- spark: SparkSession
23
- """The Spark session for the backend."""
24
- dataset_config_uri: Optional[URI] = None
25
- """The location of the dischema file defining business rules"""
26
-
27
20
  def __init__(
28
21
  self,
29
- reference_entity_config: dict[EntityName, ReferenceConfig],
22
+ spark: SparkSession,
23
+ reference_data_config: dict[EntityName, ReferenceConfig],
24
+ dataset_config_uri: URI,
30
25
  **kwargs,
31
26
  ) -> None:
32
- super().__init__(reference_entity_config, self.dataset_config_uri, **kwargs)
27
+ super().__init__(reference_data_config, dataset_config_uri, **kwargs)
28
+ self.spark = spark
33
29
  if not self.spark:
34
30
  raise AttributeError("Spark session must be provided")
35
31
 
@@ -5,10 +5,12 @@ from typing import Optional
5
5
 
6
6
  from duckdb import DuckDBPyConnection, DuckDBPyRelation
7
7
 
8
- from dve.core_engine.backends.base.reference_data import BaseRefDataLoader
8
+ import dve.parser.file_handling as fh
9
+ from dve.core_engine.backends.base.reference_data import ReferenceConfig
9
10
  from dve.core_engine.backends.implementations.duckdb.auditing import DDBAuditingManager
10
11
  from dve.core_engine.backends.implementations.duckdb.contract import DuckDBDataContract
11
12
  from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import duckdb_get_entity_count
13
+ from dve.core_engine.backends.implementations.duckdb.reference_data import DuckDBRefDataLoader
12
14
  from dve.core_engine.backends.implementations.duckdb.rules import DuckDBStepImplementations
13
15
  from dve.core_engine.models import SubmissionInfo
14
16
  from dve.core_engine.type_hints import URI
@@ -30,7 +32,6 @@ class DDBDVEPipeline(BaseDVEPipeline):
30
32
  connection: DuckDBPyConnection,
31
33
  rules_path: Optional[URI],
32
34
  submitted_files_path: Optional[URI],
33
- reference_data_loader: Optional[type[BaseRefDataLoader]] = None,
34
35
  job_run_id: Optional[int] = None,
35
36
  logger: Optional[logging.Logger] = None,
36
37
  ):
@@ -42,9 +43,19 @@ class DDBDVEPipeline(BaseDVEPipeline):
42
43
  DuckDBStepImplementations.register_udfs(connection=self._connection),
43
44
  rules_path,
44
45
  submitted_files_path,
45
- reference_data_loader,
46
46
  job_run_id,
47
47
  logger,
48
+ {"connection": self._connection},
49
+ )
50
+
51
+ def init_reference_data_loader(
52
+ self, reference_data_config: dict[str, ReferenceConfig], **kwargs
53
+ ) -> DuckDBRefDataLoader:
54
+ return DuckDBRefDataLoader(
55
+ connection=self._connection,
56
+ reference_data_config=reference_data_config,
57
+ dataset_config_uri=fh.get_parent(self._rules_path), # type: ignore
58
+ **kwargs
48
59
  )
49
60
 
50
61
  # pylint: disable=arguments-differ
@@ -9,7 +9,7 @@ from concurrent.futures import Executor, Future, ThreadPoolExecutor
9
9
  from functools import lru_cache
10
10
  from itertools import starmap
11
11
  from threading import Lock
12
- from typing import Optional, Union
12
+ from typing import Any, Optional, Union
13
13
  from uuid import uuid4
14
14
 
15
15
  import polars as pl
@@ -26,7 +26,7 @@ from dve.common.error_utils import (
26
26
  from dve.core_engine.backends.base.auditing import BaseAuditingManager
27
27
  from dve.core_engine.backends.base.contract import BaseDataContract
28
28
  from dve.core_engine.backends.base.core import EntityManager
29
- from dve.core_engine.backends.base.reference_data import BaseRefDataLoader
29
+ from dve.core_engine.backends.base.reference_data import BaseRefDataLoader, ReferenceConfig
30
30
  from dve.core_engine.backends.base.rules import BaseStepImplementations
31
31
  from dve.core_engine.backends.exceptions import MessageBearingError
32
32
  from dve.core_engine.backends.readers import BaseFileReader
@@ -36,7 +36,7 @@ from dve.core_engine.exceptions import CriticalProcessingError
36
36
  from dve.core_engine.loggers import get_logger
37
37
  from dve.core_engine.message import FeedbackMessage
38
38
  from dve.core_engine.models import SubmissionInfo, SubmissionStatisticsRecord
39
- from dve.core_engine.type_hints import URI, DVEStageName, FileURI, InfoURI
39
+ from dve.core_engine.type_hints import URI, DVEStageName, EntityName, FileURI, InfoURI
40
40
  from dve.parser import file_handling as fh
41
41
  from dve.parser.file_handling.implementations.file import LocalFilesystemImplementation
42
42
  from dve.parser.file_handling.service import _get_implementation
@@ -49,6 +49,7 @@ PERMISSIBLE_EXCEPTIONS: tuple[type[Exception]] = (
49
49
  )
50
50
 
51
51
 
52
+ # pylint: disable=R0904
52
53
  class BaseDVEPipeline:
53
54
  """
54
55
  Base class for running a DVE Pipeline either by a given step or a full e2e process.
@@ -62,14 +63,13 @@ class BaseDVEPipeline:
62
63
  step_implementations: Optional[BaseStepImplementations[EntityType]],
63
64
  rules_path: Optional[URI],
64
65
  submitted_files_path: Optional[URI],
65
- reference_data_loader: Optional[type[BaseRefDataLoader]] = None,
66
66
  job_run_id: Optional[int] = None,
67
67
  logger: Optional[logging.Logger] = None,
68
+ backend_reader_kwargs: Optional[dict[str, Any]] = None,
68
69
  ):
69
70
  self._submitted_files_path = submitted_files_path
70
71
  self._processed_files_path = processed_files_path
71
72
  self._rules_path = rules_path
72
- self._reference_data_loader = reference_data_loader
73
73
  self._job_run_id = job_run_id
74
74
  self._audit_tables = audit_tables
75
75
  self._data_contract = data_contract
@@ -78,6 +78,7 @@ class BaseDVEPipeline:
78
78
  self._summary_lock = Lock()
79
79
  self._rec_tracking_lock = Lock()
80
80
  self._aggregates_lock = Lock()
81
+ self._backend_reader_kwargs = backend_reader_kwargs
81
82
 
82
83
  if self._data_contract:
83
84
  self._data_contract.logger = self._logger
@@ -109,11 +110,23 @@ class BaseDVEPipeline:
109
110
  """The step implementations to apply the business rules to a given dataset"""
110
111
  return self._step_implementations
111
112
 
113
+ @property
114
+ def backend_reader_kwargs(self) -> dict[str, Any] | None:
115
+ """Important required arguments for all readers related to the specific backend
116
+ that can't be specified at time of writing config eg. duckdb connection"""
117
+ return self._backend_reader_kwargs
118
+
112
119
  @staticmethod
113
120
  def get_entity_count(entity: EntityType) -> int:
114
121
  """Get a row count of an entity stored as parquet"""
115
122
  raise NotImplementedError()
116
123
 
124
+ def init_reference_data_loader(
125
+ self, reference_data_config: dict[EntityName, ReferenceConfig], **kwargs
126
+ ) -> BaseRefDataLoader:
127
+ """Get reference data loader if required for business rules"""
128
+ raise NotImplementedError()
129
+
117
130
  def get_submission_status(
118
131
  self, step_name: DVEStageName, submission_id: str
119
132
  ) -> SubmissionStatus:
@@ -199,7 +212,9 @@ class BaseDVEPipeline:
199
212
 
200
213
  for model_name, model in models.items():
201
214
  self._logger.info(f"Transforming {model_name} to stringified parquet")
202
- reader: BaseFileReader = load_reader(dataset, model_name, ext)
215
+ reader: BaseFileReader = load_reader(
216
+ dataset, model_name, ext, self.backend_reader_kwargs
217
+ )
203
218
  try:
204
219
  if not entity_type:
205
220
  reader.write_parquet(
@@ -527,7 +542,7 @@ class BaseDVEPipeline:
527
542
 
528
543
  return processed_files, failed_processing
529
544
 
530
- def apply_business_rules( # pylint: disable=R0914
545
+ def apply_business_rules( # pylint: disable=R0914
531
546
  self, submission_info: SubmissionInfo, submission_status: Optional[SubmissionStatus] = None
532
547
  ) -> tuple[SubmissionInfo, SubmissionStatus]:
533
548
  """Apply the business rules to a given submission, the submission may have failed at the
@@ -542,9 +557,6 @@ class BaseDVEPipeline:
542
557
  if not self.rules_path:
543
558
  raise AttributeError("business rules path not provided.")
544
559
 
545
- if not self._reference_data_loader:
546
- raise AttributeError("reference data loader not provided.")
547
-
548
560
  if not self.processed_files_path:
549
561
  raise AttributeError("processed files path has not been provided.")
550
562
 
@@ -556,8 +568,10 @@ class BaseDVEPipeline:
556
568
  self._processed_files_path, submission_info.submission_id
557
569
  )
558
570
  ref_data = config.get_reference_data_config()
571
+ reference_data: BaseRefDataLoader = self.init_reference_data_loader(
572
+ reference_data_config=ref_data
573
+ )
559
574
  rules = config.get_rule_metadata()
560
- reference_data = self._reference_data_loader(ref_data) # type: ignore
561
575
  entities = {}
562
576
  contract = fh.joinuri(
563
577
  self.processed_files_path, submission_info.submission_id, "data_contract"
@@ -582,10 +596,7 @@ class BaseDVEPipeline:
582
596
  key_fields = {model: conf.reporting_fields for model, conf in model_config.items()}
583
597
 
584
598
  _errors_uri, rules_success = self.step_implementations.apply_rules( # type: ignore
585
- working_directory,
586
- entity_manager,
587
- rules,
588
- key_fields
599
+ working_directory, entity_manager, rules, key_fields
589
600
  )
590
601
 
591
602
  rule_messages = load_feedback_messages(
@@ -6,9 +6,11 @@ from typing import Optional
6
6
 
7
7
  from pyspark.sql import DataFrame, SparkSession
8
8
 
9
- from dve.core_engine.backends.base.reference_data import BaseRefDataLoader
9
+ import dve.parser.file_handling as fh
10
+ from dve.core_engine.backends.base.reference_data import ReferenceConfig
10
11
  from dve.core_engine.backends.implementations.spark.auditing import SparkAuditingManager
11
12
  from dve.core_engine.backends.implementations.spark.contract import SparkDataContract
13
+ from dve.core_engine.backends.implementations.spark.reference_data import SparkRefDataLoader
12
14
  from dve.core_engine.backends.implementations.spark.rules import SparkStepImplementations
13
15
  from dve.core_engine.backends.implementations.spark.spark_helpers import spark_get_entity_count
14
16
  from dve.core_engine.models import SubmissionInfo
@@ -31,7 +33,6 @@ class SparkDVEPipeline(BaseDVEPipeline):
31
33
  audit_tables: SparkAuditingManager,
32
34
  rules_path: Optional[URI],
33
35
  submitted_files_path: Optional[URI],
34
- reference_data_loader: Optional[type[BaseRefDataLoader]] = None,
35
36
  spark: Optional[SparkSession] = None,
36
37
  job_run_id: Optional[int] = None,
37
38
  logger: Optional[logging.Logger] = None,
@@ -44,11 +45,20 @@ class SparkDVEPipeline(BaseDVEPipeline):
44
45
  SparkStepImplementations.register_udfs(self._spark),
45
46
  rules_path,
46
47
  submitted_files_path,
47
- reference_data_loader,
48
48
  job_run_id,
49
49
  logger,
50
50
  )
51
51
 
52
+ def init_reference_data_loader(
53
+ self, reference_data_config: dict[str, ReferenceConfig], **kwargs
54
+ ) -> SparkRefDataLoader:
55
+ return SparkRefDataLoader(
56
+ spark=self._spark,
57
+ reference_data_config=reference_data_config,
58
+ dataset_config_uri=fh.get_parent(self._rules_path), # type: ignore
59
+ **kwargs
60
+ )
61
+
52
62
  # pylint: disable=arguments-differ
53
63
  def write_file_to_parquet( # type: ignore
54
64
  self, submission_file_uri: URI, submission_info: SubmissionInfo, output: URI
@@ -3,7 +3,7 @@
3
3
 
4
4
  import json
5
5
  from threading import Lock
6
- from typing import Optional
6
+ from typing import Any, Optional
7
7
 
8
8
  from pydantic.main import ModelMetaclass
9
9
  from pyspark.sql import SparkSession
@@ -45,10 +45,17 @@ def load_config(
45
45
  return models, config, dataset
46
46
 
47
47
 
48
- def load_reader(dataset: Dataset, model_name: str, file_extension: str):
48
+ def load_reader(
49
+ dataset: Dataset,
50
+ model_name: str,
51
+ file_extension: str,
52
+ backend_reader_kwargs: Optional[dict[str, Any]] = None,
53
+ ):
49
54
  """Loads the readers for the diven feed, model name and file extension"""
50
55
  reader_config = dataset[model_name].reader_config[f".{file_extension.lower()}"]
51
- reader = _READER_REGISTRY[reader_config.reader](**reader_config.kwargs_)
56
+ reader = _READER_REGISTRY[reader_config.reader](
57
+ **reader_config.kwargs_, **backend_reader_kwargs if backend_reader_kwargs else {}
58
+ )
52
59
  return reader
53
60
 
54
61