data-validation-engine 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. data_validation_engine-0.6.2.dist-info/METADATA +104 -0
  2. data_validation_engine-0.6.2.dist-info/RECORD +105 -0
  3. data_validation_engine-0.6.2.dist-info/WHEEL +4 -0
  4. data_validation_engine-0.6.2.dist-info/licenses/LICENSE +21 -0
  5. dve/__init__.py +0 -0
  6. dve/common/__init__.py +0 -0
  7. dve/common/error_utils.py +189 -0
  8. dve/core_engine/__init__.py +0 -0
  9. dve/core_engine/backends/__init__.py +1 -0
  10. dve/core_engine/backends/base/__init__.py +1 -0
  11. dve/core_engine/backends/base/auditing.py +618 -0
  12. dve/core_engine/backends/base/backend.py +240 -0
  13. dve/core_engine/backends/base/contract.py +454 -0
  14. dve/core_engine/backends/base/core.py +124 -0
  15. dve/core_engine/backends/base/reader.py +176 -0
  16. dve/core_engine/backends/base/reference_data.py +217 -0
  17. dve/core_engine/backends/base/rules.py +685 -0
  18. dve/core_engine/backends/base/utilities.py +146 -0
  19. dve/core_engine/backends/exceptions.py +311 -0
  20. dve/core_engine/backends/implementations/__init__.py +1 -0
  21. dve/core_engine/backends/implementations/duckdb/__init__.py +26 -0
  22. dve/core_engine/backends/implementations/duckdb/auditing.py +234 -0
  23. dve/core_engine/backends/implementations/duckdb/contract.py +213 -0
  24. dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +288 -0
  25. dve/core_engine/backends/implementations/duckdb/readers/__init__.py +13 -0
  26. dve/core_engine/backends/implementations/duckdb/readers/csv.py +222 -0
  27. dve/core_engine/backends/implementations/duckdb/readers/json.py +50 -0
  28. dve/core_engine/backends/implementations/duckdb/readers/xml.py +45 -0
  29. dve/core_engine/backends/implementations/duckdb/reference_data.py +49 -0
  30. dve/core_engine/backends/implementations/duckdb/rules.py +534 -0
  31. dve/core_engine/backends/implementations/duckdb/types.py +47 -0
  32. dve/core_engine/backends/implementations/duckdb/utilities.py +41 -0
  33. dve/core_engine/backends/implementations/spark/__init__.py +22 -0
  34. dve/core_engine/backends/implementations/spark/auditing.py +230 -0
  35. dve/core_engine/backends/implementations/spark/backend.py +78 -0
  36. dve/core_engine/backends/implementations/spark/contract.py +241 -0
  37. dve/core_engine/backends/implementations/spark/readers/__init__.py +15 -0
  38. dve/core_engine/backends/implementations/spark/readers/csv.py +77 -0
  39. dve/core_engine/backends/implementations/spark/readers/json.py +66 -0
  40. dve/core_engine/backends/implementations/spark/readers/xml.py +202 -0
  41. dve/core_engine/backends/implementations/spark/reference_data.py +42 -0
  42. dve/core_engine/backends/implementations/spark/rules.py +430 -0
  43. dve/core_engine/backends/implementations/spark/spark_helpers.py +412 -0
  44. dve/core_engine/backends/implementations/spark/types.py +21 -0
  45. dve/core_engine/backends/implementations/spark/utilities.py +144 -0
  46. dve/core_engine/backends/metadata/__init__.py +47 -0
  47. dve/core_engine/backends/metadata/contract.py +80 -0
  48. dve/core_engine/backends/metadata/reporting.py +374 -0
  49. dve/core_engine/backends/metadata/rules.py +737 -0
  50. dve/core_engine/backends/readers/__init__.py +41 -0
  51. dve/core_engine/backends/readers/csv.py +232 -0
  52. dve/core_engine/backends/readers/utilities.py +21 -0
  53. dve/core_engine/backends/readers/xml.py +432 -0
  54. dve/core_engine/backends/readers/xml_linting.py +142 -0
  55. dve/core_engine/backends/types.py +26 -0
  56. dve/core_engine/backends/utilities.py +177 -0
  57. dve/core_engine/configuration/__init__.py +1 -0
  58. dve/core_engine/configuration/base.py +56 -0
  59. dve/core_engine/configuration/v1/__init__.py +351 -0
  60. dve/core_engine/configuration/v1/filters.py +60 -0
  61. dve/core_engine/configuration/v1/rule_stores/__init__.py +1 -0
  62. dve/core_engine/configuration/v1/rule_stores/models.py +57 -0
  63. dve/core_engine/configuration/v1/steps.py +365 -0
  64. dve/core_engine/constants.py +8 -0
  65. dve/core_engine/engine.py +265 -0
  66. dve/core_engine/exceptions.py +29 -0
  67. dve/core_engine/functions/__init__.py +6 -0
  68. dve/core_engine/functions/implementations.py +200 -0
  69. dve/core_engine/loggers.py +57 -0
  70. dve/core_engine/message.py +512 -0
  71. dve/core_engine/models.py +196 -0
  72. dve/core_engine/templating.py +114 -0
  73. dve/core_engine/type_hints.py +255 -0
  74. dve/core_engine/validation.py +160 -0
  75. dve/metadata_parser/__init__.py +2 -0
  76. dve/metadata_parser/domain_types.py +682 -0
  77. dve/metadata_parser/exc.py +44 -0
  78. dve/metadata_parser/function_library.py +64 -0
  79. dve/metadata_parser/function_wrapper.py +201 -0
  80. dve/metadata_parser/model_generator.py +119 -0
  81. dve/metadata_parser/models.py +410 -0
  82. dve/metadata_parser/utilities.py +54 -0
  83. dve/parser/__init__.py +1 -0
  84. dve/parser/exceptions.py +50 -0
  85. dve/parser/file_handling/__init__.py +31 -0
  86. dve/parser/file_handling/helpers.py +29 -0
  87. dve/parser/file_handling/implementations/__init__.py +7 -0
  88. dve/parser/file_handling/implementations/base.py +97 -0
  89. dve/parser/file_handling/implementations/dbfs.py +81 -0
  90. dve/parser/file_handling/implementations/file.py +203 -0
  91. dve/parser/file_handling/implementations/s3.py +371 -0
  92. dve/parser/file_handling/log_handler.py +215 -0
  93. dve/parser/file_handling/service.py +441 -0
  94. dve/parser/file_handling/utilities.py +53 -0
  95. dve/parser/type_hints.py +46 -0
  96. dve/parser/utilities.py +113 -0
  97. dve/pipeline/__init__.py +0 -0
  98. dve/pipeline/duckdb_pipeline.py +56 -0
  99. dve/pipeline/foundry_ddb_pipeline.py +171 -0
  100. dve/pipeline/pipeline.py +935 -0
  101. dve/pipeline/spark_pipeline.py +69 -0
  102. dve/pipeline/utils.py +96 -0
  103. dve/reporting/__init__.py +1 -0
  104. dve/reporting/error_report.py +153 -0
  105. dve/reporting/excel_report.py +319 -0
@@ -0,0 +1,44 @@
1
+ """Exceptions around entities and for use in validators"""
2
+
3
+
4
+ class EntityNotFoundError(KeyError):
5
+ """Error for missing entities"""
6
+
7
+
8
+ class LocWarning(UserWarning):
9
+ """Warning class with optional location parameter"""
10
+
11
+ def __init__(self, msg, loc=""):
12
+ self.msg = msg
13
+ self.loc = loc
14
+ super().__init__()
15
+
16
+ def __str__(self):
17
+ """string representation of a loc warning"""
18
+ return f"{self.loc} {self.msg}"
19
+
20
+
21
+ class RecordRejection(ValueError):
22
+ """Exception to indicate a record has been rejected"""
23
+
24
+
25
+ class FileRejection(ValueError):
26
+ """Exception to indicate a file has been rejected"""
27
+
28
+
29
+ class GroupRejection(ValueError):
30
+ """Exception to indicate a chain of records have been rejected
31
+ normally used in a referential integrity check or for duplicated records
32
+ """
33
+
34
+
35
+ class TypeNotFoundError(KeyError):
36
+ """Exception raised when unable to locate correct type from passed in metadata"""
37
+
38
+
39
+ ERRORS = {
40
+ "recordrejection": RecordRejection,
41
+ "filerejection": FileRejection,
42
+ "warning": LocWarning,
43
+ "grouprejection": GroupRejection,
44
+ }
@@ -0,0 +1,64 @@
1
+ """Function library for wrapping into pydantic validators"""
2
+
3
+ import functools
4
+
5
+ import pydantic
6
+
7
+
8
+ def _nullcheck(func):
9
+ """Decorator to nullcheck incoming values from pydantic validators
10
+ this removes the need to add a nullcheck to the value for each function.
11
+ Nullchecks may still be needed for any additional values passed in.
12
+
13
+ e.g.
14
+ ```python
15
+ @_nullcheck # nullchecks value but not other_field
16
+ def func(value, other_field):
17
+ if other_field is None:
18
+ # do null thing
19
+ # logic
20
+ ```
21
+ """
22
+
23
+ @functools.wraps(func)
24
+ def inner(value, *args, **kwargs):
25
+ if value is None or not str(value).strip():
26
+ return None
27
+ return func(value, *args, **kwargs)
28
+
29
+ return inner
30
+
31
+
32
+ # demo function
33
+ @_nullcheck
34
+ @pydantic.validate_arguments
35
+ def normalise(value, capitalize: bool = False): # pragma: no cover
36
+ """Normalises a string by capitalising it"""
37
+ if capitalize:
38
+ return str(value).capitalize()
39
+ return value
40
+
41
+
42
+ @_nullcheck
43
+ def exclude_word(value, word: str):
44
+ """Returns None if the word is present in the value"""
45
+ if word.lower() in str(value).lower():
46
+ return None
47
+ return value
48
+
49
+
50
+ @_nullcheck
51
+ @pydantic.validate_arguments
52
+ def split(value, split_on: str, keep: int = 0):
53
+ """Splits a string on a given delimiter and keeps only the value at the given index
54
+ defaults to 0
55
+ """
56
+ try:
57
+ return str(value).split(split_on)[keep]
58
+ except IndexError as exc:
59
+ raise ValueError from exc
60
+
61
+
62
+ def static_key(value): # pylint: disable=W0613
63
+ """Return a fixed value, for use as a static join key"""
64
+ return 1
@@ -0,0 +1,201 @@
1
+ """Wrapping functions for wrapping generic functions"""
2
+
3
+ import warnings
4
+ from collections.abc import Callable, Iterable
5
+ from typing import Any, Optional, Union
6
+
7
+ import pydantic
8
+
9
+ from dve.metadata_parser import exc
10
+
11
+ PydanticCompatible = Callable[
12
+ [Any, dict[str, Any], pydantic.fields.ModelField, pydantic.BaseConfig], Any
13
+ ]
14
+ """Function Compatable with pydantic
15
+ Args:
16
+ value (Any): Value to be validated
17
+ values (dict[str, Any]): dict of previously validated fields
18
+ field (pydantic.fields.ModelField): field object containing field name and type
19
+ config (pydantic.BaseConfig): the config that determines things like aliases
20
+
21
+ """
22
+
23
+
24
+ def error_handler(
25
+ error_type: Union[type[Exception], type[Warning]],
26
+ error_message: str,
27
+ field: pydantic.fields.ModelField,
28
+ ):
29
+ """Determines whether to raise an error or warning based on error_type
30
+
31
+ Args:
32
+ error_type (Union[type[Exception], type[Warning]]): type of error to raise
33
+ error_message (str): message to apply
34
+ field (pydantic.fields.ModelField): field that caused the error to be raised
35
+
36
+ Raises:
37
+ error_type
38
+
39
+ """
40
+ if issubclass(error_type, exc.LocWarning):
41
+ warnings.warn(error_type(msg=error_message, loc=field.name))
42
+ elif issubclass(error_type, Warning):
43
+ warnings.warn(error_message, error_type)
44
+ else:
45
+ raise error_type(error_message)
46
+
47
+
48
+ def pydantic_wrapper(
49
+ error_type: Union[type[Exception], type[Warning]],
50
+ error_message: str,
51
+ *field_names: str,
52
+ failure_function: Callable = lambda x: x is False,
53
+ return_result: bool = True,
54
+ **kwargs,
55
+ ) -> Callable[[Callable], PydanticCompatible]:
56
+ """Wraps generic functions and returns a pydantic compatible function
57
+ takes an error type and error_message to allow the failure of a function to be customised
58
+
59
+ takes field_names of fields to be included into the generic function. should be in order
60
+ that they appear in the function.
61
+
62
+ takes a function that will result in the passed exception being raised
63
+
64
+ Args:
65
+ error_type (type[Exception]): The exception type to be raised if the failure_function
66
+ evaluates to True
67
+ error_message (str): Message to be passed to the above exception
68
+ failure_function (Optional[Callable]): A callable that when it evaluates to True
69
+ raises the above exception. Defaults to lambda x: x is False.
70
+ return_result (bool): Whether to return the result from the wrapped function or the
71
+ or the original value that was passed.
72
+ True should be used when the function transforms the data in some way
73
+ e.g. stripping whitespace from NHS numbers.
74
+ False should be used when the function is a comparison
75
+ e.g. x > y. in this case the value of the field is returned rather than the bool.
76
+
77
+ """
78
+
79
+ def wrapper(
80
+ func: Callable,
81
+ ) -> PydanticCompatible:
82
+ """Wraps the passed function and returns a function with a pydantic compatible calling
83
+ signature
84
+
85
+ Args:
86
+ func (Callable): function to be wrapped
87
+
88
+ Raises:
89
+ error_type: error passed in with the message given
90
+
91
+ Returns:
92
+ Callable: wrapped function with call signature:
93
+ (value: Any, values: dict, field: ModelField, config: BaseConfig) -> Any
94
+
95
+ """
96
+
97
+ def inner(
98
+ value: Any,
99
+ values: dict[str, Any],
100
+ field: pydantic.fields.ModelField, # pylint: disable=unused-argument
101
+ config: pydantic.BaseConfig, # pylint: disable=unused-argument
102
+ ) -> Any:
103
+ fields = [values.get(name) for name in field_names]
104
+ result = None
105
+ try:
106
+ result = func(value, *fields, **kwargs)
107
+ except (ValueError, TypeError, AssertionError):
108
+ error_handler(error_type, error_message, field)
109
+ else:
110
+ # only want to check if the return is False if we are returning the value
111
+ if not return_result and failure_function(result):
112
+ error_handler(error_type, error_message, field)
113
+ if return_result:
114
+ return result
115
+ return value
116
+
117
+ inner.__name__ = f"wrapped_{func.__name__}"
118
+ return inner
119
+
120
+ return wrapper
121
+
122
+
123
+ validator_args = pydantic.validator.__kwdefaults__.copy()
124
+
125
+
126
+ def create_validator(
127
+ function: Callable,
128
+ field: str,
129
+ error_type: type[Exception],
130
+ error_message: str,
131
+ fields: Optional[Iterable[str]] = None,
132
+ return_result=True,
133
+ **kwargs,
134
+ ):
135
+ """Creates a pydantic_validator from a function
136
+
137
+ Args:
138
+ function (Callable): function to wrap
139
+ field (str): field validator is applier to
140
+ fields (Iterable[str]): other fields to be included in validation (in order of arguments)
141
+ error_type (type[Exception]): Error to be raised on failure
142
+ error_message (str): Message to be raised on failure
143
+ kwargs:
144
+ pydantic_wrapper_kwargs:
145
+ failure_function: function called to cause the function to raise the passed
146
+ error when evaluates to True
147
+ return_result: bool = True: whether to return the result of the wrapped function
148
+ or the unchanged value
149
+
150
+ pydantic.validator kwargs:
151
+ pre: bool = False
152
+ each_item: bool = False
153
+ always: bool = False
154
+ check_fields: bool = True
155
+ whole: bool = None
156
+ allow_reuse: bool = False
157
+
158
+ function kwargs
159
+
160
+ Returns:
161
+ classmethod: wrapped function wrapped by pydantics validator
162
+
163
+ """
164
+ validator_kwargs = _validator_kwargs(
165
+ **{key: kwargs.pop(key, default) for key, default in validator_args.items()}
166
+ )
167
+
168
+ if fields is None:
169
+ fields = []
170
+
171
+ wrapped = pydantic_wrapper(
172
+ error_type,
173
+ error_message,
174
+ *fields,
175
+ return_result=return_result,
176
+ **kwargs,
177
+ )(function)
178
+
179
+ validator_kwargs.update(allow_reuse=True)
180
+ validator = pydantic.validator(field, **validator_kwargs)(wrapped)
181
+ return validator
182
+
183
+
184
+ @pydantic.validate_arguments
185
+ def _validator_kwargs(
186
+ pre: bool = False,
187
+ each_item: bool = False,
188
+ always: bool = False,
189
+ check_fields: bool = True,
190
+ whole: Optional[bool] = None,
191
+ allow_reuse: bool = False,
192
+ **kwargs, # pylint: disable=unused-argument
193
+ ):
194
+ return {
195
+ "pre": pre,
196
+ "each_item": each_item,
197
+ "always": always,
198
+ "check_fields": check_fields,
199
+ "whole": whole,
200
+ "allow_reuse": allow_reuse,
201
+ }
@@ -0,0 +1,119 @@
1
+ """Tools for parsing metadata to pydantic Models"""
2
+
3
+ # pylint: disable=super-init-not-called
4
+ import warnings
5
+ from abc import ABCMeta, abstractmethod
6
+ from collections.abc import Mapping
7
+ from copy import deepcopy
8
+
9
+ # This _needs_ to be `typing.Mapping`, or pydantic complains.
10
+ from typing import Any, Optional, Union
11
+
12
+ import pydantic as pyd
13
+ from typing_extensions import Literal
14
+
15
+ from dve.metadata_parser import domain_types
16
+ from dve.metadata_parser.models import DatasetSpecification
17
+ from dve.metadata_parser.utilities import FieldTypeOption
18
+
19
+ # Type doesn't like Ellipsis. the type hint this is used in doesn't like type(Ellipsis)
20
+ EllipsisType = Any
21
+ """3.9 compatible ellipsis type"""
22
+
23
+ TypeAnnotation = Union[type, Any] # pylint: disable=invalid-name
24
+ """An arbitrary type annotation"""
25
+ TableType = Literal["schema", "table"]
26
+ """
27
+ The type of 'table' specified in the schema.
28
+
29
+ Values:
30
+
31
+ - 'table': the table is a 'real' table, and should be expected
32
+ to be created by the engine.
33
+ - 'schema': the 'table' is a schema definition for a complex field
34
+ within a table. These should be defined before the tables they
35
+ are used in.
36
+
37
+ """
38
+
39
+
40
+ @pyd.validate_arguments
41
+ def constr(
42
+ *,
43
+ strip_whitespace: bool = False,
44
+ to_lower: bool = False,
45
+ strict: bool = False,
46
+ min_length: Optional[int] = None,
47
+ max_length: Optional[int] = None,
48
+ curtail_length: Optional[int] = None,
49
+ regex: Optional[str] = None,
50
+ ):
51
+ """Wrapper around constr to enable argument validation"""
52
+ return pyd.constr(
53
+ strip_whitespace=strip_whitespace,
54
+ to_lower=to_lower,
55
+ strict=strict,
56
+ min_length=min_length, # type: ignore
57
+ max_length=max_length, # type: ignore
58
+ curtail_length=curtail_length, # type: ignore
59
+ regex=regex, # type: ignore
60
+ )
61
+
62
+
63
+ STR_TO_PY_MAPPING: Mapping[str, FieldTypeOption] = {
64
+ "constr": constr,
65
+ "conint": pyd.validate_arguments(pyd.conint),
66
+ "condate": pyd.validate_arguments(pyd.condate),
67
+ "condecimal": pyd.validate_arguments(pyd.condecimal),
68
+ "postcode": domain_types.Postcode,
69
+ "nhsnumber": domain_types.NHSNumber,
70
+ "permissivenhsno": domain_types.permissive_nhs_number(),
71
+ "alphanumeric": domain_types.alphanumeric,
72
+ "identifier": domain_types.identifier,
73
+ "orgid": domain_types.OrgID,
74
+ "formatteddatetime": domain_types.formatteddatetime,
75
+ "formattedtime": domain_types.formattedtime,
76
+ "conformatteddate": domain_types.conformatteddate,
77
+ "reportingperiodstart": domain_types.reportingperiod(reporting_period_type="start"),
78
+ "reportingperiodend": domain_types.reportingperiod(reporting_period_type="end"),
79
+ }
80
+
81
+
82
+ class ModelLoader(metaclass=ABCMeta): # pylint: disable=too-few-public-methods
83
+ """An abstract model loader."""
84
+
85
+ @abstractmethod
86
+ def __init__(self, contract_contents: dict[str, Any], type_map: Optional[dict] = None):
87
+ raise NotImplementedError()
88
+
89
+ @abstractmethod
90
+ def generate_models(
91
+ self, additional_validators: Optional[dict] = None
92
+ ) -> dict[str, pyd.main.ModelMetaclass]:
93
+ """Generates models from the instance schema.
94
+
95
+ Args:
96
+ additional_validators (Optional[dict], optional): Any validation rules aside from
97
+ those described in the schema. Defaults to None [DEPRECATED]
98
+
99
+ Returns:
100
+ dict[str, model]: dict of table names to pydantic models
101
+
102
+ """
103
+ raise NotImplementedError()
104
+
105
+
106
+ class JSONtoPyd(ModelLoader): # pylint: disable=too-few-public-methods
107
+ """Generate pydantic model from a JSON schema."""
108
+
109
+ def __init__(self, contract_contents: dict[str, Any], type_map: Optional[dict] = None):
110
+ self.contract_contents = contract_contents
111
+ self.type_map = deepcopy(type_map or STR_TO_PY_MAPPING)
112
+
113
+ def generate_models(
114
+ self, additional_validators: Optional[dict] = None
115
+ ) -> dict[str, pyd.main.ModelMetaclass]:
116
+ """Generates pydantic models from a loaded json file"""
117
+ if additional_validators:
118
+ warnings.warn("Ignoring additional validator functions")
119
+ return DatasetSpecification(**self.contract_contents).load_models(self.type_map)