data-validation-engine 0.7.2__tar.gz → 0.7.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/PKG-INFO +17 -13
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/README.md +15 -12
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/pyproject.toml +9 -3
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/contract.py +9 -15
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +105 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/spark_helpers.py +100 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/domain_types.py +2 -1
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/pipeline/foundry_ddb_pipeline.py +5 -2
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/pipeline/pipeline.py +12 -4
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/LICENSE +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/common/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/common/error_utils.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/base/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/base/auditing.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/base/backend.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/base/contract.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/base/core.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/base/reader.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/base/reference_data.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/base/rules.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/base/utilities.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/exceptions.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/auditing.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/readers/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/readers/json.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/readers/xml.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/reference_data.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/rules.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/types.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/utilities.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/auditing.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/backend.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/contract.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/readers/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/readers/csv.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/readers/json.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/readers/xml.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/reference_data.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/rules.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/types.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/utilities.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/metadata/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/metadata/contract.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/metadata/reporting.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/metadata/rules.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/readers/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/readers/csv.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/readers/utilities.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/readers/xml.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/readers/xml_linting.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/types.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/utilities.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/configuration/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/configuration/base.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/configuration/v1/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/configuration/v1/filters.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/configuration/v1/rule_stores/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/configuration/v1/rule_stores/models.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/configuration/v1/steps.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/constants.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/engine.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/exceptions.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/functions/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/functions/implementations.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/loggers.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/message.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/models.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/templating.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/type_hints.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/validation.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/exc.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/function_library.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/function_wrapper.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/model_generator.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/models.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/utilities.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/exceptions.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/helpers.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/implementations/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/implementations/base.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/implementations/dbfs.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/implementations/file.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/implementations/s3.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/log_handler.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/service.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/utilities.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/type_hints.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/utilities.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/pipeline/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/pipeline/duckdb_pipeline.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/pipeline/spark_pipeline.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/pipeline/utils.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/reporting/__init__.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/reporting/error_report.py +0 -0
- {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/reporting/excel_report.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-validation-engine
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.4
|
|
4
4
|
Summary: `nhs data validation engine` is a framework used to validate data
|
|
5
5
|
License-Expression: MIT
|
|
6
6
|
License-File: LICENSE
|
|
@@ -19,6 +19,7 @@ Requires-Dist: botocore (>=1.34.162,<1.36)
|
|
|
19
19
|
Requires-Dist: delta-spark (==2.4.*)
|
|
20
20
|
Requires-Dist: duckdb (==1.1.*)
|
|
21
21
|
Requires-Dist: lxml (>=4.9.1,<5.0.0)
|
|
22
|
+
Requires-Dist: numpy (==1.26.4)
|
|
22
23
|
Requires-Dist: openpyxl (>=3.1,<4.0)
|
|
23
24
|
Requires-Dist: pandas (>=2.2.2,<3.0.0)
|
|
24
25
|
Requires-Dist: polars (==0.20.*)
|
|
@@ -45,7 +46,7 @@ Description-Content-Type: text/markdown
|
|
|
45
46
|
|
|
46
47
|
The Data Validation Engine (DVE) is a configuration driven data validation library built and utilised by NHS England. Currently the package has been reverted from v1.0.0 release to a 0.x as we feel the package is not yet mature enough to be considered a 1.0.0 release. So please bear this in mind if reading through the commits and references to a v1+ release when on v0.x.
|
|
47
48
|
|
|
48
|
-
As mentioned above, the DVE is "configuration driven" which means the majority of development for you as a user will be building a JSON document to describe how the data will be validated. The JSON document is known as a `dischema` file and example files can be accessed [here](
|
|
49
|
+
As mentioned above, the DVE is "configuration driven" which means the majority of development for you as a user will be building a JSON document to describe how the data will be validated. The JSON document is known as a `dischema` file and example files can be accessed [here](https://github.com/NHSDigital/data-validation-engine/tree/main/tests/testdata). If you'd like to learn more about JSON document and how to build one from scratch, then please read the documentation [here](https://nhsdigital.github.io/data-validation-engine/).
|
|
49
50
|
|
|
50
51
|
Once a dischema file has been defined, you are ready to use the DVE. The DVE is typically orchestrated based on four key "services". These are...
|
|
51
52
|
|
|
@@ -56,7 +57,7 @@ Once a dischema file has been defined, you are ready to use the DVE. The DVE is
|
|
|
56
57
|
| 3. | Business Rules | The business rules service will perform more complex validations such as comparisons between fields and tables, aggregations, filters etc to generate new entities. |
|
|
57
58
|
| 4. | Error Reports | The error reports service will take all the errors raised in previous services and surface them into a readable format for a downstream users/service. Currently, this implemented to be an excel spreadsheet but could be reconfigured to meet other requirements/use cases. |
|
|
58
59
|
|
|
59
|
-
If you'd like more detailed documentation around these services the please read the extended documentation [here](
|
|
60
|
+
If you'd like more detailed documentation around these services the please read the extended documentation [here](https://nhsdigital.github.io/data-validation-engine/).
|
|
60
61
|
|
|
61
62
|
The DVE has been designed in a way that's modular and can support users who just want to utilise specific "services" from the DVE (i.e. just the file transformation + data contract). Additionally, the DVE is designed to support different backend implementations. As part of the base installation of DVE, you will find backend support for `Spark` and `DuckDB`. So, if you need a `MySQL` backend implementation, you can implement this yourself. Given our organisations requirements, it will be unlikely that we add anymore specific backend implementations into the base package beyond Spark and DuckDB. So, if you are unable to implement this yourself, I would recommend reading the guidance on [requesting new features and raising bug reports here](#requesting-new-features-and-raising-bug-reports).
|
|
62
63
|
|
|
@@ -78,7 +79,7 @@ pip install data-validation-engine
|
|
|
78
79
|
|
|
79
80
|
*Note - Only versions >=0.6.2 are available on PyPi. For older versions please install directly from the git repo or build from source.*
|
|
80
81
|
|
|
81
|
-
Once you have installed the DVE you are ready to use it. For guidance on how to create your dischema JSON document (configuration), please read the [documentation](
|
|
82
|
+
Once you have installed the DVE you are ready to use it. For guidance on how to create your dischema JSON document (configuration), please read the [documentation](https://nhsdigital.github.io/data-validation-engine/).
|
|
82
83
|
|
|
83
84
|
Version 0.0.1 does support a working Python 3.7 installation. However, we will not be supporting any issues with that version of the DVE if you choose to use it. __Use at your own risk__.
|
|
84
85
|
|
|
@@ -91,17 +92,20 @@ If you have feature request then please follow the same process whilst using the
|
|
|
91
92
|
|
|
92
93
|
## Upcoming features
|
|
93
94
|
Below is a list of features that we would like to implement or have been requested.
|
|
94
|
-
| Feature
|
|
95
|
-
|
|
|
96
|
-
| Open source release
|
|
97
|
-
| Uplift to Python 3.11
|
|
98
|
-
|
|
|
99
|
-
|
|
|
100
|
-
|
|
101
|
-
|
|
95
|
+
| Feature | Release Version | Released? |
|
|
96
|
+
| ------------------------------------------------------------------------------- | ----------------- | --------- |
|
|
97
|
+
| Open source release | 0.1.0 | Yes |
|
|
98
|
+
| Uplift to Python 3.11 | 0.2.0 | Yes |
|
|
99
|
+
| Uplift Pyspark to 3.5 | TBA | No |
|
|
100
|
+
| Allow DVE to run on Python 3.12+ | TBA | No |
|
|
101
|
+
| Upgrade to Pydantic 2.0 | TBA | No |
|
|
102
|
+
| Uplift Pyspark to 4.0+ | TBA | No |
|
|
103
|
+
| Create a more user friendly interface for building and modifying dischema files | Not yet confirmed | No |
|
|
104
|
+
|
|
105
|
+
Beyond the Python and Pydantic upgrade, we cannot confirm the other features will be made available anytime soon. Therefore, if you have the interest and desire to make these features available, then please read the [Contributing](#Contributing) section and get involved.
|
|
102
106
|
|
|
103
107
|
## Contributing
|
|
104
|
-
Please see guidance [here](
|
|
108
|
+
Please see guidance [here](https://github.com/NHSDigital/data-validation-engine/blob/main/CONTRIBUTE.md).
|
|
105
109
|
|
|
106
110
|
## Legal
|
|
107
111
|
This codebase is released under the MIT License. This covers both the codebase and any sample code in the documentation.
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
|
|
11
11
|
The Data Validation Engine (DVE) is a configuration driven data validation library built and utilised by NHS England. Currently the package has been reverted from v1.0.0 release to a 0.x as we feel the package is not yet mature enough to be considered a 1.0.0 release. So please bear this in mind if reading through the commits and references to a v1+ release when on v0.x.
|
|
12
12
|
|
|
13
|
-
As mentioned above, the DVE is "configuration driven" which means the majority of development for you as a user will be building a JSON document to describe how the data will be validated. The JSON document is known as a `dischema` file and example files can be accessed [here](
|
|
13
|
+
As mentioned above, the DVE is "configuration driven" which means the majority of development for you as a user will be building a JSON document to describe how the data will be validated. The JSON document is known as a `dischema` file and example files can be accessed [here](https://github.com/NHSDigital/data-validation-engine/tree/main/tests/testdata). If you'd like to learn more about JSON document and how to build one from scratch, then please read the documentation [here](https://nhsdigital.github.io/data-validation-engine/).
|
|
14
14
|
|
|
15
15
|
Once a dischema file has been defined, you are ready to use the DVE. The DVE is typically orchestrated based on four key "services". These are...
|
|
16
16
|
|
|
@@ -21,7 +21,7 @@ Once a dischema file has been defined, you are ready to use the DVE. The DVE is
|
|
|
21
21
|
| 3. | Business Rules | The business rules service will perform more complex validations such as comparisons between fields and tables, aggregations, filters etc to generate new entities. |
|
|
22
22
|
| 4. | Error Reports | The error reports service will take all the errors raised in previous services and surface them into a readable format for a downstream users/service. Currently, this implemented to be an excel spreadsheet but could be reconfigured to meet other requirements/use cases. |
|
|
23
23
|
|
|
24
|
-
If you'd like more detailed documentation around these services the please read the extended documentation [here](
|
|
24
|
+
If you'd like more detailed documentation around these services the please read the extended documentation [here](https://nhsdigital.github.io/data-validation-engine/).
|
|
25
25
|
|
|
26
26
|
The DVE has been designed in a way that's modular and can support users who just want to utilise specific "services" from the DVE (i.e. just the file transformation + data contract). Additionally, the DVE is designed to support different backend implementations. As part of the base installation of DVE, you will find backend support for `Spark` and `DuckDB`. So, if you need a `MySQL` backend implementation, you can implement this yourself. Given our organisations requirements, it will be unlikely that we add anymore specific backend implementations into the base package beyond Spark and DuckDB. So, if you are unable to implement this yourself, I would recommend reading the guidance on [requesting new features and raising bug reports here](#requesting-new-features-and-raising-bug-reports).
|
|
27
27
|
|
|
@@ -43,7 +43,7 @@ pip install data-validation-engine
|
|
|
43
43
|
|
|
44
44
|
*Note - Only versions >=0.6.2 are available on PyPi. For older versions please install directly from the git repo or build from source.*
|
|
45
45
|
|
|
46
|
-
Once you have installed the DVE you are ready to use it. For guidance on how to create your dischema JSON document (configuration), please read the [documentation](
|
|
46
|
+
Once you have installed the DVE you are ready to use it. For guidance on how to create your dischema JSON document (configuration), please read the [documentation](https://nhsdigital.github.io/data-validation-engine/).
|
|
47
47
|
|
|
48
48
|
Version 0.0.1 does support a working Python 3.7 installation. However, we will not be supporting any issues with that version of the DVE if you choose to use it. __Use at your own risk__.
|
|
49
49
|
|
|
@@ -56,17 +56,20 @@ If you have feature request then please follow the same process whilst using the
|
|
|
56
56
|
|
|
57
57
|
## Upcoming features
|
|
58
58
|
Below is a list of features that we would like to implement or have been requested.
|
|
59
|
-
| Feature
|
|
60
|
-
|
|
|
61
|
-
| Open source release
|
|
62
|
-
| Uplift to Python 3.11
|
|
63
|
-
|
|
|
64
|
-
|
|
|
65
|
-
|
|
66
|
-
|
|
59
|
+
| Feature | Release Version | Released? |
|
|
60
|
+
| ------------------------------------------------------------------------------- | ----------------- | --------- |
|
|
61
|
+
| Open source release | 0.1.0 | Yes |
|
|
62
|
+
| Uplift to Python 3.11 | 0.2.0 | Yes |
|
|
63
|
+
| Uplift Pyspark to 3.5 | TBA | No |
|
|
64
|
+
| Allow DVE to run on Python 3.12+ | TBA | No |
|
|
65
|
+
| Upgrade to Pydantic 2.0 | TBA | No |
|
|
66
|
+
| Uplift Pyspark to 4.0+ | TBA | No |
|
|
67
|
+
| Create a more user friendly interface for building and modifying dischema files | Not yet confirmed | No |
|
|
68
|
+
|
|
69
|
+
Beyond the Python and Pydantic upgrade, we cannot confirm the other features will be made available anytime soon. Therefore, if you have the interest and desire to make these features available, then please read the [Contributing](#Contributing) section and get involved.
|
|
67
70
|
|
|
68
71
|
## Contributing
|
|
69
|
-
Please see guidance [here](
|
|
72
|
+
Please see guidance [here](https://github.com/NHSDigital/data-validation-engine/blob/main/CONTRIBUTE.md).
|
|
70
73
|
|
|
71
74
|
## Legal
|
|
72
75
|
This codebase is released under the MIT License. This covers both the codebase and any sample code in the documentation.
|
|
@@ -24,7 +24,7 @@ Issues = "https://github.com/NHSDigital/data-validation-engine/issues"
|
|
|
24
24
|
Changelog = "https://github.com/NHSDigital/data-validation-engine/blob/main/CHANGELOG.md"
|
|
25
25
|
|
|
26
26
|
[tool.poetry]
|
|
27
|
-
version = "0.7.
|
|
27
|
+
version = "0.7.4"
|
|
28
28
|
packages = [
|
|
29
29
|
{ include = "dve", from = "src" },
|
|
30
30
|
]
|
|
@@ -37,6 +37,7 @@ delta-spark = "2.4.*"
|
|
|
37
37
|
duckdb = "1.1.*" # breaking changes beyond 1.1
|
|
38
38
|
Jinja2 = "3.1.*"
|
|
39
39
|
lxml = "^4.9.1"
|
|
40
|
+
numpy = "1.26.4"
|
|
40
41
|
openpyxl = "^3.1"
|
|
41
42
|
pandas = "^2.2.2"
|
|
42
43
|
polars = "0.20.*"
|
|
@@ -55,6 +56,9 @@ include-groups = [
|
|
|
55
56
|
[tool.poetry.group.dev.dependencies]
|
|
56
57
|
commitizen = "4.9.1"
|
|
57
58
|
pre-commit = "4.3.0"
|
|
59
|
+
charset-normalizer = "3.4.6"
|
|
60
|
+
python-discovery = "1.2.0"
|
|
61
|
+
requests = "2.33.0"
|
|
58
62
|
|
|
59
63
|
[tool.poetry.group.test]
|
|
60
64
|
optional = true
|
|
@@ -97,8 +101,10 @@ optional = true
|
|
|
97
101
|
[tool.poetry.group.docs.dependencies]
|
|
98
102
|
click = "8.2.1"
|
|
99
103
|
mkdocs = "^1.6.1"
|
|
100
|
-
mkdocstrings = { version = "
|
|
101
|
-
|
|
104
|
+
mkdocstrings = { version = "1.0.3", extras = ["python"] }
|
|
105
|
+
griffelib = "2.0.1"
|
|
106
|
+
pymdown-extensions = "10.21.2"
|
|
107
|
+
zensical = "0.0.31"
|
|
102
108
|
|
|
103
109
|
[tool.ruff]
|
|
104
110
|
line-length = 100
|
|
@@ -31,6 +31,7 @@ from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import (
|
|
|
31
31
|
duckdb_read_parquet,
|
|
32
32
|
duckdb_record_index,
|
|
33
33
|
duckdb_write_parquet,
|
|
34
|
+
get_duckdb_cast_statement_from_annotation,
|
|
34
35
|
get_duckdb_type_from_annotation,
|
|
35
36
|
relation_is_empty,
|
|
36
37
|
)
|
|
@@ -101,18 +102,7 @@ class DuckDBDataContract(BaseDataContract[DuckDBPyRelation]):
|
|
|
101
102
|
_lazy_df = pl.LazyFrame(records, polars_schema) # type: ignore # pylint: disable=unused-variable
|
|
102
103
|
return self._connection.sql("select * from _lazy_df")
|
|
103
104
|
|
|
104
|
-
|
|
105
|
-
def generate_ddb_cast_statement(
|
|
106
|
-
column_name: str, dtype: DuckDBPyType, null_flag: bool = False
|
|
107
|
-
) -> str:
|
|
108
|
-
"""Helper method to generate sql statements for casting datatypes (permissively).
|
|
109
|
-
Current duckdb python API doesn't play well with this currently.
|
|
110
|
-
"""
|
|
111
|
-
if not null_flag:
|
|
112
|
-
return f'try_cast("{column_name}" AS {dtype}) AS "{column_name}"'
|
|
113
|
-
return f'cast(NULL AS {dtype}) AS "{column_name}"'
|
|
114
|
-
|
|
115
|
-
# pylint: disable=R0914
|
|
105
|
+
# pylint: disable=R0914,R0915
|
|
116
106
|
def apply_data_contract(
|
|
117
107
|
self,
|
|
118
108
|
working_dir: URI,
|
|
@@ -180,12 +170,16 @@ class DuckDBDataContract(BaseDataContract[DuckDBPyRelation]):
|
|
|
180
170
|
|
|
181
171
|
casting_statements = [
|
|
182
172
|
(
|
|
183
|
-
|
|
173
|
+
get_duckdb_cast_statement_from_annotation(column, mdl_fld.annotation)
|
|
174
|
+
+ f""" AS "{column}" """
|
|
184
175
|
if column in relation.columns
|
|
185
|
-
else
|
|
176
|
+
else f"CAST(NULL AS {ddb_schema[column]}) AS {column}"
|
|
186
177
|
)
|
|
187
|
-
for column,
|
|
178
|
+
for column, mdl_fld in entity_fields.items()
|
|
188
179
|
]
|
|
180
|
+
casting_statements.append(
|
|
181
|
+
f"CAST({RECORD_INDEX_COLUMN_NAME} AS {get_duckdb_type_from_annotation(int)}) AS {RECORD_INDEX_COLUMN_NAME}" # pylint: disable=C0301
|
|
182
|
+
)
|
|
189
183
|
try:
|
|
190
184
|
relation = relation.project(", ".join(casting_statements))
|
|
191
185
|
except Exception as err: # pylint: disable=broad-except
|
|
@@ -313,3 +313,108 @@ def duckdb_record_index(cls):
|
|
|
313
313
|
setattr(cls, "add_record_index", _add_duckdb_record_index)
|
|
314
314
|
setattr(cls, "drop_record_index", _drop_duckdb_record_index)
|
|
315
315
|
return cls
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def _cast_as_ddb_type(field_expr: str, type_annotation: Any) -> str:
|
|
319
|
+
"""Cast to Duck DB type"""
|
|
320
|
+
return f"""try_cast({field_expr} as {get_duckdb_type_from_annotation(type_annotation)})"""
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def _ddb_safely_quote_name(field_name: str) -> str:
|
|
324
|
+
"""Quote field names in case reserved"""
|
|
325
|
+
try:
|
|
326
|
+
sep_idx = field_name.index(".")
|
|
327
|
+
return f'"{field_name[: sep_idx]}"' + field_name[sep_idx:]
|
|
328
|
+
except ValueError:
|
|
329
|
+
return f'"{field_name}"'
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
# pylint: disable=R0801,R0911,R0912
|
|
333
|
+
def get_duckdb_cast_statement_from_annotation(
|
|
334
|
+
element_name: str,
|
|
335
|
+
type_annotation: Any,
|
|
336
|
+
parent_element: bool = True,
|
|
337
|
+
date_regex: str = r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$",
|
|
338
|
+
timestamp_regex: str = r"^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}((\+|\-)[0-9]{2}:[0-9]{2})?$", # pylint: disable=C0301
|
|
339
|
+
time_regex: str = r"^[0-9]{2}:[0-9]{2}:[0-9]{2}$",
|
|
340
|
+
) -> str:
|
|
341
|
+
"""Generate casting statements for duckdb relations from type annotations"""
|
|
342
|
+
type_origin = get_origin(type_annotation)
|
|
343
|
+
|
|
344
|
+
quoted_name = _ddb_safely_quote_name(element_name)
|
|
345
|
+
|
|
346
|
+
# An `Optional` or `Union` type, check to ensure non-heterogenity.
|
|
347
|
+
if type_origin is Union:
|
|
348
|
+
python_type = _get_non_heterogenous_type(get_args(type_annotation))
|
|
349
|
+
return get_duckdb_cast_statement_from_annotation(
|
|
350
|
+
element_name, python_type, parent_element, date_regex, timestamp_regex
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
# Type hint is e.g. `List[str]`, check to ensure non-heterogenity.
|
|
354
|
+
if type_origin is list or (isinstance(type_origin, type) and issubclass(type_origin, list)):
|
|
355
|
+
element_type = _get_non_heterogenous_type(get_args(type_annotation))
|
|
356
|
+
stmt = f"list_transform({quoted_name}, x -> {get_duckdb_cast_statement_from_annotation('x',element_type, False, date_regex, timestamp_regex)})" # pylint: disable=C0301
|
|
357
|
+
return stmt if not parent_element else _cast_as_ddb_type(stmt, type_annotation)
|
|
358
|
+
|
|
359
|
+
if type_origin is Annotated:
|
|
360
|
+
python_type, *other_args = get_args(type_annotation) # pylint: disable=unused-variable
|
|
361
|
+
return get_duckdb_cast_statement_from_annotation(
|
|
362
|
+
element_name, python_type, parent_element, date_regex, timestamp_regex
|
|
363
|
+
) # add other expected params here
|
|
364
|
+
# Ensure that we have a concrete type at this point.
|
|
365
|
+
if not isinstance(type_annotation, type):
|
|
366
|
+
raise ValueError(f"Unsupported type annotation {type_annotation!r}")
|
|
367
|
+
|
|
368
|
+
if (
|
|
369
|
+
# Type hint is a dict subclass, but not dict. Possibly a `TypedDict`.
|
|
370
|
+
(issubclass(type_annotation, dict) and type_annotation is not dict)
|
|
371
|
+
# Type hint is a dataclass.
|
|
372
|
+
or is_dataclass(type_annotation)
|
|
373
|
+
# Type hint is a `pydantic` model.
|
|
374
|
+
or (type_origin is None and issubclass(type_annotation, BaseModel))
|
|
375
|
+
):
|
|
376
|
+
fields: dict[str, str] = {}
|
|
377
|
+
for field_name, field_annotation in get_type_hints(type_annotation).items():
|
|
378
|
+
# Technically non-string keys are disallowed, but people are bad.
|
|
379
|
+
if not isinstance(field_name, str):
|
|
380
|
+
raise ValueError(
|
|
381
|
+
f"Dictionary/Dataclass keys must be strings, got {type_annotation!r}"
|
|
382
|
+
) # pragma: no cover
|
|
383
|
+
if get_origin(field_annotation) is ClassVar:
|
|
384
|
+
continue
|
|
385
|
+
|
|
386
|
+
fields[field_name] = get_duckdb_cast_statement_from_annotation(
|
|
387
|
+
f"{element_name}.{field_name}", field_annotation, False, date_regex, timestamp_regex
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
if not fields:
|
|
391
|
+
raise ValueError(
|
|
392
|
+
f"No type annotations in dict/dataclass type (got {type_annotation!r})"
|
|
393
|
+
)
|
|
394
|
+
cast_exprs = ",".join([f'"{nme}":= {stmt}' for nme, stmt in fields.items()])
|
|
395
|
+
stmt = f"struct_pack({cast_exprs})"
|
|
396
|
+
return stmt if not parent_element else _cast_as_ddb_type(stmt, type_annotation)
|
|
397
|
+
|
|
398
|
+
if type_annotation is list:
|
|
399
|
+
raise ValueError(
|
|
400
|
+
f"List must have type annotation (e.g. `List[str]`), got {type_annotation!r}"
|
|
401
|
+
)
|
|
402
|
+
if type_annotation is dict or type_origin is dict:
|
|
403
|
+
raise ValueError(f"dict must be `typing.TypedDict` subclass, got {type_annotation!r}")
|
|
404
|
+
|
|
405
|
+
for type_ in type_annotation.mro():
|
|
406
|
+
# datetime is subclass of date, so needs to be handled first
|
|
407
|
+
if issubclass(type_, datetime):
|
|
408
|
+
stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{timestamp_regex}') THEN TRY_CAST(TRIM({quoted_name}) as TIMESTAMP) ELSE NULL END" # pylint: disable=C0301
|
|
409
|
+
return stmt
|
|
410
|
+
if issubclass(type_, date):
|
|
411
|
+
stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{date_regex}') THEN TRY_CAST(TRIM({quoted_name}) as DATE) ELSE NULL END" # pylint: disable=C0301
|
|
412
|
+
return stmt
|
|
413
|
+
if issubclass(type_, time):
|
|
414
|
+
stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{time_regex}') THEN TRY_CAST(TRIM({quoted_name}) as TIME) ELSE NULL END" # pylint: disable=C0301
|
|
415
|
+
return stmt
|
|
416
|
+
duck_type = get_duckdb_type_from_annotation(type_)
|
|
417
|
+
if duck_type:
|
|
418
|
+
stmt = f"trim({quoted_name})"
|
|
419
|
+
return _cast_as_ddb_type(stmt, type_) if parent_element else stmt
|
|
420
|
+
raise ValueError(f"No equivalent DuckDB type for {type_annotation!r}")
|
|
@@ -439,3 +439,103 @@ def spark_record_index(cls):
|
|
|
439
439
|
setattr(cls, "add_record_index", _add_spark_record_index)
|
|
440
440
|
setattr(cls, "drop_record_index", _drop_spark_record_index)
|
|
441
441
|
return cls
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def _cast_as_spark_type(field_expr: str, field_type: Any) -> Column:
|
|
445
|
+
"""Cast to spark type"""
|
|
446
|
+
return sf.expr(field_expr).cast(get_type_from_annotation(field_type))
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def _spark_safely_quote_name(field_name: str) -> str:
|
|
450
|
+
"""Quote field names in case reserved"""
|
|
451
|
+
try:
|
|
452
|
+
sep_idx = field_name.index(".")
|
|
453
|
+
return f"`{field_name[: sep_idx]}`" + field_name[sep_idx:]
|
|
454
|
+
except ValueError:
|
|
455
|
+
return f"`{field_name}`"
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
# pylint: disable=R0801
|
|
459
|
+
def get_spark_cast_statement_from_annotation(
|
|
460
|
+
element_name: str,
|
|
461
|
+
type_annotation: Any,
|
|
462
|
+
parent_element: bool = True,
|
|
463
|
+
date_regex: str = r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$",
|
|
464
|
+
timestamp_regex: str = r"^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}((\\+|\\-)[0-9]{2}:[0-9]{2})?$", # pylint: disable=C0301
|
|
465
|
+
):
|
|
466
|
+
"""Generate casting statements for spark dataframes based on type annotations"""
|
|
467
|
+
type_origin = get_origin(type_annotation)
|
|
468
|
+
|
|
469
|
+
quoted_name = _spark_safely_quote_name(element_name)
|
|
470
|
+
|
|
471
|
+
# An `Optional` or `Union` type, check to ensure non-heterogenity.
|
|
472
|
+
if type_origin is Union:
|
|
473
|
+
python_type = _get_non_heterogenous_type(get_args(type_annotation))
|
|
474
|
+
return get_spark_cast_statement_from_annotation(
|
|
475
|
+
element_name, python_type, parent_element, date_regex, timestamp_regex
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
# Type hint is e.g. `List[str]`, check to ensure non-heterogenity.
|
|
479
|
+
if type_origin is list or (isinstance(type_origin, type) and issubclass(type_origin, list)):
|
|
480
|
+
element_type = _get_non_heterogenous_type(get_args(type_annotation))
|
|
481
|
+
stmt = f"transform({quoted_name}, x -> {get_spark_cast_statement_from_annotation('x',element_type, False, date_regex, timestamp_regex)})" # pylint: disable=C0301
|
|
482
|
+
return stmt if not parent_element else _cast_as_spark_type(stmt, type_annotation)
|
|
483
|
+
|
|
484
|
+
if type_origin is Annotated:
|
|
485
|
+
python_type, *_ = get_args(type_annotation) # pylint: disable=unused-variable
|
|
486
|
+
return get_spark_cast_statement_from_annotation(
|
|
487
|
+
element_name, python_type, parent_element, date_regex, timestamp_regex
|
|
488
|
+
) # add other expected params here
|
|
489
|
+
# Ensure that we have a concrete type at this point.
|
|
490
|
+
if not isinstance(type_annotation, type):
|
|
491
|
+
raise ValueError(f"Unsupported type annotation {type_annotation!r}")
|
|
492
|
+
|
|
493
|
+
if (
|
|
494
|
+
# Type hint is a dict subclass, but not dict. Possibly a `TypedDict`.
|
|
495
|
+
(issubclass(type_annotation, dict) and type_annotation is not dict)
|
|
496
|
+
# Type hint is a dataclass.
|
|
497
|
+
or is_dataclass(type_annotation)
|
|
498
|
+
# Type hint is a `pydantic` model.
|
|
499
|
+
or (type_origin is None and issubclass(type_annotation, BaseModel))
|
|
500
|
+
):
|
|
501
|
+
fields: dict[str, str] = {}
|
|
502
|
+
for field_name, field_annotation in get_type_hints(type_annotation).items():
|
|
503
|
+
# Technically non-string keys are disallowed, but people are bad.
|
|
504
|
+
if not isinstance(field_name, str):
|
|
505
|
+
raise ValueError(
|
|
506
|
+
f"Dictionary/Dataclass keys must be strings, got {type_annotation!r}"
|
|
507
|
+
) # pragma: no cover
|
|
508
|
+
if get_origin(field_annotation) is ClassVar:
|
|
509
|
+
continue
|
|
510
|
+
|
|
511
|
+
fields[field_name] = get_spark_cast_statement_from_annotation(
|
|
512
|
+
f"{element_name}.{field_name}", field_annotation, False, date_regex, timestamp_regex
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
if not fields:
|
|
516
|
+
raise ValueError(
|
|
517
|
+
f"No type annotations in dict/dataclass type (got {type_annotation!r})"
|
|
518
|
+
)
|
|
519
|
+
cast_exprs = ",".join([f"{stmt} AS `{nme}`" for nme, stmt in fields.items()])
|
|
520
|
+
stmt = f"struct({cast_exprs})"
|
|
521
|
+
return stmt if not parent_element else _cast_as_spark_type(stmt, type_annotation)
|
|
522
|
+
if type_annotation is list:
|
|
523
|
+
raise ValueError(
|
|
524
|
+
f"List must have type annotation (e.g. `List[str]`), got {type_annotation!r}"
|
|
525
|
+
)
|
|
526
|
+
if type_annotation is dict or type_origin is dict:
|
|
527
|
+
raise ValueError(f"dict must be `typing.TypedDict` subclass, got {type_annotation!r}")
|
|
528
|
+
|
|
529
|
+
for type_ in type_annotation.mro():
|
|
530
|
+
# datetime is subclass of date, so needs to be handled first
|
|
531
|
+
if issubclass(type_, dt.datetime):
|
|
532
|
+
stmt = rf"CASE WHEN REGEXP(TRIM({quoted_name}), '{timestamp_regex}') THEN TRIM({quoted_name}) ELSE NULL END" # pylint: disable=C0301
|
|
533
|
+
return _cast_as_spark_type(stmt, type_) if parent_element else stmt
|
|
534
|
+
if issubclass(type_, dt.date):
|
|
535
|
+
stmt = rf"CASE WHEN REGEXP(TRIM({quoted_name}), '{date_regex}') THEN TRIM({quoted_name}) ELSE NULL END" # pylint: disable=C0301
|
|
536
|
+
return _cast_as_spark_type(stmt, type_) if parent_element else stmt
|
|
537
|
+
spark_type = get_type_from_annotation(type_)
|
|
538
|
+
if spark_type:
|
|
539
|
+
stmt = f"trim({quoted_name})"
|
|
540
|
+
return _cast_as_spark_type(stmt, type_) if parent_element else stmt
|
|
541
|
+
raise ValueError(f"No equivalent Spark type for {type_annotation!r}")
|
|
@@ -519,7 +519,8 @@ class FormattedTime(dt.time):
|
|
|
519
519
|
raise ValueError("Provided time has timezone, but this is forbidden for this field")
|
|
520
520
|
if cls.TIMEZONE_TREATMENT == "require" and not new_time.tzinfo:
|
|
521
521
|
raise ValueError("Provided time missing timezone, but this is required for this field")
|
|
522
|
-
|
|
522
|
+
if isinstance(value, str) and cls.TIME_FORMAT and value != str(new_time):
|
|
523
|
+
raise ValueError("Provided time is not matching expected time format supplied.")
|
|
523
524
|
return new_time
|
|
524
525
|
|
|
525
526
|
@classmethod
|
|
@@ -42,13 +42,13 @@ class FoundryDDBPipeline(DDBDVEPipeline):
|
|
|
42
42
|
write_to.parent.mkdir(parents=True, exist_ok=True)
|
|
43
43
|
write_to = write_to.as_posix()
|
|
44
44
|
self.write_parquet( # type: ignore # pylint: disable=E1101
|
|
45
|
-
self._audit_tables._processing_status.get_relation().filter(
|
|
45
|
+
self._audit_tables._processing_status.get_relation().filter( # pylint: disable=W0212
|
|
46
46
|
f"submission_id = '{submission_info.submission_id}'"
|
|
47
47
|
),
|
|
48
48
|
fh.joinuri(write_to, "processing_status.parquet"),
|
|
49
49
|
)
|
|
50
50
|
self.write_parquet( # type: ignore # pylint: disable=E1101
|
|
51
|
-
self._audit_tables._submission_statistics.get_relation().filter(
|
|
51
|
+
self._audit_tables._submission_statistics.get_relation().filter( # pylint: disable=W0212
|
|
52
52
|
f"submission_id = '{submission_info.submission_id}'"
|
|
53
53
|
),
|
|
54
54
|
fh.joinuri(write_to, "submission_statistics.parquet"),
|
|
@@ -152,6 +152,9 @@ class FoundryDDBPipeline(DDBDVEPipeline):
|
|
|
152
152
|
)
|
|
153
153
|
if sub_stats:
|
|
154
154
|
self._audit_tables.add_submission_statistics_records(sub_stats=[sub_stats])
|
|
155
|
+
else:
|
|
156
|
+
self._audit_tables.mark_failed(submissions=[sub_id])
|
|
157
|
+
|
|
155
158
|
except Exception as err: # pylint: disable=W0718
|
|
156
159
|
self._logger.exception(
|
|
157
160
|
f"During processing of submission_id: {sub_id}, this exception was raised:"
|
|
@@ -527,7 +527,7 @@ class BaseDVEPipeline:
|
|
|
527
527
|
|
|
528
528
|
return processed_files, failed_processing
|
|
529
529
|
|
|
530
|
-
def apply_business_rules(
|
|
530
|
+
def apply_business_rules( # pylint: disable=R0914
|
|
531
531
|
self, submission_info: SubmissionInfo, submission_status: Optional[SubmissionStatus] = None
|
|
532
532
|
) -> tuple[SubmissionInfo, SubmissionStatus]:
|
|
533
533
|
"""Apply the business rules to a given submission, the submission may have failed at the
|
|
@@ -581,15 +581,23 @@ class BaseDVEPipeline:
|
|
|
581
581
|
|
|
582
582
|
key_fields = {model: conf.reporting_fields for model, conf in model_config.items()}
|
|
583
583
|
|
|
584
|
-
self.step_implementations.apply_rules(
|
|
584
|
+
_errors_uri, rules_success = self.step_implementations.apply_rules( # type: ignore
|
|
585
|
+
working_directory,
|
|
586
|
+
entity_manager,
|
|
587
|
+
rules,
|
|
588
|
+
key_fields
|
|
589
|
+
)
|
|
585
590
|
|
|
586
591
|
rule_messages = load_feedback_messages(
|
|
587
592
|
get_feedback_errors_uri(working_directory, "business_rules")
|
|
588
593
|
)
|
|
589
|
-
|
|
594
|
+
if (
|
|
590
595
|
any(not rule_message.is_informational for rule_message in rule_messages)
|
|
591
596
|
or submission_status.validation_failed
|
|
592
|
-
)
|
|
597
|
+
):
|
|
598
|
+
submission_status.validation_failed = True
|
|
599
|
+
elif not rules_success:
|
|
600
|
+
submission_status.processing_failed = True
|
|
593
601
|
|
|
594
602
|
for entity_name, entity in entity_manager.entities.items():
|
|
595
603
|
projected = self._step_implementations.write_parquet( # type: ignore
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/types.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/constants.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/exceptions.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/loggers.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/message.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/templating.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/type_hints.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/validation.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/__init__.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/exc.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/models.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/utilities.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/pipeline/duckdb_pipeline.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/pipeline/spark_pipeline.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/reporting/error_report.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/reporting/excel_report.py
RENAMED
|
File without changes
|