data-validation-engine 0.7.1__tar.gz → 0.7.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/PKG-INFO +25 -15
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/README.md +15 -12
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/pyproject.toml +35 -7
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/duckdb/contract.py +9 -15
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +105 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py +4 -2
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/spark/spark_helpers.py +100 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/metadata_parser/domain_types.py +2 -1
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/pipeline/foundry_ddb_pipeline.py +6 -2
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/reporting/excel_report.py +4 -1
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/LICENSE +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/common/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/common/error_utils.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/base/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/base/auditing.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/base/backend.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/base/contract.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/base/core.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/base/reader.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/base/reference_data.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/base/rules.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/base/utilities.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/exceptions.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/duckdb/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/duckdb/auditing.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/duckdb/readers/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/duckdb/readers/json.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/duckdb/readers/xml.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/duckdb/reference_data.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/duckdb/rules.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/duckdb/types.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/duckdb/utilities.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/spark/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/spark/auditing.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/spark/backend.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/spark/contract.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/spark/readers/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/spark/readers/csv.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/spark/readers/json.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/spark/readers/xml.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/spark/reference_data.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/spark/rules.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/spark/types.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/implementations/spark/utilities.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/metadata/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/metadata/contract.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/metadata/reporting.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/metadata/rules.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/readers/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/readers/csv.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/readers/utilities.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/readers/xml.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/readers/xml_linting.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/types.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/utilities.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/configuration/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/configuration/base.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/configuration/v1/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/configuration/v1/filters.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/configuration/v1/rule_stores/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/configuration/v1/rule_stores/models.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/configuration/v1/steps.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/constants.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/engine.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/exceptions.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/functions/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/functions/implementations.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/loggers.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/message.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/models.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/templating.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/type_hints.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/validation.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/metadata_parser/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/metadata_parser/exc.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/metadata_parser/function_library.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/metadata_parser/function_wrapper.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/metadata_parser/model_generator.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/metadata_parser/models.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/metadata_parser/utilities.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/parser/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/parser/exceptions.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/parser/file_handling/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/parser/file_handling/helpers.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/parser/file_handling/implementations/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/parser/file_handling/implementations/base.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/parser/file_handling/implementations/dbfs.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/parser/file_handling/implementations/file.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/parser/file_handling/implementations/s3.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/parser/file_handling/log_handler.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/parser/file_handling/service.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/parser/file_handling/utilities.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/parser/type_hints.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/parser/utilities.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/pipeline/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/pipeline/duckdb_pipeline.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/pipeline/pipeline.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/pipeline/spark_pipeline.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/pipeline/utils.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/reporting/__init__.py +0 -0
- {data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/reporting/error_report.py +0 -0
|
@@ -1,15 +1,16 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-validation-engine
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.3
|
|
4
4
|
Summary: `nhs data validation engine` is a framework used to validate data
|
|
5
|
+
License-Expression: MIT
|
|
5
6
|
License-File: LICENSE
|
|
6
7
|
Author: NHS England
|
|
7
8
|
Author-email: england.contactus@nhs.net
|
|
8
9
|
Requires-Python: >=3.10,<3.12
|
|
9
|
-
Classifier: Operating System :: OS Independent
|
|
10
10
|
Classifier: Programming Language :: Python :: 3
|
|
11
11
|
Classifier: Programming Language :: Python :: 3.10
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
13
14
|
Classifier: Topic :: Software Development :: Libraries
|
|
14
15
|
Classifier: Typing :: Typed
|
|
15
16
|
Requires-Dist: Jinja2 (==3.1.*)
|
|
@@ -18,13 +19,19 @@ Requires-Dist: botocore (>=1.34.162,<1.36)
|
|
|
18
19
|
Requires-Dist: delta-spark (==2.4.*)
|
|
19
20
|
Requires-Dist: duckdb (==1.1.*)
|
|
20
21
|
Requires-Dist: lxml (>=4.9.1,<5.0.0)
|
|
22
|
+
Requires-Dist: numpy (==1.26.4)
|
|
21
23
|
Requires-Dist: openpyxl (>=3.1,<4.0)
|
|
22
24
|
Requires-Dist: pandas (>=2.2.2,<3.0.0)
|
|
23
25
|
Requires-Dist: polars (==0.20.*)
|
|
24
26
|
Requires-Dist: pyarrow (>=17.0.0,<18.0.0)
|
|
25
|
-
Requires-Dist: pydantic (==1.10.
|
|
27
|
+
Requires-Dist: pydantic (==1.10.16)
|
|
26
28
|
Requires-Dist: pyspark (==3.4.*)
|
|
27
29
|
Requires-Dist: typing_extensions (>=4.6.2,<5.0.0)
|
|
30
|
+
Project-URL: Changelog, https://github.com/NHSDigital/data-validation-engine/blob/main/CHANGELOG.md
|
|
31
|
+
Project-URL: Documentation, https://nhsdigital.github.io/data-validation-engine/
|
|
32
|
+
Project-URL: Homepage, https://github.com/NHSDigital/data-validation-engine
|
|
33
|
+
Project-URL: Issues, https://github.com/NHSDigital/data-validation-engine/issues
|
|
34
|
+
Project-URL: Repository, https://github.com/NHSDigital/data-validation-engine.git
|
|
28
35
|
Description-Content-Type: text/markdown
|
|
29
36
|
|
|
30
37
|
<h1 style="display: flex; align-items: center; gap: 10px;">
|
|
@@ -39,7 +46,7 @@ Description-Content-Type: text/markdown
|
|
|
39
46
|
|
|
40
47
|
The Data Validation Engine (DVE) is a configuration driven data validation library built and utilised by NHS England. Currently the package has been reverted from v1.0.0 release to a 0.x as we feel the package is not yet mature enough to be considered a 1.0.0 release. So please bear this in mind if reading through the commits and references to a v1+ release when on v0.x.
|
|
41
48
|
|
|
42
|
-
As mentioned above, the DVE is "configuration driven" which means the majority of development for you as a user will be building a JSON document to describe how the data will be validated. The JSON document is known as a `dischema` file and example files can be accessed [here](
|
|
49
|
+
As mentioned above, the DVE is "configuration driven" which means the majority of development for you as a user will be building a JSON document to describe how the data will be validated. The JSON document is known as a `dischema` file and example files can be accessed [here](https://github.com/NHSDigital/data-validation-engine/tree/main/tests/testdata). If you'd like to learn more about JSON document and how to build one from scratch, then please read the documentation [here](https://nhsdigital.github.io/data-validation-engine/).
|
|
43
50
|
|
|
44
51
|
Once a dischema file has been defined, you are ready to use the DVE. The DVE is typically orchestrated based on four key "services". These are...
|
|
45
52
|
|
|
@@ -50,7 +57,7 @@ Once a dischema file has been defined, you are ready to use the DVE. The DVE is
|
|
|
50
57
|
| 3. | Business Rules | The business rules service will perform more complex validations such as comparisons between fields and tables, aggregations, filters etc to generate new entities. |
|
|
51
58
|
| 4. | Error Reports | The error reports service will take all the errors raised in previous services and surface them into a readable format for a downstream users/service. Currently, this implemented to be an excel spreadsheet but could be reconfigured to meet other requirements/use cases. |
|
|
52
59
|
|
|
53
|
-
If you'd like more detailed documentation around these services the please read the extended documentation [here](
|
|
60
|
+
If you'd like more detailed documentation around these services the please read the extended documentation [here](https://nhsdigital.github.io/data-validation-engine/).
|
|
54
61
|
|
|
55
62
|
The DVE has been designed in a way that's modular and can support users who just want to utilise specific "services" from the DVE (i.e. just the file transformation + data contract). Additionally, the DVE is designed to support different backend implementations. As part of the base installation of DVE, you will find backend support for `Spark` and `DuckDB`. So, if you need a `MySQL` backend implementation, you can implement this yourself. Given our organisations requirements, it will be unlikely that we add anymore specific backend implementations into the base package beyond Spark and DuckDB. So, if you are unable to implement this yourself, I would recommend reading the guidance on [requesting new features and raising bug reports here](#requesting-new-features-and-raising-bug-reports).
|
|
56
63
|
|
|
@@ -72,7 +79,7 @@ pip install data-validation-engine
|
|
|
72
79
|
|
|
73
80
|
*Note - Only versions >=0.6.2 are available on PyPi. For older versions please install directly from the git repo or build from source.*
|
|
74
81
|
|
|
75
|
-
Once you have installed the DVE you are ready to use it. For guidance on how to create your dischema JSON document (configuration), please read the [documentation](
|
|
82
|
+
Once you have installed the DVE you are ready to use it. For guidance on how to create your dischema JSON document (configuration), please read the [documentation](https://nhsdigital.github.io/data-validation-engine/).
|
|
76
83
|
|
|
77
84
|
Version 0.0.1 does support a working Python 3.7 installation. However, we will not be supporting any issues with that version of the DVE if you choose to use it. __Use at your own risk__.
|
|
78
85
|
|
|
@@ -85,17 +92,20 @@ If you have feature request then please follow the same process whilst using the
|
|
|
85
92
|
|
|
86
93
|
## Upcoming features
|
|
87
94
|
Below is a list of features that we would like to implement or have been requested.
|
|
88
|
-
| Feature
|
|
89
|
-
|
|
|
90
|
-
| Open source release
|
|
91
|
-
| Uplift to Python 3.11
|
|
92
|
-
|
|
|
93
|
-
|
|
|
94
|
-
|
|
95
|
-
|
|
95
|
+
| Feature | Release Version | Released? |
|
|
96
|
+
| ------------------------------------------------------------------------------- | ----------------- | --------- |
|
|
97
|
+
| Open source release | 0.1.0 | Yes |
|
|
98
|
+
| Uplift to Python 3.11 | 0.2.0 | Yes |
|
|
99
|
+
| Uplift Pyspark to 3.5 | TBA | No |
|
|
100
|
+
| Allow DVE to run on Python 3.12+ | TBA | No |
|
|
101
|
+
| Upgrade to Pydantic 2.0 | TBA | No |
|
|
102
|
+
| Uplift Pyspark to 4.0+ | TBA | No |
|
|
103
|
+
| Create a more user friendly interface for building and modifying dischema files | Not yet confirmed | No |
|
|
104
|
+
|
|
105
|
+
Beyond the Python and Pydantic upgrade, we cannot confirm the other features will be made available anytime soon. Therefore, if you have the interest and desire to make these features available, then please read the [Contributing](#Contributing) section and get involved.
|
|
96
106
|
|
|
97
107
|
## Contributing
|
|
98
|
-
Please see guidance [here](
|
|
108
|
+
Please see guidance [here](https://github.com/NHSDigital/data-validation-engine/blob/main/CONTRIBUTE.md).
|
|
99
109
|
|
|
100
110
|
## Legal
|
|
101
111
|
This codebase is released under the MIT License. This covers both the codebase and any sample code in the documentation.
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
|
|
11
11
|
The Data Validation Engine (DVE) is a configuration driven data validation library built and utilised by NHS England. Currently the package has been reverted from v1.0.0 release to a 0.x as we feel the package is not yet mature enough to be considered a 1.0.0 release. So please bear this in mind if reading through the commits and references to a v1+ release when on v0.x.
|
|
12
12
|
|
|
13
|
-
As mentioned above, the DVE is "configuration driven" which means the majority of development for you as a user will be building a JSON document to describe how the data will be validated. The JSON document is known as a `dischema` file and example files can be accessed [here](
|
|
13
|
+
As mentioned above, the DVE is "configuration driven" which means the majority of development for you as a user will be building a JSON document to describe how the data will be validated. The JSON document is known as a `dischema` file and example files can be accessed [here](https://github.com/NHSDigital/data-validation-engine/tree/main/tests/testdata). If you'd like to learn more about JSON document and how to build one from scratch, then please read the documentation [here](https://nhsdigital.github.io/data-validation-engine/).
|
|
14
14
|
|
|
15
15
|
Once a dischema file has been defined, you are ready to use the DVE. The DVE is typically orchestrated based on four key "services". These are...
|
|
16
16
|
|
|
@@ -21,7 +21,7 @@ Once a dischema file has been defined, you are ready to use the DVE. The DVE is
|
|
|
21
21
|
| 3. | Business Rules | The business rules service will perform more complex validations such as comparisons between fields and tables, aggregations, filters etc to generate new entities. |
|
|
22
22
|
| 4. | Error Reports | The error reports service will take all the errors raised in previous services and surface them into a readable format for a downstream users/service. Currently, this implemented to be an excel spreadsheet but could be reconfigured to meet other requirements/use cases. |
|
|
23
23
|
|
|
24
|
-
If you'd like more detailed documentation around these services the please read the extended documentation [here](
|
|
24
|
+
If you'd like more detailed documentation around these services the please read the extended documentation [here](https://nhsdigital.github.io/data-validation-engine/).
|
|
25
25
|
|
|
26
26
|
The DVE has been designed in a way that's modular and can support users who just want to utilise specific "services" from the DVE (i.e. just the file transformation + data contract). Additionally, the DVE is designed to support different backend implementations. As part of the base installation of DVE, you will find backend support for `Spark` and `DuckDB`. So, if you need a `MySQL` backend implementation, you can implement this yourself. Given our organisations requirements, it will be unlikely that we add anymore specific backend implementations into the base package beyond Spark and DuckDB. So, if you are unable to implement this yourself, I would recommend reading the guidance on [requesting new features and raising bug reports here](#requesting-new-features-and-raising-bug-reports).
|
|
27
27
|
|
|
@@ -43,7 +43,7 @@ pip install data-validation-engine
|
|
|
43
43
|
|
|
44
44
|
*Note - Only versions >=0.6.2 are available on PyPi. For older versions please install directly from the git repo or build from source.*
|
|
45
45
|
|
|
46
|
-
Once you have installed the DVE you are ready to use it. For guidance on how to create your dischema JSON document (configuration), please read the [documentation](
|
|
46
|
+
Once you have installed the DVE you are ready to use it. For guidance on how to create your dischema JSON document (configuration), please read the [documentation](https://nhsdigital.github.io/data-validation-engine/).
|
|
47
47
|
|
|
48
48
|
Version 0.0.1 does support a working Python 3.7 installation. However, we will not be supporting any issues with that version of the DVE if you choose to use it. __Use at your own risk__.
|
|
49
49
|
|
|
@@ -56,17 +56,20 @@ If you have feature request then please follow the same process whilst using the
|
|
|
56
56
|
|
|
57
57
|
## Upcoming features
|
|
58
58
|
Below is a list of features that we would like to implement or have been requested.
|
|
59
|
-
| Feature
|
|
60
|
-
|
|
|
61
|
-
| Open source release
|
|
62
|
-
| Uplift to Python 3.11
|
|
63
|
-
|
|
|
64
|
-
|
|
|
65
|
-
|
|
66
|
-
|
|
59
|
+
| Feature | Release Version | Released? |
|
|
60
|
+
| ------------------------------------------------------------------------------- | ----------------- | --------- |
|
|
61
|
+
| Open source release | 0.1.0 | Yes |
|
|
62
|
+
| Uplift to Python 3.11 | 0.2.0 | Yes |
|
|
63
|
+
| Uplift Pyspark to 3.5 | TBA | No |
|
|
64
|
+
| Allow DVE to run on Python 3.12+ | TBA | No |
|
|
65
|
+
| Upgrade to Pydantic 2.0 | TBA | No |
|
|
66
|
+
| Uplift Pyspark to 4.0+ | TBA | No |
|
|
67
|
+
| Create a more user friendly interface for building and modifying dischema files | Not yet confirmed | No |
|
|
68
|
+
|
|
69
|
+
Beyond the Python and Pydantic upgrade, we cannot confirm the other features will be made available anytime soon. Therefore, if you have the interest and desire to make these features available, then please read the [Contributing](#Contributing) section and get involved.
|
|
67
70
|
|
|
68
71
|
## Contributing
|
|
69
|
-
Please see guidance [here](
|
|
72
|
+
Please see guidance [here](https://github.com/NHSDigital/data-validation-engine/blob/main/CONTRIBUTE.md).
|
|
70
73
|
|
|
71
74
|
## Legal
|
|
72
75
|
This codebase is released under the MIT License. This covers both the codebase and any sample code in the documentation.
|
|
@@ -1,12 +1,11 @@
|
|
|
1
|
-
[
|
|
1
|
+
[project]
|
|
2
2
|
name = "data-validation-engine"
|
|
3
|
-
|
|
3
|
+
dynamic = [ "version" ]
|
|
4
4
|
description = "`nhs data validation engine` is a framework used to validate data"
|
|
5
|
-
authors = [
|
|
6
|
-
|
|
7
|
-
packages = [
|
|
8
|
-
{ include = "dve", from = "src" },
|
|
5
|
+
authors = [
|
|
6
|
+
{ name = "NHS England", email = "england.contactus@nhs.net" }
|
|
9
7
|
]
|
|
8
|
+
readme = "README.md"
|
|
10
9
|
classifiers = [
|
|
11
10
|
"Programming Language :: Python :: 3",
|
|
12
11
|
"Programming Language :: Python :: 3.10",
|
|
@@ -15,6 +14,20 @@ classifiers = [
|
|
|
15
14
|
"Topic :: Software Development :: Libraries",
|
|
16
15
|
"Typing :: Typed",
|
|
17
16
|
]
|
|
17
|
+
license = "MIT"
|
|
18
|
+
|
|
19
|
+
[project.urls]
|
|
20
|
+
Homepage = "https://github.com/NHSDigital/data-validation-engine"
|
|
21
|
+
Documentation = "https://nhsdigital.github.io/data-validation-engine/"
|
|
22
|
+
Repository = "https://github.com/NHSDigital/data-validation-engine.git"
|
|
23
|
+
Issues = "https://github.com/NHSDigital/data-validation-engine/issues"
|
|
24
|
+
Changelog = "https://github.com/NHSDigital/data-validation-engine/blob/main/CHANGELOG.md"
|
|
25
|
+
|
|
26
|
+
[tool.poetry]
|
|
27
|
+
version = "0.7.3"
|
|
28
|
+
packages = [
|
|
29
|
+
{ include = "dve", from = "src" },
|
|
30
|
+
]
|
|
18
31
|
|
|
19
32
|
[tool.poetry.dependencies]
|
|
20
33
|
python = ">=3.10,<3.12"
|
|
@@ -24,11 +37,12 @@ delta-spark = "2.4.*"
|
|
|
24
37
|
duckdb = "1.1.*" # breaking changes beyond 1.1
|
|
25
38
|
Jinja2 = "3.1.*"
|
|
26
39
|
lxml = "^4.9.1"
|
|
40
|
+
numpy = "1.26.4"
|
|
27
41
|
openpyxl = "^3.1"
|
|
28
42
|
pandas = "^2.2.2"
|
|
29
43
|
polars = "0.20.*"
|
|
30
44
|
pyarrow = "^17.0.0"
|
|
31
|
-
pydantic = "1.10.
|
|
45
|
+
pydantic = "1.10.16"
|
|
32
46
|
pyspark = "3.4.*"
|
|
33
47
|
typing_extensions = "^4.6.2"
|
|
34
48
|
|
|
@@ -42,6 +56,9 @@ include-groups = [
|
|
|
42
56
|
[tool.poetry.group.dev.dependencies]
|
|
43
57
|
commitizen = "4.9.1"
|
|
44
58
|
pre-commit = "4.3.0"
|
|
59
|
+
charset-normalizer = "3.4.6"
|
|
60
|
+
python-discovery = "1.2.0"
|
|
61
|
+
requests = "2.33.0"
|
|
45
62
|
|
|
46
63
|
[tool.poetry.group.test]
|
|
47
64
|
optional = true
|
|
@@ -78,6 +95,17 @@ types-setuptools = "68.2.0.0"
|
|
|
78
95
|
types-urllib3 = "1.26.25.14"
|
|
79
96
|
types-xmltodict = "0.13.0.3"
|
|
80
97
|
|
|
98
|
+
[tool.poetry.group.docs]
|
|
99
|
+
optional = true
|
|
100
|
+
|
|
101
|
+
[tool.poetry.group.docs.dependencies]
|
|
102
|
+
click = "8.2.1"
|
|
103
|
+
mkdocs = "^1.6.1"
|
|
104
|
+
mkdocstrings = { version = "1.0.3", extras = ["python"] }
|
|
105
|
+
griffelib = "2.0.1"
|
|
106
|
+
pymdown-extensions = "10.21.2"
|
|
107
|
+
zensical = "0.0.31"
|
|
108
|
+
|
|
81
109
|
[tool.ruff]
|
|
82
110
|
line-length = 100
|
|
83
111
|
|
|
@@ -31,6 +31,7 @@ from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import (
|
|
|
31
31
|
duckdb_read_parquet,
|
|
32
32
|
duckdb_record_index,
|
|
33
33
|
duckdb_write_parquet,
|
|
34
|
+
get_duckdb_cast_statement_from_annotation,
|
|
34
35
|
get_duckdb_type_from_annotation,
|
|
35
36
|
relation_is_empty,
|
|
36
37
|
)
|
|
@@ -101,18 +102,7 @@ class DuckDBDataContract(BaseDataContract[DuckDBPyRelation]):
|
|
|
101
102
|
_lazy_df = pl.LazyFrame(records, polars_schema) # type: ignore # pylint: disable=unused-variable
|
|
102
103
|
return self._connection.sql("select * from _lazy_df")
|
|
103
104
|
|
|
104
|
-
|
|
105
|
-
def generate_ddb_cast_statement(
|
|
106
|
-
column_name: str, dtype: DuckDBPyType, null_flag: bool = False
|
|
107
|
-
) -> str:
|
|
108
|
-
"""Helper method to generate sql statements for casting datatypes (permissively).
|
|
109
|
-
Current duckdb python API doesn't play well with this currently.
|
|
110
|
-
"""
|
|
111
|
-
if not null_flag:
|
|
112
|
-
return f'try_cast("{column_name}" AS {dtype}) AS "{column_name}"'
|
|
113
|
-
return f'cast(NULL AS {dtype}) AS "{column_name}"'
|
|
114
|
-
|
|
115
|
-
# pylint: disable=R0914
|
|
105
|
+
# pylint: disable=R0914,R0915
|
|
116
106
|
def apply_data_contract(
|
|
117
107
|
self,
|
|
118
108
|
working_dir: URI,
|
|
@@ -180,12 +170,16 @@ class DuckDBDataContract(BaseDataContract[DuckDBPyRelation]):
|
|
|
180
170
|
|
|
181
171
|
casting_statements = [
|
|
182
172
|
(
|
|
183
|
-
|
|
173
|
+
get_duckdb_cast_statement_from_annotation(column, mdl_fld.annotation)
|
|
174
|
+
+ f""" AS "{column}" """
|
|
184
175
|
if column in relation.columns
|
|
185
|
-
else
|
|
176
|
+
else f"CAST(NULL AS {ddb_schema[column]}) AS {column}"
|
|
186
177
|
)
|
|
187
|
-
for column,
|
|
178
|
+
for column, mdl_fld in entity_fields.items()
|
|
188
179
|
]
|
|
180
|
+
casting_statements.append(
|
|
181
|
+
f"CAST({RECORD_INDEX_COLUMN_NAME} AS {get_duckdb_type_from_annotation(int)}) AS {RECORD_INDEX_COLUMN_NAME}" # pylint: disable=C0301
|
|
182
|
+
)
|
|
189
183
|
try:
|
|
190
184
|
relation = relation.project(", ".join(casting_statements))
|
|
191
185
|
except Exception as err: # pylint: disable=broad-except
|
|
@@ -313,3 +313,108 @@ def duckdb_record_index(cls):
|
|
|
313
313
|
setattr(cls, "add_record_index", _add_duckdb_record_index)
|
|
314
314
|
setattr(cls, "drop_record_index", _drop_duckdb_record_index)
|
|
315
315
|
return cls
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def _cast_as_ddb_type(field_expr: str, type_annotation: Any) -> str:
|
|
319
|
+
"""Cast to Duck DB type"""
|
|
320
|
+
return f"""try_cast({field_expr} as {get_duckdb_type_from_annotation(type_annotation)})"""
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def _ddb_safely_quote_name(field_name: str) -> str:
|
|
324
|
+
"""Quote field names in case reserved"""
|
|
325
|
+
try:
|
|
326
|
+
sep_idx = field_name.index(".")
|
|
327
|
+
return f'"{field_name[: sep_idx]}"' + field_name[sep_idx:]
|
|
328
|
+
except ValueError:
|
|
329
|
+
return f'"{field_name}"'
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
# pylint: disable=R0801,R0911,R0912
|
|
333
|
+
def get_duckdb_cast_statement_from_annotation(
|
|
334
|
+
element_name: str,
|
|
335
|
+
type_annotation: Any,
|
|
336
|
+
parent_element: bool = True,
|
|
337
|
+
date_regex: str = r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$",
|
|
338
|
+
timestamp_regex: str = r"^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}((\+|\-)[0-9]{2}:[0-9]{2})?$", # pylint: disable=C0301
|
|
339
|
+
time_regex: str = r"^[0-9]{2}:[0-9]{2}:[0-9]{2}$",
|
|
340
|
+
) -> str:
|
|
341
|
+
"""Generate casting statements for duckdb relations from type annotations"""
|
|
342
|
+
type_origin = get_origin(type_annotation)
|
|
343
|
+
|
|
344
|
+
quoted_name = _ddb_safely_quote_name(element_name)
|
|
345
|
+
|
|
346
|
+
# An `Optional` or `Union` type, check to ensure non-heterogenity.
|
|
347
|
+
if type_origin is Union:
|
|
348
|
+
python_type = _get_non_heterogenous_type(get_args(type_annotation))
|
|
349
|
+
return get_duckdb_cast_statement_from_annotation(
|
|
350
|
+
element_name, python_type, parent_element, date_regex, timestamp_regex
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
# Type hint is e.g. `List[str]`, check to ensure non-heterogenity.
|
|
354
|
+
if type_origin is list or (isinstance(type_origin, type) and issubclass(type_origin, list)):
|
|
355
|
+
element_type = _get_non_heterogenous_type(get_args(type_annotation))
|
|
356
|
+
stmt = f"list_transform({quoted_name}, x -> {get_duckdb_cast_statement_from_annotation('x',element_type, False, date_regex, timestamp_regex)})" # pylint: disable=C0301
|
|
357
|
+
return stmt if not parent_element else _cast_as_ddb_type(stmt, type_annotation)
|
|
358
|
+
|
|
359
|
+
if type_origin is Annotated:
|
|
360
|
+
python_type, *other_args = get_args(type_annotation) # pylint: disable=unused-variable
|
|
361
|
+
return get_duckdb_cast_statement_from_annotation(
|
|
362
|
+
element_name, python_type, parent_element, date_regex, timestamp_regex
|
|
363
|
+
) # add other expected params here
|
|
364
|
+
# Ensure that we have a concrete type at this point.
|
|
365
|
+
if not isinstance(type_annotation, type):
|
|
366
|
+
raise ValueError(f"Unsupported type annotation {type_annotation!r}")
|
|
367
|
+
|
|
368
|
+
if (
|
|
369
|
+
# Type hint is a dict subclass, but not dict. Possibly a `TypedDict`.
|
|
370
|
+
(issubclass(type_annotation, dict) and type_annotation is not dict)
|
|
371
|
+
# Type hint is a dataclass.
|
|
372
|
+
or is_dataclass(type_annotation)
|
|
373
|
+
# Type hint is a `pydantic` model.
|
|
374
|
+
or (type_origin is None and issubclass(type_annotation, BaseModel))
|
|
375
|
+
):
|
|
376
|
+
fields: dict[str, str] = {}
|
|
377
|
+
for field_name, field_annotation in get_type_hints(type_annotation).items():
|
|
378
|
+
# Technically non-string keys are disallowed, but people are bad.
|
|
379
|
+
if not isinstance(field_name, str):
|
|
380
|
+
raise ValueError(
|
|
381
|
+
f"Dictionary/Dataclass keys must be strings, got {type_annotation!r}"
|
|
382
|
+
) # pragma: no cover
|
|
383
|
+
if get_origin(field_annotation) is ClassVar:
|
|
384
|
+
continue
|
|
385
|
+
|
|
386
|
+
fields[field_name] = get_duckdb_cast_statement_from_annotation(
|
|
387
|
+
f"{element_name}.{field_name}", field_annotation, False, date_regex, timestamp_regex
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
if not fields:
|
|
391
|
+
raise ValueError(
|
|
392
|
+
f"No type annotations in dict/dataclass type (got {type_annotation!r})"
|
|
393
|
+
)
|
|
394
|
+
cast_exprs = ",".join([f'"{nme}":= {stmt}' for nme, stmt in fields.items()])
|
|
395
|
+
stmt = f"struct_pack({cast_exprs})"
|
|
396
|
+
return stmt if not parent_element else _cast_as_ddb_type(stmt, type_annotation)
|
|
397
|
+
|
|
398
|
+
if type_annotation is list:
|
|
399
|
+
raise ValueError(
|
|
400
|
+
f"List must have type annotation (e.g. `List[str]`), got {type_annotation!r}"
|
|
401
|
+
)
|
|
402
|
+
if type_annotation is dict or type_origin is dict:
|
|
403
|
+
raise ValueError(f"dict must be `typing.TypedDict` subclass, got {type_annotation!r}")
|
|
404
|
+
|
|
405
|
+
for type_ in type_annotation.mro():
|
|
406
|
+
# datetime is subclass of date, so needs to be handled first
|
|
407
|
+
if issubclass(type_, datetime):
|
|
408
|
+
stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{timestamp_regex}') THEN TRY_CAST(TRIM({quoted_name}) as TIMESTAMP) ELSE NULL END" # pylint: disable=C0301
|
|
409
|
+
return stmt
|
|
410
|
+
if issubclass(type_, date):
|
|
411
|
+
stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{date_regex}') THEN TRY_CAST(TRIM({quoted_name}) as DATE) ELSE NULL END" # pylint: disable=C0301
|
|
412
|
+
return stmt
|
|
413
|
+
if issubclass(type_, time):
|
|
414
|
+
stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{time_regex}') THEN TRY_CAST(TRIM({quoted_name}) as TIME) ELSE NULL END" # pylint: disable=C0301
|
|
415
|
+
return stmt
|
|
416
|
+
duck_type = get_duckdb_type_from_annotation(type_)
|
|
417
|
+
if duck_type:
|
|
418
|
+
stmt = f"trim({quoted_name})"
|
|
419
|
+
return _cast_as_ddb_type(stmt, type_) if parent_element else stmt
|
|
420
|
+
raise ValueError(f"No equivalent DuckDB type for {type_annotation!r}")
|
|
@@ -82,11 +82,12 @@ class DuckDBCSVReader(BaseFileReader):
|
|
|
82
82
|
messages=[
|
|
83
83
|
FeedbackMessage(
|
|
84
84
|
entity=entity_name,
|
|
85
|
-
record=
|
|
85
|
+
record={"missing_fields": missing},
|
|
86
86
|
failure_type="submission",
|
|
87
87
|
error_location="Whole File",
|
|
88
|
+
reporting_field="missing_fields",
|
|
88
89
|
error_code=self.field_check_error_code,
|
|
89
|
-
error_message=f"{self.field_check_error_message}
|
|
90
|
+
error_message=f"{self.field_check_error_message}", # pylint: disable=line-too-long
|
|
90
91
|
)
|
|
91
92
|
],
|
|
92
93
|
)
|
|
@@ -202,6 +203,7 @@ class DuckDBCSVRepeatingHeaderReader(PolarsToDuckDBCSVReader):
|
|
|
202
203
|
`NonDistinctHeaderError`.
|
|
203
204
|
|
|
204
205
|
So using the example above, the expected entity would look like this...
|
|
206
|
+
|
|
205
207
|
| headerCol1 | headerCol2 | headerCol3 |
|
|
206
208
|
| ---------- | ---------- | ---------- |
|
|
207
209
|
| shop1 | clothes | 2025-01-01 |
|
|
@@ -439,3 +439,103 @@ def spark_record_index(cls):
|
|
|
439
439
|
setattr(cls, "add_record_index", _add_spark_record_index)
|
|
440
440
|
setattr(cls, "drop_record_index", _drop_spark_record_index)
|
|
441
441
|
return cls
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def _cast_as_spark_type(field_expr: str, field_type: Any) -> Column:
|
|
445
|
+
"""Cast to spark type"""
|
|
446
|
+
return sf.expr(field_expr).cast(get_type_from_annotation(field_type))
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def _spark_safely_quote_name(field_name: str) -> str:
|
|
450
|
+
"""Quote field names in case reserved"""
|
|
451
|
+
try:
|
|
452
|
+
sep_idx = field_name.index(".")
|
|
453
|
+
return f"`{field_name[: sep_idx]}`" + field_name[sep_idx:]
|
|
454
|
+
except ValueError:
|
|
455
|
+
return f"`{field_name}`"
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
# pylint: disable=R0801
|
|
459
|
+
def get_spark_cast_statement_from_annotation(
|
|
460
|
+
element_name: str,
|
|
461
|
+
type_annotation: Any,
|
|
462
|
+
parent_element: bool = True,
|
|
463
|
+
date_regex: str = r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$",
|
|
464
|
+
timestamp_regex: str = r"^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}((\\+|\\-)[0-9]{2}:[0-9]{2})?$", # pylint: disable=C0301
|
|
465
|
+
):
|
|
466
|
+
"""Generate casting statements for spark dataframes based on type annotations"""
|
|
467
|
+
type_origin = get_origin(type_annotation)
|
|
468
|
+
|
|
469
|
+
quoted_name = _spark_safely_quote_name(element_name)
|
|
470
|
+
|
|
471
|
+
# An `Optional` or `Union` type, check to ensure non-heterogenity.
|
|
472
|
+
if type_origin is Union:
|
|
473
|
+
python_type = _get_non_heterogenous_type(get_args(type_annotation))
|
|
474
|
+
return get_spark_cast_statement_from_annotation(
|
|
475
|
+
element_name, python_type, parent_element, date_regex, timestamp_regex
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
# Type hint is e.g. `List[str]`, check to ensure non-heterogenity.
|
|
479
|
+
if type_origin is list or (isinstance(type_origin, type) and issubclass(type_origin, list)):
|
|
480
|
+
element_type = _get_non_heterogenous_type(get_args(type_annotation))
|
|
481
|
+
stmt = f"transform({quoted_name}, x -> {get_spark_cast_statement_from_annotation('x',element_type, False, date_regex, timestamp_regex)})" # pylint: disable=C0301
|
|
482
|
+
return stmt if not parent_element else _cast_as_spark_type(stmt, type_annotation)
|
|
483
|
+
|
|
484
|
+
if type_origin is Annotated:
|
|
485
|
+
python_type, *_ = get_args(type_annotation) # pylint: disable=unused-variable
|
|
486
|
+
return get_spark_cast_statement_from_annotation(
|
|
487
|
+
element_name, python_type, parent_element, date_regex, timestamp_regex
|
|
488
|
+
) # add other expected params here
|
|
489
|
+
# Ensure that we have a concrete type at this point.
|
|
490
|
+
if not isinstance(type_annotation, type):
|
|
491
|
+
raise ValueError(f"Unsupported type annotation {type_annotation!r}")
|
|
492
|
+
|
|
493
|
+
if (
|
|
494
|
+
# Type hint is a dict subclass, but not dict. Possibly a `TypedDict`.
|
|
495
|
+
(issubclass(type_annotation, dict) and type_annotation is not dict)
|
|
496
|
+
# Type hint is a dataclass.
|
|
497
|
+
or is_dataclass(type_annotation)
|
|
498
|
+
# Type hint is a `pydantic` model.
|
|
499
|
+
or (type_origin is None and issubclass(type_annotation, BaseModel))
|
|
500
|
+
):
|
|
501
|
+
fields: dict[str, str] = {}
|
|
502
|
+
for field_name, field_annotation in get_type_hints(type_annotation).items():
|
|
503
|
+
# Technically non-string keys are disallowed, but people are bad.
|
|
504
|
+
if not isinstance(field_name, str):
|
|
505
|
+
raise ValueError(
|
|
506
|
+
f"Dictionary/Dataclass keys must be strings, got {type_annotation!r}"
|
|
507
|
+
) # pragma: no cover
|
|
508
|
+
if get_origin(field_annotation) is ClassVar:
|
|
509
|
+
continue
|
|
510
|
+
|
|
511
|
+
fields[field_name] = get_spark_cast_statement_from_annotation(
|
|
512
|
+
f"{element_name}.{field_name}", field_annotation, False, date_regex, timestamp_regex
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
if not fields:
|
|
516
|
+
raise ValueError(
|
|
517
|
+
f"No type annotations in dict/dataclass type (got {type_annotation!r})"
|
|
518
|
+
)
|
|
519
|
+
cast_exprs = ",".join([f"{stmt} AS `{nme}`" for nme, stmt in fields.items()])
|
|
520
|
+
stmt = f"struct({cast_exprs})"
|
|
521
|
+
return stmt if not parent_element else _cast_as_spark_type(stmt, type_annotation)
|
|
522
|
+
if type_annotation is list:
|
|
523
|
+
raise ValueError(
|
|
524
|
+
f"List must have type annotation (e.g. `List[str]`), got {type_annotation!r}"
|
|
525
|
+
)
|
|
526
|
+
if type_annotation is dict or type_origin is dict:
|
|
527
|
+
raise ValueError(f"dict must be `typing.TypedDict` subclass, got {type_annotation!r}")
|
|
528
|
+
|
|
529
|
+
for type_ in type_annotation.mro():
|
|
530
|
+
# datetime is subclass of date, so needs to be handled first
|
|
531
|
+
if issubclass(type_, dt.datetime):
|
|
532
|
+
stmt = rf"CASE WHEN REGEXP(TRIM({quoted_name}), '{timestamp_regex}') THEN TRIM({quoted_name}) ELSE NULL END" # pylint: disable=C0301
|
|
533
|
+
return _cast_as_spark_type(stmt, type_) if parent_element else stmt
|
|
534
|
+
if issubclass(type_, dt.date):
|
|
535
|
+
stmt = rf"CASE WHEN REGEXP(TRIM({quoted_name}), '{date_regex}') THEN TRIM({quoted_name}) ELSE NULL END" # pylint: disable=C0301
|
|
536
|
+
return _cast_as_spark_type(stmt, type_) if parent_element else stmt
|
|
537
|
+
spark_type = get_type_from_annotation(type_)
|
|
538
|
+
if spark_type:
|
|
539
|
+
stmt = f"trim({quoted_name})"
|
|
540
|
+
return _cast_as_spark_type(stmt, type_) if parent_element else stmt
|
|
541
|
+
raise ValueError(f"No equivalent Spark type for {type_annotation!r}")
|
|
@@ -519,7 +519,8 @@ class FormattedTime(dt.time):
|
|
|
519
519
|
raise ValueError("Provided time has timezone, but this is forbidden for this field")
|
|
520
520
|
if cls.TIMEZONE_TREATMENT == "require" and not new_time.tzinfo:
|
|
521
521
|
raise ValueError("Provided time missing timezone, but this is required for this field")
|
|
522
|
-
|
|
522
|
+
if isinstance(value, str) and cls.TIME_FORMAT and value != str(new_time):
|
|
523
|
+
raise ValueError("Provided time is not matching expected time format supplied.")
|
|
523
524
|
return new_time
|
|
524
525
|
|
|
525
526
|
@classmethod
|
|
@@ -42,11 +42,15 @@ class FoundryDDBPipeline(DDBDVEPipeline):
|
|
|
42
42
|
write_to.parent.mkdir(parents=True, exist_ok=True)
|
|
43
43
|
write_to = write_to.as_posix()
|
|
44
44
|
self.write_parquet( # type: ignore # pylint: disable=E1101
|
|
45
|
-
self._audit_tables._processing_status.get_relation()
|
|
45
|
+
self._audit_tables._processing_status.get_relation().filter( # pylint: disable=W0212
|
|
46
|
+
f"submission_id = '{submission_info.submission_id}'"
|
|
47
|
+
),
|
|
46
48
|
fh.joinuri(write_to, "processing_status.parquet"),
|
|
47
49
|
)
|
|
48
50
|
self.write_parquet( # type: ignore # pylint: disable=E1101
|
|
49
|
-
self._audit_tables._submission_statistics.get_relation()
|
|
51
|
+
self._audit_tables._submission_statistics.get_relation().filter( # pylint: disable=W0212
|
|
52
|
+
f"submission_id = '{submission_info.submission_id}'"
|
|
53
|
+
),
|
|
50
54
|
fh.joinuri(write_to, "submission_statistics.parquet"),
|
|
51
55
|
)
|
|
52
56
|
return write_to
|
{data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/reporting/excel_report.py
RENAMED
|
@@ -135,8 +135,11 @@ class SummaryItems:
|
|
|
135
135
|
if "Status" not in self.summary_dict:
|
|
136
136
|
summary.append(["", "Status", status])
|
|
137
137
|
|
|
138
|
+
_key_renames = {
|
|
139
|
+
"File Size": "File Size (Bytes)",
|
|
140
|
+
}
|
|
138
141
|
for key, value in self.summary_dict.items():
|
|
139
|
-
summary.append(["", key, str(value)])
|
|
142
|
+
summary.append(["", _key_renames.get(key, key), str(value)])
|
|
140
143
|
|
|
141
144
|
summary.append(["", ""])
|
|
142
145
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/backends/types.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/constants.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/exceptions.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/loggers.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/message.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/templating.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/type_hints.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/core_engine/validation.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/metadata_parser/__init__.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/metadata_parser/exc.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/metadata_parser/models.py
RENAMED
|
File without changes
|
{data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/metadata_parser/utilities.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/pipeline/duckdb_pipeline.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/pipeline/spark_pipeline.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_validation_engine-0.7.1 → data_validation_engine-0.7.3}/src/dve/reporting/error_report.py
RENAMED
|
File without changes
|