data-validation-engine 0.7.2__tar.gz → 0.7.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/PKG-INFO +17 -13
  2. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/README.md +15 -12
  3. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/pyproject.toml +9 -3
  4. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/contract.py +9 -15
  5. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +105 -0
  6. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/spark_helpers.py +100 -0
  7. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/domain_types.py +2 -1
  8. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/pipeline/foundry_ddb_pipeline.py +5 -2
  9. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/pipeline/pipeline.py +12 -4
  10. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/LICENSE +0 -0
  11. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/__init__.py +0 -0
  12. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/common/__init__.py +0 -0
  13. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/common/error_utils.py +0 -0
  14. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/__init__.py +0 -0
  15. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/__init__.py +0 -0
  16. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/base/__init__.py +0 -0
  17. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/base/auditing.py +0 -0
  18. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/base/backend.py +0 -0
  19. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/base/contract.py +0 -0
  20. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/base/core.py +0 -0
  21. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/base/reader.py +0 -0
  22. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/base/reference_data.py +0 -0
  23. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/base/rules.py +0 -0
  24. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/base/utilities.py +0 -0
  25. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/exceptions.py +0 -0
  26. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/__init__.py +0 -0
  27. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/__init__.py +0 -0
  28. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/auditing.py +0 -0
  29. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/readers/__init__.py +0 -0
  30. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py +0 -0
  31. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/readers/json.py +0 -0
  32. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/readers/xml.py +0 -0
  33. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/reference_data.py +0 -0
  34. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/rules.py +0 -0
  35. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/types.py +0 -0
  36. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/duckdb/utilities.py +0 -0
  37. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/__init__.py +0 -0
  38. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/auditing.py +0 -0
  39. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/backend.py +0 -0
  40. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/contract.py +0 -0
  41. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/readers/__init__.py +0 -0
  42. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/readers/csv.py +0 -0
  43. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/readers/json.py +0 -0
  44. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/readers/xml.py +0 -0
  45. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/reference_data.py +0 -0
  46. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/rules.py +0 -0
  47. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/types.py +0 -0
  48. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/implementations/spark/utilities.py +0 -0
  49. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/metadata/__init__.py +0 -0
  50. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/metadata/contract.py +0 -0
  51. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/metadata/reporting.py +0 -0
  52. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/metadata/rules.py +0 -0
  53. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/readers/__init__.py +0 -0
  54. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/readers/csv.py +0 -0
  55. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/readers/utilities.py +0 -0
  56. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/readers/xml.py +0 -0
  57. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/readers/xml_linting.py +0 -0
  58. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/types.py +0 -0
  59. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/backends/utilities.py +0 -0
  60. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/configuration/__init__.py +0 -0
  61. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/configuration/base.py +0 -0
  62. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/configuration/v1/__init__.py +0 -0
  63. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/configuration/v1/filters.py +0 -0
  64. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/configuration/v1/rule_stores/__init__.py +0 -0
  65. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/configuration/v1/rule_stores/models.py +0 -0
  66. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/configuration/v1/steps.py +0 -0
  67. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/constants.py +0 -0
  68. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/engine.py +0 -0
  69. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/exceptions.py +0 -0
  70. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/functions/__init__.py +0 -0
  71. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/functions/implementations.py +0 -0
  72. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/loggers.py +0 -0
  73. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/message.py +0 -0
  74. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/models.py +0 -0
  75. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/templating.py +0 -0
  76. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/type_hints.py +0 -0
  77. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/core_engine/validation.py +0 -0
  78. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/__init__.py +0 -0
  79. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/exc.py +0 -0
  80. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/function_library.py +0 -0
  81. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/function_wrapper.py +0 -0
  82. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/model_generator.py +0 -0
  83. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/models.py +0 -0
  84. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/metadata_parser/utilities.py +0 -0
  85. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/__init__.py +0 -0
  86. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/exceptions.py +0 -0
  87. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/__init__.py +0 -0
  88. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/helpers.py +0 -0
  89. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/implementations/__init__.py +0 -0
  90. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/implementations/base.py +0 -0
  91. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/implementations/dbfs.py +0 -0
  92. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/implementations/file.py +0 -0
  93. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/implementations/s3.py +0 -0
  94. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/log_handler.py +0 -0
  95. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/service.py +0 -0
  96. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/file_handling/utilities.py +0 -0
  97. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/type_hints.py +0 -0
  98. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/parser/utilities.py +0 -0
  99. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/pipeline/__init__.py +0 -0
  100. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/pipeline/duckdb_pipeline.py +0 -0
  101. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/pipeline/spark_pipeline.py +0 -0
  102. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/pipeline/utils.py +0 -0
  103. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/reporting/__init__.py +0 -0
  104. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/reporting/error_report.py +0 -0
  105. {data_validation_engine-0.7.2 → data_validation_engine-0.7.4}/src/dve/reporting/excel_report.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-validation-engine
3
- Version: 0.7.2
3
+ Version: 0.7.4
4
4
  Summary: `nhs data validation engine` is a framework used to validate data
5
5
  License-Expression: MIT
6
6
  License-File: LICENSE
@@ -19,6 +19,7 @@ Requires-Dist: botocore (>=1.34.162,<1.36)
19
19
  Requires-Dist: delta-spark (==2.4.*)
20
20
  Requires-Dist: duckdb (==1.1.*)
21
21
  Requires-Dist: lxml (>=4.9.1,<5.0.0)
22
+ Requires-Dist: numpy (==1.26.4)
22
23
  Requires-Dist: openpyxl (>=3.1,<4.0)
23
24
  Requires-Dist: pandas (>=2.2.2,<3.0.0)
24
25
  Requires-Dist: polars (==0.20.*)
@@ -45,7 +46,7 @@ Description-Content-Type: text/markdown
45
46
 
46
47
  The Data Validation Engine (DVE) is a configuration driven data validation library built and utilised by NHS England. Currently the package has been reverted from v1.0.0 release to a 0.x as we feel the package is not yet mature enough to be considered a 1.0.0 release. So please bear this in mind if reading through the commits and references to a v1+ release when on v0.x.
47
48
 
48
- As mentioned above, the DVE is "configuration driven" which means the majority of development for you as a user will be building a JSON document to describe how the data will be validated. The JSON document is known as a `dischema` file and example files can be accessed [here](./tests/testdata/). If you'd like to learn more about JSON document and how to build one from scratch, then please read the documentation [here](./docs/).
49
+ As mentioned above, the DVE is "configuration driven" which means the majority of development for you as a user will be building a JSON document to describe how the data will be validated. The JSON document is known as a `dischema` file and example files can be accessed [here](https://github.com/NHSDigital/data-validation-engine/tree/main/tests/testdata). If you'd like to learn more about JSON document and how to build one from scratch, then please read the documentation [here](https://nhsdigital.github.io/data-validation-engine/).
49
50
 
50
51
  Once a dischema file has been defined, you are ready to use the DVE. The DVE is typically orchestrated based on four key "services". These are...
51
52
 
@@ -56,7 +57,7 @@ Once a dischema file has been defined, you are ready to use the DVE. The DVE is
56
57
  | 3. | Business Rules | The business rules service will perform more complex validations such as comparisons between fields and tables, aggregations, filters etc to generate new entities. |
57
58
  | 4. | Error Reports | The error reports service will take all the errors raised in previous services and surface them into a readable format for a downstream users/service. Currently, this implemented to be an excel spreadsheet but could be reconfigured to meet other requirements/use cases. |
58
59
 
59
- If you'd like more detailed documentation around these services the please read the extended documentation [here](./docs/).
60
+ If you'd like more detailed documentation around these services the please read the extended documentation [here](https://nhsdigital.github.io/data-validation-engine/).
60
61
 
61
62
  The DVE has been designed in a way that's modular and can support users who just want to utilise specific "services" from the DVE (i.e. just the file transformation + data contract). Additionally, the DVE is designed to support different backend implementations. As part of the base installation of DVE, you will find backend support for `Spark` and `DuckDB`. So, if you need a `MySQL` backend implementation, you can implement this yourself. Given our organisations requirements, it will be unlikely that we add anymore specific backend implementations into the base package beyond Spark and DuckDB. So, if you are unable to implement this yourself, I would recommend reading the guidance on [requesting new features and raising bug reports here](#requesting-new-features-and-raising-bug-reports).
62
63
 
@@ -78,7 +79,7 @@ pip install data-validation-engine
78
79
 
79
80
  *Note - Only versions >=0.6.2 are available on PyPi. For older versions please install directly from the git repo or build from source.*
80
81
 
81
- Once you have installed the DVE you are ready to use it. For guidance on how to create your dischema JSON document (configuration), please read the [documentation](./docs/).
82
+ Once you have installed the DVE you are ready to use it. For guidance on how to create your dischema JSON document (configuration), please read the [documentation](https://nhsdigital.github.io/data-validation-engine/).
82
83
 
83
84
  Version 0.0.1 does support a working Python 3.7 installation. However, we will not be supporting any issues with that version of the DVE if you choose to use it. __Use at your own risk__.
84
85
 
@@ -91,17 +92,20 @@ If you have feature request then please follow the same process whilst using the
91
92
 
92
93
  ## Upcoming features
93
94
  Below is a list of features that we would like to implement or have been requested.
94
- | Feature | Release Version | Released? |
95
- | ------- | --------------- | --------- |
96
- | Open source release | 0.1.0 | Yes |
97
- | Uplift to Python 3.11 | 0.2.0 | Yes |
98
- | Upgrade to Pydantic 2.0 | Before 1.0 release | No |
99
- | Create a more user friendly interface for building and modifying dischema files | Not yet confirmed | No |
100
-
101
- Beyond the Python and Pydantic upgrade, we cannot confirm the other features will be made available anytime soon. Therefore, if you have the interest and desire to make these features available, then please read the [Contributing](#contributing) section and get involved.
95
+ | Feature | Release Version | Released? |
96
+ | ------------------------------------------------------------------------------- | ----------------- | --------- |
97
+ | Open source release | 0.1.0 | Yes |
98
+ | Uplift to Python 3.11 | 0.2.0 | Yes |
99
+ | Uplift Pyspark to 3.5 | TBA | No |
100
+ | Allow DVE to run on Python 3.12+ | TBA | No |
101
+ | Upgrade to Pydantic 2.0 | TBA | No |
102
+ | Uplift Pyspark to 4.0+ | TBA | No |
103
+ | Create a more user friendly interface for building and modifying dischema files | Not yet confirmed | No |
104
+
105
+ Beyond the Python and Pydantic upgrade, we cannot confirm the other features will be made available anytime soon. Therefore, if you have the interest and desire to make these features available, then please read the [Contributing](#Contributing) section and get involved.
102
106
 
103
107
  ## Contributing
104
- Please see guidance [here](./CONTRIBUTE.md).
108
+ Please see guidance [here](https://github.com/NHSDigital/data-validation-engine/blob/main/CONTRIBUTE.md).
105
109
 
106
110
  ## Legal
107
111
  This codebase is released under the MIT License. This covers both the codebase and any sample code in the documentation.
@@ -10,7 +10,7 @@
10
10
 
11
11
  The Data Validation Engine (DVE) is a configuration driven data validation library built and utilised by NHS England. Currently the package has been reverted from v1.0.0 release to a 0.x as we feel the package is not yet mature enough to be considered a 1.0.0 release. So please bear this in mind if reading through the commits and references to a v1+ release when on v0.x.
12
12
 
13
- As mentioned above, the DVE is "configuration driven" which means the majority of development for you as a user will be building a JSON document to describe how the data will be validated. The JSON document is known as a `dischema` file and example files can be accessed [here](./tests/testdata/). If you'd like to learn more about JSON document and how to build one from scratch, then please read the documentation [here](./docs/).
13
+ As mentioned above, the DVE is "configuration driven" which means the majority of development for you as a user will be building a JSON document to describe how the data will be validated. The JSON document is known as a `dischema` file and example files can be accessed [here](https://github.com/NHSDigital/data-validation-engine/tree/main/tests/testdata). If you'd like to learn more about JSON document and how to build one from scratch, then please read the documentation [here](https://nhsdigital.github.io/data-validation-engine/).
14
14
 
15
15
  Once a dischema file has been defined, you are ready to use the DVE. The DVE is typically orchestrated based on four key "services". These are...
16
16
 
@@ -21,7 +21,7 @@ Once a dischema file has been defined, you are ready to use the DVE. The DVE is
21
21
  | 3. | Business Rules | The business rules service will perform more complex validations such as comparisons between fields and tables, aggregations, filters etc to generate new entities. |
22
22
  | 4. | Error Reports | The error reports service will take all the errors raised in previous services and surface them into a readable format for a downstream users/service. Currently, this implemented to be an excel spreadsheet but could be reconfigured to meet other requirements/use cases. |
23
23
 
24
- If you'd like more detailed documentation around these services the please read the extended documentation [here](./docs/).
24
+ If you'd like more detailed documentation around these services the please read the extended documentation [here](https://nhsdigital.github.io/data-validation-engine/).
25
25
 
26
26
  The DVE has been designed in a way that's modular and can support users who just want to utilise specific "services" from the DVE (i.e. just the file transformation + data contract). Additionally, the DVE is designed to support different backend implementations. As part of the base installation of DVE, you will find backend support for `Spark` and `DuckDB`. So, if you need a `MySQL` backend implementation, you can implement this yourself. Given our organisations requirements, it will be unlikely that we add anymore specific backend implementations into the base package beyond Spark and DuckDB. So, if you are unable to implement this yourself, I would recommend reading the guidance on [requesting new features and raising bug reports here](#requesting-new-features-and-raising-bug-reports).
27
27
 
@@ -43,7 +43,7 @@ pip install data-validation-engine
43
43
 
44
44
  *Note - Only versions >=0.6.2 are available on PyPi. For older versions please install directly from the git repo or build from source.*
45
45
 
46
- Once you have installed the DVE you are ready to use it. For guidance on how to create your dischema JSON document (configuration), please read the [documentation](./docs/).
46
+ Once you have installed the DVE you are ready to use it. For guidance on how to create your dischema JSON document (configuration), please read the [documentation](https://nhsdigital.github.io/data-validation-engine/).
47
47
 
48
48
  Version 0.0.1 does support a working Python 3.7 installation. However, we will not be supporting any issues with that version of the DVE if you choose to use it. __Use at your own risk__.
49
49
 
@@ -56,17 +56,20 @@ If you have feature request then please follow the same process whilst using the
56
56
 
57
57
  ## Upcoming features
58
58
  Below is a list of features that we would like to implement or have been requested.
59
- | Feature | Release Version | Released? |
60
- | ------- | --------------- | --------- |
61
- | Open source release | 0.1.0 | Yes |
62
- | Uplift to Python 3.11 | 0.2.0 | Yes |
63
- | Upgrade to Pydantic 2.0 | Before 1.0 release | No |
64
- | Create a more user friendly interface for building and modifying dischema files | Not yet confirmed | No |
65
-
66
- Beyond the Python and Pydantic upgrade, we cannot confirm the other features will be made available anytime soon. Therefore, if you have the interest and desire to make these features available, then please read the [Contributing](#contributing) section and get involved.
59
+ | Feature | Release Version | Released? |
60
+ | ------------------------------------------------------------------------------- | ----------------- | --------- |
61
+ | Open source release | 0.1.0 | Yes |
62
+ | Uplift to Python 3.11 | 0.2.0 | Yes |
63
+ | Uplift Pyspark to 3.5 | TBA | No |
64
+ | Allow DVE to run on Python 3.12+ | TBA | No |
65
+ | Upgrade to Pydantic 2.0 | TBA | No |
66
+ | Uplift Pyspark to 4.0+ | TBA | No |
67
+ | Create a more user friendly interface for building and modifying dischema files | Not yet confirmed | No |
68
+
69
+ Beyond the Python and Pydantic upgrade, we cannot confirm the other features will be made available anytime soon. Therefore, if you have the interest and desire to make these features available, then please read the [Contributing](#Contributing) section and get involved.
67
70
 
68
71
  ## Contributing
69
- Please see guidance [here](./CONTRIBUTE.md).
72
+ Please see guidance [here](https://github.com/NHSDigital/data-validation-engine/blob/main/CONTRIBUTE.md).
70
73
 
71
74
  ## Legal
72
75
  This codebase is released under the MIT License. This covers both the codebase and any sample code in the documentation.
@@ -24,7 +24,7 @@ Issues = "https://github.com/NHSDigital/data-validation-engine/issues"
24
24
  Changelog = "https://github.com/NHSDigital/data-validation-engine/blob/main/CHANGELOG.md"
25
25
 
26
26
  [tool.poetry]
27
- version = "0.7.2"
27
+ version = "0.7.4"
28
28
  packages = [
29
29
  { include = "dve", from = "src" },
30
30
  ]
@@ -37,6 +37,7 @@ delta-spark = "2.4.*"
37
37
  duckdb = "1.1.*" # breaking changes beyond 1.1
38
38
  Jinja2 = "3.1.*"
39
39
  lxml = "^4.9.1"
40
+ numpy = "1.26.4"
40
41
  openpyxl = "^3.1"
41
42
  pandas = "^2.2.2"
42
43
  polars = "0.20.*"
@@ -55,6 +56,9 @@ include-groups = [
55
56
  [tool.poetry.group.dev.dependencies]
56
57
  commitizen = "4.9.1"
57
58
  pre-commit = "4.3.0"
59
+ charset-normalizer = "3.4.6"
60
+ python-discovery = "1.2.0"
61
+ requests = "2.33.0"
58
62
 
59
63
  [tool.poetry.group.test]
60
64
  optional = true
@@ -97,8 +101,10 @@ optional = true
97
101
  [tool.poetry.group.docs.dependencies]
98
102
  click = "8.2.1"
99
103
  mkdocs = "^1.6.1"
100
- mkdocstrings = { version = "^1.0.3", extras = ["python"] }
101
- zensical = "~=0.0.23"
104
+ mkdocstrings = { version = "1.0.3", extras = ["python"] }
105
+ griffelib = "2.0.1"
106
+ pymdown-extensions = "10.21.2"
107
+ zensical = "0.0.31"
102
108
 
103
109
  [tool.ruff]
104
110
  line-length = 100
@@ -31,6 +31,7 @@ from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import (
31
31
  duckdb_read_parquet,
32
32
  duckdb_record_index,
33
33
  duckdb_write_parquet,
34
+ get_duckdb_cast_statement_from_annotation,
34
35
  get_duckdb_type_from_annotation,
35
36
  relation_is_empty,
36
37
  )
@@ -101,18 +102,7 @@ class DuckDBDataContract(BaseDataContract[DuckDBPyRelation]):
101
102
  _lazy_df = pl.LazyFrame(records, polars_schema) # type: ignore # pylint: disable=unused-variable
102
103
  return self._connection.sql("select * from _lazy_df")
103
104
 
104
- @staticmethod
105
- def generate_ddb_cast_statement(
106
- column_name: str, dtype: DuckDBPyType, null_flag: bool = False
107
- ) -> str:
108
- """Helper method to generate sql statements for casting datatypes (permissively).
109
- Current duckdb python API doesn't play well with this currently.
110
- """
111
- if not null_flag:
112
- return f'try_cast("{column_name}" AS {dtype}) AS "{column_name}"'
113
- return f'cast(NULL AS {dtype}) AS "{column_name}"'
114
-
115
- # pylint: disable=R0914
105
+ # pylint: disable=R0914,R0915
116
106
  def apply_data_contract(
117
107
  self,
118
108
  working_dir: URI,
@@ -180,12 +170,16 @@ class DuckDBDataContract(BaseDataContract[DuckDBPyRelation]):
180
170
 
181
171
  casting_statements = [
182
172
  (
183
- self.generate_ddb_cast_statement(column, dtype)
173
+ get_duckdb_cast_statement_from_annotation(column, mdl_fld.annotation)
174
+ + f""" AS "{column}" """
184
175
  if column in relation.columns
185
- else self.generate_ddb_cast_statement(column, dtype, null_flag=True)
176
+ else f"CAST(NULL AS {ddb_schema[column]}) AS {column}"
186
177
  )
187
- for column, dtype in ddb_schema.items()
178
+ for column, mdl_fld in entity_fields.items()
188
179
  ]
180
+ casting_statements.append(
181
+ f"CAST({RECORD_INDEX_COLUMN_NAME} AS {get_duckdb_type_from_annotation(int)}) AS {RECORD_INDEX_COLUMN_NAME}" # pylint: disable=C0301
182
+ )
189
183
  try:
190
184
  relation = relation.project(", ".join(casting_statements))
191
185
  except Exception as err: # pylint: disable=broad-except
@@ -313,3 +313,108 @@ def duckdb_record_index(cls):
313
313
  setattr(cls, "add_record_index", _add_duckdb_record_index)
314
314
  setattr(cls, "drop_record_index", _drop_duckdb_record_index)
315
315
  return cls
316
+
317
+
318
+ def _cast_as_ddb_type(field_expr: str, type_annotation: Any) -> str:
319
+ """Cast to Duck DB type"""
320
+ return f"""try_cast({field_expr} as {get_duckdb_type_from_annotation(type_annotation)})"""
321
+
322
+
323
+ def _ddb_safely_quote_name(field_name: str) -> str:
324
+ """Quote field names in case reserved"""
325
+ try:
326
+ sep_idx = field_name.index(".")
327
+ return f'"{field_name[: sep_idx]}"' + field_name[sep_idx:]
328
+ except ValueError:
329
+ return f'"{field_name}"'
330
+
331
+
332
+ # pylint: disable=R0801,R0911,R0912
333
+ def get_duckdb_cast_statement_from_annotation(
334
+ element_name: str,
335
+ type_annotation: Any,
336
+ parent_element: bool = True,
337
+ date_regex: str = r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$",
338
+ timestamp_regex: str = r"^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}((\+|\-)[0-9]{2}:[0-9]{2})?$", # pylint: disable=C0301
339
+ time_regex: str = r"^[0-9]{2}:[0-9]{2}:[0-9]{2}$",
340
+ ) -> str:
341
+ """Generate casting statements for duckdb relations from type annotations"""
342
+ type_origin = get_origin(type_annotation)
343
+
344
+ quoted_name = _ddb_safely_quote_name(element_name)
345
+
346
+ # An `Optional` or `Union` type, check to ensure non-heterogenity.
347
+ if type_origin is Union:
348
+ python_type = _get_non_heterogenous_type(get_args(type_annotation))
349
+ return get_duckdb_cast_statement_from_annotation(
350
+ element_name, python_type, parent_element, date_regex, timestamp_regex
351
+ )
352
+
353
+ # Type hint is e.g. `List[str]`, check to ensure non-heterogenity.
354
+ if type_origin is list or (isinstance(type_origin, type) and issubclass(type_origin, list)):
355
+ element_type = _get_non_heterogenous_type(get_args(type_annotation))
356
+ stmt = f"list_transform({quoted_name}, x -> {get_duckdb_cast_statement_from_annotation('x',element_type, False, date_regex, timestamp_regex)})" # pylint: disable=C0301
357
+ return stmt if not parent_element else _cast_as_ddb_type(stmt, type_annotation)
358
+
359
+ if type_origin is Annotated:
360
+ python_type, *other_args = get_args(type_annotation) # pylint: disable=unused-variable
361
+ return get_duckdb_cast_statement_from_annotation(
362
+ element_name, python_type, parent_element, date_regex, timestamp_regex
363
+ ) # add other expected params here
364
+ # Ensure that we have a concrete type at this point.
365
+ if not isinstance(type_annotation, type):
366
+ raise ValueError(f"Unsupported type annotation {type_annotation!r}")
367
+
368
+ if (
369
+ # Type hint is a dict subclass, but not dict. Possibly a `TypedDict`.
370
+ (issubclass(type_annotation, dict) and type_annotation is not dict)
371
+ # Type hint is a dataclass.
372
+ or is_dataclass(type_annotation)
373
+ # Type hint is a `pydantic` model.
374
+ or (type_origin is None and issubclass(type_annotation, BaseModel))
375
+ ):
376
+ fields: dict[str, str] = {}
377
+ for field_name, field_annotation in get_type_hints(type_annotation).items():
378
+ # Technically non-string keys are disallowed, but people are bad.
379
+ if not isinstance(field_name, str):
380
+ raise ValueError(
381
+ f"Dictionary/Dataclass keys must be strings, got {type_annotation!r}"
382
+ ) # pragma: no cover
383
+ if get_origin(field_annotation) is ClassVar:
384
+ continue
385
+
386
+ fields[field_name] = get_duckdb_cast_statement_from_annotation(
387
+ f"{element_name}.{field_name}", field_annotation, False, date_regex, timestamp_regex
388
+ )
389
+
390
+ if not fields:
391
+ raise ValueError(
392
+ f"No type annotations in dict/dataclass type (got {type_annotation!r})"
393
+ )
394
+ cast_exprs = ",".join([f'"{nme}":= {stmt}' for nme, stmt in fields.items()])
395
+ stmt = f"struct_pack({cast_exprs})"
396
+ return stmt if not parent_element else _cast_as_ddb_type(stmt, type_annotation)
397
+
398
+ if type_annotation is list:
399
+ raise ValueError(
400
+ f"List must have type annotation (e.g. `List[str]`), got {type_annotation!r}"
401
+ )
402
+ if type_annotation is dict or type_origin is dict:
403
+ raise ValueError(f"dict must be `typing.TypedDict` subclass, got {type_annotation!r}")
404
+
405
+ for type_ in type_annotation.mro():
406
+ # datetime is subclass of date, so needs to be handled first
407
+ if issubclass(type_, datetime):
408
+ stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{timestamp_regex}') THEN TRY_CAST(TRIM({quoted_name}) as TIMESTAMP) ELSE NULL END" # pylint: disable=C0301
409
+ return stmt
410
+ if issubclass(type_, date):
411
+ stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{date_regex}') THEN TRY_CAST(TRIM({quoted_name}) as DATE) ELSE NULL END" # pylint: disable=C0301
412
+ return stmt
413
+ if issubclass(type_, time):
414
+ stmt = rf"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{time_regex}') THEN TRY_CAST(TRIM({quoted_name}) as TIME) ELSE NULL END" # pylint: disable=C0301
415
+ return stmt
416
+ duck_type = get_duckdb_type_from_annotation(type_)
417
+ if duck_type:
418
+ stmt = f"trim({quoted_name})"
419
+ return _cast_as_ddb_type(stmt, type_) if parent_element else stmt
420
+ raise ValueError(f"No equivalent DuckDB type for {type_annotation!r}")
@@ -439,3 +439,103 @@ def spark_record_index(cls):
439
439
  setattr(cls, "add_record_index", _add_spark_record_index)
440
440
  setattr(cls, "drop_record_index", _drop_spark_record_index)
441
441
  return cls
442
+
443
+
444
+ def _cast_as_spark_type(field_expr: str, field_type: Any) -> Column:
445
+ """Cast to spark type"""
446
+ return sf.expr(field_expr).cast(get_type_from_annotation(field_type))
447
+
448
+
449
+ def _spark_safely_quote_name(field_name: str) -> str:
450
+ """Quote field names in case reserved"""
451
+ try:
452
+ sep_idx = field_name.index(".")
453
+ return f"`{field_name[: sep_idx]}`" + field_name[sep_idx:]
454
+ except ValueError:
455
+ return f"`{field_name}`"
456
+
457
+
458
+ # pylint: disable=R0801
459
+ def get_spark_cast_statement_from_annotation(
460
+ element_name: str,
461
+ type_annotation: Any,
462
+ parent_element: bool = True,
463
+ date_regex: str = r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$",
464
+ timestamp_regex: str = r"^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}((\\+|\\-)[0-9]{2}:[0-9]{2})?$", # pylint: disable=C0301
465
+ ):
466
+ """Generate casting statements for spark dataframes based on type annotations"""
467
+ type_origin = get_origin(type_annotation)
468
+
469
+ quoted_name = _spark_safely_quote_name(element_name)
470
+
471
+ # An `Optional` or `Union` type, check to ensure non-heterogenity.
472
+ if type_origin is Union:
473
+ python_type = _get_non_heterogenous_type(get_args(type_annotation))
474
+ return get_spark_cast_statement_from_annotation(
475
+ element_name, python_type, parent_element, date_regex, timestamp_regex
476
+ )
477
+
478
+ # Type hint is e.g. `List[str]`, check to ensure non-heterogenity.
479
+ if type_origin is list or (isinstance(type_origin, type) and issubclass(type_origin, list)):
480
+ element_type = _get_non_heterogenous_type(get_args(type_annotation))
481
+ stmt = f"transform({quoted_name}, x -> {get_spark_cast_statement_from_annotation('x',element_type, False, date_regex, timestamp_regex)})" # pylint: disable=C0301
482
+ return stmt if not parent_element else _cast_as_spark_type(stmt, type_annotation)
483
+
484
+ if type_origin is Annotated:
485
+ python_type, *_ = get_args(type_annotation) # pylint: disable=unused-variable
486
+ return get_spark_cast_statement_from_annotation(
487
+ element_name, python_type, parent_element, date_regex, timestamp_regex
488
+ ) # add other expected params here
489
+ # Ensure that we have a concrete type at this point.
490
+ if not isinstance(type_annotation, type):
491
+ raise ValueError(f"Unsupported type annotation {type_annotation!r}")
492
+
493
+ if (
494
+ # Type hint is a dict subclass, but not dict. Possibly a `TypedDict`.
495
+ (issubclass(type_annotation, dict) and type_annotation is not dict)
496
+ # Type hint is a dataclass.
497
+ or is_dataclass(type_annotation)
498
+ # Type hint is a `pydantic` model.
499
+ or (type_origin is None and issubclass(type_annotation, BaseModel))
500
+ ):
501
+ fields: dict[str, str] = {}
502
+ for field_name, field_annotation in get_type_hints(type_annotation).items():
503
+ # Technically non-string keys are disallowed, but people are bad.
504
+ if not isinstance(field_name, str):
505
+ raise ValueError(
506
+ f"Dictionary/Dataclass keys must be strings, got {type_annotation!r}"
507
+ ) # pragma: no cover
508
+ if get_origin(field_annotation) is ClassVar:
509
+ continue
510
+
511
+ fields[field_name] = get_spark_cast_statement_from_annotation(
512
+ f"{element_name}.{field_name}", field_annotation, False, date_regex, timestamp_regex
513
+ )
514
+
515
+ if not fields:
516
+ raise ValueError(
517
+ f"No type annotations in dict/dataclass type (got {type_annotation!r})"
518
+ )
519
+ cast_exprs = ",".join([f"{stmt} AS `{nme}`" for nme, stmt in fields.items()])
520
+ stmt = f"struct({cast_exprs})"
521
+ return stmt if not parent_element else _cast_as_spark_type(stmt, type_annotation)
522
+ if type_annotation is list:
523
+ raise ValueError(
524
+ f"List must have type annotation (e.g. `List[str]`), got {type_annotation!r}"
525
+ )
526
+ if type_annotation is dict or type_origin is dict:
527
+ raise ValueError(f"dict must be `typing.TypedDict` subclass, got {type_annotation!r}")
528
+
529
+ for type_ in type_annotation.mro():
530
+ # datetime is subclass of date, so needs to be handled first
531
+ if issubclass(type_, dt.datetime):
532
+ stmt = rf"CASE WHEN REGEXP(TRIM({quoted_name}), '{timestamp_regex}') THEN TRIM({quoted_name}) ELSE NULL END" # pylint: disable=C0301
533
+ return _cast_as_spark_type(stmt, type_) if parent_element else stmt
534
+ if issubclass(type_, dt.date):
535
+ stmt = rf"CASE WHEN REGEXP(TRIM({quoted_name}), '{date_regex}') THEN TRIM({quoted_name}) ELSE NULL END" # pylint: disable=C0301
536
+ return _cast_as_spark_type(stmt, type_) if parent_element else stmt
537
+ spark_type = get_type_from_annotation(type_)
538
+ if spark_type:
539
+ stmt = f"trim({quoted_name})"
540
+ return _cast_as_spark_type(stmt, type_) if parent_element else stmt
541
+ raise ValueError(f"No equivalent Spark type for {type_annotation!r}")
@@ -519,7 +519,8 @@ class FormattedTime(dt.time):
519
519
  raise ValueError("Provided time has timezone, but this is forbidden for this field")
520
520
  if cls.TIMEZONE_TREATMENT == "require" and not new_time.tzinfo:
521
521
  raise ValueError("Provided time missing timezone, but this is required for this field")
522
-
522
+ if isinstance(value, str) and cls.TIME_FORMAT and value != str(new_time):
523
+ raise ValueError("Provided time is not matching expected time format supplied.")
523
524
  return new_time
524
525
 
525
526
  @classmethod
@@ -42,13 +42,13 @@ class FoundryDDBPipeline(DDBDVEPipeline):
42
42
  write_to.parent.mkdir(parents=True, exist_ok=True)
43
43
  write_to = write_to.as_posix()
44
44
  self.write_parquet( # type: ignore # pylint: disable=E1101
45
- self._audit_tables._processing_status.get_relation().filter( # pylint: disable=W0212
45
+ self._audit_tables._processing_status.get_relation().filter( # pylint: disable=W0212
46
46
  f"submission_id = '{submission_info.submission_id}'"
47
47
  ),
48
48
  fh.joinuri(write_to, "processing_status.parquet"),
49
49
  )
50
50
  self.write_parquet( # type: ignore # pylint: disable=E1101
51
- self._audit_tables._submission_statistics.get_relation().filter( # pylint: disable=W0212
51
+ self._audit_tables._submission_statistics.get_relation().filter( # pylint: disable=W0212
52
52
  f"submission_id = '{submission_info.submission_id}'"
53
53
  ),
54
54
  fh.joinuri(write_to, "submission_statistics.parquet"),
@@ -152,6 +152,9 @@ class FoundryDDBPipeline(DDBDVEPipeline):
152
152
  )
153
153
  if sub_stats:
154
154
  self._audit_tables.add_submission_statistics_records(sub_stats=[sub_stats])
155
+ else:
156
+ self._audit_tables.mark_failed(submissions=[sub_id])
157
+
155
158
  except Exception as err: # pylint: disable=W0718
156
159
  self._logger.exception(
157
160
  f"During processing of submission_id: {sub_id}, this exception was raised:"
@@ -527,7 +527,7 @@ class BaseDVEPipeline:
527
527
 
528
528
  return processed_files, failed_processing
529
529
 
530
- def apply_business_rules(
530
+ def apply_business_rules( # pylint: disable=R0914
531
531
  self, submission_info: SubmissionInfo, submission_status: Optional[SubmissionStatus] = None
532
532
  ) -> tuple[SubmissionInfo, SubmissionStatus]:
533
533
  """Apply the business rules to a given submission, the submission may have failed at the
@@ -581,15 +581,23 @@ class BaseDVEPipeline:
581
581
 
582
582
  key_fields = {model: conf.reporting_fields for model, conf in model_config.items()}
583
583
 
584
- self.step_implementations.apply_rules(working_directory, entity_manager, rules, key_fields) # type: ignore
584
+ _errors_uri, rules_success = self.step_implementations.apply_rules( # type: ignore
585
+ working_directory,
586
+ entity_manager,
587
+ rules,
588
+ key_fields
589
+ )
585
590
 
586
591
  rule_messages = load_feedback_messages(
587
592
  get_feedback_errors_uri(working_directory, "business_rules")
588
593
  )
589
- submission_status.validation_failed = (
594
+ if (
590
595
  any(not rule_message.is_informational for rule_message in rule_messages)
591
596
  or submission_status.validation_failed
592
- )
597
+ ):
598
+ submission_status.validation_failed = True
599
+ elif not rules_success:
600
+ submission_status.processing_failed = True
593
601
 
594
602
  for entity_name, entity in entity_manager.entities.items():
595
603
  projected = self._step_implementations.write_parquet( # type: ignore