data-validation-engine 0.6.2__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/PKG-INFO +9 -9
  2. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/README.md +8 -8
  3. data_validation_engine-0.7.0/pyproject.toml +272 -0
  4. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/base/backend.py +1 -4
  5. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/base/contract.py +11 -3
  6. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/base/reader.py +8 -0
  7. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/base/rules.py +6 -8
  8. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/duckdb/contract.py +8 -0
  9. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +28 -1
  10. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py +42 -7
  11. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/duckdb/readers/json.py +5 -1
  12. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/duckdb/readers/xml.py +10 -3
  13. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/duckdb/rules.py +2 -15
  14. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/spark/backend.py +2 -2
  15. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/spark/contract.py +9 -4
  16. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/spark/readers/csv.py +13 -1
  17. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/spark/readers/json.py +3 -1
  18. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/spark/readers/xml.py +9 -4
  19. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/spark/rules.py +2 -13
  20. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/spark/spark_helpers.py +30 -1
  21. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/readers/csv.py +5 -1
  22. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/readers/xml.py +5 -1
  23. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/utilities.py +22 -0
  24. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/constants.py +2 -2
  25. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/engine.py +2 -2
  26. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/message.py +5 -10
  27. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/type_hints.py +3 -0
  28. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/metadata_parser/domain_types.py +6 -0
  29. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/metadata_parser/model_generator.py +1 -1
  30. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/metadata_parser/models.py +1 -0
  31. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/pipeline/pipeline.py +7 -3
  32. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/reporting/error_report.py +1 -0
  33. data_validation_engine-0.6.2/pyproject.toml +0 -685
  34. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/LICENSE +0 -0
  35. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/__init__.py +0 -0
  36. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/common/__init__.py +0 -0
  37. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/common/error_utils.py +0 -0
  38. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/__init__.py +0 -0
  39. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/__init__.py +0 -0
  40. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/base/__init__.py +0 -0
  41. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/base/auditing.py +0 -0
  42. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/base/core.py +0 -0
  43. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/base/reference_data.py +0 -0
  44. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/base/utilities.py +0 -0
  45. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/exceptions.py +0 -0
  46. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/__init__.py +0 -0
  47. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/duckdb/__init__.py +0 -0
  48. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/duckdb/auditing.py +0 -0
  49. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/duckdb/readers/__init__.py +0 -0
  50. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/duckdb/reference_data.py +0 -0
  51. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/duckdb/types.py +0 -0
  52. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/duckdb/utilities.py +0 -0
  53. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/spark/__init__.py +0 -0
  54. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/spark/auditing.py +0 -0
  55. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/spark/readers/__init__.py +0 -0
  56. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/spark/reference_data.py +0 -0
  57. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/spark/types.py +0 -0
  58. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/implementations/spark/utilities.py +0 -0
  59. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/metadata/__init__.py +0 -0
  60. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/metadata/contract.py +0 -0
  61. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/metadata/reporting.py +0 -0
  62. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/metadata/rules.py +0 -0
  63. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/readers/__init__.py +0 -0
  64. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/readers/utilities.py +0 -0
  65. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/readers/xml_linting.py +0 -0
  66. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/backends/types.py +0 -0
  67. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/configuration/__init__.py +0 -0
  68. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/configuration/base.py +0 -0
  69. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/configuration/v1/__init__.py +0 -0
  70. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/configuration/v1/filters.py +0 -0
  71. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/configuration/v1/rule_stores/__init__.py +0 -0
  72. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/configuration/v1/rule_stores/models.py +0 -0
  73. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/configuration/v1/steps.py +0 -0
  74. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/exceptions.py +0 -0
  75. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/functions/__init__.py +0 -0
  76. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/functions/implementations.py +0 -0
  77. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/loggers.py +0 -0
  78. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/models.py +0 -0
  79. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/templating.py +0 -0
  80. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/core_engine/validation.py +0 -0
  81. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/metadata_parser/__init__.py +0 -0
  82. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/metadata_parser/exc.py +0 -0
  83. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/metadata_parser/function_library.py +0 -0
  84. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/metadata_parser/function_wrapper.py +0 -0
  85. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/metadata_parser/utilities.py +0 -0
  86. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/parser/__init__.py +0 -0
  87. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/parser/exceptions.py +0 -0
  88. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/parser/file_handling/__init__.py +0 -0
  89. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/parser/file_handling/helpers.py +0 -0
  90. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/parser/file_handling/implementations/__init__.py +0 -0
  91. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/parser/file_handling/implementations/base.py +0 -0
  92. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/parser/file_handling/implementations/dbfs.py +0 -0
  93. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/parser/file_handling/implementations/file.py +0 -0
  94. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/parser/file_handling/implementations/s3.py +0 -0
  95. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/parser/file_handling/log_handler.py +0 -0
  96. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/parser/file_handling/service.py +0 -0
  97. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/parser/file_handling/utilities.py +0 -0
  98. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/parser/type_hints.py +0 -0
  99. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/parser/utilities.py +0 -0
  100. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/pipeline/__init__.py +0 -0
  101. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/pipeline/duckdb_pipeline.py +0 -0
  102. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/pipeline/foundry_ddb_pipeline.py +0 -0
  103. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/pipeline/spark_pipeline.py +0 -0
  104. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/pipeline/utils.py +0 -0
  105. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/reporting/__init__.py +0 -0
  106. {data_validation_engine-0.6.2 → data_validation_engine-0.7.0}/src/dve/reporting/excel_report.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-validation-engine
3
- Version: 0.6.2
3
+ Version: 0.7.0
4
4
  Summary: `nhs data validation engine` is a framework used to validate data
5
5
  License-File: LICENSE
6
6
  Author: NHS England
@@ -28,7 +28,7 @@ Requires-Dist: typing_extensions (>=4.6.2,<5.0.0)
28
28
  Description-Content-Type: text/markdown
29
29
 
30
30
  <h1 style="display: flex; align-items: center; gap: 10px;">
31
- <img src="overrides/.icons/nhseng.svg" alt="NHS Logo" width="5%" height="100%" align="left">
31
+ <img src="https://github.com/NHSDigital/data-validation-engine/blob/616b55890306db4546177f7effac48ca241857ec/overrides/.icons/nhseng.svg" alt="" width="5%" height="100%" align="left">
32
32
  Data Validation Engine
33
33
  </h1>
34
34
 
@@ -54,11 +54,11 @@ If you'd like more detailed documentation around these services the please read
54
54
 
55
55
  The DVE has been designed in a way that's modular and can support users who just want to utilise specific "services" from the DVE (i.e. just the file transformation + data contract). Additionally, the DVE is designed to support different backend implementations. As part of the base installation of DVE, you will find backend support for `Spark` and `DuckDB`. So, if you need a `MySQL` backend implementation, you can implement this yourself. Given our organisations requirements, it will be unlikely that we add anymore specific backend implementations into the base package beyond Spark and DuckDB. So, if you are unable to implement this yourself, I would recommend reading the guidance on [requesting new features and raising bug reports here](#requesting-new-features-and-raising-bug-reports).
56
56
 
57
- Additionally, if you'd like to contribute a new backend implementation into the base DVE package, then please look at the [Contributing][#Contributing] section.
57
+ Additionally, if you'd like to contribute a new backend implementation into the base DVE package, then please look at the [Contributing](#Contributing) section.
58
58
 
59
59
  ## Installation and usage
60
60
 
61
- The DVE is a Python package and can be installed using `pip`. As of release v0.6.1 we currently support Python 3.10 & 3.11, with Spark version 3.4 and DuckDB version of 1.1. In the future we will be looking to upgrade the DVE to working on a higher versions of Python, DuckDB and Spark.
61
+ The DVE is a Python package and can be installed using package managers such as [pip](https://pypi.org/project/pip/). As of the latest release we support Python 3.10 & 3.11, with Spark v3.4 and DuckDB v1.1. In the future we will be looking to upgrade the DVE to working on a higher versions of Python, DuckDB and Spark.
62
62
 
63
63
  If you're planning to use the Spark backend implementation, you will also need OpenJDK 11 installed.
64
64
 
@@ -67,12 +67,12 @@ Python dependencies are listed in `pyproject.toml`.
67
67
  To install the DVE package you can simply install using a package manager such as [pip](https://pypi.org/project/pip/).
68
68
 
69
69
  ```
70
- pip install git+https://github.com/NHSDigital/data-validation-engine.git@v0.6.1
70
+ pip install data-validation-engine
71
71
  ```
72
72
 
73
- Once you have installed the DVE you are ready to use it. For guidance on how to create your dischema JSON document (configuration), please read the [documentation](./docs/).
73
+ *Note - Only versions >=0.6.2 are available on PyPi. For older versions please install directly from the git repo or build from source.*
74
74
 
75
- Please note - The long term aim is to make the DVE available via PyPi and Conda but we are not quite there yet. Once available this documentation will be updated to contain the new installation options.
75
+ Once you have installed the DVE you are ready to use it. For guidance on how to create your dischema JSON document (configuration), please read the [documentation](./docs/).
76
76
 
77
77
  Version 0.0.1 does support a working Python 3.7 installation. However, we will not be supporting any issues with that version of the DVE if you choose to use it. __Use at your own risk__.
78
78
 
@@ -89,10 +89,10 @@ Below is a list of features that we would like to implement or have been request
89
89
  | ------- | --------------- | --------- |
90
90
  | Open source release | 0.1.0 | Yes |
91
91
  | Uplift to Python 3.11 | 0.2.0 | Yes |
92
- | Upgrade to Pydantic 2.0 | Not yet confirmed | No |
92
+ | Upgrade to Pydantic 2.0 | Before 1.0 release | No |
93
93
  | Create a more user friendly interface for building and modifying dischema files | Not yet confirmed | No |
94
94
 
95
- Beyond the Python upgrade, we cannot confirm the other features will be made available anytime soon. Therefore, if you have the interest and desire to make these features available, then please read the [Contributing](#contributing) section and get involved.
95
+ Beyond the Python and Pydantic upgrade, we cannot confirm the other features will be made available anytime soon. Therefore, if you have the interest and desire to make these features available, then please read the [Contributing](#contributing) section and get involved.
96
96
 
97
97
  ## Contributing
98
98
  Please see guidance [here](./CONTRIBUTE.md).
@@ -1,5 +1,5 @@
1
1
  <h1 style="display: flex; align-items: center; gap: 10px;">
2
- <img src="overrides/.icons/nhseng.svg" alt="NHS Logo" width="5%" height="100%" align="left">
2
+ <img src="https://github.com/NHSDigital/data-validation-engine/blob/616b55890306db4546177f7effac48ca241857ec/overrides/.icons/nhseng.svg" alt="" width="5%" height="100%" align="left">
3
3
  Data Validation Engine
4
4
  </h1>
5
5
 
@@ -25,11 +25,11 @@ If you'd like more detailed documentation around these services the please read
25
25
 
26
26
  The DVE has been designed in a way that's modular and can support users who just want to utilise specific "services" from the DVE (i.e. just the file transformation + data contract). Additionally, the DVE is designed to support different backend implementations. As part of the base installation of DVE, you will find backend support for `Spark` and `DuckDB`. So, if you need a `MySQL` backend implementation, you can implement this yourself. Given our organisations requirements, it will be unlikely that we add anymore specific backend implementations into the base package beyond Spark and DuckDB. So, if you are unable to implement this yourself, I would recommend reading the guidance on [requesting new features and raising bug reports here](#requesting-new-features-and-raising-bug-reports).
27
27
 
28
- Additionally, if you'd like to contribute a new backend implementation into the base DVE package, then please look at the [Contributing][#Contributing] section.
28
+ Additionally, if you'd like to contribute a new backend implementation into the base DVE package, then please look at the [Contributing](#Contributing) section.
29
29
 
30
30
  ## Installation and usage
31
31
 
32
- The DVE is a Python package and can be installed using `pip`. As of release v0.6.1 we currently support Python 3.10 & 3.11, with Spark version 3.4 and DuckDB version of 1.1. In the future we will be looking to upgrade the DVE to working on a higher versions of Python, DuckDB and Spark.
32
+ The DVE is a Python package and can be installed using package managers such as [pip](https://pypi.org/project/pip/). As of the latest release we support Python 3.10 & 3.11, with Spark v3.4 and DuckDB v1.1. In the future we will be looking to upgrade the DVE to working on a higher versions of Python, DuckDB and Spark.
33
33
 
34
34
  If you're planning to use the Spark backend implementation, you will also need OpenJDK 11 installed.
35
35
 
@@ -38,12 +38,12 @@ Python dependencies are listed in `pyproject.toml`.
38
38
  To install the DVE package you can simply install using a package manager such as [pip](https://pypi.org/project/pip/).
39
39
 
40
40
  ```
41
- pip install git+https://github.com/NHSDigital/data-validation-engine.git@v0.6.1
41
+ pip install data-validation-engine
42
42
  ```
43
43
 
44
- Once you have installed the DVE you are ready to use it. For guidance on how to create your dischema JSON document (configuration), please read the [documentation](./docs/).
44
+ *Note - Only versions >=0.6.2 are available on PyPi. For older versions please install directly from the git repo or build from source.*
45
45
 
46
- Please note - The long term aim is to make the DVE available via PyPi and Conda but we are not quite there yet. Once available this documentation will be updated to contain the new installation options.
46
+ Once you have installed the DVE you are ready to use it. For guidance on how to create your dischema JSON document (configuration), please read the [documentation](./docs/).
47
47
 
48
48
  Version 0.0.1 does support a working Python 3.7 installation. However, we will not be supporting any issues with that version of the DVE if you choose to use it. __Use at your own risk__.
49
49
 
@@ -60,10 +60,10 @@ Below is a list of features that we would like to implement or have been request
60
60
  | ------- | --------------- | --------- |
61
61
  | Open source release | 0.1.0 | Yes |
62
62
  | Uplift to Python 3.11 | 0.2.0 | Yes |
63
- | Upgrade to Pydantic 2.0 | Not yet confirmed | No |
63
+ | Upgrade to Pydantic 2.0 | Before 1.0 release | No |
64
64
  | Create a more user friendly interface for building and modifying dischema files | Not yet confirmed | No |
65
65
 
66
- Beyond the Python upgrade, we cannot confirm the other features will be made available anytime soon. Therefore, if you have the interest and desire to make these features available, then please read the [Contributing](#contributing) section and get involved.
66
+ Beyond the Python and Pydantic upgrade, we cannot confirm the other features will be made available anytime soon. Therefore, if you have the interest and desire to make these features available, then please read the [Contributing](#contributing) section and get involved.
67
67
 
68
68
  ## Contributing
69
69
  Please see guidance [here](./CONTRIBUTE.md).
@@ -0,0 +1,272 @@
1
+ [tool.poetry]
2
+ name = "data-validation-engine"
3
+ version = "0.7.0"
4
+ description = "`nhs data validation engine` is a framework used to validate data"
5
+ authors = ["NHS England <england.contactus@nhs.net>"]
6
+ readme = "README.md"
7
+ packages = [
8
+ { include = "dve", from = "src" },
9
+ ]
10
+ classifiers = [
11
+ "Programming Language :: Python :: 3",
12
+ "Programming Language :: Python :: 3.10",
13
+ "Programming Language :: Python :: 3.11",
14
+ "Operating System :: OS Independent",
15
+ "Topic :: Software Development :: Libraries",
16
+ "Typing :: Typed",
17
+ ]
18
+
19
+ [tool.poetry.dependencies]
20
+ python = ">=3.10,<3.12"
21
+ boto3 = ">=1.34.162,<1.36" # breaking change beyond 1.36
22
+ botocore = ">=1.34.162,<1.36" # breaking change beyond 1.36
23
+ delta-spark = "2.4.*"
24
+ duckdb = "1.1.*" # breaking changes beyond 1.1
25
+ Jinja2 = "3.1.*"
26
+ lxml = "^4.9.1"
27
+ openpyxl = "^3.1"
28
+ pandas = "^2.2.2"
29
+ polars = "0.20.*"
30
+ pyarrow = "^17.0.0"
31
+ pydantic = "1.10.15"
32
+ pyspark = "3.4.*"
33
+ typing_extensions = "^4.6.2"
34
+
35
+ [tool.poetry.group.dev]
36
+ optional = true
37
+ include-groups = [
38
+ "test",
39
+ "lint"
40
+ ]
41
+
42
+ [tool.poetry.group.dev.dependencies]
43
+ commitizen = "4.9.1"
44
+ pre-commit = "4.3.0"
45
+
46
+ [tool.poetry.group.test]
47
+ optional = true
48
+
49
+ [tool.poetry.group.test.dependencies]
50
+ faker = "18.11.1"
51
+ behave = "1.3.3"
52
+ coverage = "7.11.0"
53
+ moto = {extras = ["s3"], version = "4.0.13"}
54
+ Werkzeug = "3.1.6"
55
+ pytest = "8.4.2"
56
+ pytest-lazy-fixtures = "1.4.0" # switched from https://github.com/TvoroG/pytest-lazy-fixture as it's no longer supported
57
+ xlsx2csv = "0.8.2"
58
+
59
+ [tool.poetry.group.lint]
60
+ optional = true
61
+
62
+ [tool.poetry.group.lint.dependencies]
63
+ black = "24.3.0"
64
+ astroid = "2.14.2"
65
+ isort = "5.11.5"
66
+ pylint = "2.16.4"
67
+ mypy = "0.991"
68
+ boto3-stubs = {extras = ["essential"], version = "1.26.72"}
69
+ botocore-stubs = "1.29.72"
70
+ pandas-stubs = "1.2.0.62"
71
+ types-awscrt = "0.19.1"
72
+ types-openpyxl = "3.1.0.19"
73
+ types-pytz = "2023.3.0.1"
74
+ types-PyYAML = "6.0.12.1"
75
+ types-requests = "2.31.0.2"
76
+ types-s3transfer = "0.6.2"
77
+ types-setuptools = "68.2.0.0"
78
+ types-urllib3 = "1.26.25.14"
79
+ types-xmltodict = "0.13.0.3"
80
+
81
+ [tool.ruff]
82
+ line-length = 100
83
+
84
+ [tool.ruff.lint]
85
+ select = ["E4", "E7", "E9", "F", "B", "D1", "D2", "I"]
86
+ ignore = ["B028", "D213", "D203", "D205", "D107", "D105"]
87
+
88
+ [tool.mypy]
89
+ plugins = ["pydantic.mypy"]
90
+
91
+ [[tool.mypy.overrides]]
92
+ module = "polars.*"
93
+ follow_imports = "skip"
94
+ # ^language server knows what's going on, but mypy can't find attributes on Self? type
95
+
96
+ [tool.black]
97
+ line-length = 100
98
+
99
+ [tool.isort]
100
+ profile = "black"
101
+ line_length = 100
102
+
103
+ [tool.pytest]
104
+ testpaths = ["tests"]
105
+
106
+ [tool.coverage]
107
+ [tool.coverage.run]
108
+ command_line = "-m pytest"
109
+ concurrency = ["multiprocessing"]
110
+ source_pkgs = [
111
+ "dve",
112
+ ]
113
+
114
+ [tool.coverage.report]
115
+ show_missing = true
116
+
117
+ [tool.pylint]
118
+ init-hook = "import sys; sys.path.append('./pylint_checkers')"
119
+ load-plugins = "check_typing_imports"
120
+
121
+ [tool.pylint.main]
122
+ extension-pkg-allow-list = ["pyspark", "lxml", "pydantic"]
123
+ fail-under = 10.0
124
+ ignore = ["CVS"]
125
+ ignore-patterns = ["^\\.#"]
126
+ jobs = 1
127
+ limit-inference-results = 100
128
+ persistent = true
129
+ py-version = "3.10"
130
+ suggestion-mode = true
131
+
132
+ [tool.pylint.basic]
133
+ argument-naming-style = "snake_case"
134
+ attr-naming-style = "snake_case"
135
+ bad-names = ["foo", "bar", "baz", "toto", "tutu", "tata"]
136
+ class-attribute-naming-style = "any"
137
+ class-const-naming-style = "UPPER_CASE"
138
+ class-naming-style = "PascalCase"
139
+ const-naming-style = "UPPER_CASE"
140
+ docstring-min-length = -1
141
+ function-naming-style = "snake_case"
142
+ good-names = ["i", "j", "k", "ex", "Run", "df", "_", "f"]
143
+ inlinevar-naming-style = "any"
144
+ method-naming-style = "snake_case"
145
+ module-naming-style = "snake_case"
146
+ no-docstring-rgx = "^_"
147
+ property-classes = ["abc.abstractproperty"]
148
+ variable-naming-style = "snake_case"
149
+
150
+ [tool.pylint.classes]
151
+ defining-attr-methods = ["__init__", "__new__", "setUp", "__post_init__"]
152
+ exclude-protected = ["_asdict", "_fields", "_replace", "_source", "_make"]
153
+ valid-classmethod-first-arg = ["cls"]
154
+ valid-metaclass-classmethod-first-arg = ["cls"]
155
+
156
+ [tool.pylint.design]
157
+ max-args = 8
158
+ max-attributes = 10
159
+ max-bool-expr = 5
160
+ max-branches = 15
161
+ max-locals = 20
162
+ max-parents = 7
163
+ max-public-methods = 20
164
+ max-returns = 6
165
+ max-statements = 50
166
+ min-public-methods = 2
167
+
168
+ [tool.pylint.exceptions]
169
+ overgeneral-exceptions = ["BaseException", "Exception"]
170
+
171
+ [tool.pylint.format]
172
+ ignore-long-lines = "^\\s*(# )?<?https?://\\S+>?$"
173
+ indent-after-paren = 4
174
+ indent-string = " "
175
+ max-line-length = 100
176
+ max-module-lines = 1000
177
+
178
+ [tool.pylint.imports]
179
+ known-third-party = ["enchant"]
180
+
181
+ [tool.pylint.logging]
182
+ logging-format-style = "old"
183
+ logging-modules = ["logging"]
184
+
185
+ [tool.pylint."messages control"]
186
+ confidence = [
187
+ "HIGH",
188
+ "CONTROL_FLOW",
189
+ "INFERENCE",
190
+ "INFERENCE_FAILURE",
191
+ "UNDEFINED",
192
+ ]
193
+ disable = [
194
+ "raw-checker-failed",
195
+ "bad-inline-option",
196
+ "locally-disabled",
197
+ "file-ignored",
198
+ "suppressed-message",
199
+ "useless-suppression",
200
+ "deprecated-pragma",
201
+ "use-symbolic-message-instead",
202
+ "logging-fstring-interpolation",
203
+ "fixme",
204
+ ]
205
+ enable = ["c-extension-no-member"]
206
+
207
+ [tool.pylint.miscellaneous]
208
+ notes = ["FIXME", "XXX", "TODO"]
209
+
210
+ [tool.pylint.refactoring]
211
+ max-nested-blocks = 5
212
+ never-returning-functions = ["sys.exit", "argparse.parse_error"]
213
+
214
+ [tool.pylint.reports]
215
+ evaluation = "max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))"
216
+ score = true
217
+
218
+ [tool.pylint.similarities]
219
+ ignore-comments = true
220
+ ignore-docstrings = true
221
+ ignore-imports = true
222
+ ignore-signatures = true
223
+ min-similarity-lines = 10
224
+
225
+ [tool.pylint.spelling]
226
+ max-spelling-suggestions = 4
227
+ spelling-ignore-comment-directives = "fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:"
228
+
229
+ [tool.pylint.typecheck]
230
+ contextmanager-decorators = ["contextlib.contextmanager"]
231
+ ignore-none = true
232
+ ignore-on-opaque-inference = true
233
+ ignored-checks-for-mixins = [
234
+ "no-member",
235
+ "not-async-context-manager",
236
+ "not-context-manager",
237
+ "attribute-defined-outside-init",
238
+ ]
239
+ ignored-classes = [
240
+ "optparse.Values",
241
+ "thread._local",
242
+ "_thread._local",
243
+ "argparse.Namespace",
244
+ ]
245
+ missing-member-hint = true
246
+ missing-member-hint-distance = 1
247
+ missing-member-max-choices = 1
248
+ mixin-class-rgx = ".*[Mm]ixin"
249
+
250
+ [tool.pylint.variables]
251
+ allow-global-unused-variables = true
252
+ callbacks = ["cb_", "_cb"]
253
+ dummy-variables-rgx = "_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_"
254
+ ignored-argument-names = "_.*|^ignored_|^unused_"
255
+ redefining-builtins-modules = [
256
+ "six.moves",
257
+ "past.builtins",
258
+ "future.builtins",
259
+ "builtins",
260
+ "io",
261
+ ]
262
+
263
+ [tool.commitizen]
264
+ name = "cz_conventional_commits"
265
+ tag_format = "v$major.$minor.$patch"
266
+ version_scheme = "pep440"
267
+ version_provider = "poetry"
268
+ update_changelog_on_bump = true
269
+
270
+ [build-system]
271
+ requires = ["poetry-core"]
272
+ build-backend = "poetry.core.masonry.api"
@@ -163,7 +163,7 @@ class BaseBackend(Generic[EntityType], ABC):
163
163
  return entities, get_parent(processing_errors_uri), successful
164
164
 
165
165
  for entity_name, entity in entities.items():
166
- entities[entity_name] = self.step_implementations.add_row_id(entity)
166
+ entities[entity_name] = self.step_implementations.add_record_index(entity)
167
167
 
168
168
  # TODO: Handle entity manager creation errors.
169
169
  entity_manager = EntityManager(entities, reference_data)
@@ -172,9 +172,6 @@ class BaseBackend(Generic[EntityType], ABC):
172
172
  # TODO: and return uri to errors
173
173
  _ = self.step_implementations.apply_rules(working_dir, entity_manager, rule_metadata)
174
174
 
175
- for entity_name, entity in entity_manager.entities.items():
176
- entity_manager.entities[entity_name] = self.step_implementations.drop_row_id(entity)
177
-
178
175
  return entity_manager.entities, get_parent(dc_feedback_errors_uri), True
179
176
 
180
177
  def process(
@@ -337,9 +337,9 @@ class BaseDataContract(Generic[EntityType], ABC):
337
337
  successful = True
338
338
  for entity_name, resource in entity_locations.items():
339
339
  reader_metadata = contract_metadata.reader_metadata[entity_name]
340
- extension = "." + (
341
- get_file_suffix(resource) or ""
342
- ).lower() # Already checked that extension supported.
340
+ extension = (
341
+ "." + (get_file_suffix(resource) or "").lower()
342
+ ) # Already checked that extension supported.
343
343
 
344
344
  reader_config = reader_metadata[extension]
345
345
  reader_type = get_reader(reader_config.reader)
@@ -369,6 +369,14 @@ class BaseDataContract(Generic[EntityType], ABC):
369
369
 
370
370
  return entities, dedup_messages(messages), successful
371
371
 
372
+ def add_record_index(self, entity: EntityType, **kwargs) -> EntityType:
373
+ """Add a record index to the entity"""
374
+ raise NotImplementedError(f"add_record_index not implemented in {self.__class__}")
375
+
376
+ def drop_record_index(self, entity: EntityType, **kwargs) -> EntityType:
377
+ """Drop a record index from the entity"""
378
+ raise NotImplementedError(f"drop_record_index not implemented in {self.__class__}")
379
+
372
380
  @abstractmethod
373
381
  def apply_data_contract(
374
382
  self,
@@ -127,6 +127,14 @@ class BaseFileReader(ABC):
127
127
 
128
128
  return reader_func(self, resource, entity_name, schema)
129
129
 
130
+ def add_record_index(self, entity: EntityType, **kwargs) -> EntityType:
131
+ """Add a record index to the entity"""
132
+ raise NotImplementedError(f"add_record_index not implemented in {self.__class__}")
133
+
134
+ def drop_record_index(self, entity: EntityType, **kwargs) -> EntityType:
135
+ """Drop a record index to the entity"""
136
+ raise NotImplementedError(f"drop_record_index not implemented in {self.__class__}")
137
+
130
138
  def write_parquet(
131
139
  self,
132
140
  entity: EntityType,
@@ -135,15 +135,13 @@ class BaseStepImplementations(Generic[EntityType], ABC): # pylint: disable=too-
135
135
  """Method to register all custom dve functions for use during business rules application"""
136
136
  raise NotImplementedError()
137
137
 
138
- @staticmethod
139
- def add_row_id(entity: EntityType) -> EntityType:
140
- """Add a unique row id field to an entity"""
141
- raise NotImplementedError()
138
+ def add_record_index(self, entity: EntityType, **kwargs) -> EntityType:
139
+ """Add a record index to the entity"""
140
+ raise NotImplementedError(f"add_record_index not implemented in {self.__class__}")
142
141
 
143
- @staticmethod
144
- def drop_row_id(entity: EntityType) -> EntityType:
145
- """Add a unique row id field to an entity"""
146
- raise NotImplementedError()
142
+ def drop_record_index(self, entity: EntityType) -> EntityType:
143
+ """Drop a unique row id field to an entity"""
144
+ raise NotImplementedError(f"drop_record_index not implemented in {self.__class__}")
147
145
 
148
146
  @classmethod
149
147
  def _raise_notimplemented_error(
@@ -29,6 +29,7 @@ from dve.core_engine.backends.base.utilities import (
29
29
  )
30
30
  from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import (
31
31
  duckdb_read_parquet,
32
+ duckdb_record_index,
32
33
  duckdb_write_parquet,
33
34
  get_duckdb_type_from_annotation,
34
35
  relation_is_empty,
@@ -37,6 +38,7 @@ from dve.core_engine.backends.implementations.duckdb.types import DuckDBEntities
37
38
  from dve.core_engine.backends.metadata.contract import DataContractMetadata
38
39
  from dve.core_engine.backends.types import StageSuccessful
39
40
  from dve.core_engine.backends.utilities import get_polars_type_from_annotation, stringify_model
41
+ from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME
40
42
  from dve.core_engine.message import FeedbackMessage
41
43
  from dve.core_engine.type_hints import URI, EntityLocations
42
44
  from dve.core_engine.validation import RowValidator, apply_row_validator_helper
@@ -54,6 +56,7 @@ class PandasApplyHelper:
54
56
  return row # no op
55
57
 
56
58
 
59
+ @duckdb_record_index
57
60
  @duckdb_write_parquet
58
61
  @duckdb_read_parquet
59
62
  class DuckDBDataContract(BaseDataContract[DuckDBPyRelation]):
@@ -144,10 +147,12 @@ class DuckDBDataContract(BaseDataContract[DuckDBPyRelation]):
144
147
  fld.name: get_duckdb_type_from_annotation(fld.annotation)
145
148
  for fld in entity_fields.values()
146
149
  }
150
+ ddb_schema[RECORD_INDEX_COLUMN_NAME] = get_duckdb_type_from_annotation(int)
147
151
  polars_schema: dict[str, PolarsType] = {
148
152
  fld.name: get_polars_type_from_annotation(fld.annotation)
149
153
  for fld in entity_fields.values()
150
154
  }
155
+ polars_schema[RECORD_INDEX_COLUMN_NAME] = get_polars_type_from_annotation(int)
151
156
  if relation_is_empty(relation):
152
157
  self.logger.warning(f"+ Empty relation for {entity_name}")
153
158
  empty_df = pl.DataFrame([], schema=polars_schema) # type: ignore # pylint: disable=W0612
@@ -170,6 +175,9 @@ class DuckDBDataContract(BaseDataContract[DuckDBPyRelation]):
170
175
 
171
176
  self.logger.info(f"Data contract found {msg_count} issues in {entity_name}")
172
177
 
178
+ if RECORD_INDEX_COLUMN_NAME not in relation.columns:
179
+ relation = self.add_record_index(relation)
180
+
173
181
  casting_statements = [
174
182
  (
175
183
  self.generate_ddb_cast_statement(column, dtype)
@@ -12,13 +12,14 @@ from urllib.parse import urlparse
12
12
 
13
13
  import duckdb.typing as ddbtyp
14
14
  import numpy as np
15
- from duckdb import DuckDBPyConnection, DuckDBPyRelation
15
+ from duckdb import DuckDBPyConnection, DuckDBPyRelation, StarExpression
16
16
  from duckdb.typing import DuckDBPyType
17
17
  from pandas import DataFrame
18
18
  from pydantic import BaseModel
19
19
  from typing_extensions import Annotated, get_args, get_origin, get_type_hints
20
20
 
21
21
  from dve.core_engine.backends.base.utilities import _get_non_heterogenous_type
22
+ from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME
22
23
  from dve.core_engine.type_hints import URI
23
24
  from dve.parser.file_handling.service import LocalFilesystemImplementation, _get_implementation
24
25
 
@@ -286,3 +287,29 @@ def duckdb_rel_to_dictionaries(
286
287
  cols: tuple[str] = tuple(entity.columns) # type: ignore
287
288
  while rows := entity.fetchmany(batch_size):
288
289
  yield from (dict(zip(cols, rw)) for rw in rows)
290
+
291
+
292
+ def _add_duckdb_record_index(
293
+ self, entity: DuckDBPyRelation # pylint: disable=W0613
294
+ ) -> DuckDBPyRelation:
295
+ """Add record index to duckdb relation"""
296
+ if RECORD_INDEX_COLUMN_NAME in entity.columns:
297
+ return entity
298
+
299
+ return entity.select(f"*, row_number() OVER () as {RECORD_INDEX_COLUMN_NAME}")
300
+
301
+
302
+ def _drop_duckdb_record_index(
303
+ self, entity: DuckDBPyRelation # pylint: disable=W0613
304
+ ) -> DuckDBPyRelation:
305
+ """Drop record index from duckdb relation"""
306
+ if RECORD_INDEX_COLUMN_NAME not in entity.columns:
307
+ return entity
308
+ return entity.select(StarExpression(exclude=[RECORD_INDEX_COLUMN_NAME]))
309
+
310
+
311
+ def duckdb_record_index(cls):
312
+ """Class decorator to add record index methods for duckdb implementations"""
313
+ setattr(cls, "add_record_index", _add_duckdb_record_index)
314
+ setattr(cls, "drop_record_index", _drop_duckdb_record_index)
315
+ return cls