data-validation-engine 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. data_validation_engine-0.6.2.dist-info/METADATA +104 -0
  2. data_validation_engine-0.6.2.dist-info/RECORD +105 -0
  3. data_validation_engine-0.6.2.dist-info/WHEEL +4 -0
  4. data_validation_engine-0.6.2.dist-info/licenses/LICENSE +21 -0
  5. dve/__init__.py +0 -0
  6. dve/common/__init__.py +0 -0
  7. dve/common/error_utils.py +189 -0
  8. dve/core_engine/__init__.py +0 -0
  9. dve/core_engine/backends/__init__.py +1 -0
  10. dve/core_engine/backends/base/__init__.py +1 -0
  11. dve/core_engine/backends/base/auditing.py +618 -0
  12. dve/core_engine/backends/base/backend.py +240 -0
  13. dve/core_engine/backends/base/contract.py +454 -0
  14. dve/core_engine/backends/base/core.py +124 -0
  15. dve/core_engine/backends/base/reader.py +176 -0
  16. dve/core_engine/backends/base/reference_data.py +217 -0
  17. dve/core_engine/backends/base/rules.py +685 -0
  18. dve/core_engine/backends/base/utilities.py +146 -0
  19. dve/core_engine/backends/exceptions.py +311 -0
  20. dve/core_engine/backends/implementations/__init__.py +1 -0
  21. dve/core_engine/backends/implementations/duckdb/__init__.py +26 -0
  22. dve/core_engine/backends/implementations/duckdb/auditing.py +234 -0
  23. dve/core_engine/backends/implementations/duckdb/contract.py +213 -0
  24. dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +288 -0
  25. dve/core_engine/backends/implementations/duckdb/readers/__init__.py +13 -0
  26. dve/core_engine/backends/implementations/duckdb/readers/csv.py +222 -0
  27. dve/core_engine/backends/implementations/duckdb/readers/json.py +50 -0
  28. dve/core_engine/backends/implementations/duckdb/readers/xml.py +45 -0
  29. dve/core_engine/backends/implementations/duckdb/reference_data.py +49 -0
  30. dve/core_engine/backends/implementations/duckdb/rules.py +534 -0
  31. dve/core_engine/backends/implementations/duckdb/types.py +47 -0
  32. dve/core_engine/backends/implementations/duckdb/utilities.py +41 -0
  33. dve/core_engine/backends/implementations/spark/__init__.py +22 -0
  34. dve/core_engine/backends/implementations/spark/auditing.py +230 -0
  35. dve/core_engine/backends/implementations/spark/backend.py +78 -0
  36. dve/core_engine/backends/implementations/spark/contract.py +241 -0
  37. dve/core_engine/backends/implementations/spark/readers/__init__.py +15 -0
  38. dve/core_engine/backends/implementations/spark/readers/csv.py +77 -0
  39. dve/core_engine/backends/implementations/spark/readers/json.py +66 -0
  40. dve/core_engine/backends/implementations/spark/readers/xml.py +202 -0
  41. dve/core_engine/backends/implementations/spark/reference_data.py +42 -0
  42. dve/core_engine/backends/implementations/spark/rules.py +430 -0
  43. dve/core_engine/backends/implementations/spark/spark_helpers.py +412 -0
  44. dve/core_engine/backends/implementations/spark/types.py +21 -0
  45. dve/core_engine/backends/implementations/spark/utilities.py +144 -0
  46. dve/core_engine/backends/metadata/__init__.py +47 -0
  47. dve/core_engine/backends/metadata/contract.py +80 -0
  48. dve/core_engine/backends/metadata/reporting.py +374 -0
  49. dve/core_engine/backends/metadata/rules.py +737 -0
  50. dve/core_engine/backends/readers/__init__.py +41 -0
  51. dve/core_engine/backends/readers/csv.py +232 -0
  52. dve/core_engine/backends/readers/utilities.py +21 -0
  53. dve/core_engine/backends/readers/xml.py +432 -0
  54. dve/core_engine/backends/readers/xml_linting.py +142 -0
  55. dve/core_engine/backends/types.py +26 -0
  56. dve/core_engine/backends/utilities.py +177 -0
  57. dve/core_engine/configuration/__init__.py +1 -0
  58. dve/core_engine/configuration/base.py +56 -0
  59. dve/core_engine/configuration/v1/__init__.py +351 -0
  60. dve/core_engine/configuration/v1/filters.py +60 -0
  61. dve/core_engine/configuration/v1/rule_stores/__init__.py +1 -0
  62. dve/core_engine/configuration/v1/rule_stores/models.py +57 -0
  63. dve/core_engine/configuration/v1/steps.py +365 -0
  64. dve/core_engine/constants.py +8 -0
  65. dve/core_engine/engine.py +265 -0
  66. dve/core_engine/exceptions.py +29 -0
  67. dve/core_engine/functions/__init__.py +6 -0
  68. dve/core_engine/functions/implementations.py +200 -0
  69. dve/core_engine/loggers.py +57 -0
  70. dve/core_engine/message.py +512 -0
  71. dve/core_engine/models.py +196 -0
  72. dve/core_engine/templating.py +114 -0
  73. dve/core_engine/type_hints.py +255 -0
  74. dve/core_engine/validation.py +160 -0
  75. dve/metadata_parser/__init__.py +2 -0
  76. dve/metadata_parser/domain_types.py +682 -0
  77. dve/metadata_parser/exc.py +44 -0
  78. dve/metadata_parser/function_library.py +64 -0
  79. dve/metadata_parser/function_wrapper.py +201 -0
  80. dve/metadata_parser/model_generator.py +119 -0
  81. dve/metadata_parser/models.py +410 -0
  82. dve/metadata_parser/utilities.py +54 -0
  83. dve/parser/__init__.py +1 -0
  84. dve/parser/exceptions.py +50 -0
  85. dve/parser/file_handling/__init__.py +31 -0
  86. dve/parser/file_handling/helpers.py +29 -0
  87. dve/parser/file_handling/implementations/__init__.py +7 -0
  88. dve/parser/file_handling/implementations/base.py +97 -0
  89. dve/parser/file_handling/implementations/dbfs.py +81 -0
  90. dve/parser/file_handling/implementations/file.py +203 -0
  91. dve/parser/file_handling/implementations/s3.py +371 -0
  92. dve/parser/file_handling/log_handler.py +215 -0
  93. dve/parser/file_handling/service.py +441 -0
  94. dve/parser/file_handling/utilities.py +53 -0
  95. dve/parser/type_hints.py +46 -0
  96. dve/parser/utilities.py +113 -0
  97. dve/pipeline/__init__.py +0 -0
  98. dve/pipeline/duckdb_pipeline.py +56 -0
  99. dve/pipeline/foundry_ddb_pipeline.py +171 -0
  100. dve/pipeline/pipeline.py +935 -0
  101. dve/pipeline/spark_pipeline.py +69 -0
  102. dve/pipeline/utils.py +96 -0
  103. dve/reporting/__init__.py +1 -0
  104. dve/reporting/error_report.py +153 -0
  105. dve/reporting/excel_report.py +319 -0
@@ -0,0 +1,104 @@
1
+ Metadata-Version: 2.4
2
+ Name: data-validation-engine
3
+ Version: 0.6.2
4
+ Summary: `nhs data validation engine` is a framework used to validate data
5
+ License-File: LICENSE
6
+ Author: NHS England
7
+ Author-email: england.contactus@nhs.net
8
+ Requires-Python: >=3.10,<3.12
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Topic :: Software Development :: Libraries
14
+ Classifier: Typing :: Typed
15
+ Requires-Dist: Jinja2 (==3.1.*)
16
+ Requires-Dist: boto3 (>=1.34.162,<1.36)
17
+ Requires-Dist: botocore (>=1.34.162,<1.36)
18
+ Requires-Dist: delta-spark (==2.4.*)
19
+ Requires-Dist: duckdb (==1.1.*)
20
+ Requires-Dist: lxml (>=4.9.1,<5.0.0)
21
+ Requires-Dist: openpyxl (>=3.1,<4.0)
22
+ Requires-Dist: pandas (>=2.2.2,<3.0.0)
23
+ Requires-Dist: polars (==0.20.*)
24
+ Requires-Dist: pyarrow (>=17.0.0,<18.0.0)
25
+ Requires-Dist: pydantic (==1.10.15)
26
+ Requires-Dist: pyspark (==3.4.*)
27
+ Requires-Dist: typing_extensions (>=4.6.2,<5.0.0)
28
+ Description-Content-Type: text/markdown
29
+
30
+ <h1 style="display: flex; align-items: center; gap: 10px;">
31
+ <img src="overrides/.icons/nhseng.svg" alt="NHS Logo" width="5%" height="100%" align="left">
32
+ Data Validation Engine
33
+ </h1>
34
+
35
+ ![License](https://img.shields.io/github/license/NHSDigital/data-validation-engine)
36
+ ![Version](https://img.shields.io/github/v/release/NHSDigital/data-validation-engine)
37
+ [![CI Unit Tests](https://github.com/NHSDigital/data-validation-engine/actions/workflows/ci_testing.yml/badge.svg)](https://github.com/NHSDigital/data-validation-engine/actions/workflows/ci_testing.yml)
38
+ [![CI Formatting & Linting](https://github.com/NHSDigital/data-validation-engine/actions/workflows/ci_linting.yml/badge.svg)](https://github.com/NHSDigital/data-validation-engine/actions/workflows/ci_linting.yml)
39
+
40
+ The Data Validation Engine (DVE) is a configuration driven data validation library built and utilised by NHS England. Currently the package has been reverted from v1.0.0 release to a 0.x as we feel the package is not yet mature enough to be considered a 1.0.0 release. So please bear this in mind if reading through the commits and references to a v1+ release when on v0.x.
41
+
42
+ As mentioned above, the DVE is "configuration driven" which means the majority of development for you as a user will be building a JSON document to describe how the data will be validated. The JSON document is known as a `dischema` file and example files can be accessed [here](./tests/testdata/). If you'd like to learn more about JSON document and how to build one from scratch, then please read the documentation [here](./docs/).
43
+
44
+ Once a dischema file has been defined, you are ready to use the DVE. The DVE is typically orchestrated based on four key "services". These are...
45
+
46
+ | | Service | Purpose |
47
+ | -- | ------- | ------- |
48
+ | 1. | File Transformation | This service will take submitted files and turn them into stringified parquet file(s) to ensure that a consistent data structure can be passed through the other services. |
49
+ | 2. | Data Contract | This service will validate and perform type casting against a stringified parquet file using [pydantic models](https://docs.pydantic.dev/1.10/). |
50
+ | 3. | Business Rules | The business rules service will perform more complex validations such as comparisons between fields and tables, aggregations, filters etc to generate new entities. |
51
+ | 4. | Error Reports | The error reports service will take all the errors raised in previous services and surface them into a readable format for a downstream users/service. Currently, this implemented to be an excel spreadsheet but could be reconfigured to meet other requirements/use cases. |
52
+
53
+ If you'd like more detailed documentation around these services the please read the extended documentation [here](./docs/).
54
+
55
+ The DVE has been designed in a way that's modular and can support users who just want to utilise specific "services" from the DVE (i.e. just the file transformation + data contract). Additionally, the DVE is designed to support different backend implementations. As part of the base installation of DVE, you will find backend support for `Spark` and `DuckDB`. So, if you need a `MySQL` backend implementation, you can implement this yourself. Given our organisations requirements, it will be unlikely that we add anymore specific backend implementations into the base package beyond Spark and DuckDB. So, if you are unable to implement this yourself, I would recommend reading the guidance on [requesting new features and raising bug reports here](#requesting-new-features-and-raising-bug-reports).
56
+
57
+ Additionally, if you'd like to contribute a new backend implementation into the base DVE package, then please look at the [Contributing][#Contributing] section.
58
+
59
+ ## Installation and usage
60
+
61
+ The DVE is a Python package and can be installed using `pip`. As of release v0.6.1 we currently support Python 3.10 & 3.11, with Spark version 3.4 and DuckDB version of 1.1. In the future we will be looking to upgrade the DVE to working on a higher versions of Python, DuckDB and Spark.
62
+
63
+ If you're planning to use the Spark backend implementation, you will also need OpenJDK 11 installed.
64
+
65
+ Python dependencies are listed in `pyproject.toml`.
66
+
67
+ To install the DVE package you can simply install using a package manager such as [pip](https://pypi.org/project/pip/).
68
+
69
+ ```
70
+ pip install git+https://github.com/NHSDigital/data-validation-engine.git@v0.6.1
71
+ ```
72
+
73
+ Once you have installed the DVE you are ready to use it. For guidance on how to create your dischema JSON document (configuration), please read the [documentation](./docs/).
74
+
75
+ Please note - The long term aim is to make the DVE available via PyPi and Conda but we are not quite there yet. Once available this documentation will be updated to contain the new installation options.
76
+
77
+ Version 0.0.1 does support a working Python 3.7 installation. However, we will not be supporting any issues with that version of the DVE if you choose to use it. __Use at your own risk__.
78
+
79
+ ## Requesting new features and raising bug reports
80
+ **Before creating new issues, please check to see if the same bug/feature has been created already. Where a duplicate is created, the ticket will be closed and referenced to an existing issue.**
81
+
82
+ If you have spotted a bug with the DVE then please raise an issue [here](https://github.com/nhsengland/Data-Validation-Engine/issues) using the "bug template".
83
+
84
+ If you have feature request then please follow the same process whilst using the "Feature request template".
85
+
86
+ ## Upcoming features
87
+ Below is a list of features that we would like to implement or have been requested.
88
+ | Feature | Release Version | Released? |
89
+ | ------- | --------------- | --------- |
90
+ | Open source release | 0.1.0 | Yes |
91
+ | Uplift to Python 3.11 | 0.2.0 | Yes |
92
+ | Upgrade to Pydantic 2.0 | Not yet confirmed | No |
93
+ | Create a more user friendly interface for building and modifying dischema files | Not yet confirmed | No |
94
+
95
+ Beyond the Python upgrade, we cannot confirm the other features will be made available anytime soon. Therefore, if you have the interest and desire to make these features available, then please read the [Contributing](#contributing) section and get involved.
96
+
97
+ ## Contributing
98
+ Please see guidance [here](./CONTRIBUTE.md).
99
+
100
+ ## Legal
101
+ This codebase is released under the MIT License. This covers both the codebase and any sample code in the documentation.
102
+
103
+ Any HTML or Markdown documentation is [© Crown copyright](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/) and available under the terms of the [Open Government 3.0 licence](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/).
104
+
@@ -0,0 +1,105 @@
1
+ dve/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ dve/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ dve/common/error_utils.py,sha256=uy4kl3U9ZxturTXDkZ0gegSQIf36AgwQsgY4oe14fP8,6545
4
+ dve/core_engine/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ dve/core_engine/backends/__init__.py,sha256=Mb0VB9fDUw2wkncbHUwAegwyAEh1XI30SA2sJkz4oHk,51
6
+ dve/core_engine/backends/base/__init__.py,sha256=a7reDmK2Jwn57iW1tcUc6VWsCQWt2Q8QffNXVas9bc0,39
7
+ dve/core_engine/backends/base/auditing.py,sha256=k9bqgnYLH2_ReuRsBGomidaOBs3iBuC7Cp4lAx_SmPQ,22913
8
+ dve/core_engine/backends/base/backend.py,sha256=gZiyFCrOCy-GWmMyeMqHnwXJTO4su55wsQleUWmA9nM,9992
9
+ dve/core_engine/backends/base/contract.py,sha256=1lpBb335ABOSPrtUFPXo66r8_uMTzd-L2IZKjKvupqs,17842
10
+ dve/core_engine/backends/base/core.py,sha256=YKLkalATdJ_z1c5y2qmJjU-193Km09tjQDlv_3YDO00,4766
11
+ dve/core_engine/backends/base/reader.py,sha256=Iv3D8J_nXvGOquf0pJhiiOCcB1zUyvEcLL1tN4Y3m9A,6529
12
+ dve/core_engine/backends/base/reference_data.py,sha256=M3HnzbC0r3V6qVNUz94-5Rod3I6WmO-xlMsp0Q5heJQ,8102
13
+ dve/core_engine/backends/base/rules.py,sha256=3V3mxHAEj8A-LlS6_dpaRqgzPfBSubPw2oA5iyPV_5g,28734
14
+ dve/core_engine/backends/base/utilities.py,sha256=nY0z-UOFqyrAO7jwu0j2nOZyQ5kuMVGQtNCi2JjpvnI,5148
15
+ dve/core_engine/backends/exceptions.py,sha256=1YEzFaGMTkJTUwmoyzownByrgPCvBSQDG3DtYLswRJA,11143
16
+ dve/core_engine/backends/implementations/__init__.py,sha256=7dB6WqLni3JHk3zcQdlECZkzpFLfrzhhoiqiWn_ruAg,62
17
+ dve/core_engine/backends/implementations/duckdb/__init__.py,sha256=IjGIYTGOSSKqbz5C3yE2fvcaq7l5fjYu41fW8hIdp_M,761
18
+ dve/core_engine/backends/implementations/duckdb/auditing.py,sha256=-d9VaqDF9hcopRoD_UXzxkjQoNuPa9s3H4vt1Oa8-4M,8684
19
+ dve/core_engine/backends/implementations/duckdb/contract.py,sha256=UeJ4xdNwE1EmomzfRtF5oQvPsPkMoQ1app_4nW64_pY,9063
20
+ dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py,sha256=1H7MyiRI_RIYSfL5I0gJ_37ugx3Va5CYyS1xn84oKdc,11013
21
+ dve/core_engine/backends/implementations/duckdb/readers/__init__.py,sha256=qyyFQX64LQGlpR25Ofud1YvADP3VWRVz8oS_P40jjTQ,367
22
+ dve/core_engine/backends/implementations/duckdb/readers/csv.py,sha256=0yPO-Xb-c5gMHz6pZqoIXyCycyTh-eOoCYAuo-pkuzk,9451
23
+ dve/core_engine/backends/implementations/duckdb/readers/json.py,sha256=lC7xEprsoKOGxP4IgGR9jQqeOUfu5URP7lj_uGzrwCg,1719
24
+ dve/core_engine/backends/implementations/duckdb/readers/xml.py,sha256=HXpY59oNLgJEC-hKudqbvL-CjzbnBRgP9td9eHh_hMM,1869
25
+ dve/core_engine/backends/implementations/duckdb/reference_data.py,sha256=r6pXB4Q0ti2NRLCxbiVEUu53FnSfwR7PAVX5aiYfF60,1774
26
+ dve/core_engine/backends/implementations/duckdb/rules.py,sha256=s9W_bxzRekUIXyQEmZaqOB3Q_g4IEf-ksNj7b41-Znk,23168
27
+ dve/core_engine/backends/implementations/duckdb/types.py,sha256=EP-CioAT5fEPCKOjIFA8mHbWZlD27RHtz_NkDS_3Kqo,1061
28
+ dve/core_engine/backends/implementations/duckdb/utilities.py,sha256=0ZybsiYUu0tOqvzUHxebGZJ-GXf0pl48haLSEgMt85M,1365
29
+ dve/core_engine/backends/implementations/spark/__init__.py,sha256=o5GWNbAXQ3PWryLaBpRBrhyBf9qkXypayEQAzFBp53g,624
30
+ dve/core_engine/backends/implementations/spark/auditing.py,sha256=9lc7CvPi146eqGBZ7wlcOIQLgJbSBeukW2M_FHA3eDQ,8753
31
+ dve/core_engine/backends/implementations/spark/backend.py,sha256=V5qL6T0FDl6n-VMQIJtY9qwg-kzxqPYTbzMOfw_H1Hc,3471
32
+ dve/core_engine/backends/implementations/spark/contract.py,sha256=TB66HKHdZaL-vxiarf7N56XUMEoYeBdjhsdzYiW3gNg,9565
33
+ dve/core_engine/backends/implementations/spark/readers/__init__.py,sha256=bQYNIPd1OSlvCp3bW3bS_ren3qFxiafM4JBRrmwk_Lg,436
34
+ dve/core_engine/backends/implementations/spark/readers/csv.py,sha256=rA5dhqLy6J3zs0Y7p0mw2YefGJsgz7e_zk8UKdfHqNs,2585
35
+ dve/core_engine/backends/implementations/spark/readers/json.py,sha256=zhVAYRovLJLEMIWoMQeH0pMFjW0Uu1-dPT8iN4XKJb4,2233
36
+ dve/core_engine/backends/implementations/spark/readers/xml.py,sha256=WnSY6Qet6Rn4HzdAJK3HMinEDYK943V3WikbkVU3Z2E,7808
37
+ dve/core_engine/backends/implementations/spark/reference_data.py,sha256=rD6PHckGGNw9-3FIQKueep-ID82am4Q28zLEoOfjiv8,1392
38
+ dve/core_engine/backends/implementations/spark/rules.py,sha256=NjY3CjDwb-FjBbEpY0w0ecLDJlHojz_D9bhjeyN3MFc,18336
39
+ dve/core_engine/backends/implementations/spark/spark_helpers.py,sha256=OoddXPTGEb0UHLkT4V_tDxMAIad_gQ3aBbxPyZfl5DQ,15216
40
+ dve/core_engine/backends/implementations/spark/types.py,sha256=IkJd6-wjFiaUdcOyQUM7XFS9olWJ3YYqLlY9yzy6u58,623
41
+ dve/core_engine/backends/implementations/spark/utilities.py,sha256=DRdDaQvIB5lgzCJLvxrOuLU7bkmqXk4gQGOMyf8FgVQ,4505
42
+ dve/core_engine/backends/metadata/__init__.py,sha256=LrJQinyvt3QK8o-NpMmY7nV0E6ncaUQNd9ibqyFj1Y4,1171
43
+ dve/core_engine/backends/metadata/contract.py,sha256=Pk53xxrMXNbTSHrGRDvJqpwFOYnz5MYtPEnKHWNQT7g,2859
44
+ dve/core_engine/backends/metadata/reporting.py,sha256=N3O0Ve9I1NKnn40wwl1dS58C5OjanqLo2V9Fc8WLtyU,14143
45
+ dve/core_engine/backends/metadata/rules.py,sha256=b1N5m2kNiUoXNTxb6GwU-M4jIaudhMPOKSh2l0K9W3g,25463
46
+ dve/core_engine/backends/readers/__init__.py,sha256=Gpq3xAiXOk8KRHmOt5pgu6P-v08LflaJiPW51h8wrEo,1366
47
+ dve/core_engine/backends/readers/csv.py,sha256=wKZbJxXbwuc_Xdj1PDDhX_qxekZiXsjFj4B7iahLN3o,8901
48
+ dve/core_engine/backends/readers/utilities.py,sha256=yUfSMeRfaBcYQph1RF0GqNrCF_3C6Ffy8w4NHIeUfpk,677
49
+ dve/core_engine/backends/readers/xml.py,sha256=d5-YQDKaFPTIh3_h_K1ogJTS0stmmm7cwaeUAR5XeDA,17160
50
+ dve/core_engine/backends/readers/xml_linting.py,sha256=a5c4FFjFn3rowjCg3V793zLOaOgmp1W3qPUnIknQhug,5422
51
+ dve/core_engine/backends/types.py,sha256=jn1knqFrxMMzchjpkDyDI3w2tR39N6XFk4oJS-LeQeQ,680
52
+ dve/core_engine/backends/utilities.py,sha256=mJspa4p2fJXcR8WPe24AsC_DWG9ZOqTe6Wwf1LChsQQ,7515
53
+ dve/core_engine/configuration/__init__.py,sha256=CLlpunGvqJcavVYRU-4KL7whI4_rbFlWfIbKNLmf7MQ,35
54
+ dve/core_engine/configuration/base.py,sha256=YUqag6m-xl8BVVRKXjwz0phznnPt5lvLGZX6Fn7ZnCE,1881
55
+ dve/core_engine/configuration/v1/__init__.py,sha256=qMFNKX_uTfLh5SFKcyyeZA0OS0Jz7jOdfF3XXYqxP2E,14088
56
+ dve/core_engine/configuration/v1/filters.py,sha256=ULPvGHjBvIqlUivbW1AoNRjINFE8WjlwXroIqcQVp-0,2076
57
+ dve/core_engine/configuration/v1/rule_stores/__init__.py,sha256=z6j1fyYvRr55uX5AyL4Hdl1FRfFmHM9PDHyECyR7bMo,45
58
+ dve/core_engine/configuration/v1/rule_stores/models.py,sha256=WDszhc9b2BGOfkusZ9XLF9ICkwCNeLhPZPmO4xQ5IRM,1855
59
+ dve/core_engine/configuration/v1/steps.py,sha256=grvlmdfZWtOpoki2ocySxkq4iCfbFem6nVgI82VLDag,10273
60
+ dve/core_engine/constants.py,sha256=_rOMpoo2eeL95DgaCijUKqK-BTazsGpLRt4MCZyaR98,323
61
+ dve/core_engine/engine.py,sha256=TiXyxqkB87dA88xye_cPKwNzBnueEc2V2euil2Qsax0,10428
62
+ dve/core_engine/exceptions.py,sha256=P9ISgn4TULQa_lAjmZjneCPkHh8L_FNduHOCR4BPR7Q,954
63
+ dve/core_engine/functions/__init__.py,sha256=CORD5jh6SQ-VOLQg8KveY7x0CDQ6ExXcN_dKqqk6kjw,168
64
+ dve/core_engine/functions/implementations.py,sha256=tafTfCvbrQAiiEFoUSvuPFosaq8rXU_7GAQV03l714U,4893
65
+ dve/core_engine/loggers.py,sha256=Kfn66ZRrnSOGCHtIgtFG3n4NzEjx_SZCa3eUkZzh1rc,1743
66
+ dve/core_engine/message.py,sha256=YKhKsn7-9-nUw5el9lIDxDGk_HY7BfRQsp8HQ4AX__Y,17677
67
+ dve/core_engine/models.py,sha256=xS0Py7BcoNei0aReiPEprZdOEyhhO7maiBILPld6Gc4,7573
68
+ dve/core_engine/templating.py,sha256=WZJxxD9FgtbkFHXVvqgEI1mNT092v_Ql_HHWHGlaTZM,3854
69
+ dve/core_engine/type_hints.py,sha256=2ciOVkYS10n-BfjziOf0Ulrp7v53EcSgufhh1Fk1j2Y,8422
70
+ dve/core_engine/validation.py,sha256=T0OSoc8lY-ibqxQ7ubBUs0Pb3eMsy0Rn_9CRA_RX_vY,6127
71
+ dve/metadata_parser/__init__.py,sha256=aW5V8AaufpFiB_UVrazDBDb3DI4ac8_LJz82EnV2hd4,107
72
+ dve/metadata_parser/domain_types.py,sha256=xZ-aWVEdv3a8ZUWY2IR82SeQufLvqoWaPtMbC0-l-Rk,24194
73
+ dve/metadata_parser/exc.py,sha256=ngP2JVRPEfuZ8Pk7q4LPuEd6AbtY9LNzIV8VponWitA,1108
74
+ dve/metadata_parser/function_library.py,sha256=h7WQjH3fV17IoPwvDnvCtRCH1_xIcnXThqgNdrb91yk,1640
75
+ dve/metadata_parser/function_wrapper.py,sha256=G4kY-AMJzrcS6yvI2PmPngUWCFEuj1aGeSI4CaBbtY4,6666
76
+ dve/metadata_parser/model_generator.py,sha256=wErxpnpTPgFRwZIAve8-MDU3hqRS5XdoKd-uzjqhujg,4142
77
+ dve/metadata_parser/models.py,sha256=ylqct8_PKHztd7rcueXrfjooGYRd_qScNytsBD-WAkc,15930
78
+ dve/metadata_parser/utilities.py,sha256=BtpbuVfgty7dcGFZggKIqj0XCBQV6a9bK0CA0BeB4-E,1588
79
+ dve/parser/__init__.py,sha256=8uVxu0C5drBMQI3ssFgq4-KpdUCr2bK5Wph7XFs1JgU,45
80
+ dve/parser/exceptions.py,sha256=zCe634MpBlBq-bTX8F2s98tnnoXbsAuFc_eR7B_zj6o,1319
81
+ dve/parser/file_handling/__init__.py,sha256=57VgwNocd3r5CVBlvNFJO5yKdH_Nv3ZjX-vz0kdQSc8,788
82
+ dve/parser/file_handling/helpers.py,sha256=_vTDA5jQvfRs66FNcZncBc-lDUhTVPYeB470V_sPnNc,868
83
+ dve/parser/file_handling/implementations/__init__.py,sha256=xI43lMBceGVjK7VhEosblO5iooquT3zoztUXT0C9CRI,261
84
+ dve/parser/file_handling/implementations/base.py,sha256=IvXnqKWb6te2ERY-_-W-QOS3BA-ZfS34SitEPDQQN2o,3525
85
+ dve/parser/file_handling/implementations/dbfs.py,sha256=lF-C1zjw_irAiXHKc5RFRwayAZwApijBLM-wS-qgBog,2967
86
+ dve/parser/file_handling/implementations/file.py,sha256=Up32oANER-nNmFhIiF22f9PZ2NlaFbCHWU0LiFXAe4w,8150
87
+ dve/parser/file_handling/implementations/s3.py,sha256=AAV02n2uBddIqvk8_C7b1gcWC6L1XSlqlH7dNwOj8cs,15084
88
+ dve/parser/file_handling/log_handler.py,sha256=wZcs0OFxWarYyB9aPPSLtZ4RiLf2AN9yoRxm69ya9u4,7492
89
+ dve/parser/file_handling/service.py,sha256=AdIniT1TYaPzM5mYW-_lxrfK7SGvf1KFcY34V9_7i8c,15942
90
+ dve/parser/file_handling/utilities.py,sha256=5BrQ8-ip9nZvAym1fSNvDZT5rcJ7l8dZcHrG_ny2XxI,1775
91
+ dve/parser/type_hints.py,sha256=S82emxVeOrZ4K9ei4A2bEHu2OmjjVCNQ5_HCZR80P4Y,1477
92
+ dve/parser/utilities.py,sha256=QoffR54nCdu_S5Na_Feq5tpFWAIt1xogPzlr4B8Bopo,3809
93
+ dve/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
94
+ dve/pipeline/duckdb_pipeline.py,sha256=bl33gWglqqP_YaZV7gOQcfta6aJRPZSNj4FIWBPfILM,2073
95
+ dve/pipeline/foundry_ddb_pipeline.py,sha256=2JWsEnWZbBrqsObvcrbGnCiV1pZblTT6WRV7GJnu-fs,8395
96
+ dve/pipeline/pipeline.py,sha256=OCZHkkBrg0uoKdx_mqYgiXcA0wt5938rN2Ab-89Y8vk,40637
97
+ dve/pipeline/spark_pipeline.py,sha256=jgDW8FcLuihQocnOVIzSWKM63EIlUL-C8ez3QTcbjVw,2573
98
+ dve/pipeline/utils.py,sha256=KvRr43OEku9Kl1oJIc94JEsDgly7DoK_i68HLZ5LDuk,3410
99
+ dve/reporting/__init__.py,sha256=mWauBREIYl8OBUsmrxIpLqEuDTBWyRSimB8p7_GYZE0,28
100
+ dve/reporting/error_report.py,sha256=kxTcSNbmgDCBCVUGVcrdn10xjZq77zN7cu7uf1i0kDw,5365
101
+ dve/reporting/excel_report.py,sha256=Oipi7gmd0ijmX0VTsfdEfzfpus_a1NpIlNQ87kKRoeU,12062
102
+ data_validation_engine-0.6.2.dist-info/METADATA,sha256=DSBlLMjhDV6o6606T2T24Ga8FVj9ngHPUtfptoMdYZY,7798
103
+ data_validation_engine-0.6.2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
104
+ data_validation_engine-0.6.2.dist-info/licenses/LICENSE,sha256=weoppAS71QMtLuFT0rITA0GAQPos-b7-_PGlL0Bsqvc,1085
105
+ data_validation_engine-0.6.2.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.2.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Crown Copyright NHS England.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
dve/__init__.py ADDED
File without changes
dve/common/__init__.py ADDED
File without changes
@@ -0,0 +1,189 @@
1
+ """Utilities to support reporting"""
2
+
3
+ import datetime as dt
4
+ import json
5
+ import logging
6
+ from collections.abc import Iterable
7
+ from itertools import chain
8
+ from multiprocessing import Queue
9
+ from threading import Thread
10
+ from typing import Optional, Union
11
+
12
+ import dve.parser.file_handling as fh
13
+ from dve.core_engine.exceptions import CriticalProcessingError
14
+ from dve.core_engine.loggers import get_logger
15
+ from dve.core_engine.message import UserMessage
16
+ from dve.core_engine.type_hints import URI, DVEStageName, Messages
17
+
18
+
19
+ def get_feedback_errors_uri(working_folder: URI, step_name: DVEStageName) -> URI:
20
+ """Determine the location of json lines file containing all errors generated in a step"""
21
+ return fh.joinuri(working_folder, "errors", f"{step_name}_errors.jsonl")
22
+
23
+
24
+ def get_processing_errors_uri(working_folder: URI) -> URI:
25
+ """Determine the location of json lines file containing all processing
26
+ errors generated from DVE run"""
27
+ return fh.joinuri(working_folder, "processing_errors", "processing_errors.jsonl")
28
+
29
+
30
+ def dump_feedback_errors(
31
+ working_folder: URI,
32
+ step_name: DVEStageName,
33
+ messages: Messages,
34
+ key_fields: Optional[dict[str, list[str]]] = None,
35
+ ) -> URI:
36
+ """Write out captured feedback error messages."""
37
+ if not working_folder:
38
+ raise AttributeError("processed files path not passed")
39
+
40
+ if not key_fields:
41
+ key_fields = {}
42
+
43
+ error_file = get_feedback_errors_uri(working_folder, step_name)
44
+ processed = []
45
+
46
+ for message in messages:
47
+ if message.original_entity is not None:
48
+ primary_keys = key_fields.get(message.original_entity, [])
49
+ elif message.entity is not None:
50
+ primary_keys = key_fields.get(message.entity, [])
51
+ else:
52
+ primary_keys = []
53
+
54
+ error = message.to_dict(
55
+ key_field=primary_keys,
56
+ value_separator=" -- ",
57
+ max_number_of_values=10,
58
+ record_converter=None,
59
+ )
60
+ error["Key"] = conditional_cast(error["Key"], primary_keys, value_separator=" -- ")
61
+ processed.append(error)
62
+
63
+ with fh.open_stream(error_file, "a") as f:
64
+ f.write("\n".join([json.dumps(rec, default=str) for rec in processed]) + "\n")
65
+ return error_file
66
+
67
+
68
+ def dump_processing_errors(
69
+ working_folder: URI, step_name: str, errors: list[CriticalProcessingError]
70
+ ):
71
+ """Write out critical processing errors"""
72
+ if not working_folder:
73
+ raise AttributeError("processed files path not passed")
74
+ if not step_name:
75
+ raise AttributeError("step name not passed")
76
+ if not errors:
77
+ raise AttributeError("errors list not passed")
78
+
79
+ error_file: URI = get_processing_errors_uri(working_folder)
80
+ processed = []
81
+
82
+ for error in errors:
83
+ processed.append(
84
+ {
85
+ "step_name": step_name,
86
+ "error_location": "processing",
87
+ "error_level": "integrity",
88
+ "error_message": error.error_message,
89
+ "error_traceback": error.messages,
90
+ }
91
+ )
92
+
93
+ with fh.open_stream(error_file, "a") as f:
94
+ f.write("\n".join([json.dumps(rec, default=str) for rec in processed]) + "\n")
95
+
96
+ return error_file
97
+
98
+
99
+ def load_feedback_messages(feedback_messages_uri: URI) -> Iterable[UserMessage]:
100
+ """Load user messages from jsonl file"""
101
+ if not fh.get_resource_exists(feedback_messages_uri):
102
+ return
103
+ with fh.open_stream(feedback_messages_uri) as errs:
104
+ yield from (UserMessage(**json.loads(err)) for err in errs.readlines())
105
+
106
+
107
+ def load_all_error_messages(error_directory_uri: URI) -> Iterable[UserMessage]:
108
+ "Load user messages from all jsonl files"
109
+ return chain.from_iterable(
110
+ [
111
+ load_feedback_messages(err_file)
112
+ for err_file, _ in fh.iter_prefix(error_directory_uri)
113
+ if err_file.endswith(".jsonl")
114
+ ]
115
+ )
116
+
117
+
118
+ class BackgroundMessageWriter:
119
+ """Controls batch writes to error jsonl files"""
120
+
121
+ def __init__(
122
+ self,
123
+ working_directory: URI,
124
+ dve_stage: DVEStageName,
125
+ key_fields: Optional[dict[str, list[str]]] = None,
126
+ logger: Optional[logging.Logger] = None,
127
+ ):
128
+ self._working_directory = working_directory
129
+ self._dve_stage = dve_stage
130
+ self._feedback_message_uri = get_feedback_errors_uri(
131
+ self._working_directory, self._dve_stage
132
+ )
133
+ self._key_fields = key_fields
134
+ self.logger = logger or get_logger(type(self).__name__)
135
+ self._write_thread: Optional[Thread] = None
136
+ self._queue: Queue = Queue()
137
+
138
+ @property
139
+ def write_queue(self) -> Queue: # type: ignore
140
+ """Queue for storing batches of messages to be written"""
141
+ return self._queue
142
+
143
+ @property
144
+ def write_thread(self) -> Thread: # type: ignore
145
+ """Thread to write batches of messages to jsonl file"""
146
+ if not self._write_thread:
147
+ self._write_thread = Thread(target=self._write_process_wrapper)
148
+ return self._write_thread
149
+
150
+ def _write_process_wrapper(self):
151
+ """Wrapper for dump feedback errors to run in background process"""
152
+ # writing thread will block if nothing in queue
153
+ while True:
154
+ if msgs := self.write_queue.get():
155
+ dump_feedback_errors(
156
+ self._working_directory, self._dve_stage, msgs, self._key_fields
157
+ )
158
+ else:
159
+ break
160
+
161
+ def __enter__(self) -> "BackgroundMessageWriter":
162
+ self.write_thread.start()
163
+ return self
164
+
165
+ def __exit__(self, exc_type, exc_value, traceback):
166
+ if exc_type:
167
+ self.logger.exception(
168
+ "Issue occured during background write process:",
169
+ exc_info=(exc_type, exc_value, traceback),
170
+ )
171
+ # None value in queue will trigger break in target
172
+ self.write_queue.put(None)
173
+ self.write_thread.join()
174
+
175
+
176
+ def conditional_cast(value, primary_keys: list[str], value_separator: str) -> Union[list[str], str]:
177
+ """Determines what to do with a value coming back from the error list"""
178
+ if isinstance(value, list):
179
+ casts = [
180
+ conditional_cast(val, primary_keys, value_separator) for val in value
181
+ ] # type: ignore
182
+ return value_separator.join(
183
+ [f"{pk}: {id}" if pk else "" for pk, id in zip(primary_keys, casts)]
184
+ )
185
+ if isinstance(value, dt.date):
186
+ return value.isoformat()
187
+ if isinstance(value, dict):
188
+ return ""
189
+ return str(value)
File without changes
@@ -0,0 +1 @@
1
+ """Backend implementations for the core engine."""
@@ -0,0 +1 @@
1
+ """The base backend implementation."""