gchq-data-quality 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. gchq_data_quality-1.0.0/PKG-INFO +278 -0
  2. gchq_data_quality-1.0.0/README.md +245 -0
  3. gchq_data_quality-1.0.0/pyproject.toml +87 -0
  4. gchq_data_quality-1.0.0/src/gchq_data_quality/__init__.py +71 -0
  5. gchq_data_quality-1.0.0/src/gchq_data_quality/config.py +531 -0
  6. gchq_data_quality-1.0.0/src/gchq_data_quality/errors.py +3 -0
  7. gchq_data_quality-1.0.0/src/gchq_data_quality/globals.py +38 -0
  8. gchq_data_quality-1.0.0/src/gchq_data_quality/models.py +188 -0
  9. gchq_data_quality-1.0.0/src/gchq_data_quality/results/__init__.py +1 -0
  10. gchq_data_quality-1.0.0/src/gchq_data_quality/results/models.py +446 -0
  11. gchq_data_quality-1.0.0/src/gchq_data_quality/results/utils.py +177 -0
  12. gchq_data_quality-1.0.0/src/gchq_data_quality/rules/__init__.py +69 -0
  13. gchq_data_quality-1.0.0/src/gchq_data_quality/rules/accuracy.py +78 -0
  14. gchq_data_quality-1.0.0/src/gchq_data_quality/rules/base.py +508 -0
  15. gchq_data_quality-1.0.0/src/gchq_data_quality/rules/completeness.py +58 -0
  16. gchq_data_quality-1.0.0/src/gchq_data_quality/rules/consistency.py +172 -0
  17. gchq_data_quality-1.0.0/src/gchq_data_quality/rules/timeliness.py +307 -0
  18. gchq_data_quality-1.0.0/src/gchq_data_quality/rules/uniqueness.py +167 -0
  19. gchq_data_quality-1.0.0/src/gchq_data_quality/rules/utils/__init__.py +1 -0
  20. gchq_data_quality-1.0.0/src/gchq_data_quality/rules/utils/datetime_utils.py +97 -0
  21. gchq_data_quality-1.0.0/src/gchq_data_quality/rules/utils/rules_utils.py +189 -0
  22. gchq_data_quality-1.0.0/src/gchq_data_quality/rules/validity.py +181 -0
  23. gchq_data_quality-1.0.0/src/gchq_data_quality/spark/__init__.py +1 -0
  24. gchq_data_quality-1.0.0/src/gchq_data_quality/spark/dataframe_operations.py +373 -0
  25. gchq_data_quality-1.0.0/src/gchq_data_quality/spark/models.py +47 -0
  26. gchq_data_quality-1.0.0/src/gchq_data_quality/spark/utils/__init__.py +1 -0
  27. gchq_data_quality-1.0.0/src/gchq_data_quality/spark/utils/results_utils.py +61 -0
  28. gchq_data_quality-1.0.0/src/gchq_data_quality/spark/utils/rules_utils.py +127 -0
@@ -0,0 +1,278 @@
1
+ Metadata-Version: 2.4
2
+ Name: gchq-data-quality
3
+ Version: 1.0.0
4
+ Summary: Tools for data quality measurement in Pandas and Spark
5
+ Keywords: data quality
6
+ Author: GCHQ
7
+ Author-email: GCHQ <oss@gchq.gov.uk>
8
+ License-Expression: Apache-2.0
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: License :: OSI Approved :: Apache Software License
11
+ Classifier: Natural Language :: English
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Programming Language :: Python :: 3.14
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Topic :: Utilities
18
+ Requires-Dist: numpy>=2.4.1
19
+ Requires-Dist: pandas>=2.2.3,<3
20
+ Requires-Dist: pydantic>=2.9
21
+ Requires-Dist: pyyaml>=6.0.3
22
+ Requires-Dist: elasticsearch>=8.17 ; extra == 'elasticsearch'
23
+ Requires-Dist: pyspark>=3.5 ; extra == 'pyspark'
24
+ Requires-Dist: pyarrow>=23.0.0 ; extra == 'pyspark'
25
+ Requires-Python: >=3.11
26
+ Project-URL: Documentation, https://gchq.github.io/gchq-data-quality/
27
+ Project-URL: Homepage, https://github.com/gchq/gchq-data-quality
28
+ Project-URL: Issues, https://github.com/gchq/gchq-data-quality/issues
29
+ Project-URL: Respository, https://github.com/gchq/gchq-data-quality
30
+ Provides-Extra: elasticsearch
31
+ Provides-Extra: pyspark
32
+ Description-Content-Type: text/markdown
33
+
34
+ # GCHQ Data Quality Package
35
+
36
+ ## Why build our own Data Quality Package?
37
+ There are a number of existing commercial and opensource data quality (DQ) packages already available. We created our own for five reasons:
38
+ ### 1. Opensource under Apache 2.0
39
+
40
+ For permissive use.
41
+
42
+ ### 2. Simplicity.
43
+
44
+ We opted for a simple 'plug-in' approach to DQ measurement to speed up adoption by Engineering teams:
45
+
46
+ 1. Get your data into a dataframe (Pandas or Spark)
47
+ 2. Define some rules (using only 8 functions)
48
+ 3. It will output a dataframe of results
49
+
50
+ How you handle the workloads either side are up to you (connecting to data, scheduling, sampling, dashboarding, alerting etc.)
51
+
52
+ We deliberately ignore connections to SQL / MongoDB etc, as all of those can, via an SDK, get data into a dataframe.
53
+
54
+ ### 3. Handle Nested Data
55
+
56
+ A lot of our data is nested. No other open-source data quality package handles nested data (that we could find).
57
+
58
+ ### 4. Better Data Quality rules for comparing two values.
59
+
60
+ We wanted something that:
61
+
62
+ 1. Gave us granular comparisons around time (such as an event happening within a time window relative to another date - important for us)
63
+ 2. Used pandas.eval() syntax to provide a huge range of flexibile rules with logical operators, summary statistics, string and datetime operations. We maximise the use of this powerful syntax, without complicating our code
64
+
65
+ ### 5. Designed for Insights
66
+
67
+ Other packages are designed with an Engineering mindset - it's about the number of rules that pass or fail. This isn't great for diagnosing the root cause of data quality issues. Our results format is a flat table, with enough metadata to make it easy to:
68
+
69
+ 1. Build an insightful dashboard
70
+ 2. Work out the cause of the DQ problem
71
+
72
+ ## Tutorials
73
+
74
+ There are tutorial notebooks to guide you through using the code. See the tutorials/ directory to find notebooks to download.
75
+
76
+ ## Orientation
77
+
78
+ ### Language
79
+ Consistent verbs / descriptions for both the code and the output are used.
80
+
81
+ A dataset is the data you are measuring, it may have a name and an ID.
82
+ Data Quality is formed from a set of Rules - these are **evaluated** can be passed or failed by a record in your dataset.
83
+
84
+ Rules are combined into a data quality configuration, which is **executed** on a dataset, and explains when the measurement happened, at one stage of the lifecycle etc.
85
+
86
+ The measure we use is the pass rate (records passing / records evaluated). This makes it distinct from a score
87
+ which suggests weighting. Users are free to take the pass rate and create weighted scores if they wish, although we do not recommend doing this as it is more confusing to interpret.
88
+
89
+ # Acknowledgements
90
+ We are grateful for DAMA-UK (Data Management Association, UK Chapter) for granting us permission to reference and use their Data Quality Dimensions throughout the tutorials and code.
91
+ Source: DAMA International® (2017) DAMA-DMBOK®: Data Management Body of Knowledge.
92
+ ISBN 9781634622349. Copyright registered June 18, 2018 (TX0008608498).
93
+ Rights and permissions: DAMA International®, 2512 E. Evergreen Blvd, #1023,
94
+ Vancouver, WA 98661, United States. (973) 625-4347.
95
+
96
+
97
+ # Data Quality Tools — Python Quickstart
98
+
99
+ ## Basic Usage
100
+
101
+ ### 1. Import Rules
102
+
103
+ Import the rule classes and supporting components:
104
+
105
+ ```python
106
+ import pandas as pd
107
+ from gchq_data_quality import (
108
+ UniquenessRule,
109
+ CompletenessRule,
110
+ ValidityRegexRule,
111
+ DataQualityConfig,
112
+ )
113
+ ```
114
+
115
+ ### 2. Create Example Data
116
+
117
+ ```python
118
+ df = pd.DataFrame({
119
+ "id": [1, 2, 2, 3],
120
+ "email": ["a@example.com", "b@sample.com", "invalid@", None],
121
+ })
122
+ ```
123
+
124
+ ### 3. Define and Evaluate Quality Rules
125
+
126
+ Instantiate each rule for the field of interest, and evaluate against your DataFrame:
127
+
128
+ ```python
129
+ # Uniqueness
130
+ uniqueness_rule = UniquenessRule(field="id")
131
+ uniqueness_result = uniqueness_rule.evaluate(df)
132
+ print("Uniqueness:", uniqueness_result.pass_rate) # e.g. 0.75 if 3/4 unique
133
+
134
+ # Completeness
135
+ completeness_rule = CompletenessRule(field="email")
136
+ completeness_result = completeness_rule.evaluate(df)
137
+ print("Completeness:", completeness_result.pass_rate)
138
+
139
+ # Validity using regular expressions
140
+ validity_rule = ValidityRegexRule(
141
+ field="email", regex_pattern=r"^[^@\s]+@[^@\s]+\.[^@\s]+$"
142
+ )
143
+ validity_result = validity_rule.evaluate(df)
144
+ print("Email validity:", validity_result.pass_rate)
145
+ ```
146
+
147
+ ### 4. Running Multiple Rules via a Config Object
148
+
149
+ You can bundle your rules and dataset details in a `DataQualityConfig`, execute them all at once, and get a tabular report:
150
+
151
+ ```python
152
+ dq_config = DataQualityConfig(
153
+ dataset_name="My Example Dataset",
154
+ rules=[
155
+ uniqueness_rule,
156
+ completeness_rule,
157
+ validity_rule,
158
+ ],
159
+ )
160
+ dq_report = dq_config.execute(df)
161
+ print(dq_report.to_dataframe())
162
+ ```
163
+
164
+ ## More Rule Types
165
+
166
+ Other measurements are available (AccuracyRule, ValidityNumericalRangeRule, ConsistencyRule, TimelinessStaticRule, etc.) — see notebook examples and code documentation for details.
167
+
168
+ ## YAML Configurations
169
+
170
+ For defining rules in YAML and running configs across multiple datasets, see the advanced Python 2 tutorial.
171
+
172
+ ### Spark dataframes
173
+
174
+ The same approach is used in Spark dataframes, and with Spark you can also handle nested data.
175
+ ```python
176
+ from pyspark.sql import SparkSession
177
+
178
+ from gchq_data_quality import DataQualityConfig
179
+ from gchq_data_quality.spark.dataframe_operations import flatten_spark
180
+
181
+ spark = SparkSession.builder.getOrCreate()
182
+
183
+ # Create nested example data: 2 parents, one with 1 child, one with 2
184
+ data = [
185
+ {
186
+ "parent": {
187
+ "age": 40,
188
+ "children": [{"age": 10}]
189
+ }
190
+ },
191
+ {
192
+ "parent": {
193
+ "age": 35,
194
+ "children": [{"age": 7}, {"age": 5}]
195
+ }
196
+ }
197
+ ]
198
+
199
+ schema = StructType([
200
+ StructField("parent", StructType([
201
+ StructField("age", IntegerType(), True),
202
+ StructField("children", ArrayType(StructType([
203
+ StructField("age", IntegerType(), True)
204
+ ])), True),
205
+ ]), True)
206
+ ])
207
+
208
+ spark = SparkSession.builder.getOrCreate()
209
+ spark_df = spark.createDataFrame(data, schema=schema)
210
+
211
+ # Optionally flatten nested columns to inspect structure - worth checking it's what you expected. Deeply nested data can get quite confusing.
212
+ df_flat = flatten_spark(
213
+ spark_df, flatten_cols=["parent.age", "parent.children[*].age"]
214
+ )
215
+ df_flat.show()
216
+
217
+ # Load DQ rules from YAML config (rules.yaml, e.g., we might run a consistency rule to check that '`parent.age` > `parent.children[*].age`
218
+ # i.e. all parents are older than all of their children
219
+ # See the tutorial for examples
220
+
221
+ dq_config = DataQualityConfig.from_yaml("rules.yaml")
222
+
223
+ # Run all configured rules
224
+ dq_report = dq_config.execute(spark_df)
225
+ print(dq_report.to_dataframe())
226
+
227
+ spark.stop()
228
+ ```
229
+
230
+ # Data Quality Overview
231
+
232
+ A brief overview of what we mean when we say 'data quality'
233
+ ## What is Data Quality?
234
+
235
+ Data quality refers to how well data meets the expectations and needs of its consumers.
236
+ Data is considered **high quality** when it is **fit-for-purpose** – meaning it supports the intended use effectively and reliably.
237
+
238
+ High-quality data provides:
239
+ - Assurance of compliance (policy or law).
240
+ - Confidence that analysts are receiving accurate and usable information.
241
+ - Early detection of data processing issues, such as:
242
+ - Faulty data pipelines
243
+ - Upstream schema changes
244
+ - Poor data entry
245
+ - Time-based anomalies (e.g., reduced data quality at weekends)
246
+
247
+ That's all quite theoretical. But in a practical sense, we define it as the percentage of records that pass a specific data quality rule. It's a 'unit test' for data.
248
+
249
+ ## Key Dimensions of Data Quality
250
+
251
+ We measure data quality by the percentage of defined rules that a record passes.
252
+ Rules are grouped under the **DAMA Framework’s six dimensions**:
253
+
254
+ 1. **Uniqueness** – No duplicates (e.g., every ID should be unique).
255
+ 2. **Completeness** – Data fields are present and not empty (e.g., no NULL or ‘N/A’ values).
256
+ 3. **Accuracy** – Values correctly describe the real-world object or event, usually checked against an authoritative dataset. e.g 'country' is an actual ISO country code.
257
+ 4. **Validity** – Values conform to the required format, type, or range (e.g., valid email address, age within a reasonable range).
258
+ 5. **Timeliness** – Data is up-to-date and time values make sense (e.g., birth dates are in the past).
259
+ 6. **Consistency** – Logical consistency within or across datasets (e.g., date of birth should be before date of death).
260
+
261
+ ## How We Use Data Quality Measures
262
+
263
+ - Each rule produces a score between `0` and `1` based on the proportion of records that pass.
264
+ - Scores can be monitored over time to detect trends and changes.
265
+ - Scores can be aggregated over the DAMA Dimensions, over fields or source data (makes it easy to dashbaord)
266
+ - Drops or spikes in scores may signal problems in:
267
+ - Extraction and transformation pipelines
268
+ - Upstream schema changes
269
+ - The relative changes to score across the dimensions can indicate what type of issue it is. e.g. a drop in uniqueness but no other change, suggest being given duplicate data. A drop in completeness, but not in validity, suggest a problem at data entry, with fields not being populated.
270
+
271
+ ## Why Data Quality Matters
272
+
273
+ Good data quality supports:
274
+ - More confident decision-making (higher assurance of underlying data)
275
+ - Reduced confusion from duplicate or invalid records
276
+ - More efficient data engineering work (if data is high quality, then joining, transformations, enrichment become easier)
277
+
278
+
@@ -0,0 +1,245 @@
1
+ # GCHQ Data Quality Package
2
+
3
+ ## Why build our own Data Quality Package?
4
+ There are a number of existing commercial and opensource data quality (DQ) packages already available. We created our own for five reasons:
5
+ ### 1. Opensource under Apache 2.0
6
+
7
+ For permissive use.
8
+
9
+ ### 2. Simplicity.
10
+
11
+ We opted for a simple 'plug-in' approach to DQ measurement to speed up adoption by Engineering teams:
12
+
13
+ 1. Get your data into a dataframe (Pandas or Spark)
14
+ 2. Define some rules (using only 8 functions)
15
+ 3. It will output a dataframe of results
16
+
17
+ How you handle the workloads either side are up to you (connecting to data, scheduling, sampling, dashboarding, alerting etc.)
18
+
19
+ We deliberately ignore connections to SQL / MongoDB etc, as all of those can, via an SDK, get data into a dataframe.
20
+
21
+ ### 3. Handle Nested Data
22
+
23
+ A lot of our data is nested. No other open-source data quality package handles nested data (that we could find).
24
+
25
+ ### 4. Better Data Quality rules for comparing two values.
26
+
27
+ We wanted something that:
28
+
29
+ 1. Gave us granular comparisons around time (such as an event happening within a time window relative to another date - important for us)
30
+ 2. Used pandas.eval() syntax to provide a huge range of flexibile rules with logical operators, summary statistics, string and datetime operations. We maximise the use of this powerful syntax, without complicating our code
31
+
32
+ ### 5. Designed for Insights
33
+
34
+ Other packages are designed with an Engineering mindset - it's about the number of rules that pass or fail. This isn't great for diagnosing the root cause of data quality issues. Our results format is a flat table, with enough metadata to make it easy to:
35
+
36
+ 1. Build an insightful dashboard
37
+ 2. Work out the cause of the DQ problem
38
+
39
+ ## Tutorials
40
+
41
+ There are tutorial notebooks to guide you through using the code. See the tutorials/ directory to find notebooks to download.
42
+
43
+ ## Orientation
44
+
45
+ ### Language
46
+ Consistent verbs / descriptions for both the code and the output are used.
47
+
48
+ A dataset is the data you are measuring, it may have a name and an ID.
49
+ Data Quality is formed from a set of Rules - these are **evaluated** can be passed or failed by a record in your dataset.
50
+
51
+ Rules are combined into a data quality configuration, which is **executed** on a dataset, and explains when the measurement happened, at one stage of the lifecycle etc.
52
+
53
+ The measure we use is the pass rate (records passing / records evaluated). This makes it distinct from a score
54
+ which suggests weighting. Users are free to take the pass rate and create weighted scores if they wish, although we do not recommend doing this as it is more confusing to interpret.
55
+
56
+ # Acknowledgements
57
+ We are grateful for DAMA-UK (Data Management Association, UK Chapter) for granting us permission to reference and use their Data Quality Dimensions throughout the tutorials and code.
58
+ Source: DAMA International® (2017) DAMA-DMBOK®: Data Management Body of Knowledge.
59
+ ISBN 9781634622349. Copyright registered June 18, 2018 (TX0008608498).
60
+ Rights and permissions: DAMA International®, 2512 E. Evergreen Blvd, #1023,
61
+ Vancouver, WA 98661, United States. (973) 625-4347.
62
+
63
+
64
+ # Data Quality Tools — Python Quickstart
65
+
66
+ ## Basic Usage
67
+
68
+ ### 1. Import Rules
69
+
70
+ Import the rule classes and supporting components:
71
+
72
+ ```python
73
+ import pandas as pd
74
+ from gchq_data_quality import (
75
+ UniquenessRule,
76
+ CompletenessRule,
77
+ ValidityRegexRule,
78
+ DataQualityConfig,
79
+ )
80
+ ```
81
+
82
+ ### 2. Create Example Data
83
+
84
+ ```python
85
+ df = pd.DataFrame({
86
+ "id": [1, 2, 2, 3],
87
+ "email": ["a@example.com", "b@sample.com", "invalid@", None],
88
+ })
89
+ ```
90
+
91
+ ### 3. Define and Evaluate Quality Rules
92
+
93
+ Instantiate each rule for the field of interest, and evaluate against your DataFrame:
94
+
95
+ ```python
96
+ # Uniqueness
97
+ uniqueness_rule = UniquenessRule(field="id")
98
+ uniqueness_result = uniqueness_rule.evaluate(df)
99
+ print("Uniqueness:", uniqueness_result.pass_rate) # e.g. 0.75 if 3/4 unique
100
+
101
+ # Completeness
102
+ completeness_rule = CompletenessRule(field="email")
103
+ completeness_result = completeness_rule.evaluate(df)
104
+ print("Completeness:", completeness_result.pass_rate)
105
+
106
+ # Validity using regular expressions
107
+ validity_rule = ValidityRegexRule(
108
+ field="email", regex_pattern=r"^[^@\s]+@[^@\s]+\.[^@\s]+$"
109
+ )
110
+ validity_result = validity_rule.evaluate(df)
111
+ print("Email validity:", validity_result.pass_rate)
112
+ ```
113
+
114
+ ### 4. Running Multiple Rules via a Config Object
115
+
116
+ You can bundle your rules and dataset details in a `DataQualityConfig`, execute them all at once, and get a tabular report:
117
+
118
+ ```python
119
+ dq_config = DataQualityConfig(
120
+ dataset_name="My Example Dataset",
121
+ rules=[
122
+ uniqueness_rule,
123
+ completeness_rule,
124
+ validity_rule,
125
+ ],
126
+ )
127
+ dq_report = dq_config.execute(df)
128
+ print(dq_report.to_dataframe())
129
+ ```
130
+
131
+ ## More Rule Types
132
+
133
+ Other measurements are available (AccuracyRule, ValidityNumericalRangeRule, ConsistencyRule, TimelinessStaticRule, etc.) — see notebook examples and code documentation for details.
134
+
135
+ ## YAML Configurations
136
+
137
+ For defining rules in YAML and running configs across multiple datasets, see the advanced Python 2 tutorial.
138
+
139
+ ### Spark dataframes
140
+
141
+ The same approach is used in Spark dataframes, and with Spark you can also handle nested data.
142
+ ```python
143
+ from pyspark.sql import SparkSession
144
+
145
+ from gchq_data_quality import DataQualityConfig
146
+ from gchq_data_quality.spark.dataframe_operations import flatten_spark
147
+
148
+ spark = SparkSession.builder.getOrCreate()
149
+
150
+ # Create nested example data: 2 parents, one with 1 child, one with 2
151
+ data = [
152
+ {
153
+ "parent": {
154
+ "age": 40,
155
+ "children": [{"age": 10}]
156
+ }
157
+ },
158
+ {
159
+ "parent": {
160
+ "age": 35,
161
+ "children": [{"age": 7}, {"age": 5}]
162
+ }
163
+ }
164
+ ]
165
+
166
+ schema = StructType([
167
+ StructField("parent", StructType([
168
+ StructField("age", IntegerType(), True),
169
+ StructField("children", ArrayType(StructType([
170
+ StructField("age", IntegerType(), True)
171
+ ])), True),
172
+ ]), True)
173
+ ])
174
+
175
+ spark = SparkSession.builder.getOrCreate()
176
+ spark_df = spark.createDataFrame(data, schema=schema)
177
+
178
+ # Optionally flatten nested columns to inspect structure - worth checking it's what you expected. Deeply nested data can get quite confusing.
179
+ df_flat = flatten_spark(
180
+ spark_df, flatten_cols=["parent.age", "parent.children[*].age"]
181
+ )
182
+ df_flat.show()
183
+
184
+ # Load DQ rules from YAML config (rules.yaml, e.g., we might run a consistency rule to check that '`parent.age` > `parent.children[*].age`
185
+ # i.e. all parents are older than all of their children
186
+ # See the tutorial for examples
187
+
188
+ dq_config = DataQualityConfig.from_yaml("rules.yaml")
189
+
190
+ # Run all configured rules
191
+ dq_report = dq_config.execute(spark_df)
192
+ print(dq_report.to_dataframe())
193
+
194
+ spark.stop()
195
+ ```
196
+
197
+ # Data Quality Overview
198
+
199
+ A brief overview of what we mean when we say 'data quality'
200
+ ## What is Data Quality?
201
+
202
+ Data quality refers to how well data meets the expectations and needs of its consumers.
203
+ Data is considered **high quality** when it is **fit-for-purpose** – meaning it supports the intended use effectively and reliably.
204
+
205
+ High-quality data provides:
206
+ - Assurance of compliance (policy or law).
207
+ - Confidence that analysts are receiving accurate and usable information.
208
+ - Early detection of data processing issues, such as:
209
+ - Faulty data pipelines
210
+ - Upstream schema changes
211
+ - Poor data entry
212
+ - Time-based anomalies (e.g., reduced data quality at weekends)
213
+
214
+ That's all quite theoretical. But in a practical sense, we define it as the percentage of records that pass a specific data quality rule. It's a 'unit test' for data.
215
+
216
+ ## Key Dimensions of Data Quality
217
+
218
+ We measure data quality by the percentage of defined rules that a record passes.
219
+ Rules are grouped under the **DAMA Framework’s six dimensions**:
220
+
221
+ 1. **Uniqueness** – No duplicates (e.g., every ID should be unique).
222
+ 2. **Completeness** – Data fields are present and not empty (e.g., no NULL or ‘N/A’ values).
223
+ 3. **Accuracy** – Values correctly describe the real-world object or event, usually checked against an authoritative dataset. e.g 'country' is an actual ISO country code.
224
+ 4. **Validity** – Values conform to the required format, type, or range (e.g., valid email address, age within a reasonable range).
225
+ 5. **Timeliness** – Data is up-to-date and time values make sense (e.g., birth dates are in the past).
226
+ 6. **Consistency** – Logical consistency within or across datasets (e.g., date of birth should be before date of death).
227
+
228
+ ## How We Use Data Quality Measures
229
+
230
+ - Each rule produces a score between `0` and `1` based on the proportion of records that pass.
231
+ - Scores can be monitored over time to detect trends and changes.
232
+ - Scores can be aggregated over the DAMA Dimensions, over fields or source data (makes it easy to dashbaord)
233
+ - Drops or spikes in scores may signal problems in:
234
+ - Extraction and transformation pipelines
235
+ - Upstream schema changes
236
+ - The relative changes to score across the dimensions can indicate what type of issue it is. e.g. a drop in uniqueness but no other change, suggest being given duplicate data. A drop in completeness, but not in validity, suggest a problem at data entry, with fields not being populated.
237
+
238
+ ## Why Data Quality Matters
239
+
240
+ Good data quality supports:
241
+ - More confident decision-making (higher assurance of underlying data)
242
+ - Reduced confusion from duplicate or invalid records
243
+ - More efficient data engineering work (if data is high quality, then joining, transformations, enrichment become easier)
244
+
245
+
@@ -0,0 +1,87 @@
1
+ [build-system]
2
+ requires = ["uv_build>=0.7.11,<0.8.0"]
3
+ build-backend = "uv_build"
4
+
5
+ [project]
6
+ name = "gchq-data-quality"
7
+ version = "1.0.0"
8
+ description = "Tools for data quality measurement in Pandas and Spark"
9
+ requires-python = ">=3.11"
10
+ keywords = ["data quality"]
11
+ classifiers = [
12
+ "Development Status :: 4 - Beta",
13
+ "License :: OSI Approved :: Apache Software License",
14
+ "Natural Language :: English",
15
+ "Programming Language :: Python :: 3.11",
16
+ "Programming Language :: Python :: 3.12",
17
+ "Programming Language :: Python :: 3.13",
18
+ "Programming Language :: Python :: 3.14",
19
+ "Intended Audience :: Developers",
20
+ "Topic :: Utilities",
21
+ ]
22
+
23
+ dependencies = [
24
+ "numpy>=2.4.1",
25
+ "pandas>=2.2.3,<3",
26
+ "pydantic>=2.9",
27
+ "pyyaml>=6.0.3",
28
+ ]
29
+
30
+
31
+ authors = [{ name = "GCHQ", email = "oss@gchq.gov.uk" }]
32
+
33
+ readme = "README.md"
34
+ license = "Apache-2.0"
35
+
36
+ [project.urls]
37
+ Homepage = "https://github.com/gchq/gchq-data-quality"
38
+ Issues = "https://github.com/gchq/gchq-data-quality/issues"
39
+ Documentation = "https://gchq.github.io/gchq-data-quality/"
40
+ Respository = "https://github.com/gchq/gchq-data-quality"
41
+
42
+ [project.optional-dependencies]
43
+ pyspark = ["pyspark>=3.5", "pyarrow>=23.0.0"]
44
+
45
+ elasticsearch = ["elasticsearch>=8.17"]
46
+
47
+ [dependency-groups]
48
+ dev = [
49
+ "ruff>=0.9, <1",
50
+ "pre-commit>=3.0",
51
+ "nbconvert>=7.17.0,<8",
52
+ "ipykernel>=7.1.0",
53
+ { include-group = "docs" },
54
+ { include-group = "test" },
55
+ ]
56
+
57
+ test = ["pytest>=7.0", "coverage>=7.9.2,<8"]
58
+ docs = ["mkdocs", "mkdocs-material", "mkdocstrings[python]"]
59
+
60
+ [tool.pytest.ini_options]
61
+ testpaths = ["tests"]
62
+
63
+
64
+ [tool.ruff]
65
+ line-length = 88
66
+
67
+ [tool.ruff.lint]
68
+ select = ["F", "E", "W", "I", "N", "UP", "S", "B", "COM", "ANN"]
69
+ ignore = ["E501", "COM812", "N813", "N806"]
70
+ fixable = ["ALL"]
71
+
72
+ [tool.ruff.lint.isort]
73
+ combine-as-imports = true
74
+
75
+ [tool.ruff.lint.per-file-ignores]
76
+ "tests/*" = ["S101"]
77
+
78
+ [tool.ruff.format]
79
+ quote-style = "double"
80
+ indent-style = "space"
81
+ skip-magic-trailing-comma = false
82
+ line-ending = "auto"
83
+ docstring-code-format = false
84
+ docstring-code-line-length = "dynamic"
85
+
86
+ [tool.ruff.lint.flake8-bugbear]
87
+ extend-immutable-calls = ["aws_cdk.Duration.minutes"]
@@ -0,0 +1,71 @@
1
+ # (c) Crown Copyright GCHQ \n
2
+ """
3
+ Data quality rule definitions for the gchq_data_quality framework.
4
+
5
+ This module provides data quality rules for the 6 DAMA Dimensions of Data Quality:
6
+ - Uniqueness
7
+ - Accuracy
8
+ - Completeness
9
+ - Validity
10
+ - Consistency
11
+ - Timeliness
12
+
13
+ They inherit from a core BaseRule class. All data quality evaluation is built on a consistent method:
14
+ 1. Determine the records that are evaluated (as a boolean mask) - records_evaluated_mask
15
+ The total records evaluated here is then the sum of the mask.
16
+ 2. Determine the records that pass the rule (as a boolean mask) - records_passing_mask
17
+ The count of records_passing is the sum of records_passing_mask AND records_evaluated_mask
18
+ (for various reasons you can have records passing a rule that are not in the evaluation set, e.g. they are NULL)
19
+
20
+ The pass_rate is then records_passing / records_evaluated
21
+
22
+ You can see the mechanisms in each rule primarily by looking at the masks that are created. The metrics derived from these
23
+ masks are typically the same for every rule type and are specified in BaseRule.
24
+
25
+ Available rule classes:
26
+ - UniquenessRule
27
+ - AccuracyRule
28
+ - CompletenessRule
29
+ - ConsistencyRule
30
+ - TimelinessRelativeRule
31
+ - TimelinessStaticRule
32
+ - ValidityNumericalRangeRule
33
+ - ValidityRegexRule
34
+
35
+ Whilst the user can call these rules and evaluate them against a dataframe
36
+
37
+ UniquenessRule.evalute(df)
38
+
39
+ The intention of the package is that multiple rules are wrapped up into a DataQualityConfig class
40
+ and executed together agains a dataframe.
41
+ """
42
+
43
+ from gchq_data_quality.rules.uniqueness import UniquenessRule # noqa
44
+ from gchq_data_quality.rules.accuracy import AccuracyRule # noqa
45
+ from gchq_data_quality.rules.completeness import CompletenessRule # noqa
46
+ from gchq_data_quality.rules.consistency import ConsistencyRule # noqa
47
+ from gchq_data_quality.rules.timeliness import (
48
+ TimelinessRelativeRule,
49
+ TimelinessStaticRule,
50
+ ) # noqa
51
+ from gchq_data_quality.rules.validity import (
52
+ ValidityNumericalRangeRule,
53
+ ValidityRegexRule,
54
+ ) # noqa
55
+
56
+ from gchq_data_quality.config import DataQualityConfig
57
+
58
+ from gchq_data_quality.results.models import DataQualityReport
59
+
60
+ __all__ = [
61
+ "UniquenessRule",
62
+ "AccuracyRule",
63
+ "CompletenessRule",
64
+ "ConsistencyRule",
65
+ "TimelinessRelativeRule",
66
+ "TimelinessStaticRule",
67
+ "ValidityNumericalRangeRule",
68
+ "ValidityRegexRule",
69
+ "DataQualityConfig",
70
+ "DataQualityReport",
71
+ ]