gchq-data-quality 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gchq_data_quality-1.0.0/PKG-INFO +278 -0
- gchq_data_quality-1.0.0/README.md +245 -0
- gchq_data_quality-1.0.0/pyproject.toml +87 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/__init__.py +71 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/config.py +531 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/errors.py +3 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/globals.py +38 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/models.py +188 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/results/__init__.py +1 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/results/models.py +446 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/results/utils.py +177 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/rules/__init__.py +69 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/rules/accuracy.py +78 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/rules/base.py +508 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/rules/completeness.py +58 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/rules/consistency.py +172 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/rules/timeliness.py +307 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/rules/uniqueness.py +167 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/rules/utils/__init__.py +1 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/rules/utils/datetime_utils.py +97 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/rules/utils/rules_utils.py +189 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/rules/validity.py +181 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/spark/__init__.py +1 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/spark/dataframe_operations.py +373 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/spark/models.py +47 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/spark/utils/__init__.py +1 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/spark/utils/results_utils.py +61 -0
- gchq_data_quality-1.0.0/src/gchq_data_quality/spark/utils/rules_utils.py +127 -0
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gchq-data-quality
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Tools for data quality measurement in Pandas and Spark
|
|
5
|
+
Keywords: data quality
|
|
6
|
+
Author: GCHQ
|
|
7
|
+
Author-email: GCHQ <oss@gchq.gov.uk>
|
|
8
|
+
License-Expression: Apache-2.0
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
+
Classifier: Natural Language :: English
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: Topic :: Utilities
|
|
18
|
+
Requires-Dist: numpy>=2.4.1
|
|
19
|
+
Requires-Dist: pandas>=2.2.3,<3
|
|
20
|
+
Requires-Dist: pydantic>=2.9
|
|
21
|
+
Requires-Dist: pyyaml>=6.0.3
|
|
22
|
+
Requires-Dist: elasticsearch>=8.17 ; extra == 'elasticsearch'
|
|
23
|
+
Requires-Dist: pyspark>=3.5 ; extra == 'pyspark'
|
|
24
|
+
Requires-Dist: pyarrow>=23.0.0 ; extra == 'pyspark'
|
|
25
|
+
Requires-Python: >=3.11
|
|
26
|
+
Project-URL: Documentation, https://gchq.github.io/gchq-data-quality/
|
|
27
|
+
Project-URL: Homepage, https://github.com/gchq/gchq-data-quality
|
|
28
|
+
Project-URL: Issues, https://github.com/gchq/gchq-data-quality/issues
|
|
29
|
+
Project-URL: Respository, https://github.com/gchq/gchq-data-quality
|
|
30
|
+
Provides-Extra: elasticsearch
|
|
31
|
+
Provides-Extra: pyspark
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# GCHQ Data Quality Package
|
|
35
|
+
|
|
36
|
+
## Why build our own Data Quality Package?
|
|
37
|
+
There are a number of existing commercial and opensource data quality (DQ) packages already available. We created our own for five reasons:
|
|
38
|
+
### 1. Opensource under Apache 2.0
|
|
39
|
+
|
|
40
|
+
For permissive use.
|
|
41
|
+
|
|
42
|
+
### 2. Simplicity.
|
|
43
|
+
|
|
44
|
+
We opted for a simple 'plug-in' approach to DQ measurement to speed up adoption by Engineering teams:
|
|
45
|
+
|
|
46
|
+
1. Get your data into a dataframe (Pandas or Spark)
|
|
47
|
+
2. Define some rules (using only 8 functions)
|
|
48
|
+
3. It will output a dataframe of results
|
|
49
|
+
|
|
50
|
+
How you handle the workloads either side are up to you (connecting to data, scheduling, sampling, dashboarding, alerting etc.)
|
|
51
|
+
|
|
52
|
+
We deliberately ignore connections to SQL / MongoDB etc, as all of those can, via an SDK, get data into a dataframe.
|
|
53
|
+
|
|
54
|
+
### 3. Handle Nested Data
|
|
55
|
+
|
|
56
|
+
A lot of our data is nested. No other open-source data quality package handles nested data (that we could find).
|
|
57
|
+
|
|
58
|
+
### 4. Better Data Quality rules for comparing two values.
|
|
59
|
+
|
|
60
|
+
We wanted something that:
|
|
61
|
+
|
|
62
|
+
1. Gave us granular comparisons around time (such as an event happening within a time window relative to another date - important for us)
|
|
63
|
+
2. Used pandas.eval() syntax to provide a huge range of flexibile rules with logical operators, summary statistics, string and datetime operations. We maximise the use of this powerful syntax, without complicating our code
|
|
64
|
+
|
|
65
|
+
### 5. Designed for Insights
|
|
66
|
+
|
|
67
|
+
Other packages are designed with an Engineering mindset - it's about the number of rules that pass or fail. This isn't great for diagnosing the root cause of data quality issues. Our results format is a flat table, with enough metadata to make it easy to:
|
|
68
|
+
|
|
69
|
+
1. Build an insightful dashboard
|
|
70
|
+
2. Work out the cause of the DQ problem
|
|
71
|
+
|
|
72
|
+
## Tutorials
|
|
73
|
+
|
|
74
|
+
There are tutorial notebooks to guide you through using the code. See the tutorials/ directory to find notebooks to download.
|
|
75
|
+
|
|
76
|
+
## Orientation
|
|
77
|
+
|
|
78
|
+
### Language
|
|
79
|
+
Consistent verbs / descriptions for both the code and the output are used.
|
|
80
|
+
|
|
81
|
+
A dataset is the data you are measuring, it may have a name and an ID.
|
|
82
|
+
Data Quality is formed from a set of Rules - these are **evaluated** can be passed or failed by a record in your dataset.
|
|
83
|
+
|
|
84
|
+
Rules are combined into a data quality configuration, which is **executed** on a dataset, and explains when the measurement happened, at one stage of the lifecycle etc.
|
|
85
|
+
|
|
86
|
+
The measure we use is the pass rate (records passing / records evaluated). This makes it distinct from a score
|
|
87
|
+
which suggests weighting. Users are free to take the pass rate and create weighted scores if they wish, although we do not recommend doing this as it is more confusing to interpret.
|
|
88
|
+
|
|
89
|
+
# Acknowledgements
|
|
90
|
+
We are grateful for DAMA-UK (Data Management Association, UK Chapter) for granting us permission to reference and use their Data Quality Dimensions throughout the tutorials and code.
|
|
91
|
+
Source: DAMA International® (2017) DAMA-DMBOK®: Data Management Body of Knowledge.
|
|
92
|
+
ISBN 9781634622349. Copyright registered June 18, 2018 (TX0008608498).
|
|
93
|
+
Rights and permissions: DAMA International®, 2512 E. Evergreen Blvd, #1023,
|
|
94
|
+
Vancouver, WA 98661, United States. (973) 625-4347.
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# Data Quality Tools — Python Quickstart
|
|
98
|
+
|
|
99
|
+
## Basic Usage
|
|
100
|
+
|
|
101
|
+
### 1. Import Rules
|
|
102
|
+
|
|
103
|
+
Import the rule classes and supporting components:
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
import pandas as pd
|
|
107
|
+
from gchq_data_quality import (
|
|
108
|
+
UniquenessRule,
|
|
109
|
+
CompletenessRule,
|
|
110
|
+
ValidityRegexRule,
|
|
111
|
+
DataQualityConfig,
|
|
112
|
+
)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### 2. Create Example Data
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
df = pd.DataFrame({
|
|
119
|
+
"id": [1, 2, 2, 3],
|
|
120
|
+
"email": ["a@example.com", "b@sample.com", "invalid@", None],
|
|
121
|
+
})
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### 3. Define and Evaluate Quality Rules
|
|
125
|
+
|
|
126
|
+
Instantiate each rule for the field of interest, and evaluate against your DataFrame:
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
# Uniqueness
|
|
130
|
+
uniqueness_rule = UniquenessRule(field="id")
|
|
131
|
+
uniqueness_result = uniqueness_rule.evaluate(df)
|
|
132
|
+
print("Uniqueness:", uniqueness_result.pass_rate) # e.g. 0.75 if 3/4 unique
|
|
133
|
+
|
|
134
|
+
# Completeness
|
|
135
|
+
completeness_rule = CompletenessRule(field="email")
|
|
136
|
+
completeness_result = completeness_rule.evaluate(df)
|
|
137
|
+
print("Completeness:", completeness_result.pass_rate)
|
|
138
|
+
|
|
139
|
+
# Validity using regular expressions
|
|
140
|
+
validity_rule = ValidityRegexRule(
|
|
141
|
+
field="email", regex_pattern=r"^[^@\s]+@[^@\s]+\.[^@\s]+$"
|
|
142
|
+
)
|
|
143
|
+
validity_result = validity_rule.evaluate(df)
|
|
144
|
+
print("Email validity:", validity_result.pass_rate)
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### 4. Running Multiple Rules via a Config Object
|
|
148
|
+
|
|
149
|
+
You can bundle your rules and dataset details in a `DataQualityConfig`, execute them all at once, and get a tabular report:
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
dq_config = DataQualityConfig(
|
|
153
|
+
dataset_name="My Example Dataset",
|
|
154
|
+
rules=[
|
|
155
|
+
uniqueness_rule,
|
|
156
|
+
completeness_rule,
|
|
157
|
+
validity_rule,
|
|
158
|
+
],
|
|
159
|
+
)
|
|
160
|
+
dq_report = dq_config.execute(df)
|
|
161
|
+
print(dq_report.to_dataframe())
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## More Rule Types
|
|
165
|
+
|
|
166
|
+
Other measurements are available (AccuracyRule, ValidityNumericalRangeRule, ConsistencyRule, TimelinessStaticRule, etc.) — see notebook examples and code documentation for details.
|
|
167
|
+
|
|
168
|
+
## YAML Configurations
|
|
169
|
+
|
|
170
|
+
For defining rules in YAML and running configs across multiple datasets, see the advanced Python 2 tutorial.
|
|
171
|
+
|
|
172
|
+
### Spark dataframes
|
|
173
|
+
|
|
174
|
+
The same approach is used in Spark dataframes, and with Spark you can also handle nested data.
|
|
175
|
+
```python
|
|
176
|
+
from pyspark.sql import SparkSession
|
|
177
|
+
|
|
178
|
+
from gchq_data_quality import DataQualityConfig
|
|
179
|
+
from gchq_data_quality.spark.dataframe_operations import flatten_spark
|
|
180
|
+
|
|
181
|
+
spark = SparkSession.builder.getOrCreate()
|
|
182
|
+
|
|
183
|
+
# Create nested example data: 2 parents, one with 1 child, one with 2
|
|
184
|
+
data = [
|
|
185
|
+
{
|
|
186
|
+
"parent": {
|
|
187
|
+
"age": 40,
|
|
188
|
+
"children": [{"age": 10}]
|
|
189
|
+
}
|
|
190
|
+
},
|
|
191
|
+
{
|
|
192
|
+
"parent": {
|
|
193
|
+
"age": 35,
|
|
194
|
+
"children": [{"age": 7}, {"age": 5}]
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
]
|
|
198
|
+
|
|
199
|
+
schema = StructType([
|
|
200
|
+
StructField("parent", StructType([
|
|
201
|
+
StructField("age", IntegerType(), True),
|
|
202
|
+
StructField("children", ArrayType(StructType([
|
|
203
|
+
StructField("age", IntegerType(), True)
|
|
204
|
+
])), True),
|
|
205
|
+
]), True)
|
|
206
|
+
])
|
|
207
|
+
|
|
208
|
+
spark = SparkSession.builder.getOrCreate()
|
|
209
|
+
spark_df = spark.createDataFrame(data, schema=schema)
|
|
210
|
+
|
|
211
|
+
# Optionally flatten nested columns to inspect structure - worth checking it's what you expected. Deeply nested data can get quite confusing.
|
|
212
|
+
df_flat = flatten_spark(
|
|
213
|
+
spark_df, flatten_cols=["parent.age", "parent.children[*].age"]
|
|
214
|
+
)
|
|
215
|
+
df_flat.show()
|
|
216
|
+
|
|
217
|
+
# Load DQ rules from YAML config (rules.yaml, e.g., we might run a consistency rule to check that '`parent.age` > `parent.children[*].age`
|
|
218
|
+
# i.e. all parents are older than all of their children
|
|
219
|
+
# See the tutorial for examples
|
|
220
|
+
|
|
221
|
+
dq_config = DataQualityConfig.from_yaml("rules.yaml")
|
|
222
|
+
|
|
223
|
+
# Run all configured rules
|
|
224
|
+
dq_report = dq_config.execute(spark_df)
|
|
225
|
+
print(dq_report.to_dataframe())
|
|
226
|
+
|
|
227
|
+
spark.stop()
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
# Data Quality Overview
|
|
231
|
+
|
|
232
|
+
A brief overview of what we mean when we say 'data quality'
|
|
233
|
+
## What is Data Quality?
|
|
234
|
+
|
|
235
|
+
Data quality refers to how well data meets the expectations and needs of its consumers.
|
|
236
|
+
Data is considered **high quality** when it is **fit-for-purpose** – meaning it supports the intended use effectively and reliably.
|
|
237
|
+
|
|
238
|
+
High-quality data provides:
|
|
239
|
+
- Assurance of compliance (policy or law).
|
|
240
|
+
- Confidence that analysts are receiving accurate and usable information.
|
|
241
|
+
- Early detection of data processing issues, such as:
|
|
242
|
+
- Faulty data pipelines
|
|
243
|
+
- Upstream schema changes
|
|
244
|
+
- Poor data entry
|
|
245
|
+
- Time-based anomalies (e.g., reduced data quality at weekends)
|
|
246
|
+
|
|
247
|
+
That's all quite theoretical. But in a practical sense, we define it as the percentage of records that pass a specific data quality rule. It's a 'unit test' for data.
|
|
248
|
+
|
|
249
|
+
## Key Dimensions of Data Quality
|
|
250
|
+
|
|
251
|
+
We measure data quality by the percentage of defined rules that a record passes.
|
|
252
|
+
Rules are grouped under the **DAMA Framework’s six dimensions**:
|
|
253
|
+
|
|
254
|
+
1. **Uniqueness** – No duplicates (e.g., every ID should be unique).
|
|
255
|
+
2. **Completeness** – Data fields are present and not empty (e.g., no NULL or ‘N/A’ values).
|
|
256
|
+
3. **Accuracy** – Values correctly describe the real-world object or event, usually checked against an authoritative dataset. e.g 'country' is an actual ISO country code.
|
|
257
|
+
4. **Validity** – Values conform to the required format, type, or range (e.g., valid email address, age within a reasonable range).
|
|
258
|
+
5. **Timeliness** – Data is up-to-date and time values make sense (e.g., birth dates are in the past).
|
|
259
|
+
6. **Consistency** – Logical consistency within or across datasets (e.g., date of birth should be before date of death).
|
|
260
|
+
|
|
261
|
+
## How We Use Data Quality Measures
|
|
262
|
+
|
|
263
|
+
- Each rule produces a score between `0` and `1` based on the proportion of records that pass.
|
|
264
|
+
- Scores can be monitored over time to detect trends and changes.
|
|
265
|
+
- Scores can be aggregated over the DAMA Dimensions, over fields or source data (makes it easy to dashbaord)
|
|
266
|
+
- Drops or spikes in scores may signal problems in:
|
|
267
|
+
- Extraction and transformation pipelines
|
|
268
|
+
- Upstream schema changes
|
|
269
|
+
- The relative changes to score across the dimensions can indicate what type of issue it is. e.g. a drop in uniqueness but no other change, suggest being given duplicate data. A drop in completeness, but not in validity, suggest a problem at data entry, with fields not being populated.
|
|
270
|
+
|
|
271
|
+
## Why Data Quality Matters
|
|
272
|
+
|
|
273
|
+
Good data quality supports:
|
|
274
|
+
- More confident decision-making (higher assurance of underlying data)
|
|
275
|
+
- Reduced confusion from duplicate or invalid records
|
|
276
|
+
- More efficient data engineering work (if data is high quality, then joining, transformations, enrichment become easier)
|
|
277
|
+
|
|
278
|
+
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
# GCHQ Data Quality Package
|
|
2
|
+
|
|
3
|
+
## Why build our own Data Quality Package?
|
|
4
|
+
There are a number of existing commercial and opensource data quality (DQ) packages already available. We created our own for five reasons:
|
|
5
|
+
### 1. Opensource under Apache 2.0
|
|
6
|
+
|
|
7
|
+
For permissive use.
|
|
8
|
+
|
|
9
|
+
### 2. Simplicity.
|
|
10
|
+
|
|
11
|
+
We opted for a simple 'plug-in' approach to DQ measurement to speed up adoption by Engineering teams:
|
|
12
|
+
|
|
13
|
+
1. Get your data into a dataframe (Pandas or Spark)
|
|
14
|
+
2. Define some rules (using only 8 functions)
|
|
15
|
+
3. It will output a dataframe of results
|
|
16
|
+
|
|
17
|
+
How you handle the workloads either side are up to you (connecting to data, scheduling, sampling, dashboarding, alerting etc.)
|
|
18
|
+
|
|
19
|
+
We deliberately ignore connections to SQL / MongoDB etc, as all of those can, via an SDK, get data into a dataframe.
|
|
20
|
+
|
|
21
|
+
### 3. Handle Nested Data
|
|
22
|
+
|
|
23
|
+
A lot of our data is nested. No other open-source data quality package handles nested data (that we could find).
|
|
24
|
+
|
|
25
|
+
### 4. Better Data Quality rules for comparing two values.
|
|
26
|
+
|
|
27
|
+
We wanted something that:
|
|
28
|
+
|
|
29
|
+
1. Gave us granular comparisons around time (such as an event happening within a time window relative to another date - important for us)
|
|
30
|
+
2. Used pandas.eval() syntax to provide a huge range of flexibile rules with logical operators, summary statistics, string and datetime operations. We maximise the use of this powerful syntax, without complicating our code
|
|
31
|
+
|
|
32
|
+
### 5. Designed for Insights
|
|
33
|
+
|
|
34
|
+
Other packages are designed with an Engineering mindset - it's about the number of rules that pass or fail. This isn't great for diagnosing the root cause of data quality issues. Our results format is a flat table, with enough metadata to make it easy to:
|
|
35
|
+
|
|
36
|
+
1. Build an insightful dashboard
|
|
37
|
+
2. Work out the cause of the DQ problem
|
|
38
|
+
|
|
39
|
+
## Tutorials
|
|
40
|
+
|
|
41
|
+
There are tutorial notebooks to guide you through using the code. See the tutorials/ directory to find notebooks to download.
|
|
42
|
+
|
|
43
|
+
## Orientation
|
|
44
|
+
|
|
45
|
+
### Language
|
|
46
|
+
Consistent verbs / descriptions for both the code and the output are used.
|
|
47
|
+
|
|
48
|
+
A dataset is the data you are measuring, it may have a name and an ID.
|
|
49
|
+
Data Quality is formed from a set of Rules - these are **evaluated** can be passed or failed by a record in your dataset.
|
|
50
|
+
|
|
51
|
+
Rules are combined into a data quality configuration, which is **executed** on a dataset, and explains when the measurement happened, at one stage of the lifecycle etc.
|
|
52
|
+
|
|
53
|
+
The measure we use is the pass rate (records passing / records evaluated). This makes it distinct from a score
|
|
54
|
+
which suggests weighting. Users are free to take the pass rate and create weighted scores if they wish, although we do not recommend doing this as it is more confusing to interpret.
|
|
55
|
+
|
|
56
|
+
# Acknowledgements
|
|
57
|
+
We are grateful for DAMA-UK (Data Management Association, UK Chapter) for granting us permission to reference and use their Data Quality Dimensions throughout the tutorials and code.
|
|
58
|
+
Source: DAMA International® (2017) DAMA-DMBOK®: Data Management Body of Knowledge.
|
|
59
|
+
ISBN 9781634622349. Copyright registered June 18, 2018 (TX0008608498).
|
|
60
|
+
Rights and permissions: DAMA International®, 2512 E. Evergreen Blvd, #1023,
|
|
61
|
+
Vancouver, WA 98661, United States. (973) 625-4347.
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# Data Quality Tools — Python Quickstart
|
|
65
|
+
|
|
66
|
+
## Basic Usage
|
|
67
|
+
|
|
68
|
+
### 1. Import Rules
|
|
69
|
+
|
|
70
|
+
Import the rule classes and supporting components:
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
import pandas as pd
|
|
74
|
+
from gchq_data_quality import (
|
|
75
|
+
UniquenessRule,
|
|
76
|
+
CompletenessRule,
|
|
77
|
+
ValidityRegexRule,
|
|
78
|
+
DataQualityConfig,
|
|
79
|
+
)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### 2. Create Example Data
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
df = pd.DataFrame({
|
|
86
|
+
"id": [1, 2, 2, 3],
|
|
87
|
+
"email": ["a@example.com", "b@sample.com", "invalid@", None],
|
|
88
|
+
})
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### 3. Define and Evaluate Quality Rules
|
|
92
|
+
|
|
93
|
+
Instantiate each rule for the field of interest, and evaluate against your DataFrame:
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
# Uniqueness
|
|
97
|
+
uniqueness_rule = UniquenessRule(field="id")
|
|
98
|
+
uniqueness_result = uniqueness_rule.evaluate(df)
|
|
99
|
+
print("Uniqueness:", uniqueness_result.pass_rate) # e.g. 0.75 if 3/4 unique
|
|
100
|
+
|
|
101
|
+
# Completeness
|
|
102
|
+
completeness_rule = CompletenessRule(field="email")
|
|
103
|
+
completeness_result = completeness_rule.evaluate(df)
|
|
104
|
+
print("Completeness:", completeness_result.pass_rate)
|
|
105
|
+
|
|
106
|
+
# Validity using regular expressions
|
|
107
|
+
validity_rule = ValidityRegexRule(
|
|
108
|
+
field="email", regex_pattern=r"^[^@\s]+@[^@\s]+\.[^@\s]+$"
|
|
109
|
+
)
|
|
110
|
+
validity_result = validity_rule.evaluate(df)
|
|
111
|
+
print("Email validity:", validity_result.pass_rate)
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### 4. Running Multiple Rules via a Config Object
|
|
115
|
+
|
|
116
|
+
You can bundle your rules and dataset details in a `DataQualityConfig`, execute them all at once, and get a tabular report:
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
dq_config = DataQualityConfig(
|
|
120
|
+
dataset_name="My Example Dataset",
|
|
121
|
+
rules=[
|
|
122
|
+
uniqueness_rule,
|
|
123
|
+
completeness_rule,
|
|
124
|
+
validity_rule,
|
|
125
|
+
],
|
|
126
|
+
)
|
|
127
|
+
dq_report = dq_config.execute(df)
|
|
128
|
+
print(dq_report.to_dataframe())
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## More Rule Types
|
|
132
|
+
|
|
133
|
+
Other measurements are available (AccuracyRule, ValidityNumericalRangeRule, ConsistencyRule, TimelinessStaticRule, etc.) — see notebook examples and code documentation for details.
|
|
134
|
+
|
|
135
|
+
## YAML Configurations
|
|
136
|
+
|
|
137
|
+
For defining rules in YAML and running configs across multiple datasets, see the advanced Python 2 tutorial.
|
|
138
|
+
|
|
139
|
+
### Spark dataframes
|
|
140
|
+
|
|
141
|
+
The same approach is used in Spark dataframes, and with Spark you can also handle nested data.
|
|
142
|
+
```python
|
|
143
|
+
from pyspark.sql import SparkSession
|
|
144
|
+
|
|
145
|
+
from gchq_data_quality import DataQualityConfig
|
|
146
|
+
from gchq_data_quality.spark.dataframe_operations import flatten_spark
|
|
147
|
+
|
|
148
|
+
spark = SparkSession.builder.getOrCreate()
|
|
149
|
+
|
|
150
|
+
# Create nested example data: 2 parents, one with 1 child, one with 2
|
|
151
|
+
data = [
|
|
152
|
+
{
|
|
153
|
+
"parent": {
|
|
154
|
+
"age": 40,
|
|
155
|
+
"children": [{"age": 10}]
|
|
156
|
+
}
|
|
157
|
+
},
|
|
158
|
+
{
|
|
159
|
+
"parent": {
|
|
160
|
+
"age": 35,
|
|
161
|
+
"children": [{"age": 7}, {"age": 5}]
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
]
|
|
165
|
+
|
|
166
|
+
schema = StructType([
|
|
167
|
+
StructField("parent", StructType([
|
|
168
|
+
StructField("age", IntegerType(), True),
|
|
169
|
+
StructField("children", ArrayType(StructType([
|
|
170
|
+
StructField("age", IntegerType(), True)
|
|
171
|
+
])), True),
|
|
172
|
+
]), True)
|
|
173
|
+
])
|
|
174
|
+
|
|
175
|
+
spark = SparkSession.builder.getOrCreate()
|
|
176
|
+
spark_df = spark.createDataFrame(data, schema=schema)
|
|
177
|
+
|
|
178
|
+
# Optionally flatten nested columns to inspect structure - worth checking it's what you expected. Deeply nested data can get quite confusing.
|
|
179
|
+
df_flat = flatten_spark(
|
|
180
|
+
spark_df, flatten_cols=["parent.age", "parent.children[*].age"]
|
|
181
|
+
)
|
|
182
|
+
df_flat.show()
|
|
183
|
+
|
|
184
|
+
# Load DQ rules from YAML config (rules.yaml, e.g., we might run a consistency rule to check that '`parent.age` > `parent.children[*].age`
|
|
185
|
+
# i.e. all parents are older than all of their children
|
|
186
|
+
# See the tutorial for examples
|
|
187
|
+
|
|
188
|
+
dq_config = DataQualityConfig.from_yaml("rules.yaml")
|
|
189
|
+
|
|
190
|
+
# Run all configured rules
|
|
191
|
+
dq_report = dq_config.execute(spark_df)
|
|
192
|
+
print(dq_report.to_dataframe())
|
|
193
|
+
|
|
194
|
+
spark.stop()
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
# Data Quality Overview
|
|
198
|
+
|
|
199
|
+
A brief overview of what we mean when we say 'data quality'
|
|
200
|
+
## What is Data Quality?
|
|
201
|
+
|
|
202
|
+
Data quality refers to how well data meets the expectations and needs of its consumers.
|
|
203
|
+
Data is considered **high quality** when it is **fit-for-purpose** – meaning it supports the intended use effectively and reliably.
|
|
204
|
+
|
|
205
|
+
High-quality data provides:
|
|
206
|
+
- Assurance of compliance (policy or law).
|
|
207
|
+
- Confidence that analysts are receiving accurate and usable information.
|
|
208
|
+
- Early detection of data processing issues, such as:
|
|
209
|
+
- Faulty data pipelines
|
|
210
|
+
- Upstream schema changes
|
|
211
|
+
- Poor data entry
|
|
212
|
+
- Time-based anomalies (e.g., reduced data quality at weekends)
|
|
213
|
+
|
|
214
|
+
That's all quite theoretical. But in a practical sense, we define it as the percentage of records that pass a specific data quality rule. It's a 'unit test' for data.
|
|
215
|
+
|
|
216
|
+
## Key Dimensions of Data Quality
|
|
217
|
+
|
|
218
|
+
We measure data quality by the percentage of defined rules that a record passes.
|
|
219
|
+
Rules are grouped under the **DAMA Framework’s six dimensions**:
|
|
220
|
+
|
|
221
|
+
1. **Uniqueness** – No duplicates (e.g., every ID should be unique).
|
|
222
|
+
2. **Completeness** – Data fields are present and not empty (e.g., no NULL or ‘N/A’ values).
|
|
223
|
+
3. **Accuracy** – Values correctly describe the real-world object or event, usually checked against an authoritative dataset. e.g 'country' is an actual ISO country code.
|
|
224
|
+
4. **Validity** – Values conform to the required format, type, or range (e.g., valid email address, age within a reasonable range).
|
|
225
|
+
5. **Timeliness** – Data is up-to-date and time values make sense (e.g., birth dates are in the past).
|
|
226
|
+
6. **Consistency** – Logical consistency within or across datasets (e.g., date of birth should be before date of death).
|
|
227
|
+
|
|
228
|
+
## How We Use Data Quality Measures
|
|
229
|
+
|
|
230
|
+
- Each rule produces a score between `0` and `1` based on the proportion of records that pass.
|
|
231
|
+
- Scores can be monitored over time to detect trends and changes.
|
|
232
|
+
- Scores can be aggregated over the DAMA Dimensions, over fields or source data (makes it easy to dashbaord)
|
|
233
|
+
- Drops or spikes in scores may signal problems in:
|
|
234
|
+
- Extraction and transformation pipelines
|
|
235
|
+
- Upstream schema changes
|
|
236
|
+
- The relative changes to score across the dimensions can indicate what type of issue it is. e.g. a drop in uniqueness but no other change, suggest being given duplicate data. A drop in completeness, but not in validity, suggest a problem at data entry, with fields not being populated.
|
|
237
|
+
|
|
238
|
+
## Why Data Quality Matters
|
|
239
|
+
|
|
240
|
+
Good data quality supports:
|
|
241
|
+
- More confident decision-making (higher assurance of underlying data)
|
|
242
|
+
- Reduced confusion from duplicate or invalid records
|
|
243
|
+
- More efficient data engineering work (if data is high quality, then joining, transformations, enrichment become easier)
|
|
244
|
+
|
|
245
|
+
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["uv_build>=0.7.11,<0.8.0"]
|
|
3
|
+
build-backend = "uv_build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "gchq-data-quality"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Tools for data quality measurement in Pandas and Spark"
|
|
9
|
+
requires-python = ">=3.11"
|
|
10
|
+
keywords = ["data quality"]
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Development Status :: 4 - Beta",
|
|
13
|
+
"License :: OSI Approved :: Apache Software License",
|
|
14
|
+
"Natural Language :: English",
|
|
15
|
+
"Programming Language :: Python :: 3.11",
|
|
16
|
+
"Programming Language :: Python :: 3.12",
|
|
17
|
+
"Programming Language :: Python :: 3.13",
|
|
18
|
+
"Programming Language :: Python :: 3.14",
|
|
19
|
+
"Intended Audience :: Developers",
|
|
20
|
+
"Topic :: Utilities",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
dependencies = [
|
|
24
|
+
"numpy>=2.4.1",
|
|
25
|
+
"pandas>=2.2.3,<3",
|
|
26
|
+
"pydantic>=2.9",
|
|
27
|
+
"pyyaml>=6.0.3",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
authors = [{ name = "GCHQ", email = "oss@gchq.gov.uk" }]
|
|
32
|
+
|
|
33
|
+
readme = "README.md"
|
|
34
|
+
license = "Apache-2.0"
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Homepage = "https://github.com/gchq/gchq-data-quality"
|
|
38
|
+
Issues = "https://github.com/gchq/gchq-data-quality/issues"
|
|
39
|
+
Documentation = "https://gchq.github.io/gchq-data-quality/"
|
|
40
|
+
Respository = "https://github.com/gchq/gchq-data-quality"
|
|
41
|
+
|
|
42
|
+
[project.optional-dependencies]
|
|
43
|
+
pyspark = ["pyspark>=3.5", "pyarrow>=23.0.0"]
|
|
44
|
+
|
|
45
|
+
elasticsearch = ["elasticsearch>=8.17"]
|
|
46
|
+
|
|
47
|
+
[dependency-groups]
|
|
48
|
+
dev = [
|
|
49
|
+
"ruff>=0.9, <1",
|
|
50
|
+
"pre-commit>=3.0",
|
|
51
|
+
"nbconvert>=7.17.0,<8",
|
|
52
|
+
"ipykernel>=7.1.0",
|
|
53
|
+
{ include-group = "docs" },
|
|
54
|
+
{ include-group = "test" },
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
test = ["pytest>=7.0", "coverage>=7.9.2,<8"]
|
|
58
|
+
docs = ["mkdocs", "mkdocs-material", "mkdocstrings[python]"]
|
|
59
|
+
|
|
60
|
+
[tool.pytest.ini_options]
|
|
61
|
+
testpaths = ["tests"]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
[tool.ruff]
|
|
65
|
+
line-length = 88
|
|
66
|
+
|
|
67
|
+
[tool.ruff.lint]
|
|
68
|
+
select = ["F", "E", "W", "I", "N", "UP", "S", "B", "COM", "ANN"]
|
|
69
|
+
ignore = ["E501", "COM812", "N813", "N806"]
|
|
70
|
+
fixable = ["ALL"]
|
|
71
|
+
|
|
72
|
+
[tool.ruff.lint.isort]
|
|
73
|
+
combine-as-imports = true
|
|
74
|
+
|
|
75
|
+
[tool.ruff.lint.per-file-ignores]
|
|
76
|
+
"tests/*" = ["S101"]
|
|
77
|
+
|
|
78
|
+
[tool.ruff.format]
|
|
79
|
+
quote-style = "double"
|
|
80
|
+
indent-style = "space"
|
|
81
|
+
skip-magic-trailing-comma = false
|
|
82
|
+
line-ending = "auto"
|
|
83
|
+
docstring-code-format = false
|
|
84
|
+
docstring-code-line-length = "dynamic"
|
|
85
|
+
|
|
86
|
+
[tool.ruff.lint.flake8-bugbear]
|
|
87
|
+
extend-immutable-calls = ["aws_cdk.Duration.minutes"]
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# (c) Crown Copyright GCHQ \n
|
|
2
|
+
"""
|
|
3
|
+
Data quality rule definitions for the gchq_data_quality framework.
|
|
4
|
+
|
|
5
|
+
This module provides data quality rules for the 6 DAMA Dimensions of Data Quality:
|
|
6
|
+
- Uniqueness
|
|
7
|
+
- Accuracy
|
|
8
|
+
- Completeness
|
|
9
|
+
- Validity
|
|
10
|
+
- Consistency
|
|
11
|
+
- Timeliness
|
|
12
|
+
|
|
13
|
+
They inherit from a core BaseRule class. All data quality evaluation is built on a consistent method:
|
|
14
|
+
1. Determine the records that are evaluated (as a boolean mask) - records_evaluated_mask
|
|
15
|
+
The total records evaluated here is then the sum of the mask.
|
|
16
|
+
2. Determine the records that pass the rule (as a boolean mask) - records_passing_mask
|
|
17
|
+
The count of records_passing is the sum of records_passing_mask AND records_evaluated_mask
|
|
18
|
+
(for various reasons you can have records passing a rule that are not in the evaluation set, e.g. they are NULL)
|
|
19
|
+
|
|
20
|
+
The pass_rate is then records_passing / records_evaluated
|
|
21
|
+
|
|
22
|
+
You can see the mechanisms in each rule primarily by looking at the masks that are created. The metrics derived from these
|
|
23
|
+
masks are typically the same for every rule type and are specified in BaseRule.
|
|
24
|
+
|
|
25
|
+
Available rule classes:
|
|
26
|
+
- UniquenessRule
|
|
27
|
+
- AccuracyRule
|
|
28
|
+
- CompletenessRule
|
|
29
|
+
- ConsistencyRule
|
|
30
|
+
- TimelinessRelativeRule
|
|
31
|
+
- TimelinessStaticRule
|
|
32
|
+
- ValidityNumericalRangeRule
|
|
33
|
+
- ValidityRegexRule
|
|
34
|
+
|
|
35
|
+
Whilst the user can call these rules and evaluate them against a dataframe
|
|
36
|
+
|
|
37
|
+
UniquenessRule.evalute(df)
|
|
38
|
+
|
|
39
|
+
The intention of the package is that multiple rules are wrapped up into a DataQualityConfig class
|
|
40
|
+
and executed together agains a dataframe.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
from gchq_data_quality.rules.uniqueness import UniquenessRule # noqa
|
|
44
|
+
from gchq_data_quality.rules.accuracy import AccuracyRule # noqa
|
|
45
|
+
from gchq_data_quality.rules.completeness import CompletenessRule # noqa
|
|
46
|
+
from gchq_data_quality.rules.consistency import ConsistencyRule # noqa
|
|
47
|
+
from gchq_data_quality.rules.timeliness import (
|
|
48
|
+
TimelinessRelativeRule,
|
|
49
|
+
TimelinessStaticRule,
|
|
50
|
+
) # noqa
|
|
51
|
+
from gchq_data_quality.rules.validity import (
|
|
52
|
+
ValidityNumericalRangeRule,
|
|
53
|
+
ValidityRegexRule,
|
|
54
|
+
) # noqa
|
|
55
|
+
|
|
56
|
+
from gchq_data_quality.config import DataQualityConfig
|
|
57
|
+
|
|
58
|
+
from gchq_data_quality.results.models import DataQualityReport
|
|
59
|
+
|
|
60
|
+
__all__ = [
|
|
61
|
+
"UniquenessRule",
|
|
62
|
+
"AccuracyRule",
|
|
63
|
+
"CompletenessRule",
|
|
64
|
+
"ConsistencyRule",
|
|
65
|
+
"TimelinessRelativeRule",
|
|
66
|
+
"TimelinessStaticRule",
|
|
67
|
+
"ValidityNumericalRangeRule",
|
|
68
|
+
"ValidityRegexRule",
|
|
69
|
+
"DataQualityConfig",
|
|
70
|
+
"DataQualityReport",
|
|
71
|
+
]
|