dqguard 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dqguard-0.1.0/LICENSE +21 -0
- dqguard-0.1.0/PKG-INFO +225 -0
- dqguard-0.1.0/README.md +188 -0
- dqguard-0.1.0/dataguard/__init__.py +33 -0
- dqguard-0.1.0/dataguard/checks.py +131 -0
- dqguard-0.1.0/dataguard/core.py +98 -0
- dqguard-0.1.0/dataguard/exceptions.py +16 -0
- dqguard-0.1.0/dataguard/pandas_engine.py +111 -0
- dqguard-0.1.0/dataguard/report.py +99 -0
- dqguard-0.1.0/dataguard/rules.py +86 -0
- dqguard-0.1.0/dataguard/spark_engine.py +167 -0
- dqguard-0.1.0/dqguard.egg-info/PKG-INFO +225 -0
- dqguard-0.1.0/dqguard.egg-info/SOURCES.txt +17 -0
- dqguard-0.1.0/dqguard.egg-info/dependency_links.txt +1 -0
- dqguard-0.1.0/dqguard.egg-info/requires.txt +12 -0
- dqguard-0.1.0/dqguard.egg-info/top_level.txt +1 -0
- dqguard-0.1.0/pyproject.toml +68 -0
- dqguard-0.1.0/setup.cfg +4 -0
- dqguard-0.1.0/tests/test_core.py +247 -0
dqguard-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Zhang Zhen
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
dqguard-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dqguard
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lightweight data quality validation framework for big data pipelines
|
|
5
|
+
Author-email: Zhang Zhen <zhangzhen9798@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/zhangzhen9798/dataguard
|
|
8
|
+
Project-URL: Repository, https://github.com/zhangzhen9798/dataguard
|
|
9
|
+
Project-URL: Issues, https://github.com/zhangzhen9798/dataguard/issues
|
|
10
|
+
Keywords: data-quality,validation,spark,pandas,big-data,data-pipeline
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Database
|
|
21
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
22
|
+
Classifier: Topic :: Software Development :: Testing
|
|
23
|
+
Requires-Python: >=3.8
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: pandas>=1.3.0
|
|
27
|
+
Requires-Dist: numpy>=1.20.0
|
|
28
|
+
Provides-Extra: spark
|
|
29
|
+
Requires-Dist: pyspark>=3.0.0; extra == "spark"
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
32
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
33
|
+
Requires-Dist: black>=23.0; extra == "dev"
|
|
34
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
35
|
+
Requires-Dist: mypy>=1.0; extra == "dev"
|
|
36
|
+
Dynamic: license-file
|
|
37
|
+
|
|
38
|
+
<div align="center">
|
|
39
|
+
|
|
40
|
+
# 🛡️ DataGuard
|
|
41
|
+
|
|
42
|
+
**Lightweight Data Quality Validation Framework for Big Data Pipelines**
|
|
43
|
+
|
|
44
|
+
[](https://github.com/zhangzhen9798/dataguard)
|
|
45
|
+
[](https://www.python.org/)
|
|
46
|
+
[](https://opensource.org/licenses/MIT)
|
|
47
|
+
[](CONTRIBUTING.md)
|
|
48
|
+
|
|
49
|
+
[English](#features) | [中文文档](#中文介绍)
|
|
50
|
+
|
|
51
|
+
</div>
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Why DataGuard?
|
|
56
|
+
|
|
57
|
+
Data quality issues cost organizations millions annually. Existing solutions like Great Expectations are powerful but heavy. **DataGuard** provides a **lightweight, intuitive** alternative that works seamlessly with both **Pandas** and **PySpark** — perfect for big data pipelines.
|
|
58
|
+
|
|
59
|
+
- ✅ **Dual Engine**: First-class support for both Pandas & PySpark
|
|
60
|
+
- ✅ **Declarative Rules**: Define validation rules cleanly, no boilerplate
|
|
61
|
+
- ✅ **Threshold-based**: Set pass-rate thresholds per rule (not just pass/fail)
|
|
62
|
+
- ✅ **Data Profiling**: Auto-generate column-level statistics
|
|
63
|
+
- ✅ **Rich Reports**: Human-readable summaries + JSON export for CI/CD
|
|
64
|
+
- ✅ **Zero Config**: Works out of the box, no setup files needed
|
|
65
|
+
|
|
66
|
+
## Quick Start
|
|
67
|
+
|
|
68
|
+
### Installation
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
# Basic (Pandas engine)
|
|
72
|
+
pip install dataguard
|
|
73
|
+
|
|
74
|
+
# With PySpark support
|
|
75
|
+
pip install dataguard[spark]
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Basic Usage
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
import pandas as pd
|
|
82
|
+
from dataguard import DataGuard, RuleSet, not_null, in_range, in_set, regex_match
|
|
83
|
+
|
|
84
|
+
# Create a DataFrame
|
|
85
|
+
df = pd.DataFrame({
|
|
86
|
+
"name": ["Alice", "Bob", "Charlie", None, "Eve"],
|
|
87
|
+
"age": [25, 30, -1, 40, 150],
|
|
88
|
+
"email": ["alice@example.com", "invalid", "charlie@example.com", "dana@example.com", "eve@example.com"],
|
|
89
|
+
"status": ["active", "active", "inactive", "active", "unknown"],
|
|
90
|
+
})
|
|
91
|
+
|
|
92
|
+
# Define validation rules
|
|
93
|
+
rules = RuleSet()
|
|
94
|
+
rules.add("name", not_null())
|
|
95
|
+
rules.add("age", not_null())
|
|
96
|
+
rules.add("age", in_range(0, 120))
|
|
97
|
+
rules.add("email", regex_match(r"^[\w.-]+@[\w.-]+\.\w+$"))
|
|
98
|
+
rules.add("status", in_set(["active", "inactive"]))
|
|
99
|
+
|
|
100
|
+
# Run validation
|
|
101
|
+
guardian = DataGuard(df)
|
|
102
|
+
report = guardian.validate(rules)
|
|
103
|
+
|
|
104
|
+
# Print summary
|
|
105
|
+
print(report.summary())
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Output:
|
|
109
|
+
```
|
|
110
|
+
DataGuard Validation Report
|
|
111
|
+
Engine: pandas
|
|
112
|
+
Total Rules: 5 | Passed: 1 | Failed: 4
|
|
113
|
+
Overall Status: INVALID
|
|
114
|
+
------------------------------------------------------------
|
|
115
|
+
[FAIL] name.not_null | pass_rate=80.00% (threshold=100%) | 4/5 rows passed
|
|
116
|
+
[PASS] age.not_null | pass_rate=100.00% (threshold=100%) | 5/5 rows passed
|
|
117
|
+
[FAIL] age.in_range(0, 120) | pass_rate=80.00% (threshold=100%) | 4/5 rows passed
|
|
118
|
+
[FAIL] email.regex_match(...) | pass_rate=80.00% (threshold=100%) | 4/5 rows passed
|
|
119
|
+
[FAIL] status.in_set(...) | pass_rate=80.00% (threshold=100%) | 4/5 rows passed
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### With PySpark
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from pyspark.sql import SparkSession
|
|
126
|
+
from dataguard import DataGuard, RuleSet, not_null, in_range
|
|
127
|
+
|
|
128
|
+
spark = SparkSession.builder.appName("DataGuard").getOrCreate()
|
|
129
|
+
df = spark.read.parquet("s3://my-bucket/data/")
|
|
130
|
+
|
|
131
|
+
rules = RuleSet()
|
|
132
|
+
rules.add("user_id", not_null())
|
|
133
|
+
rules.add("user_id", unique())
|
|
134
|
+
rules.add("age", in_range(0, 120))
|
|
135
|
+
|
|
136
|
+
report = DataGuard(df).validate(rules)
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Threshold-based Validation
|
|
140
|
+
|
|
141
|
+
Not every dataset needs 100% compliance. Set thresholds per rule:
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
rules = RuleSet()
|
|
145
|
+
# Allow up to 5% null values in optional fields
|
|
146
|
+
rules.add("middle_name", not_null(), threshold=0.95)
|
|
147
|
+
# Require 99.9% uniqueness for IDs
|
|
148
|
+
rules.add("transaction_id", unique(), threshold=0.999)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### Data Profiling
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
guardian = DataGuard(df)
|
|
155
|
+
profile = guardian.profile()
|
|
156
|
+
|
|
157
|
+
for col, stats in profile.items():
|
|
158
|
+
print(f"{col}: {stats['distinct_count']} distinct, {stats['null_rate']:.2%} nulls")
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### JSON Export (for CI/CD integration)
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
report = guardian.validate(rules)
|
|
165
|
+
print(report.to_json())
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## Built-in Checks
|
|
169
|
+
|
|
170
|
+
| Check | Description |
|
|
171
|
+
|-------|-------------|
|
|
172
|
+
| `not_null()` | Value must not be None/NaN |
|
|
173
|
+
| `unique()` | Column values must be unique |
|
|
174
|
+
| `in_range(min, max)` | Numeric value within range (inclusive) |
|
|
175
|
+
| `regex_match(pattern)` | String matches regex pattern |
|
|
176
|
+
| `in_set(values)` | Value in allowed set |
|
|
177
|
+
| `min_length(n)` | String has at least n characters |
|
|
178
|
+
| `max_length(n)` | String has at most n characters |
|
|
179
|
+
| `custom(fn, name)` | Custom validation function |
|
|
180
|
+
|
|
181
|
+
## Architecture
|
|
182
|
+
|
|
183
|
+
```
|
|
184
|
+
dataguard/
|
|
185
|
+
├── __init__.py # Public API
|
|
186
|
+
├── core.py # DataGuard main class
|
|
187
|
+
├── rules.py # Rule & RuleSet definitions
|
|
188
|
+
├── checks.py # Built-in check functions
|
|
189
|
+
├── report.py # ValidationReport & ValidationResult
|
|
190
|
+
├── exceptions.py # Custom exceptions
|
|
191
|
+
├── pandas_engine.py # Pandas validation backend
|
|
192
|
+
└── spark_engine.py # PySpark validation backend
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## Roadmap
|
|
196
|
+
|
|
197
|
+
- [ ] Great Expectations interop layer
|
|
198
|
+
- [ ] dbt integration
|
|
199
|
+
- [ ] SQL-based validation engine
|
|
200
|
+
- [ ] Streaming data validation (Spark Structured Streaming)
|
|
201
|
+
- [ ] CLI tool for one-off validation jobs
|
|
202
|
+
- [ ] Visualization dashboard
|
|
203
|
+
|
|
204
|
+
## Contributing
|
|
205
|
+
|
|
206
|
+
Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
207
|
+
|
|
208
|
+
## License
|
|
209
|
+
|
|
210
|
+
This project is licensed under the MIT License — see the [LICENSE](LICENSE) file for details.
|
|
211
|
+
|
|
212
|
+
---
|
|
213
|
+
|
|
214
|
+
## 中文介绍
|
|
215
|
+
|
|
216
|
+
DataGuard 是一个轻量级的大数据管道数据质量验证框架,核心特性:
|
|
217
|
+
|
|
218
|
+
- **双引擎支持**:原生支持 Pandas 和 PySpark,无需切换工具
|
|
219
|
+
- **声明式规则**:用简洁的语法定义验证规则,告别样板代码
|
|
220
|
+
- **阈值验证**:支持按规则设置通过率阈值,而非简单的二元判断
|
|
221
|
+
- **数据画像**:一键生成列级统计信息
|
|
222
|
+
- **丰富报告**:支持人类可读摘要 + JSON 导出,方便 CI/CD 集成
|
|
223
|
+
- **零配置**:开箱即用,无需配置文件
|
|
224
|
+
|
|
225
|
+
适用于数据工程师在 ETL/ELT 管道中进行数据质量检查,也适用于数据科学家在分析前验证数据完整性。
|
dqguard-0.1.0/README.md
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
# 🛡️ DataGuard
|
|
4
|
+
|
|
5
|
+
**Lightweight Data Quality Validation Framework for Big Data Pipelines**
|
|
6
|
+
|
|
7
|
+
[](https://github.com/zhangzhen9798/dataguard)
|
|
8
|
+
[](https://www.python.org/)
|
|
9
|
+
[](https://opensource.org/licenses/MIT)
|
|
10
|
+
[](CONTRIBUTING.md)
|
|
11
|
+
|
|
12
|
+
[English](#features) | [中文文档](#中文介绍)
|
|
13
|
+
|
|
14
|
+
</div>
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## Why DataGuard?
|
|
19
|
+
|
|
20
|
+
Data quality issues cost organizations millions annually. Existing solutions like Great Expectations are powerful but heavy. **DataGuard** provides a **lightweight, intuitive** alternative that works seamlessly with both **Pandas** and **PySpark** — perfect for big data pipelines.
|
|
21
|
+
|
|
22
|
+
- ✅ **Dual Engine**: First-class support for both Pandas & PySpark
|
|
23
|
+
- ✅ **Declarative Rules**: Define validation rules cleanly, no boilerplate
|
|
24
|
+
- ✅ **Threshold-based**: Set pass-rate thresholds per rule (not just pass/fail)
|
|
25
|
+
- ✅ **Data Profiling**: Auto-generate column-level statistics
|
|
26
|
+
- ✅ **Rich Reports**: Human-readable summaries + JSON export for CI/CD
|
|
27
|
+
- ✅ **Zero Config**: Works out of the box, no setup files needed
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
### Installation
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
# Basic (Pandas engine)
|
|
35
|
+
pip install dataguard
|
|
36
|
+
|
|
37
|
+
# With PySpark support
|
|
38
|
+
pip install dataguard[spark]
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Basic Usage
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
import pandas as pd
|
|
45
|
+
from dataguard import DataGuard, RuleSet, not_null, in_range, in_set, regex_match
|
|
46
|
+
|
|
47
|
+
# Create a DataFrame
|
|
48
|
+
df = pd.DataFrame({
|
|
49
|
+
"name": ["Alice", "Bob", "Charlie", None, "Eve"],
|
|
50
|
+
"age": [25, 30, -1, 40, 150],
|
|
51
|
+
"email": ["alice@example.com", "invalid", "charlie@example.com", "dana@example.com", "eve@example.com"],
|
|
52
|
+
"status": ["active", "active", "inactive", "active", "unknown"],
|
|
53
|
+
})
|
|
54
|
+
|
|
55
|
+
# Define validation rules
|
|
56
|
+
rules = RuleSet()
|
|
57
|
+
rules.add("name", not_null())
|
|
58
|
+
rules.add("age", not_null())
|
|
59
|
+
rules.add("age", in_range(0, 120))
|
|
60
|
+
rules.add("email", regex_match(r"^[\w.-]+@[\w.-]+\.\w+$"))
|
|
61
|
+
rules.add("status", in_set(["active", "inactive"]))
|
|
62
|
+
|
|
63
|
+
# Run validation
|
|
64
|
+
guardian = DataGuard(df)
|
|
65
|
+
report = guardian.validate(rules)
|
|
66
|
+
|
|
67
|
+
# Print summary
|
|
68
|
+
print(report.summary())
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Output:
|
|
72
|
+
```
|
|
73
|
+
DataGuard Validation Report
|
|
74
|
+
Engine: pandas
|
|
75
|
+
Total Rules: 5 | Passed: 1 | Failed: 4
|
|
76
|
+
Overall Status: INVALID
|
|
77
|
+
------------------------------------------------------------
|
|
78
|
+
[FAIL] name.not_null | pass_rate=80.00% (threshold=100%) | 4/5 rows passed
|
|
79
|
+
[PASS] age.not_null | pass_rate=100.00% (threshold=100%) | 5/5 rows passed
|
|
80
|
+
[FAIL] age.in_range(0, 120) | pass_rate=80.00% (threshold=100%) | 4/5 rows passed
|
|
81
|
+
[FAIL] email.regex_match(...) | pass_rate=80.00% (threshold=100%) | 4/5 rows passed
|
|
82
|
+
[FAIL] status.in_set(...) | pass_rate=80.00% (threshold=100%) | 4/5 rows passed
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### With PySpark
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from pyspark.sql import SparkSession
|
|
89
|
+
from dataguard import DataGuard, RuleSet, not_null, in_range
|
|
90
|
+
|
|
91
|
+
spark = SparkSession.builder.appName("DataGuard").getOrCreate()
|
|
92
|
+
df = spark.read.parquet("s3://my-bucket/data/")
|
|
93
|
+
|
|
94
|
+
rules = RuleSet()
|
|
95
|
+
rules.add("user_id", not_null())
|
|
96
|
+
rules.add("user_id", unique())
|
|
97
|
+
rules.add("age", in_range(0, 120))
|
|
98
|
+
|
|
99
|
+
report = DataGuard(df).validate(rules)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Threshold-based Validation
|
|
103
|
+
|
|
104
|
+
Not every dataset needs 100% compliance. Set thresholds per rule:
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
rules = RuleSet()
|
|
108
|
+
# Allow up to 5% null values in optional fields
|
|
109
|
+
rules.add("middle_name", not_null(), threshold=0.95)
|
|
110
|
+
# Require 99.9% uniqueness for IDs
|
|
111
|
+
rules.add("transaction_id", unique(), threshold=0.999)
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Data Profiling
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
guardian = DataGuard(df)
|
|
118
|
+
profile = guardian.profile()
|
|
119
|
+
|
|
120
|
+
for col, stats in profile.items():
|
|
121
|
+
print(f"{col}: {stats['distinct_count']} distinct, {stats['null_rate']:.2%} nulls")
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### JSON Export (for CI/CD integration)
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
report = guardian.validate(rules)
|
|
128
|
+
print(report.to_json())
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Built-in Checks
|
|
132
|
+
|
|
133
|
+
| Check | Description |
|
|
134
|
+
|-------|-------------|
|
|
135
|
+
| `not_null()` | Value must not be None/NaN |
|
|
136
|
+
| `unique()` | Column values must be unique |
|
|
137
|
+
| `in_range(min, max)` | Numeric value within range (inclusive) |
|
|
138
|
+
| `regex_match(pattern)` | String matches regex pattern |
|
|
139
|
+
| `in_set(values)` | Value in allowed set |
|
|
140
|
+
| `min_length(n)` | String has at least n characters |
|
|
141
|
+
| `max_length(n)` | String has at most n characters |
|
|
142
|
+
| `custom(fn, name)` | Custom validation function |
|
|
143
|
+
|
|
144
|
+
## Architecture
|
|
145
|
+
|
|
146
|
+
```
|
|
147
|
+
dataguard/
|
|
148
|
+
├── __init__.py # Public API
|
|
149
|
+
├── core.py # DataGuard main class
|
|
150
|
+
├── rules.py # Rule & RuleSet definitions
|
|
151
|
+
├── checks.py # Built-in check functions
|
|
152
|
+
├── report.py # ValidationReport & ValidationResult
|
|
153
|
+
├── exceptions.py # Custom exceptions
|
|
154
|
+
├── pandas_engine.py # Pandas validation backend
|
|
155
|
+
└── spark_engine.py # PySpark validation backend
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## Roadmap
|
|
159
|
+
|
|
160
|
+
- [ ] Great Expectations interop layer
|
|
161
|
+
- [ ] dbt integration
|
|
162
|
+
- [ ] SQL-based validation engine
|
|
163
|
+
- [ ] Streaming data validation (Spark Structured Streaming)
|
|
164
|
+
- [ ] CLI tool for one-off validation jobs
|
|
165
|
+
- [ ] Visualization dashboard
|
|
166
|
+
|
|
167
|
+
## Contributing
|
|
168
|
+
|
|
169
|
+
Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
170
|
+
|
|
171
|
+
## License
|
|
172
|
+
|
|
173
|
+
This project is licensed under the MIT License — see the [LICENSE](LICENSE) file for details.
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## 中文介绍
|
|
178
|
+
|
|
179
|
+
DataGuard 是一个轻量级的大数据管道数据质量验证框架,核心特性:
|
|
180
|
+
|
|
181
|
+
- **双引擎支持**:原生支持 Pandas 和 PySpark,无需切换工具
|
|
182
|
+
- **声明式规则**:用简洁的语法定义验证规则,告别样板代码
|
|
183
|
+
- **阈值验证**:支持按规则设置通过率阈值,而非简单的二元判断
|
|
184
|
+
- **数据画像**:一键生成列级统计信息
|
|
185
|
+
- **丰富报告**:支持人类可读摘要 + JSON 导出,方便 CI/CD 集成
|
|
186
|
+
- **零配置**:开箱即用,无需配置文件
|
|
187
|
+
|
|
188
|
+
适用于数据工程师在 ETL/ELT 管道中进行数据质量检查,也适用于数据科学家在分析前验证数据完整性。
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DataGuard - Lightweight Data Quality Validation Framework for Big Data Pipelines
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
__version__ = "0.1.0"
|
|
6
|
+
__author__ = "Zhang Zhen"
|
|
7
|
+
|
|
8
|
+
from dataguard.core import DataGuard
|
|
9
|
+
from dataguard.rules import Rule, RuleSet
|
|
10
|
+
from dataguard.checks import (
|
|
11
|
+
not_null,
|
|
12
|
+
unique,
|
|
13
|
+
in_range,
|
|
14
|
+
regex_match,
|
|
15
|
+
in_set,
|
|
16
|
+
min_length,
|
|
17
|
+
max_length,
|
|
18
|
+
custom,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"DataGuard",
|
|
23
|
+
"Rule",
|
|
24
|
+
"RuleSet",
|
|
25
|
+
"not_null",
|
|
26
|
+
"unique",
|
|
27
|
+
"in_range",
|
|
28
|
+
"regex_match",
|
|
29
|
+
"in_set",
|
|
30
|
+
"min_length",
|
|
31
|
+
"max_length",
|
|
32
|
+
"custom",
|
|
33
|
+
]
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Built-in validation check functions for DataGuard.
|
|
3
|
+
|
|
4
|
+
Each check function takes a value and returns True if valid, False otherwise.
|
|
5
|
+
These are designed to be composable and reusable.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from typing import Any, Callable, Container, Optional, Pattern
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def not_null():
|
|
13
|
+
"""Check that value is not None/NaN."""
|
|
14
|
+
def _check(value: Any) -> bool:
|
|
15
|
+
if value is None:
|
|
16
|
+
return False
|
|
17
|
+
# Handle pandas NaN
|
|
18
|
+
try:
|
|
19
|
+
import math
|
|
20
|
+
if isinstance(value, float) and math.isnan(value):
|
|
21
|
+
return False
|
|
22
|
+
except (TypeError, ValueError):
|
|
23
|
+
pass
|
|
24
|
+
return True
|
|
25
|
+
_check.__name__ = "not_null"
|
|
26
|
+
return _check
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def unique():
|
|
30
|
+
"""
|
|
31
|
+
Check that values in a column are unique.
|
|
32
|
+
|
|
33
|
+
Note: This is a column-level check. The engine handles deduplication logic.
|
|
34
|
+
This check function marks values for uniqueness validation.
|
|
35
|
+
"""
|
|
36
|
+
def _check(value: Any) -> bool:
|
|
37
|
+
return True # Actual uniqueness is checked at engine level
|
|
38
|
+
_check.__name__ = "unique"
|
|
39
|
+
return _check
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def in_range(min_val: Optional[float] = None, max_val: Optional[float] = None):
|
|
43
|
+
"""
|
|
44
|
+
Check that a numeric value falls within [min_val, max_val].
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
min_val: Minimum allowed value (inclusive). None = no lower bound.
|
|
48
|
+
max_val: Maximum allowed value (inclusive). None = no upper bound.
|
|
49
|
+
"""
|
|
50
|
+
def _check(value: Any) -> bool:
|
|
51
|
+
if value is None:
|
|
52
|
+
return True # Use not_null() for null checks
|
|
53
|
+
try:
|
|
54
|
+
num = float(value)
|
|
55
|
+
except (TypeError, ValueError):
|
|
56
|
+
return False
|
|
57
|
+
if min_val is not None and num < min_val:
|
|
58
|
+
return False
|
|
59
|
+
if max_val is not None and num > max_val:
|
|
60
|
+
return False
|
|
61
|
+
return True
|
|
62
|
+
_check.__name__ = f"in_range({min_val}, {max_val})"
|
|
63
|
+
return _check
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def regex_match(pattern: str):
|
|
67
|
+
"""
|
|
68
|
+
Check that a string value matches the given regex pattern.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
pattern: Regular expression pattern string.
|
|
72
|
+
"""
|
|
73
|
+
compiled = re.compile(pattern)
|
|
74
|
+
|
|
75
|
+
def _check(value: Any) -> bool:
|
|
76
|
+
if value is None:
|
|
77
|
+
return True
|
|
78
|
+
return bool(compiled.match(str(value)))
|
|
79
|
+
_check.__name__ = f"regex_match('{pattern}')"
|
|
80
|
+
return _check
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def in_set(allowed_values: Container):
|
|
84
|
+
"""
|
|
85
|
+
Check that a value is in the allowed set.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
allowed_values: A set, list, or tuple of allowed values.
|
|
89
|
+
"""
|
|
90
|
+
allowed = set(allowed_values)
|
|
91
|
+
|
|
92
|
+
def _check(value: Any) -> bool:
|
|
93
|
+
if value is None:
|
|
94
|
+
return True
|
|
95
|
+
return value in allowed
|
|
96
|
+
_check.__name__ = f"in_set({allowed_values})"
|
|
97
|
+
return _check
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def min_length(min_len: int):
|
|
101
|
+
"""Check that a string value has at least min_len characters."""
|
|
102
|
+
def _check(value: Any) -> bool:
|
|
103
|
+
if value is None:
|
|
104
|
+
return True
|
|
105
|
+
return len(str(value)) >= min_len
|
|
106
|
+
_check.__name__ = f"min_length({min_len})"
|
|
107
|
+
return _check
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def max_length(max_len: int):
|
|
111
|
+
"""Check that a string value has at most max_len characters."""
|
|
112
|
+
def _check(value: Any) -> bool:
|
|
113
|
+
if value is None:
|
|
114
|
+
return True
|
|
115
|
+
return len(str(value)) <= max_len
|
|
116
|
+
_check.__name__ = f"max_length({max_len})"
|
|
117
|
+
return _check
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def custom(func: Callable[[Any], bool], name: str = ""):
|
|
121
|
+
"""
|
|
122
|
+
Wrap a custom validation function.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
func: A callable that takes a value and returns bool.
|
|
126
|
+
name: Optional name for the check.
|
|
127
|
+
"""
|
|
128
|
+
def _check(value: Any) -> bool:
|
|
129
|
+
return func(value)
|
|
130
|
+
_check.__name__ = name or getattr(func, "__name__", "custom")
|
|
131
|
+
return _check
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core validation engine for DataGuard.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, List, Optional, Union
|
|
6
|
+
from dataguard.rules import RuleSet
|
|
7
|
+
from dataguard.report import ValidationReport, ValidationResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DataGuard:
|
|
11
|
+
"""
|
|
12
|
+
Main entry point for data quality validation.
|
|
13
|
+
|
|
14
|
+
Supports both Pandas DataFrames and PySpark DataFrames.
|
|
15
|
+
|
|
16
|
+
Example:
|
|
17
|
+
>>> import pandas as pd
|
|
18
|
+
>>> from dataguard import DataGuard, RuleSet, not_null, in_range
|
|
19
|
+
>>> df = pd.DataFrame({"age": [25, 30, -1, None], "name": ["Alice", "Bob", "Charlie", "Dana"]})
|
|
20
|
+
>>> rules = RuleSet()
|
|
21
|
+
>>> rules.add("age", not_null())
|
|
22
|
+
>>> rules.add("age", in_range(0, 120))
|
|
23
|
+
>>> guardian = DataGuard(df)
|
|
24
|
+
>>> report = guardian.validate(rules)
|
|
25
|
+
>>> print(report.summary())
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, dataframe: Any, engine: Optional[str] = None):
|
|
29
|
+
"""
|
|
30
|
+
Initialize DataGuard with a DataFrame.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
dataframe: A Pandas DataFrame or PySpark DataFrame.
|
|
34
|
+
engine: Force engine type ('pandas' or 'spark'). Auto-detected if None.
|
|
35
|
+
"""
|
|
36
|
+
self._dataframe = dataframe
|
|
37
|
+
self._engine = self._detect_engine(dataframe, engine)
|
|
38
|
+
|
|
39
|
+
@staticmethod
|
|
40
|
+
def _detect_engine(dataframe: Any, engine: Optional[str]) -> str:
|
|
41
|
+
"""Detect whether to use Pandas or Spark engine."""
|
|
42
|
+
if engine is not None:
|
|
43
|
+
if engine not in ("pandas", "spark"):
|
|
44
|
+
raise ValueError(f"Unsupported engine: {engine}. Use 'pandas' or 'spark'.")
|
|
45
|
+
return engine
|
|
46
|
+
|
|
47
|
+
type_name = type(dataframe).__module__
|
|
48
|
+
if "pandas" in type_name:
|
|
49
|
+
return "pandas"
|
|
50
|
+
elif "pyspark" in type_name or "spark" in type_name:
|
|
51
|
+
return "spark"
|
|
52
|
+
else:
|
|
53
|
+
# Default to pandas
|
|
54
|
+
return "pandas"
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def engine(self) -> str:
|
|
58
|
+
"""Return the active engine type."""
|
|
59
|
+
return self._engine
|
|
60
|
+
|
|
61
|
+
def validate(self, rules: RuleSet, raise_on_error: bool = False) -> ValidationReport:
|
|
62
|
+
"""
|
|
63
|
+
Run all validation rules against the DataFrame.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
rules: A RuleSet containing validation rules.
|
|
67
|
+
raise_on_error: If True, raise ValidationError when any rule fails.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
ValidationReport with detailed results for each rule.
|
|
71
|
+
"""
|
|
72
|
+
if self._engine == "spark":
|
|
73
|
+
from dataguard.spark_engine import validate_spark
|
|
74
|
+
results = validate_spark(self._dataframe, rules)
|
|
75
|
+
else:
|
|
76
|
+
from dataguard.pandas_engine import validate_pandas
|
|
77
|
+
results = validate_pandas(self._dataframe, rules)
|
|
78
|
+
|
|
79
|
+
report = ValidationReport(results=results, engine=self._engine)
|
|
80
|
+
|
|
81
|
+
if raise_on_error and report.failed_count > 0:
|
|
82
|
+
from dataguard.exceptions import ValidationError
|
|
83
|
+
raise ValidationError(report)
|
|
84
|
+
|
|
85
|
+
return report
|
|
86
|
+
|
|
87
|
+
def profile(self) -> Dict[str, Dict[str, Any]]:
|
|
88
|
+
"""
|
|
89
|
+
Generate a data profile for all columns.
|
|
90
|
+
|
|
91
|
+
Returns basic statistics: null count, distinct count, min, max, etc.
|
|
92
|
+
"""
|
|
93
|
+
if self._engine == "spark":
|
|
94
|
+
from dataguard.spark_engine import profile_spark
|
|
95
|
+
return profile_spark(self._dataframe)
|
|
96
|
+
else:
|
|
97
|
+
from dataguard.pandas_engine import profile_pandas
|
|
98
|
+
return profile_pandas(self._dataframe)
|