dqguard 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dqguard-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Zhang Zhen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
dqguard-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,225 @@
1
+ Metadata-Version: 2.4
2
+ Name: dqguard
3
+ Version: 0.1.0
4
+ Summary: Lightweight data quality validation framework for big data pipelines
5
+ Author-email: Zhang Zhen <zhangzhen9798@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/zhangzhen9798/dataguard
8
+ Project-URL: Repository, https://github.com/zhangzhen9798/dataguard
9
+ Project-URL: Issues, https://github.com/zhangzhen9798/dataguard/issues
10
+ Keywords: data-quality,validation,spark,pandas,big-data,data-pipeline
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Database
21
+ Classifier: Topic :: Software Development :: Quality Assurance
22
+ Classifier: Topic :: Software Development :: Testing
23
+ Requires-Python: >=3.8
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: pandas>=1.3.0
27
+ Requires-Dist: numpy>=1.20.0
28
+ Provides-Extra: spark
29
+ Requires-Dist: pyspark>=3.0.0; extra == "spark"
30
+ Provides-Extra: dev
31
+ Requires-Dist: pytest>=7.0; extra == "dev"
32
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
33
+ Requires-Dist: black>=23.0; extra == "dev"
34
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
35
+ Requires-Dist: mypy>=1.0; extra == "dev"
36
+ Dynamic: license-file
37
+
38
+ <div align="center">
39
+
40
+ # 🛡️ DataGuard
41
+
42
+ **Lightweight Data Quality Validation Framework for Big Data Pipelines**
43
+
44
+ [![PyPI version](https://img.shields.io/badge/version-0.1.0-blue.svg)](https://github.com/zhangzhen9798/dataguard)
45
+ [![Python](https://img.shields.io/badge/python-3.8%2B-green.svg)](https://www.python.org/)
46
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
47
+ [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](CONTRIBUTING.md)
48
+
49
+ [English](#features) | [中文文档](#中文介绍)
50
+
51
+ </div>
52
+
53
+ ---
54
+
55
+ ## Why DataGuard?
56
+
57
+ Data quality issues cost organizations millions annually. Existing solutions like Great Expectations are powerful but heavy. **DataGuard** provides a **lightweight, intuitive** alternative that works seamlessly with both **Pandas** and **PySpark** — perfect for big data pipelines.
58
+
59
+ - ✅ **Dual Engine**: First-class support for both Pandas & PySpark
60
+ - ✅ **Declarative Rules**: Define validation rules cleanly, no boilerplate
61
+ - ✅ **Threshold-based**: Set pass-rate thresholds per rule (not just pass/fail)
62
+ - ✅ **Data Profiling**: Auto-generate column-level statistics
63
+ - ✅ **Rich Reports**: Human-readable summaries + JSON export for CI/CD
64
+ - ✅ **Zero Config**: Works out of the box, no setup files needed
65
+
66
+ ## Quick Start
67
+
68
+ ### Installation
69
+
70
+ ```bash
71
+ # Basic (Pandas engine)
72
+ pip install dataguard
73
+
74
+ # With PySpark support
75
+ pip install dataguard[spark]
76
+ ```
77
+
78
+ ### Basic Usage
79
+
80
+ ```python
81
+ import pandas as pd
82
+ from dataguard import DataGuard, RuleSet, not_null, in_range, in_set, regex_match
83
+
84
+ # Create a DataFrame
85
+ df = pd.DataFrame({
86
+ "name": ["Alice", "Bob", "Charlie", None, "Eve"],
87
+ "age": [25, 30, -1, 40, 150],
88
+ "email": ["alice@example.com", "invalid", "charlie@example.com", "dana@example.com", "eve@example.com"],
89
+ "status": ["active", "active", "inactive", "active", "unknown"],
90
+ })
91
+
92
+ # Define validation rules
93
+ rules = RuleSet()
94
+ rules.add("name", not_null())
95
+ rules.add("age", not_null())
96
+ rules.add("age", in_range(0, 120))
97
+ rules.add("email", regex_match(r"^[\w.-]+@[\w.-]+\.\w+$"))
98
+ rules.add("status", in_set(["active", "inactive"]))
99
+
100
+ # Run validation
101
+ guardian = DataGuard(df)
102
+ report = guardian.validate(rules)
103
+
104
+ # Print summary
105
+ print(report.summary())
106
+ ```
107
+
108
+ Output:
109
+ ```
110
+ DataGuard Validation Report
111
+ Engine: pandas
112
+ Total Rules: 5 | Passed: 1 | Failed: 4
113
+ Overall Status: INVALID
114
+ ------------------------------------------------------------
115
+ [FAIL] name.not_null | pass_rate=80.00% (threshold=100%) | 4/5 rows passed
116
+ [PASS] age.not_null | pass_rate=100.00% (threshold=100%) | 5/5 rows passed
117
+ [FAIL] age.in_range(0, 120) | pass_rate=80.00% (threshold=100%) | 4/5 rows passed
118
+ [FAIL] email.regex_match(...) | pass_rate=80.00% (threshold=100%) | 4/5 rows passed
119
+ [FAIL] status.in_set(...) | pass_rate=80.00% (threshold=100%) | 4/5 rows passed
120
+ ```
121
+
122
+ ### With PySpark
123
+
124
+ ```python
125
+ from pyspark.sql import SparkSession
126
+ from dataguard import DataGuard, RuleSet, not_null, in_range
127
+
128
+ spark = SparkSession.builder.appName("DataGuard").getOrCreate()
129
+ df = spark.read.parquet("s3://my-bucket/data/")
130
+
131
+ rules = RuleSet()
132
+ rules.add("user_id", not_null())
133
+ rules.add("user_id", unique())
134
+ rules.add("age", in_range(0, 120))
135
+
136
+ report = DataGuard(df).validate(rules)
137
+ ```
138
+
139
+ ### Threshold-based Validation
140
+
141
+ Not every dataset needs 100% compliance. Set thresholds per rule:
142
+
143
+ ```python
144
+ rules = RuleSet()
145
+ # Allow up to 5% null values in optional fields
146
+ rules.add("middle_name", not_null(), threshold=0.95)
147
+ # Require 99.9% uniqueness for IDs
148
+ rules.add("transaction_id", unique(), threshold=0.999)
149
+ ```
150
+
151
+ ### Data Profiling
152
+
153
+ ```python
154
+ guardian = DataGuard(df)
155
+ profile = guardian.profile()
156
+
157
+ for col, stats in profile.items():
158
+ print(f"{col}: {stats['distinct_count']} distinct, {stats['null_rate']:.2%} nulls")
159
+ ```
160
+
161
+ ### JSON Export (for CI/CD integration)
162
+
163
+ ```python
164
+ report = guardian.validate(rules)
165
+ print(report.to_json())
166
+ ```
167
+
168
+ ## Built-in Checks
169
+
170
+ | Check | Description |
171
+ |-------|-------------|
172
+ | `not_null()` | Value must not be None/NaN |
173
+ | `unique()` | Column values must be unique |
174
+ | `in_range(min, max)` | Numeric value within range (inclusive) |
175
+ | `regex_match(pattern)` | String matches regex pattern |
176
+ | `in_set(values)` | Value in allowed set |
177
+ | `min_length(n)` | String has at least n characters |
178
+ | `max_length(n)` | String has at most n characters |
179
+ | `custom(fn, name)` | Custom validation function |
180
+
181
+ ## Architecture
182
+
183
+ ```
184
+ dataguard/
185
+ ├── __init__.py # Public API
186
+ ├── core.py # DataGuard main class
187
+ ├── rules.py # Rule & RuleSet definitions
188
+ ├── checks.py # Built-in check functions
189
+ ├── report.py # ValidationReport & ValidationResult
190
+ ├── exceptions.py # Custom exceptions
191
+ ├── pandas_engine.py # Pandas validation backend
192
+ └── spark_engine.py # PySpark validation backend
193
+ ```
194
+
195
+ ## Roadmap
196
+
197
+ - [ ] Great Expectations interop layer
198
+ - [ ] dbt integration
199
+ - [ ] SQL-based validation engine
200
+ - [ ] Streaming data validation (Spark Structured Streaming)
201
+ - [ ] CLI tool for one-off validation jobs
202
+ - [ ] Visualization dashboard
203
+
204
+ ## Contributing
205
+
206
+ Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
207
+
208
+ ## License
209
+
210
+ This project is licensed under the MIT License — see the [LICENSE](LICENSE) file for details.
211
+
212
+ ---
213
+
214
+ ## 中文介绍
215
+
216
+ DataGuard 是一个轻量级的大数据管道数据质量验证框架,核心特性:
217
+
218
+ - **双引擎支持**:原生支持 Pandas 和 PySpark,无需切换工具
219
+ - **声明式规则**:用简洁的语法定义验证规则,告别样板代码
220
+ - **阈值验证**:支持按规则设置通过率阈值,而非简单的二元判断
221
+ - **数据画像**:一键生成列级统计信息
222
+ - **丰富报告**:支持人类可读摘要 + JSON 导出,方便 CI/CD 集成
223
+ - **零配置**:开箱即用,无需配置文件
224
+
225
+ 适用于数据工程师在 ETL/ELT 管道中进行数据质量检查,也适用于数据科学家在分析前验证数据完整性。
@@ -0,0 +1,188 @@
1
+ <div align="center">
2
+
3
+ # 🛡️ DataGuard
4
+
5
+ **Lightweight Data Quality Validation Framework for Big Data Pipelines**
6
+
7
+ [![PyPI version](https://img.shields.io/badge/version-0.1.0-blue.svg)](https://github.com/zhangzhen9798/dataguard)
8
+ [![Python](https://img.shields.io/badge/python-3.8%2B-green.svg)](https://www.python.org/)
9
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
10
+ [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](CONTRIBUTING.md)
11
+
12
+ [English](#features) | [中文文档](#中文介绍)
13
+
14
+ </div>
15
+
16
+ ---
17
+
18
+ ## Why DataGuard?
19
+
20
+ Data quality issues cost organizations millions annually. Existing solutions like Great Expectations are powerful but heavy. **DataGuard** provides a **lightweight, intuitive** alternative that works seamlessly with both **Pandas** and **PySpark** — perfect for big data pipelines.
21
+
22
+ - ✅ **Dual Engine**: First-class support for both Pandas & PySpark
23
+ - ✅ **Declarative Rules**: Define validation rules cleanly, no boilerplate
24
+ - ✅ **Threshold-based**: Set pass-rate thresholds per rule (not just pass/fail)
25
+ - ✅ **Data Profiling**: Auto-generate column-level statistics
26
+ - ✅ **Rich Reports**: Human-readable summaries + JSON export for CI/CD
27
+ - ✅ **Zero Config**: Works out of the box, no setup files needed
28
+
29
+ ## Quick Start
30
+
31
+ ### Installation
32
+
33
+ ```bash
34
+ # Basic (Pandas engine)
35
+ pip install dataguard
36
+
37
+ # With PySpark support
38
+ pip install dataguard[spark]
39
+ ```
40
+
41
+ ### Basic Usage
42
+
43
+ ```python
44
+ import pandas as pd
45
+ from dataguard import DataGuard, RuleSet, not_null, in_range, in_set, regex_match
46
+
47
+ # Create a DataFrame
48
+ df = pd.DataFrame({
49
+ "name": ["Alice", "Bob", "Charlie", None, "Eve"],
50
+ "age": [25, 30, -1, 40, 150],
51
+ "email": ["alice@example.com", "invalid", "charlie@example.com", "dana@example.com", "eve@example.com"],
52
+ "status": ["active", "active", "inactive", "active", "unknown"],
53
+ })
54
+
55
+ # Define validation rules
56
+ rules = RuleSet()
57
+ rules.add("name", not_null())
58
+ rules.add("age", not_null())
59
+ rules.add("age", in_range(0, 120))
60
+ rules.add("email", regex_match(r"^[\w.-]+@[\w.-]+\.\w+$"))
61
+ rules.add("status", in_set(["active", "inactive"]))
62
+
63
+ # Run validation
64
+ guardian = DataGuard(df)
65
+ report = guardian.validate(rules)
66
+
67
+ # Print summary
68
+ print(report.summary())
69
+ ```
70
+
71
+ Output:
72
+ ```
73
+ DataGuard Validation Report
74
+ Engine: pandas
75
+ Total Rules: 5 | Passed: 1 | Failed: 4
76
+ Overall Status: INVALID
77
+ ------------------------------------------------------------
78
+ [FAIL] name.not_null | pass_rate=80.00% (threshold=100%) | 4/5 rows passed
79
+ [PASS] age.not_null | pass_rate=100.00% (threshold=100%) | 5/5 rows passed
80
+ [FAIL] age.in_range(0, 120) | pass_rate=80.00% (threshold=100%) | 4/5 rows passed
81
+ [FAIL] email.regex_match(...) | pass_rate=80.00% (threshold=100%) | 4/5 rows passed
82
+ [FAIL] status.in_set(...) | pass_rate=80.00% (threshold=100%) | 4/5 rows passed
83
+ ```
84
+
85
+ ### With PySpark
86
+
87
+ ```python
88
+ from pyspark.sql import SparkSession
89
+ from dataguard import DataGuard, RuleSet, not_null, in_range
90
+
91
+ spark = SparkSession.builder.appName("DataGuard").getOrCreate()
92
+ df = spark.read.parquet("s3://my-bucket/data/")
93
+
94
+ rules = RuleSet()
95
+ rules.add("user_id", not_null())
96
+ rules.add("user_id", unique())
97
+ rules.add("age", in_range(0, 120))
98
+
99
+ report = DataGuard(df).validate(rules)
100
+ ```
101
+
102
+ ### Threshold-based Validation
103
+
104
+ Not every dataset needs 100% compliance. Set thresholds per rule:
105
+
106
+ ```python
107
+ rules = RuleSet()
108
+ # Allow up to 5% null values in optional fields
109
+ rules.add("middle_name", not_null(), threshold=0.95)
110
+ # Require 99.9% uniqueness for IDs
111
+ rules.add("transaction_id", unique(), threshold=0.999)
112
+ ```
113
+
114
+ ### Data Profiling
115
+
116
+ ```python
117
+ guardian = DataGuard(df)
118
+ profile = guardian.profile()
119
+
120
+ for col, stats in profile.items():
121
+ print(f"{col}: {stats['distinct_count']} distinct, {stats['null_rate']:.2%} nulls")
122
+ ```
123
+
124
+ ### JSON Export (for CI/CD integration)
125
+
126
+ ```python
127
+ report = guardian.validate(rules)
128
+ print(report.to_json())
129
+ ```
130
+
131
+ ## Built-in Checks
132
+
133
+ | Check | Description |
134
+ |-------|-------------|
135
+ | `not_null()` | Value must not be None/NaN |
136
+ | `unique()` | Column values must be unique |
137
+ | `in_range(min, max)` | Numeric value within range (inclusive) |
138
+ | `regex_match(pattern)` | String matches regex pattern |
139
+ | `in_set(values)` | Value in allowed set |
140
+ | `min_length(n)` | String has at least n characters |
141
+ | `max_length(n)` | String has at most n characters |
142
+ | `custom(fn, name)` | Custom validation function |
143
+
144
+ ## Architecture
145
+
146
+ ```
147
+ dataguard/
148
+ ├── __init__.py # Public API
149
+ ├── core.py # DataGuard main class
150
+ ├── rules.py # Rule & RuleSet definitions
151
+ ├── checks.py # Built-in check functions
152
+ ├── report.py # ValidationReport & ValidationResult
153
+ ├── exceptions.py # Custom exceptions
154
+ ├── pandas_engine.py # Pandas validation backend
155
+ └── spark_engine.py # PySpark validation backend
156
+ ```
157
+
158
+ ## Roadmap
159
+
160
+ - [ ] Great Expectations interop layer
161
+ - [ ] dbt integration
162
+ - [ ] SQL-based validation engine
163
+ - [ ] Streaming data validation (Spark Structured Streaming)
164
+ - [ ] CLI tool for one-off validation jobs
165
+ - [ ] Visualization dashboard
166
+
167
+ ## Contributing
168
+
169
+ Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
170
+
171
+ ## License
172
+
173
+ This project is licensed under the MIT License — see the [LICENSE](LICENSE) file for details.
174
+
175
+ ---
176
+
177
+ ## 中文介绍
178
+
179
+ DataGuard 是一个轻量级的大数据管道数据质量验证框架,核心特性:
180
+
181
+ - **双引擎支持**:原生支持 Pandas 和 PySpark,无需切换工具
182
+ - **声明式规则**:用简洁的语法定义验证规则,告别样板代码
183
+ - **阈值验证**:支持按规则设置通过率阈值,而非简单的二元判断
184
+ - **数据画像**:一键生成列级统计信息
185
+ - **丰富报告**:支持人类可读摘要 + JSON 导出,方便 CI/CD 集成
186
+ - **零配置**:开箱即用,无需配置文件
187
+
188
+ 适用于数据工程师在 ETL/ELT 管道中进行数据质量检查,也适用于数据科学家在分析前验证数据完整性。
@@ -0,0 +1,33 @@
1
+ """
2
+ DataGuard - Lightweight Data Quality Validation Framework for Big Data Pipelines
3
+ """
4
+
5
+ __version__ = "0.1.0"
6
+ __author__ = "Zhang Zhen"
7
+
8
+ from dataguard.core import DataGuard
9
+ from dataguard.rules import Rule, RuleSet
10
+ from dataguard.checks import (
11
+ not_null,
12
+ unique,
13
+ in_range,
14
+ regex_match,
15
+ in_set,
16
+ min_length,
17
+ max_length,
18
+ custom,
19
+ )
20
+
21
+ __all__ = [
22
+ "DataGuard",
23
+ "Rule",
24
+ "RuleSet",
25
+ "not_null",
26
+ "unique",
27
+ "in_range",
28
+ "regex_match",
29
+ "in_set",
30
+ "min_length",
31
+ "max_length",
32
+ "custom",
33
+ ]
@@ -0,0 +1,131 @@
1
+ """
2
+ Built-in validation check functions for DataGuard.
3
+
4
+ Each check function takes a value and returns True if valid, False otherwise.
5
+ These are designed to be composable and reusable.
6
+ """
7
+
8
+ import re
9
+ from typing import Any, Callable, Container, Optional, Pattern
10
+
11
+
12
+ def not_null():
13
+ """Check that value is not None/NaN."""
14
+ def _check(value: Any) -> bool:
15
+ if value is None:
16
+ return False
17
+ # Handle pandas NaN
18
+ try:
19
+ import math
20
+ if isinstance(value, float) and math.isnan(value):
21
+ return False
22
+ except (TypeError, ValueError):
23
+ pass
24
+ return True
25
+ _check.__name__ = "not_null"
26
+ return _check
27
+
28
+
29
+ def unique():
30
+ """
31
+ Check that values in a column are unique.
32
+
33
+ Note: This is a column-level check. The engine handles deduplication logic.
34
+ This check function marks values for uniqueness validation.
35
+ """
36
+ def _check(value: Any) -> bool:
37
+ return True # Actual uniqueness is checked at engine level
38
+ _check.__name__ = "unique"
39
+ return _check
40
+
41
+
42
+ def in_range(min_val: Optional[float] = None, max_val: Optional[float] = None):
43
+ """
44
+ Check that a numeric value falls within [min_val, max_val].
45
+
46
+ Args:
47
+ min_val: Minimum allowed value (inclusive). None = no lower bound.
48
+ max_val: Maximum allowed value (inclusive). None = no upper bound.
49
+ """
50
+ def _check(value: Any) -> bool:
51
+ if value is None:
52
+ return True # Use not_null() for null checks
53
+ try:
54
+ num = float(value)
55
+ except (TypeError, ValueError):
56
+ return False
57
+ if min_val is not None and num < min_val:
58
+ return False
59
+ if max_val is not None and num > max_val:
60
+ return False
61
+ return True
62
+ _check.__name__ = f"in_range({min_val}, {max_val})"
63
+ return _check
64
+
65
+
66
+ def regex_match(pattern: str):
67
+ """
68
+ Check that a string value matches the given regex pattern.
69
+
70
+ Args:
71
+ pattern: Regular expression pattern string.
72
+ """
73
+ compiled = re.compile(pattern)
74
+
75
+ def _check(value: Any) -> bool:
76
+ if value is None:
77
+ return True
78
+ return bool(compiled.match(str(value)))
79
+ _check.__name__ = f"regex_match('{pattern}')"
80
+ return _check
81
+
82
+
83
+ def in_set(allowed_values: Container):
84
+ """
85
+ Check that a value is in the allowed set.
86
+
87
+ Args:
88
+ allowed_values: A set, list, or tuple of allowed values.
89
+ """
90
+ allowed = set(allowed_values)
91
+
92
+ def _check(value: Any) -> bool:
93
+ if value is None:
94
+ return True
95
+ return value in allowed
96
+ _check.__name__ = f"in_set({allowed_values})"
97
+ return _check
98
+
99
+
100
+ def min_length(min_len: int):
101
+ """Check that a string value has at least min_len characters."""
102
+ def _check(value: Any) -> bool:
103
+ if value is None:
104
+ return True
105
+ return len(str(value)) >= min_len
106
+ _check.__name__ = f"min_length({min_len})"
107
+ return _check
108
+
109
+
110
+ def max_length(max_len: int):
111
+ """Check that a string value has at most max_len characters."""
112
+ def _check(value: Any) -> bool:
113
+ if value is None:
114
+ return True
115
+ return len(str(value)) <= max_len
116
+ _check.__name__ = f"max_length({max_len})"
117
+ return _check
118
+
119
+
120
+ def custom(func: Callable[[Any], bool], name: str = ""):
121
+ """
122
+ Wrap a custom validation function.
123
+
124
+ Args:
125
+ func: A callable that takes a value and returns bool.
126
+ name: Optional name for the check.
127
+ """
128
+ def _check(value: Any) -> bool:
129
+ return func(value)
130
+ _check.__name__ = name or getattr(func, "__name__", "custom")
131
+ return _check
@@ -0,0 +1,98 @@
1
+ """
2
+ Core validation engine for DataGuard.
3
+ """
4
+
5
+ from typing import Any, Dict, List, Optional, Union
6
+ from dataguard.rules import RuleSet
7
+ from dataguard.report import ValidationReport, ValidationResult
8
+
9
+
10
+ class DataGuard:
11
+ """
12
+ Main entry point for data quality validation.
13
+
14
+ Supports both Pandas DataFrames and PySpark DataFrames.
15
+
16
+ Example:
17
+ >>> import pandas as pd
18
+ >>> from dataguard import DataGuard, RuleSet, not_null, in_range
19
+ >>> df = pd.DataFrame({"age": [25, 30, -1, None], "name": ["Alice", "Bob", "Charlie", "Dana"]})
20
+ >>> rules = RuleSet()
21
+ >>> rules.add("age", not_null())
22
+ >>> rules.add("age", in_range(0, 120))
23
+ >>> guardian = DataGuard(df)
24
+ >>> report = guardian.validate(rules)
25
+ >>> print(report.summary())
26
+ """
27
+
28
+ def __init__(self, dataframe: Any, engine: Optional[str] = None):
29
+ """
30
+ Initialize DataGuard with a DataFrame.
31
+
32
+ Args:
33
+ dataframe: A Pandas DataFrame or PySpark DataFrame.
34
+ engine: Force engine type ('pandas' or 'spark'). Auto-detected if None.
35
+ """
36
+ self._dataframe = dataframe
37
+ self._engine = self._detect_engine(dataframe, engine)
38
+
39
+ @staticmethod
40
+ def _detect_engine(dataframe: Any, engine: Optional[str]) -> str:
41
+ """Detect whether to use Pandas or Spark engine."""
42
+ if engine is not None:
43
+ if engine not in ("pandas", "spark"):
44
+ raise ValueError(f"Unsupported engine: {engine}. Use 'pandas' or 'spark'.")
45
+ return engine
46
+
47
+ type_name = type(dataframe).__module__
48
+ if "pandas" in type_name:
49
+ return "pandas"
50
+ elif "pyspark" in type_name or "spark" in type_name:
51
+ return "spark"
52
+ else:
53
+ # Default to pandas
54
+ return "pandas"
55
+
56
+ @property
57
+ def engine(self) -> str:
58
+ """Return the active engine type."""
59
+ return self._engine
60
+
61
+ def validate(self, rules: RuleSet, raise_on_error: bool = False) -> ValidationReport:
62
+ """
63
+ Run all validation rules against the DataFrame.
64
+
65
+ Args:
66
+ rules: A RuleSet containing validation rules.
67
+ raise_on_error: If True, raise ValidationError when any rule fails.
68
+
69
+ Returns:
70
+ ValidationReport with detailed results for each rule.
71
+ """
72
+ if self._engine == "spark":
73
+ from dataguard.spark_engine import validate_spark
74
+ results = validate_spark(self._dataframe, rules)
75
+ else:
76
+ from dataguard.pandas_engine import validate_pandas
77
+ results = validate_pandas(self._dataframe, rules)
78
+
79
+ report = ValidationReport(results=results, engine=self._engine)
80
+
81
+ if raise_on_error and report.failed_count > 0:
82
+ from dataguard.exceptions import ValidationError
83
+ raise ValidationError(report)
84
+
85
+ return report
86
+
87
+ def profile(self) -> Dict[str, Dict[str, Any]]:
88
+ """
89
+ Generate a data profile for all columns.
90
+
91
+ Returns basic statistics: null count, distinct count, min, max, etc.
92
+ """
93
+ if self._engine == "spark":
94
+ from dataguard.spark_engine import profile_spark
95
+ return profile_spark(self._dataframe)
96
+ else:
97
+ from dataguard.pandas_engine import profile_pandas
98
+ return profile_pandas(self._dataframe)