validatex 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validatex-1.0.0/LICENSE +21 -0
- validatex-1.0.0/PKG-INFO +572 -0
- validatex-1.0.0/README.md +521 -0
- validatex-1.0.0/setup.cfg +4 -0
- validatex-1.0.0/setup.py +64 -0
- validatex-1.0.0/validatex/__init__.py +47 -0
- validatex-1.0.0/validatex/cli/__init__.py +1 -0
- validatex-1.0.0/validatex/cli/main.py +320 -0
- validatex-1.0.0/validatex/config/__init__.py +1 -0
- validatex-1.0.0/validatex/config/loader.py +102 -0
- validatex-1.0.0/validatex/core/__init__.py +14 -0
- validatex-1.0.0/validatex/core/expectation.py +167 -0
- validatex-1.0.0/validatex/core/result.py +500 -0
- validatex-1.0.0/validatex/core/suite.py +142 -0
- validatex-1.0.0/validatex/core/validator.py +87 -0
- validatex-1.0.0/validatex/datasources/__init__.py +15 -0
- validatex-1.0.0/validatex/datasources/base_source.py +40 -0
- validatex-1.0.0/validatex/datasources/csv_source.py +49 -0
- validatex-1.0.0/validatex/datasources/database_source.py +49 -0
- validatex-1.0.0/validatex/datasources/dataframe_source.py +29 -0
- validatex-1.0.0/validatex/datasources/parquet_source.py +41 -0
- validatex-1.0.0/validatex/expectations/__init__.py +6 -0
- validatex-1.0.0/validatex/expectations/aggregate_expectations.py +240 -0
- validatex-1.0.0/validatex/expectations/column_expectations.py +807 -0
- validatex-1.0.0/validatex/expectations/table_expectations.py +228 -0
- validatex-1.0.0/validatex/profiler/__init__.py +1 -0
- validatex-1.0.0/validatex/profiler/profiler.py +300 -0
- validatex-1.0.0/validatex/reporting/__init__.py +1 -0
- validatex-1.0.0/validatex/reporting/html_report.py +748 -0
- validatex-1.0.0/validatex/reporting/json_report.py +15 -0
- validatex-1.0.0/validatex.egg-info/PKG-INFO +572 -0
- validatex-1.0.0/validatex.egg-info/SOURCES.txt +34 -0
- validatex-1.0.0/validatex.egg-info/dependency_links.txt +1 -0
- validatex-1.0.0/validatex.egg-info/entry_points.txt +2 -0
- validatex-1.0.0/validatex.egg-info/requires.txt +24 -0
- validatex-1.0.0/validatex.egg-info/top_level.txt +1 -0
validatex-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Kaviarasan Mani
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
validatex-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,572 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: validatex
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A powerful data quality validation framework inspired by Great Expectations
|
|
5
|
+
Home-page: https://github.com/kaviarasanmani/ValidateX
|
|
6
|
+
Author: Kaviarasan Mani
|
|
7
|
+
Classifier: Development Status :: 4 - Beta
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
17
|
+
Classifier: Topic :: Database
|
|
18
|
+
Requires-Python: >=3.8
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: pandas>=1.3.0
|
|
22
|
+
Requires-Dist: pyyaml>=6.0
|
|
23
|
+
Requires-Dist: jinja2>=3.0
|
|
24
|
+
Requires-Dist: click>=8.0
|
|
25
|
+
Requires-Dist: rich>=12.0
|
|
26
|
+
Requires-Dist: colorama>=0.4.0
|
|
27
|
+
Provides-Extra: spark
|
|
28
|
+
Requires-Dist: pyspark>=3.0.0; extra == "spark"
|
|
29
|
+
Provides-Extra: database
|
|
30
|
+
Requires-Dist: sqlalchemy>=1.4.0; extra == "database"
|
|
31
|
+
Provides-Extra: all
|
|
32
|
+
Requires-Dist: pyspark>=3.0.0; extra == "all"
|
|
33
|
+
Requires-Dist: sqlalchemy>=1.4.0; extra == "all"
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
36
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
37
|
+
Requires-Dist: black>=22.0; extra == "dev"
|
|
38
|
+
Requires-Dist: flake8>=5.0; extra == "dev"
|
|
39
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
40
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
41
|
+
Dynamic: author
|
|
42
|
+
Dynamic: classifier
|
|
43
|
+
Dynamic: description
|
|
44
|
+
Dynamic: description-content-type
|
|
45
|
+
Dynamic: home-page
|
|
46
|
+
Dynamic: license-file
|
|
47
|
+
Dynamic: provides-extra
|
|
48
|
+
Dynamic: requires-dist
|
|
49
|
+
Dynamic: requires-python
|
|
50
|
+
Dynamic: summary
|
|
51
|
+
|
|
52
|
+
<p align="center">
|
|
53
|
+
<h1 align="center">๐ ValidateX</h1>
|
|
54
|
+
<p align="center">
|
|
55
|
+
<strong>A powerful, extensible data quality validation framework for Python.</strong>
|
|
56
|
+
</p>
|
|
57
|
+
<p align="center">
|
|
58
|
+
<!-- Build & Tests -->
|
|
59
|
+
<a href="https://github.com/kaviarasanmani/ValidateX/actions/workflows/tests.yml"><img src="https://img.shields.io/github/actions/workflow/status/kaviarasanmani/ValidateX/tests.yml?branch=main" alt="Build Status (Tests & CI)"></a>
|
|
60
|
+
<img src="https://img.shields.io/badge/coverage-96%25-brightgreen" alt="Code Coverage">
|
|
61
|
+
<img src="https://img.shields.io/badge/tests-66%20passed-brightgreen" alt="Test Passing Rate">
|
|
62
|
+
<!-- Package & Language -->
|
|
63
|
+
<a href="https://pypi.org/project/validatex/"><img src="https://img.shields.io/pypi/v/validatex.svg" alt="PyPI Latest Version"></a>
|
|
64
|
+
<img src="https://img.shields.io/badge/python-3.9+-blue?logo=python&logoColor=white" alt="Supported Python Versions">
|
|
65
|
+
<!-- License & Style -->
|
|
66
|
+
<img src="https://img.shields.io/badge/license-MIT-green" alt="MIT License">
|
|
67
|
+
<img src="https://img.shields.io/badge/code%20style-black-000000" alt="Code Style: black">
|
|
68
|
+
</p>
|
|
69
|
+
<p align="center">
|
|
70
|
+
<em>Badges represent (from left to right): CI/CD Build Status, Code Coverage, Test Count, Latest PyPI Release, Supported Python Versions, License, and Code Style.</em>
|
|
71
|
+
</p>
|
|
72
|
+
</p>
|
|
73
|
+
|
|
74
|
+
ValidateX provides a comprehensive suite of tools for validating, profiling, and monitoring data quality across **Pandas** and **PySpark** DataFrames. Inspired by Great Expectations, it offers a simpler, more focused approach with modern, production-ready HTML reports and an intuitive API.
|
|
75
|
+
|
|
76
|
+
## ๐ Table of Contents
|
|
77
|
+
- [๐ผ๏ธ Report Preview](#๏ธ-report-preview)
|
|
78
|
+
- [๐ค Why ValidateX?](#-why-validatex)
|
|
79
|
+
- [๐ฏ Who Is This For?](#-who-is-this-for)
|
|
80
|
+
- [โจ Features](#-features)
|
|
81
|
+
- [๐ฆ Installation](#-installation)
|
|
82
|
+
- [๐ Quick Start](#-quick-start)
|
|
83
|
+
- [๐ค Automate with CI/CD](#-automate-with-cicd)
|
|
84
|
+
- [๐ฏ Data Quality Score](#-data-quality-score)
|
|
85
|
+
- [๐ Available Expectations](#-available-expectations)
|
|
86
|
+
- [๐ Roadmap](#-roadmap)
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## ๐ผ๏ธ Report Preview
|
|
91
|
+
|
|
92
|
+
<p align="center">
|
|
93
|
+
<img src="docs/screenshots/report_overview.png" alt="ValidateX Report โ Overview" width="100%">
|
|
94
|
+
</p>
|
|
95
|
+
|
|
96
|
+
<table>
|
|
97
|
+
<tr>
|
|
98
|
+
<td width="50%">
|
|
99
|
+
<img src="docs/screenshots/report_header.png" alt="Column Health Summary" width="100%">
|
|
100
|
+
<p align="center"><em>Column Health Summary with mini bar charts</em></p>
|
|
101
|
+
</td>
|
|
102
|
+
<td width="50%">
|
|
103
|
+
<img src="docs/screenshots/expectations_table.png" alt="Expectations Table" width="100%">
|
|
104
|
+
<p align="center"><em>Severity-tagged Expectations with human-readable output</em></p>
|
|
105
|
+
</td>
|
|
106
|
+
</tr>
|
|
107
|
+
</table>
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## ๐ค Why ValidateX?
|
|
112
|
+
|
|
113
|
+
| Feature | **ValidateX** | **Great Expectations** |
|
|
114
|
+
|---|---|---|
|
|
115
|
+
| **Setup** | `pip install` โ validate in 5 lines | Multi-step setup with contexts & stores |
|
|
116
|
+
| **API** | Fluent, chainable Python API | Heavy config system |
|
|
117
|
+
| **Severity levels** | โ (Critical, Warning, Info) | โ |
|
|
118
|
+
| **Quality score** | โ (Weighted 0โ100) | โ |
|
|
119
|
+
| **Auto-suggest expectations**| โ | โ |
|
|
120
|
+
| **Reports** | Modern dark-theme HTML with minicharts | Basic data docs |
|
|
121
|
+
| **Output Data Types** | Clean native Python types | NumPy types leak into JSON |
|
|
122
|
+
| **PySpark Support** | โ | โ |
|
|
123
|
+
| **Polars Support** | Soon | โ |
|
|
124
|
+
| **CI/CD friendly CLI** | โ | โ |
|
|
125
|
+
| **Downloads** | JSON / CSV / clipboard built into report | Separate export |
|
|
126
|
+
| **Learning curve** | Minutes | Hours to days |
|
|
127
|
+
|
|
128
|
+
ValidateX is not a replacement for Great Expectations โ it's a **focused alternative** for teams that want production-grade data validation without the overhead.
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## ๐ฏ Who Is This For?
|
|
133
|
+
|
|
134
|
+
- **Startup data teams** โ Ship data quality checks in minutes, not days
|
|
135
|
+
- **ML engineers** โ Validate feature stores and training data before model runs
|
|
136
|
+
- **CI/CD pipelines** โ Gate deployments on data quality with a single CLI command
|
|
137
|
+
- **Analytics teams** โ Catch data issues before they reach dashboards
|
|
138
|
+
- **dbt users** โ Lightweight validation alongside your transformation layer
|
|
139
|
+
- **Data platform teams** โ Monitor data quality across dozens of tables
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## โจ Features
|
|
144
|
+
|
|
145
|
+
| Feature | Description |
|
|
146
|
+
|---------|-------------|
|
|
147
|
+
| **25+ Built-in Expectations** | Column-level, table-level, and aggregate validations |
|
|
148
|
+
| **Dual Engine Support** | Pandas and PySpark execution engines |
|
|
149
|
+
| **๐ฏ Data Quality Score** | Weighted score (0โ100) based on severity of checks |
|
|
150
|
+
| **๐ด๐ก๐ต Severity Levels** | Critical / Warning / Info classification for every expectation |
|
|
151
|
+
| **๐ Column Health Summary** | At-a-glance per-column health with mini bar charts |
|
|
152
|
+
| **Modern HTML Reports** | Stunning, self-contained dark-theme reports with animations |
|
|
153
|
+
| **๐ฅ Download Buttons** | Export reports as JSON, CSV, or copy summary to clipboard |
|
|
154
|
+
| **๐ Drift Detection** | Track changes between validation runs |
|
|
155
|
+
| **Data Profiling** | Auto-analyse datasets and suggest expectations |
|
|
156
|
+
| **YAML/JSON Config** | Define expectations declaratively |
|
|
157
|
+
| **CLI Interface** | Run validations from the command line |
|
|
158
|
+
| **Checkpoint System** | Tie data sources and suites together |
|
|
159
|
+
| **Extensible** | Create custom expectations with the registry pattern |
|
|
160
|
+
| **Clean Output** | All values are native Python types โ zero NumPy leakage |
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## ๐ฆ Installation
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
# Basic install
|
|
168
|
+
pip install -e .
|
|
169
|
+
|
|
170
|
+
# With PySpark support
|
|
171
|
+
pip install -e ".[spark]"
|
|
172
|
+
|
|
173
|
+
# With database support
|
|
174
|
+
pip install -e ".[database]"
|
|
175
|
+
|
|
176
|
+
# Full install
|
|
177
|
+
pip install -e ".[all]"
|
|
178
|
+
|
|
179
|
+
# Development
|
|
180
|
+
pip install -e ".[dev]"
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## ๐ Quick Start
|
|
186
|
+
|
|
187
|
+
### Python API
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
import pandas as pd
|
|
191
|
+
import validatex as vx
|
|
192
|
+
|
|
193
|
+
# Create your data
|
|
194
|
+
df = pd.DataFrame({
|
|
195
|
+
"user_id": [1, 2, 3, 4, 5],
|
|
196
|
+
"name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
|
|
197
|
+
"age": [25, 30, 35, 28, 42],
|
|
198
|
+
"email": ["alice@test.com", "bob@test.com", "charlie@test.com",
|
|
199
|
+
"diana@test.com", "eve@test.com"],
|
|
200
|
+
"status": ["active", "active", "inactive", "active", "pending"],
|
|
201
|
+
})
|
|
202
|
+
|
|
203
|
+
# Build an expectation suite
|
|
204
|
+
suite = (
|
|
205
|
+
vx.ExpectationSuite("user_quality")
|
|
206
|
+
.add("expect_column_to_not_be_null", column="user_id")
|
|
207
|
+
.add("expect_column_values_to_be_unique", column="user_id")
|
|
208
|
+
.add("expect_column_values_to_be_between", column="age", min_value=0, max_value=150)
|
|
209
|
+
.add("expect_column_values_to_be_in_set",
|
|
210
|
+
column="status", value_set=["active", "inactive", "pending"])
|
|
211
|
+
.add("expect_column_values_to_match_regex",
|
|
212
|
+
column="email", regex=r"^[\w.]+@[\w]+\.\w+$")
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# Validate
|
|
216
|
+
result = vx.validate(df, suite)
|
|
217
|
+
|
|
218
|
+
# Print summary (includes Quality Score)
|
|
219
|
+
print(result.summary())
|
|
220
|
+
|
|
221
|
+
# Generate reports
|
|
222
|
+
result.to_html("report.html")
|
|
223
|
+
result.to_json_file("report.json")
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### CLI
|
|
227
|
+
|
|
228
|
+
```bash
|
|
229
|
+
# Initialize a project
|
|
230
|
+
validatex init
|
|
231
|
+
|
|
232
|
+
# Profile a dataset
|
|
233
|
+
validatex profile --data data.csv --suggest --output auto_suite.yaml
|
|
234
|
+
|
|
235
|
+
# Run validation
|
|
236
|
+
validatex validate --data data.csv --suite suite.yaml --report report.html
|
|
237
|
+
|
|
238
|
+
# Run checkpoint
|
|
239
|
+
validatex run --checkpoint checkpoint.yaml
|
|
240
|
+
|
|
241
|
+
# List available expectations
|
|
242
|
+
validatex list-expectations
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
---
|
|
246
|
+
|
|
247
|
+
## ๐ค Automate with CI/CD
|
|
248
|
+
|
|
249
|
+
ValidateX is designed to be lightweight and CI-friendly. You can easily integrate it into your GitHub Actions, GitLab CI, or Jenkins pipelines to gate deployments on data quality.
|
|
250
|
+
|
|
251
|
+
**Example: GitHub Actions**
|
|
252
|
+
```yaml
|
|
253
|
+
name: Data Quality Validation
|
|
254
|
+
on: [push, pull_request]
|
|
255
|
+
|
|
256
|
+
jobs:
|
|
257
|
+
validate-data:
|
|
258
|
+
runs-on: ubuntu-latest
|
|
259
|
+
steps:
|
|
260
|
+
- uses: actions/checkout@v4
|
|
261
|
+
|
|
262
|
+
- name: Set up Python
|
|
263
|
+
uses: actions/setup-python@v5
|
|
264
|
+
with:
|
|
265
|
+
python-version: '3.11'
|
|
266
|
+
|
|
267
|
+
- name: Install ValidateX
|
|
268
|
+
run: pip install validatex
|
|
269
|
+
|
|
270
|
+
- name: Run Data Validation
|
|
271
|
+
run: |
|
|
272
|
+
validatex validate \
|
|
273
|
+
--data data/production_data.csv \
|
|
274
|
+
--suite tests/data_quality/suite.yaml \
|
|
275
|
+
--report dq_report.html
|
|
276
|
+
|
|
277
|
+
- name: Archive production artifacts
|
|
278
|
+
uses: actions/upload-artifact@v4
|
|
279
|
+
if: always()
|
|
280
|
+
with:
|
|
281
|
+
name: validatex-report
|
|
282
|
+
path: dq_report.html
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
---
|
|
286
|
+
|
|
287
|
+
## ๐ฏ Data Quality Score
|
|
288
|
+
|
|
289
|
+
ValidateX computes a **weighted quality score** (0โ100) based on the severity of each expectation:
|
|
290
|
+
|
|
291
|
+
| Severity | Weight | Example Expectations |
|
|
292
|
+
|----------|--------|---------------------|
|
|
293
|
+
| ๐ด **Critical** | ร3 | Null checks, uniqueness, column existence, row count |
|
|
294
|
+
| ๐ก **Warning** | ร2 | Range checks, set membership, regex, type checks |
|
|
295
|
+
| ๐ต **Info** | ร1 | Mean/stdev bounds, string lengths, distinct values |
|
|
296
|
+
|
|
297
|
+
**Formula:** `Score = 100 ร (weighted_passed / weighted_total)`
|
|
298
|
+
|
|
299
|
+
A critical failure impacts the score 3ร more than an info-level check. This gives decision-makers a **single number** to assess data health.
|
|
300
|
+
|
|
301
|
+
```python
|
|
302
|
+
result = vx.validate(df, suite)
|
|
303
|
+
score = result.compute_quality_score()
|
|
304
|
+
print(f"Data Quality Score: {score}/100")
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
### Custom Severity
|
|
308
|
+
|
|
309
|
+
Override the default severity on any expectation via meta:
|
|
310
|
+
|
|
311
|
+
```yaml
|
|
312
|
+
expectations:
|
|
313
|
+
- expectation_type: expect_column_mean_to_be_between
|
|
314
|
+
column: revenue
|
|
315
|
+
kwargs:
|
|
316
|
+
min_value: 1000
|
|
317
|
+
max_value: 50000
|
|
318
|
+
meta:
|
|
319
|
+
severity: critical # Override default "info" โ "critical"
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
---
|
|
323
|
+
|
|
324
|
+
## ๐ Column Health Summary
|
|
325
|
+
|
|
326
|
+
The HTML report includes a **Column Health Summary** that aggregates all expectations per column:
|
|
327
|
+
|
|
328
|
+
| Column | Checks | Passed | Failed | Health | Null % | Unique % |
|
|
329
|
+
|--------|--------|--------|--------|--------|--------|----------|
|
|
330
|
+
| user_id | 3 | 3 | 0 | 100% โโโ | 0.0% | 100.0% โโโ |
|
|
331
|
+
| email | 4 | 4 | 0 | 100% โโโ | 0.0% | 100.0% โโโ |
|
|
332
|
+
| status | 1 | 1 | 0 | 100% โโโ | โ | โ |
|
|
333
|
+
|
|
334
|
+
Each metric includes a **mini CSS bar chart** for instant visual scanning.
|
|
335
|
+
|
|
336
|
+
```python
|
|
337
|
+
for col in result.column_health():
|
|
338
|
+
print(f"{col.column}: {col.health_score}% health, "
|
|
339
|
+
f"{col.passed}/{col.checks} passed")
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
---
|
|
343
|
+
|
|
344
|
+
## ๐ Available Expectations
|
|
345
|
+
|
|
346
|
+
### Column-Level (16)
|
|
347
|
+
| Expectation | Severity | Description |
|
|
348
|
+
|------------|----------|-------------|
|
|
349
|
+
| `expect_column_to_exist` | ๐ด Critical | Column exists in DataFrame |
|
|
350
|
+
| `expect_column_to_not_be_null` | ๐ด Critical | No null values |
|
|
351
|
+
| `expect_column_values_to_be_unique` | ๐ด Critical | All values unique |
|
|
352
|
+
| `expect_column_values_to_be_between` | ๐ก Warning | Values within range |
|
|
353
|
+
| `expect_column_values_to_be_in_set` | ๐ก Warning | Values in allowed set |
|
|
354
|
+
| `expect_column_values_to_not_be_in_set` | ๐ก Warning | Values not in forbidden set |
|
|
355
|
+
| `expect_column_values_to_match_regex` | ๐ก Warning | Values match regex pattern |
|
|
356
|
+
| `expect_column_values_to_be_of_type` | ๐ก Warning | Column dtype matches |
|
|
357
|
+
| `expect_column_values_to_be_dateutil_parseable` | ๐ก Warning | Values parseable as dates |
|
|
358
|
+
| `expect_column_value_lengths_to_be_between` | ๐ต Info | String lengths within range |
|
|
359
|
+
| `expect_column_max_to_be_between` | ๐ต Info | Column max within bounds |
|
|
360
|
+
| `expect_column_min_to_be_between` | ๐ต Info | Column min within bounds |
|
|
361
|
+
| `expect_column_mean_to_be_between` | ๐ต Info | Column mean within bounds |
|
|
362
|
+
| `expect_column_stdev_to_be_between` | ๐ต Info | Column std dev within bounds |
|
|
363
|
+
| `expect_column_distinct_values_to_be_in_set` | ๐ต Info | All distinct values in set |
|
|
364
|
+
| `expect_column_proportion_of_unique_values_to_be_between` | ๐ต Info | Uniqueness ratio in range |
|
|
365
|
+
|
|
366
|
+
### Table-Level (5)
|
|
367
|
+
| Expectation | Severity | Description |
|
|
368
|
+
|------------|----------|-------------|
|
|
369
|
+
| `expect_table_row_count_to_equal` | ๐ด Critical | Exact row count |
|
|
370
|
+
| `expect_table_row_count_to_be_between` | ๐ด Critical | Row count in range |
|
|
371
|
+
| `expect_table_columns_to_match_ordered_list` | ๐ด Critical | Column order matches |
|
|
372
|
+
| `expect_table_columns_to_match_set` | ๐ด Critical | Column names match (unordered) |
|
|
373
|
+
| `expect_table_column_count_to_equal` | ๐ด Critical | Exact column count |
|
|
374
|
+
|
|
375
|
+
### Aggregate / Cross-Column (4)
|
|
376
|
+
| Expectation | Severity | Description |
|
|
377
|
+
|------------|----------|-------------|
|
|
378
|
+
| `expect_column_pair_values_a_to_be_greater_than_b` | ๐ก Warning | Column A > Column B |
|
|
379
|
+
| `expect_column_pair_values_to_be_equal` | ๐ก Warning | Two columns equal |
|
|
380
|
+
| `expect_multicolumn_sum_to_equal` | ๐ก Warning | Row-wise sum equals target |
|
|
381
|
+
| `expect_compound_columns_to_be_unique` | ๐ด Critical | Compound key uniqueness |
|
|
382
|
+
|
|
383
|
+
---
|
|
384
|
+
|
|
385
|
+
## ๐ Data Profiling
|
|
386
|
+
|
|
387
|
+
```python
|
|
388
|
+
import pandas as pd
|
|
389
|
+
from validatex import DataProfiler
|
|
390
|
+
|
|
391
|
+
df = pd.read_csv("data.csv")
|
|
392
|
+
profiler = DataProfiler()
|
|
393
|
+
|
|
394
|
+
# Profile
|
|
395
|
+
profile = profiler.profile(df)
|
|
396
|
+
print(profile.summary())
|
|
397
|
+
|
|
398
|
+
# Auto-suggest expectations
|
|
399
|
+
suite = profiler.suggest_expectations(df, suite_name="auto_suite")
|
|
400
|
+
suite.save("auto_suite.yaml")
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
---
|
|
404
|
+
|
|
405
|
+
## ๐ง YAML Suite Configuration
|
|
406
|
+
|
|
407
|
+
```yaml
|
|
408
|
+
suite_name: my_data_quality
|
|
409
|
+
meta:
|
|
410
|
+
description: "Quality checks for production data"
|
|
411
|
+
|
|
412
|
+
expectations:
|
|
413
|
+
- expectation_type: expect_column_to_not_be_null
|
|
414
|
+
column: id
|
|
415
|
+
meta:
|
|
416
|
+
severity: critical
|
|
417
|
+
|
|
418
|
+
- expectation_type: expect_column_values_to_be_between
|
|
419
|
+
column: age
|
|
420
|
+
kwargs:
|
|
421
|
+
min_value: 0
|
|
422
|
+
max_value: 150
|
|
423
|
+
|
|
424
|
+
- expectation_type: expect_column_values_to_be_in_set
|
|
425
|
+
column: status
|
|
426
|
+
kwargs:
|
|
427
|
+
value_set: ["active", "inactive"]
|
|
428
|
+
```
|
|
429
|
+
|
|
430
|
+
---
|
|
431
|
+
|
|
432
|
+
## ๐๏ธ Architecture
|
|
433
|
+
|
|
434
|
+
```
|
|
435
|
+
validatex/
|
|
436
|
+
โโโ core/
|
|
437
|
+
โ โโโ expectation.py # Base class + registry
|
|
438
|
+
โ โโโ result.py # ValidationResult, QualityScore, Severity, ColumnHealth
|
|
439
|
+
โ โโโ suite.py # ExpectationSuite (fluent API)
|
|
440
|
+
โ โโโ validator.py # Validation orchestrator
|
|
441
|
+
โโโ expectations/
|
|
442
|
+
โ โโโ column_expectations.py # 16 column-level checks
|
|
443
|
+
โ โโโ table_expectations.py # 5 table-level checks
|
|
444
|
+
โ โโโ aggregate_expectations.py # 4 cross-column checks
|
|
445
|
+
โโโ datasources/
|
|
446
|
+
โ โโโ csv_source.py # CSV files
|
|
447
|
+
โ โโโ parquet_source.py # Parquet files
|
|
448
|
+
โ โโโ database_source.py # SQL databases (SQLAlchemy)
|
|
449
|
+
โ โโโ dataframe_source.py # Direct DataFrames
|
|
450
|
+
โโโ profiler/
|
|
451
|
+
โ โโโ profiler.py # Auto-profiling & suggestion engine
|
|
452
|
+
โโโ reporting/
|
|
453
|
+
โ โโโ html_report.py # Production HTML reports
|
|
454
|
+
โ โโโ json_report.py # JSON reports
|
|
455
|
+
โโโ config/
|
|
456
|
+
โ โโโ loader.py # YAML/JSON config loading
|
|
457
|
+
โโโ cli/
|
|
458
|
+
โโโ main.py # CLI (validate, run, profile, init, list-expectations)
|
|
459
|
+
```
|
|
460
|
+
|
|
461
|
+
---
|
|
462
|
+
|
|
463
|
+
## ๐งช Testing
|
|
464
|
+
|
|
465
|
+
```bash
|
|
466
|
+
# Run all tests (66 tests)
|
|
467
|
+
pytest tests/ -v
|
|
468
|
+
|
|
469
|
+
# Run with coverage
|
|
470
|
+
pytest tests/ -v --cov=validatex --cov-report=html
|
|
471
|
+
|
|
472
|
+
# Unit tests only
|
|
473
|
+
pytest tests/unit/ -v
|
|
474
|
+
|
|
475
|
+
# Integration tests
|
|
476
|
+
pytest tests/integration/ -v
|
|
477
|
+
```
|
|
478
|
+
|
|
479
|
+
---
|
|
480
|
+
|
|
481
|
+
## ๐ค Creating Custom Expectations
|
|
482
|
+
|
|
483
|
+
```python
|
|
484
|
+
from dataclasses import dataclass, field
|
|
485
|
+
from validatex.core.expectation import Expectation, register_expectation
|
|
486
|
+
from validatex.core.result import ExpectationResult
|
|
487
|
+
|
|
488
|
+
@register_expectation
|
|
489
|
+
@dataclass
|
|
490
|
+
class ExpectColumnValuesToBePositive(Expectation):
|
|
491
|
+
"""Expect all values in a numeric column to be positive."""
|
|
492
|
+
|
|
493
|
+
expectation_type: str = field(
|
|
494
|
+
init=False, default="expect_column_values_to_be_positive"
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
def _validate_pandas(self, df) -> ExpectationResult:
|
|
498
|
+
series = df[self.column].dropna()
|
|
499
|
+
total = len(series)
|
|
500
|
+
negative_mask = series <= 0
|
|
501
|
+
unexpected_count = int(negative_mask.sum())
|
|
502
|
+
pct = (unexpected_count / total * 100) if total > 0 else 0.0
|
|
503
|
+
|
|
504
|
+
return self._build_result(
|
|
505
|
+
success=(unexpected_count == 0),
|
|
506
|
+
element_count=total,
|
|
507
|
+
unexpected_count=unexpected_count,
|
|
508
|
+
unexpected_percent=pct,
|
|
509
|
+
unexpected_values=series[negative_mask].tolist()[:20],
|
|
510
|
+
)
|
|
511
|
+
```
|
|
512
|
+
|
|
513
|
+
---
|
|
514
|
+
|
|
515
|
+
## ๐งน Clean Output
|
|
516
|
+
|
|
517
|
+
ValidateX converts all internal types to native Python before rendering. You'll never see `np.int64(20)` in reports or JSON โ only clean `20`.
|
|
518
|
+
|
|
519
|
+
```python
|
|
520
|
+
result = vx.validate(df, suite)
|
|
521
|
+
data = result.to_dict()
|
|
522
|
+
|
|
523
|
+
# Observed values are always clean:
|
|
524
|
+
# {'min': 20, 'max': 69} โ NOT {'min': np.int64(20), ...}
|
|
525
|
+
# "Unique: 100/100 (100.0%)" โ NOT "100 unique out of 100"
|
|
526
|
+
# "Distinct values: 3" โ NOT "{'unique_values': 3}"
|
|
527
|
+
```
|
|
528
|
+
|
|
529
|
+
---
|
|
530
|
+
|
|
531
|
+
## ๐ Roadmap
|
|
532
|
+
|
|
533
|
+
- [x] 25+ built-in expectations (column, table, aggregate)
|
|
534
|
+
- [x] Pandas + PySpark dual-engine support
|
|
535
|
+
- [x] Severity modeling (Critical / Warning / Info)
|
|
536
|
+
- [x] Weighted data quality score (0โ100)
|
|
537
|
+
- [x] Column health summary with mini charts
|
|
538
|
+
- [x] Modern HTML reports with dark theme
|
|
539
|
+
- [x] Download buttons (JSON, CSV, clipboard)
|
|
540
|
+
- [x] Drift detection foundation
|
|
541
|
+
- [x] Data profiler with auto-suggestion
|
|
542
|
+
- [x] CLI with validate, profile, run, init commands
|
|
543
|
+
- [x] YAML/JSON declarative configuration
|
|
544
|
+
- [x] Native Python type sanitization
|
|
545
|
+
- [ ] Slack / Teams notifications on failure
|
|
546
|
+
- [ ] GitHub Action template for CI/CD
|
|
547
|
+
- [ ] Polars engine support
|
|
548
|
+
- [ ] Baseline history tracking & trend charts
|
|
549
|
+
- [ ] Anomaly detection expectations
|
|
550
|
+
- [ ] Great Expectations suite import/migration
|
|
551
|
+
- [ ] Web dashboard for multi-dataset monitoring
|
|
552
|
+
- [ ] dbt integration plugin
|
|
553
|
+
|
|
554
|
+
### Versioning
|
|
555
|
+
ValidateX follows [Semantic Versioning](https://semver.org/).
|
|
556
|
+
- **MAJOR** version for incompatible API changes
|
|
557
|
+
- **MINOR** version for backwards-compatible new functionality
|
|
558
|
+
- **PATCH** version for backwards-compatible bug fixes
|
|
559
|
+
|
|
560
|
+
---
|
|
561
|
+
|
|
562
|
+
## ๐ License
|
|
563
|
+
|
|
564
|
+
MIT License
|
|
565
|
+
|
|
566
|
+
---
|
|
567
|
+
|
|
568
|
+
<p align="center">
|
|
569
|
+
<strong>Built with โค๏ธ by the ValidateX Team</strong>
|
|
570
|
+
<br>
|
|
571
|
+
<sub>If this project helps you, consider giving it a โญ</sub>
|
|
572
|
+
</p>
|