dqscore 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dqscore-0.1.0/LICENSE +21 -0
- dqscore-0.1.0/PKG-INFO +184 -0
- dqscore-0.1.0/README.md +154 -0
- dqscore-0.1.0/pyproject.toml +47 -0
- dqscore-0.1.0/setup.cfg +4 -0
- dqscore-0.1.0/src/dqscore/__init__.py +40 -0
- dqscore-0.1.0/src/dqscore/autoscan.py +60 -0
- dqscore-0.1.0/src/dqscore/checks.py +127 -0
- dqscore-0.1.0/src/dqscore/cli.py +80 -0
- dqscore-0.1.0/src/dqscore/profiling.py +134 -0
- dqscore-0.1.0/src/dqscore/py.typed +0 -0
- dqscore-0.1.0/src/dqscore/report.py +199 -0
- dqscore-0.1.0/src/dqscore/validator.py +189 -0
- dqscore-0.1.0/src/dqscore.egg-info/PKG-INFO +184 -0
- dqscore-0.1.0/src/dqscore.egg-info/SOURCES.txt +19 -0
- dqscore-0.1.0/src/dqscore.egg-info/dependency_links.txt +1 -0
- dqscore-0.1.0/src/dqscore.egg-info/entry_points.txt +2 -0
- dqscore-0.1.0/src/dqscore.egg-info/requires.txt +5 -0
- dqscore-0.1.0/src/dqscore.egg-info/top_level.txt +1 -0
- dqscore-0.1.0/tests/test_checks.py +69 -0
- dqscore-0.1.0/tests/test_suite.py +113 -0
dqscore-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Digvijay Waghela
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
dqscore-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dqscore
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight data quality toolkit for pandas: profiling, validation schemas, and a zero-config scan.
|
|
5
|
+
Author-email: YOUR NAME <you@example.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/YOUR_USERNAME/dqscore
|
|
8
|
+
Project-URL: Repository, https://github.com/YOUR_USERNAME/dqscore
|
|
9
|
+
Project-URL: Issues, https://github.com/YOUR_USERNAME/dqscore/issues
|
|
10
|
+
Keywords: data-quality,pandas,validation,data-profiling,etl,dataframe
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering
|
|
21
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
22
|
+
Requires-Python: >=3.8
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: pandas>=1.3
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
|
|
31
|
+
# dqscore
|
|
32
|
+
|
|
33
|
+
> A lightweight **data quality toolkit for pandas** — profile any DataFrame, declare
|
|
34
|
+
> expectations with a fluent schema, or run a zero-config scan. No heavy dependencies,
|
|
35
|
+
> no config files required.
|
|
36
|
+
|
|
37
|
+
[](https://github.com/dgvj-work/dqscore/actions/workflows/ci.yml)
|
|
38
|
+
[](https://www.python.org/)
|
|
39
|
+
[](LICENSE)
|
|
40
|
+
|
|
41
|
+
`dqscore` helps you catch the boring-but-costly data problems — nulls where there
|
|
42
|
+
shouldn't be any, duplicate keys, out-of-range values, malformed strings — before
|
|
43
|
+
they reach a model, a dashboard, or a stakeholder.
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
## Why this exists ?
|
|
47
|
+
Data quality issues are the silent killers of analytics and ML work. A null in the wrong column, a duplicate primary key, a value outside its expected range — these don't crash your pipeline. They quietly corrupt your output, and you find out three weeks later in a stakeholder meeting.
|
|
48
|
+
The Python ecosystem already has excellent tools for this. Great Expectations is comprehensive and battle-tested. Pandera offers powerful schema-based validation. ydata-profiling produces rich exploratory reports. If you're building a long-lived production data platform, those are the right answers.
|
|
49
|
+
But there's a gap in shape. When an analyst gets a fresh CSV and wants a fast read on whether it's trustworthy, the existing tools ask for a lot upfront — a schema, a config, a project structure, sometimes a framework integration. The lightest possible question — is this data OK? — doesn't have a one-line answer in any of them. And once you do set up checks, getting a single number you can put on a dashboard, or a non-zero exit code you can wire into CI, often needs custom code on top.
|
|
50
|
+
dqscore is built for that middle ground. It has one dependency (pandas) and three things to learn: profile a DataFrame, declare a schema with a fluent API, or run a zero-config scan that infers sensible defaults. Every validation produces a 0–100 quality score and a report that exports to HTML, Markdown, or JSON. The CLI returns exit code 1 on failure, so dqscore scan data.csv drops straight into a CI pipeline or a pre-commit hook with no glue code.
|
|
51
|
+
It's not a replacement for Great Expectations or pandera. It's the tool you reach for at the start of a project, or when reviewing a new dataset, or when you want a simple quality gate in CI without standing up a whole framework. That's the gap, and I think it's a useful one to fill — especially for individuals, smaller teams, and educators where the ceremony of heavier tools is the actual barrier to checking data at all.
|
|
52
|
+
The package is MIT-licensed and feedback is welcome. If a check is missing, a report format would be useful, or the auto-scan heuristics could be smarter for your data, open an issue.
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Why dqscore?
|
|
57
|
+
|
|
58
|
+
- **Tiny surface area.** Three things to learn: `profile`, `Schema`, `auto_scan`.
|
|
59
|
+
- **Readable reports.** Every result exports to dict, JSON, Markdown, or styled HTML.
|
|
60
|
+
- **Scoreable.** Each validation produces a 0–100 quality score for dashboards/CI.
|
|
61
|
+
- **CLI included.** `dqscore scan data.csv` returns a non-zero exit code on failure,
|
|
62
|
+
so it drops straight into a pipeline or pre-commit hook.
|
|
63
|
+
- **One dependency:** pandas.
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## Installation
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install dqscore
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Or install the latest from source:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
git clone https://github.com/dgvj-work/dqscore.git
|
|
77
|
+
cd dqscore
|
|
78
|
+
pip install -e ".[dev]"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## Quick start
|
|
84
|
+
|
|
85
|
+
### 1. Profile a DataFrame
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
import pandas as pd
|
|
89
|
+
import dqscore as dq
|
|
90
|
+
|
|
91
|
+
df = pd.read_csv("customers.csv")
|
|
92
|
+
profile = dq.profile(df)
|
|
93
|
+
|
|
94
|
+
print(profile.to_markdown()) # per-column stats
|
|
95
|
+
profile.to_html("profile.html")
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### 2. Validate against a schema
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
schema = dq.Schema("customers")
|
|
102
|
+
schema.column("id").not_null().unique()
|
|
103
|
+
schema.column("age").in_range(0, 120)
|
|
104
|
+
schema.column("email").matches(r"^[^@]+@[^@]+\.[^@]+$")
|
|
105
|
+
schema.column("country").in_set(["US", "CA", "MX"])
|
|
106
|
+
schema.no_duplicate_rows()
|
|
107
|
+
|
|
108
|
+
result = schema.validate(df)
|
|
109
|
+
|
|
110
|
+
print(result.summary()) # human-readable report
|
|
111
|
+
print("Quality score:", result.score)
|
|
112
|
+
result.to_html("dq_report.html")
|
|
113
|
+
|
|
114
|
+
if not result.passed:
|
|
115
|
+
raise SystemExit("Data quality checks failed")
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### 3. Zero-config scan
|
|
119
|
+
|
|
120
|
+
When you just want a quick read on a new file:
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
result = dq.auto_scan(df) # checks nulls, duplicate keys, duplicate rows
|
|
124
|
+
print(result.summary())
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## Command line
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
# Profile every column
|
|
133
|
+
dqscore profile data.csv --html profile.html
|
|
134
|
+
|
|
135
|
+
# Quick quality scan (exit code 1 if it fails — great for CI)
|
|
136
|
+
dqscore scan data.csv --json report.json
|
|
137
|
+
dqscore scan data.csv --max-null-pct 5
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
---
|
|
141
|
+
|
|
142
|
+
## Available checks
|
|
143
|
+
|
|
144
|
+
| Method | Fails when… |
|
|
145
|
+
| ----------------------------------- | -------------------------------------------- |
|
|
146
|
+
| `not_null()` | value is null / NaN / NaT |
|
|
147
|
+
| `unique()` | a non-null value occurs more than once |
|
|
148
|
+
| `in_range(min, max, inclusive)` | numeric value is outside the bounds |
|
|
149
|
+
| `in_set([...])` | value is not one of the allowed values |
|
|
150
|
+
| `matches(pattern, full_match)` | string does not match the regex |
|
|
151
|
+
| `is_numeric()` / `is_integer()` | value can't be parsed as a number / integer |
|
|
152
|
+
| `is_datetime(fmt)` | value can't be parsed as a date/time |
|
|
153
|
+
| `string_length(min_len, max_len)` | string length is out of bounds |
|
|
154
|
+
| `custom(fn, name)` | your function returns `True` for a row |
|
|
155
|
+
| `Schema.no_duplicate_rows(subset)` | rows are exact duplicates |
|
|
156
|
+
|
|
157
|
+
Checks chain on a column and most let nulls pass, so `not_null()` stays the single
|
|
158
|
+
source of truth for missing values:
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
schema.column("score").not_null().is_numeric().in_range(0, 100)
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
## Reports & scoring
|
|
167
|
+
|
|
168
|
+
A `ValidationResult` gives you:
|
|
169
|
+
|
|
170
|
+
- `result.passed` — `True`/`False`
|
|
171
|
+
- `result.score` — percentage of checks passed (0–100)
|
|
172
|
+
- `result.failures` — only the failing checks (with sample failing values & indices)
|
|
173
|
+
- `result.summary()` / `to_markdown()` / `to_json()` / `to_html(path)`
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## Contributing
|
|
178
|
+
|
|
179
|
+
Contributions and feedback are very welcome — see [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
180
|
+
Found a bug or want a new check? [Open an issue](https://github.com/dgvj-work/dqscore/issues).
|
|
181
|
+
|
|
182
|
+
## License
|
|
183
|
+
|
|
184
|
+
[MIT](LICENSE)
|
dqscore-0.1.0/README.md
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# dqscore
|
|
2
|
+
|
|
3
|
+
> A lightweight **data quality toolkit for pandas** — profile any DataFrame, declare
|
|
4
|
+
> expectations with a fluent schema, or run a zero-config scan. No heavy dependencies,
|
|
5
|
+
> no config files required.
|
|
6
|
+
|
|
7
|
+
[](https://github.com/dgvj-work/dqscore/actions/workflows/ci.yml)
|
|
8
|
+
[](https://www.python.org/)
|
|
9
|
+
[](LICENSE)
|
|
10
|
+
|
|
11
|
+
`dqscore` helps you catch the boring-but-costly data problems — nulls where there
|
|
12
|
+
shouldn't be any, duplicate keys, out-of-range values, malformed strings — before
|
|
13
|
+
they reach a model, a dashboard, or a stakeholder.
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
## Why this exists ?
|
|
17
|
+
Data quality issues are the silent killers of analytics and ML work. A null in the wrong column, a duplicate primary key, a value outside its expected range — these don't crash your pipeline. They quietly corrupt your output, and you find out three weeks later in a stakeholder meeting.
|
|
18
|
+
The Python ecosystem already has excellent tools for this. Great Expectations is comprehensive and battle-tested. Pandera offers powerful schema-based validation. ydata-profiling produces rich exploratory reports. If you're building a long-lived production data platform, those are the right answers.
|
|
19
|
+
But there's a gap in shape. When an analyst gets a fresh CSV and wants a fast read on whether it's trustworthy, the existing tools ask for a lot upfront — a schema, a config, a project structure, sometimes a framework integration. The lightest possible question — is this data OK? — doesn't have a one-line answer in any of them. And once you do set up checks, getting a single number you can put on a dashboard, or a non-zero exit code you can wire into CI, often needs custom code on top.
|
|
20
|
+
dqscore is built for that middle ground. It has one dependency (pandas) and three things to learn: profile a DataFrame, declare a schema with a fluent API, or run a zero-config scan that infers sensible defaults. Every validation produces a 0–100 quality score and a report that exports to HTML, Markdown, or JSON. The CLI returns exit code 1 on failure, so dqscore scan data.csv drops straight into a CI pipeline or a pre-commit hook with no glue code.
|
|
21
|
+
It's not a replacement for Great Expectations or pandera. It's the tool you reach for at the start of a project, or when reviewing a new dataset, or when you want a simple quality gate in CI without standing up a whole framework. That's the gap, and I think it's a useful one to fill — especially for individuals, smaller teams, and educators where the ceremony of heavier tools is the actual barrier to checking data at all.
|
|
22
|
+
The package is MIT-licensed and feedback is welcome. If a check is missing, a report format would be useful, or the auto-scan heuristics could be smarter for your data, open an issue.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Why dqscore?
|
|
27
|
+
|
|
28
|
+
- **Tiny surface area.** Three things to learn: `profile`, `Schema`, `auto_scan`.
|
|
29
|
+
- **Readable reports.** Every result exports to dict, JSON, Markdown, or styled HTML.
|
|
30
|
+
- **Scoreable.** Each validation produces a 0–100 quality score for dashboards/CI.
|
|
31
|
+
- **CLI included.** `dqscore scan data.csv` returns a non-zero exit code on failure,
|
|
32
|
+
so it drops straight into a pipeline or pre-commit hook.
|
|
33
|
+
- **One dependency:** pandas.
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## Installation
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install dqscore
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Or install the latest from source:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
git clone https://github.com/dgvj-work/dqscore.git
|
|
47
|
+
cd dqscore
|
|
48
|
+
pip install -e ".[dev]"
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Quick start
|
|
54
|
+
|
|
55
|
+
### 1. Profile a DataFrame
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
import pandas as pd
|
|
59
|
+
import dqscore as dq
|
|
60
|
+
|
|
61
|
+
df = pd.read_csv("customers.csv")
|
|
62
|
+
profile = dq.profile(df)
|
|
63
|
+
|
|
64
|
+
print(profile.to_markdown()) # per-column stats
|
|
65
|
+
profile.to_html("profile.html")
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### 2. Validate against a schema
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
schema = dq.Schema("customers")
|
|
72
|
+
schema.column("id").not_null().unique()
|
|
73
|
+
schema.column("age").in_range(0, 120)
|
|
74
|
+
schema.column("email").matches(r"^[^@]+@[^@]+\.[^@]+$")
|
|
75
|
+
schema.column("country").in_set(["US", "CA", "MX"])
|
|
76
|
+
schema.no_duplicate_rows()
|
|
77
|
+
|
|
78
|
+
result = schema.validate(df)
|
|
79
|
+
|
|
80
|
+
print(result.summary()) # human-readable report
|
|
81
|
+
print("Quality score:", result.score)
|
|
82
|
+
result.to_html("dq_report.html")
|
|
83
|
+
|
|
84
|
+
if not result.passed:
|
|
85
|
+
raise SystemExit("Data quality checks failed")
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### 3. Zero-config scan
|
|
89
|
+
|
|
90
|
+
When you just want a quick read on a new file:
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
result = dq.auto_scan(df) # checks nulls, duplicate keys, duplicate rows
|
|
94
|
+
print(result.summary())
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Command line
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
# Profile every column
|
|
103
|
+
dqscore profile data.csv --html profile.html
|
|
104
|
+
|
|
105
|
+
# Quick quality scan (exit code 1 if it fails — great for CI)
|
|
106
|
+
dqscore scan data.csv --json report.json
|
|
107
|
+
dqscore scan data.csv --max-null-pct 5
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
## Available checks
|
|
113
|
+
|
|
114
|
+
| Method | Fails when… |
|
|
115
|
+
| ----------------------------------- | -------------------------------------------- |
|
|
116
|
+
| `not_null()` | value is null / NaN / NaT |
|
|
117
|
+
| `unique()` | a non-null value occurs more than once |
|
|
118
|
+
| `in_range(min, max, inclusive)` | numeric value is outside the bounds |
|
|
119
|
+
| `in_set([...])` | value is not one of the allowed values |
|
|
120
|
+
| `matches(pattern, full_match)` | string does not match the regex |
|
|
121
|
+
| `is_numeric()` / `is_integer()` | value can't be parsed as a number / integer |
|
|
122
|
+
| `is_datetime(fmt)` | value can't be parsed as a date/time |
|
|
123
|
+
| `string_length(min_len, max_len)` | string length is out of bounds |
|
|
124
|
+
| `custom(fn, name)` | your function returns `True` for a row |
|
|
125
|
+
| `Schema.no_duplicate_rows(subset)` | rows are exact duplicates |
|
|
126
|
+
|
|
127
|
+
Checks chain on a column and most let nulls pass, so `not_null()` stays the single
|
|
128
|
+
source of truth for missing values:
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
schema.column("score").not_null().is_numeric().in_range(0, 100)
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## Reports & scoring
|
|
137
|
+
|
|
138
|
+
A `ValidationResult` gives you:
|
|
139
|
+
|
|
140
|
+
- `result.passed` — `True`/`False`
|
|
141
|
+
- `result.score` — percentage of checks passed (0–100)
|
|
142
|
+
- `result.failures` — only the failing checks (with sample failing values & indices)
|
|
143
|
+
- `result.summary()` / `to_markdown()` / `to_json()` / `to_html(path)`
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## Contributing
|
|
148
|
+
|
|
149
|
+
Contributions and feedback are very welcome — see [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
150
|
+
Found a bug or want a new check? [Open an issue](https://github.com/dgvj-work/dqscore/issues).
|
|
151
|
+
|
|
152
|
+
## License
|
|
153
|
+
|
|
154
|
+
[MIT](LICENSE)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "dqscore"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A lightweight data quality toolkit for pandas: profiling, validation schemas, and a zero-config scan."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "YOUR NAME", email = "you@example.com" }]
|
|
13
|
+
keywords = ["data-quality", "pandas", "validation", "data-profiling", "etl", "dataframe"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.8",
|
|
20
|
+
"Programming Language :: Python :: 3.9",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Topic :: Scientific/Engineering",
|
|
25
|
+
"Topic :: Software Development :: Quality Assurance",
|
|
26
|
+
]
|
|
27
|
+
dependencies = ["pandas>=1.3"]
|
|
28
|
+
|
|
29
|
+
[project.urls]
|
|
30
|
+
Homepage = "https://github.com/YOUR_USERNAME/dqscore"
|
|
31
|
+
Repository = "https://github.com/YOUR_USERNAME/dqscore"
|
|
32
|
+
Issues = "https://github.com/YOUR_USERNAME/dqscore/issues"
|
|
33
|
+
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
dev = ["pytest>=7.0", "pytest-cov"]
|
|
36
|
+
|
|
37
|
+
[project.scripts]
|
|
38
|
+
dqscore = "dqscore.cli:main"
|
|
39
|
+
|
|
40
|
+
[tool.setuptools.packages.find]
|
|
41
|
+
where = ["src"]
|
|
42
|
+
|
|
43
|
+
[tool.setuptools.package-data]
|
|
44
|
+
dqscore = ["py.typed"]
|
|
45
|
+
|
|
46
|
+
[tool.pytest.ini_options]
|
|
47
|
+
testpaths = ["tests"]
|
dqscore-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""dqscore — a lightweight data quality toolkit for pandas.
|
|
2
|
+
|
|
3
|
+
Quick start
|
|
4
|
+
-----------
|
|
5
|
+
>>> import pandas as pd
|
|
6
|
+
>>> import dqscore as dq
|
|
7
|
+
>>> df = pd.DataFrame({"id": [1, 2, 2], "age": [30, -1, 41]})
|
|
8
|
+
>>> result = dq.auto_scan(df)
|
|
9
|
+
>>> result.passed
|
|
10
|
+
False
|
|
11
|
+
|
|
12
|
+
Declare expectations explicitly with a :class:`~dqscore.Schema`::
|
|
13
|
+
|
|
14
|
+
schema = dq.Schema("people")
|
|
15
|
+
schema.column("id").not_null().unique()
|
|
16
|
+
schema.column("age").in_range(0, 120)
|
|
17
|
+
report = schema.validate(df)
|
|
18
|
+
print(report.summary())
|
|
19
|
+
"""
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from . import checks
|
|
23
|
+
from .autoscan import auto_scan
|
|
24
|
+
from .profiling import Profile, profile
|
|
25
|
+
from .report import CheckResult, ValidationResult
|
|
26
|
+
from .validator import ColumnSchema, Schema
|
|
27
|
+
|
|
28
|
+
__version__ = "0.1.0"
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"Schema",
|
|
32
|
+
"ColumnSchema",
|
|
33
|
+
"profile",
|
|
34
|
+
"Profile",
|
|
35
|
+
"auto_scan",
|
|
36
|
+
"ValidationResult",
|
|
37
|
+
"CheckResult",
|
|
38
|
+
"checks",
|
|
39
|
+
"__version__",
|
|
40
|
+
]
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Zero-config quality scan: infer sensible default checks for any DataFrame."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from .report import ValidationResult
|
|
9
|
+
from .validator import Schema
|
|
10
|
+
|
|
11
|
+
__all__ = ["auto_scan"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _looks_like_id(name: str) -> bool:
|
|
15
|
+
lowered = str(name).lower()
|
|
16
|
+
return lowered == "id" or lowered.endswith("_id") or lowered.endswith("id")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def auto_scan(
|
|
20
|
+
df: pd.DataFrame,
|
|
21
|
+
max_null_pct: float = 0.0,
|
|
22
|
+
name: str = "auto_scan",
|
|
23
|
+
) -> ValidationResult:
|
|
24
|
+
"""Run a quick, opinionated quality scan with no schema required.
|
|
25
|
+
|
|
26
|
+
Heuristics applied:
|
|
27
|
+
|
|
28
|
+
* every column is expected to have at most ``max_null_pct`` percent nulls;
|
|
29
|
+
* columns that look like identifiers (``id`` / ``*_id``) are expected to be
|
|
30
|
+
unique;
|
|
31
|
+
* the frame is expected to have no fully duplicated rows.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
df:
|
|
36
|
+
The DataFrame to scan.
|
|
37
|
+
max_null_pct:
|
|
38
|
+
Allowed percentage of nulls per column before the column's null check
|
|
39
|
+
fails. ``0.0`` means "no nulls allowed".
|
|
40
|
+
"""
|
|
41
|
+
if not isinstance(df, pd.DataFrame):
|
|
42
|
+
raise TypeError("auto_scan() expects a pandas DataFrame")
|
|
43
|
+
|
|
44
|
+
n = len(df)
|
|
45
|
+
threshold = max_null_pct / 100.0
|
|
46
|
+
schema = Schema(name)
|
|
47
|
+
|
|
48
|
+
for col in df.columns:
|
|
49
|
+
series = df[col]
|
|
50
|
+
null_frac = series.isna().mean() if n else 0.0
|
|
51
|
+
if null_frac > threshold:
|
|
52
|
+
# Flag missingness explicitly via the not_null check.
|
|
53
|
+
schema.column(col).not_null()
|
|
54
|
+
if _looks_like_id(col):
|
|
55
|
+
# Identifier-like columns are expected to be unique; this surfaces
|
|
56
|
+
# accidental duplicate keys, a common data quality defect.
|
|
57
|
+
schema.column(col).unique()
|
|
58
|
+
|
|
59
|
+
schema.no_duplicate_rows()
|
|
60
|
+
return schema.validate(df)
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Low-level data quality checks.
|
|
2
|
+
|
|
3
|
+
Every check takes a :class:`pandas.Series` (or DataFrame, for frame-level
|
|
4
|
+
checks) and returns a boolean mask aligned to the input where ``True`` marks a
|
|
5
|
+
*failing* row. Null handling is deliberate: most checks let nulls pass so that
|
|
6
|
+
``not_null`` is the single source of truth for missing values. Combine checks to
|
|
7
|
+
express richer expectations.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from typing import Any, Iterable, Optional
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"not_null",
|
|
18
|
+
"unique",
|
|
19
|
+
"in_range",
|
|
20
|
+
"in_set",
|
|
21
|
+
"matches",
|
|
22
|
+
"is_numeric",
|
|
23
|
+
"is_integer",
|
|
24
|
+
"is_datetime",
|
|
25
|
+
"string_length",
|
|
26
|
+
"no_duplicate_rows",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _as_bool_mask(mask: pd.Series, index: pd.Index) -> pd.Series:
|
|
31
|
+
"""Coerce a mask to a clean boolean Series aligned to ``index``."""
|
|
32
|
+
return pd.Series(mask, index=index).fillna(False).astype(bool)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def not_null(series: pd.Series) -> pd.Series:
|
|
36
|
+
"""Fail rows whose value is null / NaN / NaT."""
|
|
37
|
+
return series.isna()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def unique(series: pd.Series) -> pd.Series:
|
|
41
|
+
"""Fail rows whose (non-null) value appears more than once."""
|
|
42
|
+
duplicated = series.duplicated(keep=False)
|
|
43
|
+
return _as_bool_mask(duplicated & series.notna(), series.index)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def in_range(
|
|
47
|
+
series: pd.Series,
|
|
48
|
+
min_value: Optional[float] = None,
|
|
49
|
+
max_value: Optional[float] = None,
|
|
50
|
+
inclusive: bool = True,
|
|
51
|
+
) -> pd.Series:
|
|
52
|
+
"""Fail rows outside ``[min_value, max_value]``.
|
|
53
|
+
|
|
54
|
+
Non-numeric, non-null values fail as well. Nulls pass (use ``not_null``).
|
|
55
|
+
"""
|
|
56
|
+
numeric = pd.to_numeric(series, errors="coerce")
|
|
57
|
+
fail = pd.Series(False, index=series.index)
|
|
58
|
+
if min_value is not None:
|
|
59
|
+
fail |= (numeric < min_value) if inclusive else (numeric <= min_value)
|
|
60
|
+
if max_value is not None:
|
|
61
|
+
fail |= (numeric > max_value) if inclusive else (numeric >= max_value)
|
|
62
|
+
non_numeric = numeric.isna() & series.notna()
|
|
63
|
+
fail |= non_numeric
|
|
64
|
+
return _as_bool_mask(fail, series.index)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def in_set(series: pd.Series, allowed: Iterable[Any]) -> pd.Series:
|
|
68
|
+
"""Fail rows whose (non-null) value is not in ``allowed``."""
|
|
69
|
+
allowed_set = set(allowed)
|
|
70
|
+
fail = ~series.isin(allowed_set) & series.notna()
|
|
71
|
+
return _as_bool_mask(fail, series.index)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def matches(series: pd.Series, pattern: str, full_match: bool = False) -> pd.Series:
|
|
75
|
+
"""Fail rows whose (non-null) string value does not match ``pattern``."""
|
|
76
|
+
compiled = re.compile(pattern)
|
|
77
|
+
finder = compiled.fullmatch if full_match else compiled.search
|
|
78
|
+
|
|
79
|
+
def _fails(value: Any) -> bool:
|
|
80
|
+
if pd.isna(value):
|
|
81
|
+
return False
|
|
82
|
+
return finder(str(value)) is None
|
|
83
|
+
|
|
84
|
+
return _as_bool_mask(series.map(_fails), series.index)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def is_numeric(series: pd.Series) -> pd.Series:
|
|
88
|
+
"""Fail non-null values that cannot be parsed as numbers."""
|
|
89
|
+
coerced = pd.to_numeric(series, errors="coerce")
|
|
90
|
+
return _as_bool_mask(coerced.isna() & series.notna(), series.index)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def is_integer(series: pd.Series) -> pd.Series:
|
|
94
|
+
"""Fail non-null values that are not whole numbers."""
|
|
95
|
+
coerced = pd.to_numeric(series, errors="coerce")
|
|
96
|
+
non_numeric = coerced.isna() & series.notna()
|
|
97
|
+
non_integer = coerced.notna() & (coerced % 1 != 0)
|
|
98
|
+
return _as_bool_mask(non_numeric | non_integer, series.index)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def is_datetime(series: pd.Series, fmt: Optional[str] = None) -> pd.Series:
|
|
102
|
+
"""Fail non-null values that cannot be parsed as dates/times."""
|
|
103
|
+
coerced = pd.to_datetime(series, errors="coerce", format=fmt)
|
|
104
|
+
return _as_bool_mask(coerced.isna() & series.notna(), series.index)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def string_length(
|
|
108
|
+
series: pd.Series,
|
|
109
|
+
min_len: Optional[int] = None,
|
|
110
|
+
max_len: Optional[int] = None,
|
|
111
|
+
) -> pd.Series:
|
|
112
|
+
"""Fail non-null values whose string length is outside the bounds."""
|
|
113
|
+
lengths = series.dropna().astype(str).str.len()
|
|
114
|
+
fail = pd.Series(False, index=series.index)
|
|
115
|
+
if min_len is not None:
|
|
116
|
+
fail.loc[lengths.index] |= lengths < min_len
|
|
117
|
+
if max_len is not None:
|
|
118
|
+
fail.loc[lengths.index] |= lengths > max_len
|
|
119
|
+
return _as_bool_mask(fail, series.index)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def no_duplicate_rows(
|
|
123
|
+
df: pd.DataFrame, subset: Optional[Iterable[str]] = None
|
|
124
|
+
) -> pd.Series:
|
|
125
|
+
"""Fail rows that are exact duplicates (optionally over ``subset``)."""
|
|
126
|
+
subset_list = list(subset) if subset is not None else None
|
|
127
|
+
return _as_bool_mask(df.duplicated(subset=subset_list, keep=False), df.index)
|