dataruff 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataruff-0.1.0/PKG-INFO +235 -0
- dataruff-0.1.0/README.md +205 -0
- dataruff-0.1.0/datadoctor/__init__.py +36 -0
- dataruff-0.1.0/datadoctor/_compat.py +40 -0
- dataruff-0.1.0/datadoctor/analyzers/__init__.py +19 -0
- dataruff-0.1.0/datadoctor/analyzers/drift_analyzer.py +68 -0
- dataruff-0.1.0/datadoctor/analyzers/duplicate.py +33 -0
- dataruff-0.1.0/datadoctor/analyzers/format_analyzer.py +91 -0
- dataruff-0.1.0/datadoctor/analyzers/null_analyzer.py +52 -0
- dataruff-0.1.0/datadoctor/analyzers/outlier.py +73 -0
- dataruff-0.1.0/datadoctor/analyzers/pii_analyzer.py +58 -0
- dataruff-0.1.0/datadoctor/analyzers/type_analyzer.py +51 -0
- dataruff-0.1.0/datadoctor/anomalies.py +52 -0
- dataruff-0.1.0/datadoctor/audit.py +19 -0
- dataruff-0.1.0/datadoctor/cli.py +115 -0
- dataruff-0.1.0/datadoctor/compare.py +50 -0
- dataruff-0.1.0/datadoctor/drift.py +20 -0
- dataruff-0.1.0/datadoctor/fix.py +14 -0
- dataruff-0.1.0/datadoctor/fixing/__init__.py +3 -0
- dataruff-0.1.0/datadoctor/fixing/engine.py +83 -0
- dataruff-0.1.0/datadoctor/investigate.py +40 -0
- dataruff-0.1.0/datadoctor/loader.py +26 -0
- dataruff-0.1.0/datadoctor/models.py +89 -0
- dataruff-0.1.0/datadoctor/pii.py +69 -0
- dataruff-0.1.0/datadoctor/reporting/__init__.py +10 -0
- dataruff-0.1.0/datadoctor/reporting/json_reporter.py +29 -0
- dataruff-0.1.0/datadoctor/reporting/terminal.py +111 -0
- dataruff-0.1.0/datadoctor/score.py +17 -0
- dataruff-0.1.0/datadoctor/scoring/__init__.py +3 -0
- dataruff-0.1.0/datadoctor/scoring/engine.py +128 -0
- dataruff-0.1.0/datadoctor/validate.py +107 -0
- dataruff-0.1.0/dataruff.egg-info/PKG-INFO +235 -0
- dataruff-0.1.0/dataruff.egg-info/SOURCES.txt +49 -0
- dataruff-0.1.0/dataruff.egg-info/dependency_links.txt +1 -0
- dataruff-0.1.0/dataruff.egg-info/entry_points.txt +2 -0
- dataruff-0.1.0/dataruff.egg-info/requires.txt +14 -0
- dataruff-0.1.0/dataruff.egg-info/top_level.txt +1 -0
- dataruff-0.1.0/pyproject.toml +59 -0
- dataruff-0.1.0/setup.cfg +4 -0
- dataruff-0.1.0/tests/test_anomalies.py +97 -0
- dataruff-0.1.0/tests/test_audit.py +48 -0
- dataruff-0.1.0/tests/test_cli.py +103 -0
- dataruff-0.1.0/tests/test_compare.py +108 -0
- dataruff-0.1.0/tests/test_drift.py +93 -0
- dataruff-0.1.0/tests/test_fix.py +146 -0
- dataruff-0.1.0/tests/test_investigate.py +86 -0
- dataruff-0.1.0/tests/test_loader.py +46 -0
- dataruff-0.1.0/tests/test_pii.py +180 -0
- dataruff-0.1.0/tests/test_reporting.py +175 -0
- dataruff-0.1.0/tests/test_score.py +95 -0
- dataruff-0.1.0/tests/test_validate.py +116 -0
dataruff-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dataruff
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: One-command dataset health diagnostics — the Ruff of datasets.
|
|
5
|
+
Author: dataruff contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Keywords: data quality,pandas,csv,data validation,EDA,data science
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
Requires-Dist: pandas>=2.0
|
|
19
|
+
Requires-Dist: numpy>=1.24
|
|
20
|
+
Requires-Dist: scipy>=1.10
|
|
21
|
+
Requires-Dist: scikit-learn>=1.3
|
|
22
|
+
Requires-Dist: openpyxl>=3.1
|
|
23
|
+
Requires-Dist: python-dateutil>=2.8
|
|
24
|
+
Provides-Extra: rich
|
|
25
|
+
Requires-Dist: rich>=13.0; extra == "rich"
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7.4; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-cov>=4.1; extra == "dev"
|
|
29
|
+
Requires-Dist: rich>=13.0; extra == "dev"
|
|
30
|
+
|
|
31
|
+
# dataruff
|
|
32
|
+
|
|
33
|
+
[](https://github.com/AryanPatankar27/dataruff/actions/workflows/ci.yml)
|
|
34
|
+
[](https://codecov.io/gh/AryanPatankar27/dataruff)
|
|
35
|
+
[](https://pypi.org/project/dataruff/)
|
|
36
|
+
[](https://pypi.org/project/dataruff/)
|
|
37
|
+
[](https://opensource.org/licenses/MIT)
|
|
38
|
+
|
|
39
|
+
**The Ruff of datasets.** One command to discover, explain, score, and fix data quality problems in Pandas DataFrames and CSV/Excel files.
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from datadoctor import audit
|
|
43
|
+
|
|
44
|
+
audit(df)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
```
|
|
48
|
+
Data Quality Score: 81/100
|
|
49
|
+
|
|
50
|
+
Issues Found (5):
|
|
51
|
+
! 42 duplicate rows
|
|
52
|
+
~ 13 invalid email (column: email)
|
|
53
|
+
! 3 empty columns
|
|
54
|
+
~ 7 outlier (column: salary)
|
|
55
|
+
. 2 inconsistent date format (column: created_at)
|
|
56
|
+
|
|
57
|
+
Rows: 10,000 | Columns: 12
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Install
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install dataruff
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Optionally install [rich](https://github.com/Textualize/rich) for prettier terminal output:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
pip install dataruff[rich]
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Quick start
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
import pandas as pd
|
|
80
|
+
from datadoctor import audit, fix, score, validate, detect_pii
|
|
81
|
+
|
|
82
|
+
df = pd.read_csv("customers.csv")
|
|
83
|
+
|
|
84
|
+
# Full health report
|
|
85
|
+
audit(df)
|
|
86
|
+
|
|
87
|
+
# Get numeric score
|
|
88
|
+
s = score(df)
|
|
89
|
+
print(s.overall) # 81
|
|
90
|
+
print(s.to_dict()) # {'overall': 81, 'completeness': 92, ...}
|
|
91
|
+
|
|
92
|
+
# Auto-fix common issues
|
|
93
|
+
clean_df = fix(df)
|
|
94
|
+
|
|
95
|
+
# Validate against a schema
|
|
96
|
+
result = validate(df, schema={
|
|
97
|
+
"email": "email",
|
|
98
|
+
"age": "0-120",
|
|
99
|
+
"id": "unique",
|
|
100
|
+
})
|
|
101
|
+
|
|
102
|
+
# PII detection
|
|
103
|
+
report = detect_pii(df)
|
|
104
|
+
print(report.columns_with_pii)
|
|
105
|
+
# {'email': ['email'], 'phone': ['phone'], 'uid': ['aadhaar']}
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## API reference
|
|
111
|
+
|
|
112
|
+
| Function | Description | Returns |
|
|
113
|
+
|---|---|---|
|
|
114
|
+
| `audit(df)` | Print full health report | `InvestigationReport` |
|
|
115
|
+
| `investigate(df)` | Structured issue breakdown | `InvestigationReport` |
|
|
116
|
+
| `score(df)` | Data quality score | `ScoreBreakdown` |
|
|
117
|
+
| `fix(df)` | Auto-repair common issues | `pd.DataFrame` |
|
|
118
|
+
| `validate(df, schema)` | Check schema constraints | `dict` |
|
|
119
|
+
| `compare(old, new)` | Diff two datasets | `ComparisonReport` |
|
|
120
|
+
| `detect_pii(df)` | Find PII columns | `PIIReport` |
|
|
121
|
+
| `mask_pii(df)` | Redact PII values | `pd.DataFrame` |
|
|
122
|
+
| `detect_drift(old, new)` | Distribution drift analysis | `DriftReport` |
|
|
123
|
+
| `find_anomalies(df)` | Anomaly / outlier detection | `dict` |
|
|
124
|
+
|
|
125
|
+
All functions accept a **DataFrame, CSV path, or XLSX path** as input.
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## Scoring formula
|
|
130
|
+
|
|
131
|
+
| Dimension | Weight | Measures |
|
|
132
|
+
|---|---|---|
|
|
133
|
+
| Completeness | 25% | Non-null ratio across all cells |
|
|
134
|
+
| Validity | 25% | Format correctness (emails, dates, types) |
|
|
135
|
+
| Consistency | 20% | Uniform types and formats per column |
|
|
136
|
+
| Uniqueness | 20% | Absence of duplicate rows |
|
|
137
|
+
| Schema compliance | 10% | Adherence to user-provided schema |
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
## `fix()` — what gets repaired
|
|
142
|
+
|
|
143
|
+
| Issue | Fix applied |
|
|
144
|
+
|---|---|
|
|
145
|
+
| Duplicate rows | Removed |
|
|
146
|
+
| Leading/trailing whitespace | Stripped |
|
|
147
|
+
| Boolean strings (`yes/no/true/false`) | Converted to `bool` |
|
|
148
|
+
| Mixed date formats | Normalized to `YYYY-MM-DD` |
|
|
149
|
+
| Missing numeric values | Filled with column median |
|
|
150
|
+
| Missing string values | Filled with column mode |
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
## `validate()` — schema rules
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
validate(df, schema={
|
|
158
|
+
"email": "email", # valid email format
|
|
159
|
+
"age": "0-120", # numeric range
|
|
160
|
+
"user_id": "unique", # no duplicates
|
|
161
|
+
"price": "positive", # > 0
|
|
162
|
+
"code": "not_null", # no missing values
|
|
163
|
+
"ref": "regex:[A-Z]{3}", # custom regex
|
|
164
|
+
})
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
## `detect_pii()` — supported PII types
|
|
170
|
+
|
|
171
|
+
| Type | Example |
|
|
172
|
+
|---|---|
|
|
173
|
+
| `email` | `alice@example.com` |
|
|
174
|
+
| `phone` | `9876543210` |
|
|
175
|
+
| `aadhaar` | `2345 6789 0123` |
|
|
176
|
+
| `pan` | `ABCDE1234F` |
|
|
177
|
+
| `ssn` | `123-45-6789` |
|
|
178
|
+
| `credit_card` | `4111 1111 1111 1111` |
|
|
179
|
+
|
|
180
|
+
---
|
|
181
|
+
|
|
182
|
+
## CLI
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
# Audit a CSV file
|
|
186
|
+
dataruff audit customers.csv
|
|
187
|
+
|
|
188
|
+
# Output as JSON
|
|
189
|
+
dataruff audit customers.csv --json
|
|
190
|
+
|
|
191
|
+
# Fix issues and write cleaned file
|
|
192
|
+
dataruff fix customers.csv
|
|
193
|
+
# -> customers_clean.csv
|
|
194
|
+
|
|
195
|
+
# Compare two datasets
|
|
196
|
+
dataruff compare old.csv new.csv
|
|
197
|
+
|
|
198
|
+
# Data quality score
|
|
199
|
+
dataruff score customers.csv
|
|
200
|
+
|
|
201
|
+
# PII detection
|
|
202
|
+
dataruff detect-pii customers.csv
|
|
203
|
+
|
|
204
|
+
# Mask PII
|
|
205
|
+
dataruff mask-pii customers.csv
|
|
206
|
+
# -> customers_masked.csv
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
## Architecture
|
|
212
|
+
|
|
213
|
+
```
|
|
214
|
+
datadoctor/
|
|
215
|
+
├── analyzers/ # DuplicateAnalyzer, NullAnalyzer, TypeAnalyzer,
|
|
216
|
+
│ # FormatAnalyzer, OutlierAnalyzer, PIIAnalyzer, DriftAnalyzer
|
|
217
|
+
├── scoring/ # Weighted scoring engine
|
|
218
|
+
├── fixing/ # Auto-remediation rules
|
|
219
|
+
└── reporting/ # Terminal (rich + plain fallback) and JSON output
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
No LLMs. No API calls. Everything deterministic and offline.
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## Requirements
|
|
227
|
+
|
|
228
|
+
- Python 3.10+
|
|
229
|
+
- pandas, numpy, scipy, scikit-learn, openpyxl, python-dateutil
|
|
230
|
+
|
|
231
|
+
---
|
|
232
|
+
|
|
233
|
+
## License
|
|
234
|
+
|
|
235
|
+
MIT
|
dataruff-0.1.0/README.md
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
# dataruff
|
|
2
|
+
|
|
3
|
+
[](https://github.com/AryanPatankar27/dataruff/actions/workflows/ci.yml)
|
|
4
|
+
[](https://codecov.io/gh/AryanPatankar27/dataruff)
|
|
5
|
+
[](https://pypi.org/project/dataruff/)
|
|
6
|
+
[](https://pypi.org/project/dataruff/)
|
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
|
8
|
+
|
|
9
|
+
**The Ruff of datasets.** One command to discover, explain, score, and fix data quality problems in Pandas DataFrames and CSV/Excel files.
|
|
10
|
+
|
|
11
|
+
```python
|
|
12
|
+
from datadoctor import audit
|
|
13
|
+
|
|
14
|
+
audit(df)
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
Data Quality Score: 81/100
|
|
19
|
+
|
|
20
|
+
Issues Found (5):
|
|
21
|
+
! 42 duplicate rows
|
|
22
|
+
~ 13 invalid email (column: email)
|
|
23
|
+
! 3 empty columns
|
|
24
|
+
~ 7 outlier (column: salary)
|
|
25
|
+
. 2 inconsistent date format (column: created_at)
|
|
26
|
+
|
|
27
|
+
Rows: 10,000 | Columns: 12
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## Install
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install dataruff
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Optionally install [rich](https://github.com/Textualize/rich) for prettier terminal output:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install dataruff[rich]
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## Quick start
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
import pandas as pd
|
|
50
|
+
from datadoctor import audit, fix, score, validate, detect_pii
|
|
51
|
+
|
|
52
|
+
df = pd.read_csv("customers.csv")
|
|
53
|
+
|
|
54
|
+
# Full health report
|
|
55
|
+
audit(df)
|
|
56
|
+
|
|
57
|
+
# Get numeric score
|
|
58
|
+
s = score(df)
|
|
59
|
+
print(s.overall) # 81
|
|
60
|
+
print(s.to_dict()) # {'overall': 81, 'completeness': 92, ...}
|
|
61
|
+
|
|
62
|
+
# Auto-fix common issues
|
|
63
|
+
clean_df = fix(df)
|
|
64
|
+
|
|
65
|
+
# Validate against a schema
|
|
66
|
+
result = validate(df, schema={
|
|
67
|
+
"email": "email",
|
|
68
|
+
"age": "0-120",
|
|
69
|
+
"id": "unique",
|
|
70
|
+
})
|
|
71
|
+
|
|
72
|
+
# PII detection
|
|
73
|
+
report = detect_pii(df)
|
|
74
|
+
print(report.columns_with_pii)
|
|
75
|
+
# {'email': ['email'], 'phone': ['phone'], 'uid': ['aadhaar']}
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## API reference
|
|
81
|
+
|
|
82
|
+
| Function | Description | Returns |
|
|
83
|
+
|---|---|---|
|
|
84
|
+
| `audit(df)` | Print full health report | `InvestigationReport` |
|
|
85
|
+
| `investigate(df)` | Structured issue breakdown | `InvestigationReport` |
|
|
86
|
+
| `score(df)` | Data quality score | `ScoreBreakdown` |
|
|
87
|
+
| `fix(df)` | Auto-repair common issues | `pd.DataFrame` |
|
|
88
|
+
| `validate(df, schema)` | Check schema constraints | `dict` |
|
|
89
|
+
| `compare(old, new)` | Diff two datasets | `ComparisonReport` |
|
|
90
|
+
| `detect_pii(df)` | Find PII columns | `PIIReport` |
|
|
91
|
+
| `mask_pii(df)` | Redact PII values | `pd.DataFrame` |
|
|
92
|
+
| `detect_drift(old, new)` | Distribution drift analysis | `DriftReport` |
|
|
93
|
+
| `find_anomalies(df)` | Anomaly / outlier detection | `dict` |
|
|
94
|
+
|
|
95
|
+
All functions accept a **DataFrame, CSV path, or XLSX path** as input.
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Scoring formula
|
|
100
|
+
|
|
101
|
+
| Dimension | Weight | Measures |
|
|
102
|
+
|---|---|---|
|
|
103
|
+
| Completeness | 25% | Non-null ratio across all cells |
|
|
104
|
+
| Validity | 25% | Format correctness (emails, dates, types) |
|
|
105
|
+
| Consistency | 20% | Uniform types and formats per column |
|
|
106
|
+
| Uniqueness | 20% | Absence of duplicate rows |
|
|
107
|
+
| Schema compliance | 10% | Adherence to user-provided schema |
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## `fix()` — what gets repaired
|
|
112
|
+
|
|
113
|
+
| Issue | Fix applied |
|
|
114
|
+
|---|---|
|
|
115
|
+
| Duplicate rows | Removed |
|
|
116
|
+
| Leading/trailing whitespace | Stripped |
|
|
117
|
+
| Boolean strings (`yes/no/true/false`) | Converted to `bool` |
|
|
118
|
+
| Mixed date formats | Normalized to `YYYY-MM-DD` |
|
|
119
|
+
| Missing numeric values | Filled with column median |
|
|
120
|
+
| Missing string values | Filled with column mode |
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## `validate()` — schema rules
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
validate(df, schema={
|
|
128
|
+
"email": "email", # valid email format
|
|
129
|
+
"age": "0-120", # numeric range
|
|
130
|
+
"user_id": "unique", # no duplicates
|
|
131
|
+
"price": "positive", # > 0
|
|
132
|
+
"code": "not_null", # no missing values
|
|
133
|
+
"ref": "regex:[A-Z]{3}", # custom regex
|
|
134
|
+
})
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## `detect_pii()` — supported PII types
|
|
140
|
+
|
|
141
|
+
| Type | Example |
|
|
142
|
+
|---|---|
|
|
143
|
+
| `email` | `alice@example.com` |
|
|
144
|
+
| `phone` | `9876543210` |
|
|
145
|
+
| `aadhaar` | `2345 6789 0123` |
|
|
146
|
+
| `pan` | `ABCDE1234F` |
|
|
147
|
+
| `ssn` | `123-45-6789` |
|
|
148
|
+
| `credit_card` | `4111 1111 1111 1111` |
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## CLI
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
# Audit a CSV file
|
|
156
|
+
dataruff audit customers.csv
|
|
157
|
+
|
|
158
|
+
# Output as JSON
|
|
159
|
+
dataruff audit customers.csv --json
|
|
160
|
+
|
|
161
|
+
# Fix issues and write cleaned file
|
|
162
|
+
dataruff fix customers.csv
|
|
163
|
+
# -> customers_clean.csv
|
|
164
|
+
|
|
165
|
+
# Compare two datasets
|
|
166
|
+
dataruff compare old.csv new.csv
|
|
167
|
+
|
|
168
|
+
# Data quality score
|
|
169
|
+
dataruff score customers.csv
|
|
170
|
+
|
|
171
|
+
# PII detection
|
|
172
|
+
dataruff detect-pii customers.csv
|
|
173
|
+
|
|
174
|
+
# Mask PII
|
|
175
|
+
dataruff mask-pii customers.csv
|
|
176
|
+
# -> customers_masked.csv
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
---
|
|
180
|
+
|
|
181
|
+
## Architecture
|
|
182
|
+
|
|
183
|
+
```
|
|
184
|
+
datadoctor/
|
|
185
|
+
├── analyzers/ # DuplicateAnalyzer, NullAnalyzer, TypeAnalyzer,
|
|
186
|
+
│ # FormatAnalyzer, OutlierAnalyzer, PIIAnalyzer, DriftAnalyzer
|
|
187
|
+
├── scoring/ # Weighted scoring engine
|
|
188
|
+
├── fixing/ # Auto-remediation rules
|
|
189
|
+
└── reporting/ # Terminal (rich + plain fallback) and JSON output
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
No LLMs. No API calls. Everything deterministic and offline.
|
|
193
|
+
|
|
194
|
+
---
|
|
195
|
+
|
|
196
|
+
## Requirements
|
|
197
|
+
|
|
198
|
+
- Python 3.10+
|
|
199
|
+
- pandas, numpy, scipy, scikit-learn, openpyxl, python-dateutil
|
|
200
|
+
|
|
201
|
+
---
|
|
202
|
+
|
|
203
|
+
## License
|
|
204
|
+
|
|
205
|
+
MIT
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""
|
|
2
|
+
dataruff — One-command dataset health diagnostics.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
from datadoctor import audit, fix, score, detect_pii
|
|
6
|
+
|
|
7
|
+
audit(df) # Print quality report
|
|
8
|
+
fix(df) # Return cleaned DataFrame
|
|
9
|
+
score(df) # Return ScoreBreakdown
|
|
10
|
+
detect_pii(df) # Return PIIReport
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from datadoctor.audit import audit
|
|
14
|
+
from datadoctor.investigate import investigate
|
|
15
|
+
from datadoctor.fix import fix
|
|
16
|
+
from datadoctor.validate import validate
|
|
17
|
+
from datadoctor.compare import compare
|
|
18
|
+
from datadoctor.pii import detect_pii, mask_pii
|
|
19
|
+
from datadoctor.drift import detect_drift
|
|
20
|
+
from datadoctor.anomalies import find_anomalies
|
|
21
|
+
from datadoctor.score import score
|
|
22
|
+
|
|
23
|
+
__version__ = "0.1.0"
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"audit",
|
|
27
|
+
"investigate",
|
|
28
|
+
"fix",
|
|
29
|
+
"validate",
|
|
30
|
+
"compare",
|
|
31
|
+
"detect_pii",
|
|
32
|
+
"mask_pii",
|
|
33
|
+
"detect_drift",
|
|
34
|
+
"find_anomalies",
|
|
35
|
+
"score",
|
|
36
|
+
]
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pandas 2.x / 3.x dtype compatibility.
|
|
3
|
+
|
|
4
|
+
In pandas 2.x string columns have dtype ``object`` (dtype.name == 'object').
|
|
5
|
+
In pandas 3.x (infer_string=True by default) they have a ``StringDtype``
|
|
6
|
+
instance whose repr shows as ``dtype: str`` (dtype.name may be 'str',
|
|
7
|
+
'string', or 'string[python]' depending on the sub-release).
|
|
8
|
+
|
|
9
|
+
The safest guard is ``isinstance(dtype, pd.StringDtype)`` — it covers every
|
|
10
|
+
StringDtype variant without relying on the `.name` attribute.
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def is_str_col(series: pd.Series) -> bool:
|
|
18
|
+
"""
|
|
19
|
+
True for any string-like column, pandas 2.x and 3.x compatible.
|
|
20
|
+
|
|
21
|
+
- pandas 2.x default: dtype == object (plain Python objects)
|
|
22
|
+
- pandas 3.x default: isinstance(dtype, pd.StringDtype)
|
|
23
|
+
(repr shows as ``dtype: str``)
|
|
24
|
+
"""
|
|
25
|
+
dtype = series.dtype
|
|
26
|
+
# Fast path: classic object dtype used by pandas 2.x
|
|
27
|
+
if dtype == object:
|
|
28
|
+
return True
|
|
29
|
+
# All StringDtype variants (pd.StringDtype was added in pandas 1.0 and is
|
|
30
|
+
# the default in pandas 3.x regardless of storage backend)
|
|
31
|
+
if hasattr(pd, "StringDtype") and isinstance(dtype, pd.StringDtype):
|
|
32
|
+
return True
|
|
33
|
+
# Extra safety-net: catch any future/vendor string dtype by name
|
|
34
|
+
name = getattr(dtype, "name", "")
|
|
35
|
+
return name in ("str", "string", "large_string") or "string" in str(dtype).lower()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def str_columns(df: pd.DataFrame) -> list[str]:
|
|
39
|
+
"""Return names of all string-like columns in *df*."""
|
|
40
|
+
return [col for col in df.columns if is_str_col(df[col])]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from datadoctor.analyzers import (
|
|
2
|
+
duplicate,
|
|
3
|
+
null_analyzer,
|
|
4
|
+
type_analyzer,
|
|
5
|
+
format_analyzer,
|
|
6
|
+
outlier,
|
|
7
|
+
pii_analyzer,
|
|
8
|
+
drift_analyzer,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"duplicate",
|
|
13
|
+
"null_analyzer",
|
|
14
|
+
"type_analyzer",
|
|
15
|
+
"format_analyzer",
|
|
16
|
+
"outlier",
|
|
17
|
+
"pii_analyzer",
|
|
18
|
+
"drift_analyzer",
|
|
19
|
+
]
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from scipy import stats
|
|
8
|
+
|
|
9
|
+
from datadoctor._compat import is_str_col
|
|
10
|
+
|
|
11
|
+
_KS_SIGNIFICANCE = 0.05
|
|
12
|
+
_CATEGORY_DRIFT_THRESHOLD = 0.05
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def analyze(old_df: pd.DataFrame, new_df: pd.DataFrame) -> dict[str, Any]:
|
|
16
|
+
distribution_drift: dict[str, float] = {}
|
|
17
|
+
category_drift: dict[str, dict[str, Any]] = {}
|
|
18
|
+
missing_value_drift: dict[str, float] = {}
|
|
19
|
+
drifted: set[str] = set()
|
|
20
|
+
|
|
21
|
+
common_cols = set(old_df.columns) & set(new_df.columns)
|
|
22
|
+
|
|
23
|
+
for col in sorted(common_cols):
|
|
24
|
+
old_s = old_df[col]
|
|
25
|
+
new_s = new_df[col]
|
|
26
|
+
|
|
27
|
+
# Missing-value drift
|
|
28
|
+
old_null = old_s.isna().mean()
|
|
29
|
+
new_null = new_s.isna().mean()
|
|
30
|
+
mv_change = round(abs(new_null - old_null) * 100, 2)
|
|
31
|
+
missing_value_drift[col] = mv_change
|
|
32
|
+
if mv_change > 5.0:
|
|
33
|
+
drifted.add(col)
|
|
34
|
+
|
|
35
|
+
# Numeric distribution drift (KS test)
|
|
36
|
+
if pd.api.types.is_numeric_dtype(old_s) and pd.api.types.is_numeric_dtype(new_s):
|
|
37
|
+
old_clean = old_s.dropna().astype(float)
|
|
38
|
+
new_clean = new_s.dropna().astype(float)
|
|
39
|
+
if len(old_clean) > 1 and len(new_clean) > 1:
|
|
40
|
+
stat, p_value = stats.ks_2samp(old_clean, new_clean)
|
|
41
|
+
distribution_drift[col] = round(float(stat), 4)
|
|
42
|
+
if p_value < _KS_SIGNIFICANCE:
|
|
43
|
+
drifted.add(col)
|
|
44
|
+
|
|
45
|
+
# Categorical distribution drift
|
|
46
|
+
elif is_str_col(old_s) and is_str_col(new_s):
|
|
47
|
+
old_freq = old_s.value_counts(normalize=True)
|
|
48
|
+
new_freq = new_s.value_counts(normalize=True)
|
|
49
|
+
all_cats = set(old_freq.index) | set(new_freq.index)
|
|
50
|
+
changes: dict[str, Any] = {}
|
|
51
|
+
for cat in all_cats:
|
|
52
|
+
old_p = float(old_freq.get(cat, 0.0))
|
|
53
|
+
new_p = float(new_freq.get(cat, 0.0))
|
|
54
|
+
if abs(new_p - old_p) > _CATEGORY_DRIFT_THRESHOLD:
|
|
55
|
+
changes[str(cat)] = {
|
|
56
|
+
"old_pct": round(old_p * 100, 2),
|
|
57
|
+
"new_pct": round(new_p * 100, 2),
|
|
58
|
+
}
|
|
59
|
+
if changes:
|
|
60
|
+
category_drift[col] = changes
|
|
61
|
+
drifted.add(col)
|
|
62
|
+
|
|
63
|
+
return {
|
|
64
|
+
"distribution_drift": distribution_drift,
|
|
65
|
+
"category_drift": category_drift,
|
|
66
|
+
"missing_value_drift": missing_value_drift,
|
|
67
|
+
"drifted_columns": sorted(drifted),
|
|
68
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from datadoctor.models import Issue
|
|
6
|
+
|
|
7
|
+
_HIGH_THRESHOLD = 0.10 # >10% duplicates → high severity
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def analyze(df: pd.DataFrame) -> list[Issue]:
|
|
11
|
+
if df.empty:
|
|
12
|
+
return []
|
|
13
|
+
|
|
14
|
+
mask = df.duplicated()
|
|
15
|
+
dup_count = int(mask.sum())
|
|
16
|
+
|
|
17
|
+
if dup_count == 0:
|
|
18
|
+
return []
|
|
19
|
+
|
|
20
|
+
pct = dup_count / len(df)
|
|
21
|
+
severity = "high" if pct > _HIGH_THRESHOLD else "medium"
|
|
22
|
+
|
|
23
|
+
return [
|
|
24
|
+
Issue(
|
|
25
|
+
type="duplicate_rows",
|
|
26
|
+
severity=severity,
|
|
27
|
+
count=dup_count,
|
|
28
|
+
details={
|
|
29
|
+
"percentage": round(pct * 100, 2),
|
|
30
|
+
"duplicate_indices": df[mask].index.tolist(),
|
|
31
|
+
},
|
|
32
|
+
)
|
|
33
|
+
]
|