dataruff 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. dataruff-0.1.0/PKG-INFO +235 -0
  2. dataruff-0.1.0/README.md +205 -0
  3. dataruff-0.1.0/datadoctor/__init__.py +36 -0
  4. dataruff-0.1.0/datadoctor/_compat.py +40 -0
  5. dataruff-0.1.0/datadoctor/analyzers/__init__.py +19 -0
  6. dataruff-0.1.0/datadoctor/analyzers/drift_analyzer.py +68 -0
  7. dataruff-0.1.0/datadoctor/analyzers/duplicate.py +33 -0
  8. dataruff-0.1.0/datadoctor/analyzers/format_analyzer.py +91 -0
  9. dataruff-0.1.0/datadoctor/analyzers/null_analyzer.py +52 -0
  10. dataruff-0.1.0/datadoctor/analyzers/outlier.py +73 -0
  11. dataruff-0.1.0/datadoctor/analyzers/pii_analyzer.py +58 -0
  12. dataruff-0.1.0/datadoctor/analyzers/type_analyzer.py +51 -0
  13. dataruff-0.1.0/datadoctor/anomalies.py +52 -0
  14. dataruff-0.1.0/datadoctor/audit.py +19 -0
  15. dataruff-0.1.0/datadoctor/cli.py +115 -0
  16. dataruff-0.1.0/datadoctor/compare.py +50 -0
  17. dataruff-0.1.0/datadoctor/drift.py +20 -0
  18. dataruff-0.1.0/datadoctor/fix.py +14 -0
  19. dataruff-0.1.0/datadoctor/fixing/__init__.py +3 -0
  20. dataruff-0.1.0/datadoctor/fixing/engine.py +83 -0
  21. dataruff-0.1.0/datadoctor/investigate.py +40 -0
  22. dataruff-0.1.0/datadoctor/loader.py +26 -0
  23. dataruff-0.1.0/datadoctor/models.py +89 -0
  24. dataruff-0.1.0/datadoctor/pii.py +69 -0
  25. dataruff-0.1.0/datadoctor/reporting/__init__.py +10 -0
  26. dataruff-0.1.0/datadoctor/reporting/json_reporter.py +29 -0
  27. dataruff-0.1.0/datadoctor/reporting/terminal.py +111 -0
  28. dataruff-0.1.0/datadoctor/score.py +17 -0
  29. dataruff-0.1.0/datadoctor/scoring/__init__.py +3 -0
  30. dataruff-0.1.0/datadoctor/scoring/engine.py +128 -0
  31. dataruff-0.1.0/datadoctor/validate.py +107 -0
  32. dataruff-0.1.0/dataruff.egg-info/PKG-INFO +235 -0
  33. dataruff-0.1.0/dataruff.egg-info/SOURCES.txt +49 -0
  34. dataruff-0.1.0/dataruff.egg-info/dependency_links.txt +1 -0
  35. dataruff-0.1.0/dataruff.egg-info/entry_points.txt +2 -0
  36. dataruff-0.1.0/dataruff.egg-info/requires.txt +14 -0
  37. dataruff-0.1.0/dataruff.egg-info/top_level.txt +1 -0
  38. dataruff-0.1.0/pyproject.toml +59 -0
  39. dataruff-0.1.0/setup.cfg +4 -0
  40. dataruff-0.1.0/tests/test_anomalies.py +97 -0
  41. dataruff-0.1.0/tests/test_audit.py +48 -0
  42. dataruff-0.1.0/tests/test_cli.py +103 -0
  43. dataruff-0.1.0/tests/test_compare.py +108 -0
  44. dataruff-0.1.0/tests/test_drift.py +93 -0
  45. dataruff-0.1.0/tests/test_fix.py +146 -0
  46. dataruff-0.1.0/tests/test_investigate.py +86 -0
  47. dataruff-0.1.0/tests/test_loader.py +46 -0
  48. dataruff-0.1.0/tests/test_pii.py +180 -0
  49. dataruff-0.1.0/tests/test_reporting.py +175 -0
  50. dataruff-0.1.0/tests/test_score.py +95 -0
  51. dataruff-0.1.0/tests/test_validate.py +116 -0
@@ -0,0 +1,235 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataruff
3
+ Version: 0.1.0
4
+ Summary: One-command dataset health diagnostics — the Ruff of datasets.
5
+ Author: dataruff contributors
6
+ License-Expression: MIT
7
+ Keywords: data quality,pandas,csv,data validation,EDA,data science
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
16
+ Requires-Python: >=3.10
17
+ Description-Content-Type: text/markdown
18
+ Requires-Dist: pandas>=2.0
19
+ Requires-Dist: numpy>=1.24
20
+ Requires-Dist: scipy>=1.10
21
+ Requires-Dist: scikit-learn>=1.3
22
+ Requires-Dist: openpyxl>=3.1
23
+ Requires-Dist: python-dateutil>=2.8
24
+ Provides-Extra: rich
25
+ Requires-Dist: rich>=13.0; extra == "rich"
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=7.4; extra == "dev"
28
+ Requires-Dist: pytest-cov>=4.1; extra == "dev"
29
+ Requires-Dist: rich>=13.0; extra == "dev"
30
+
31
+ # dataruff
32
+
33
+ [![CI](https://github.com/AryanPatankar27/dataruff/actions/workflows/ci.yml/badge.svg)](https://github.com/AryanPatankar27/dataruff/actions/workflows/ci.yml)
34
+ [![codecov](https://codecov.io/gh/AryanPatankar27/dataruff/branch/main/graph/badge.svg)](https://codecov.io/gh/AryanPatankar27/dataruff)
35
+ [![PyPI version](https://img.shields.io/pypi/v/dataruff)](https://pypi.org/project/dataruff/)
36
+ [![Python](https://img.shields.io/pypi/pyversions/dataruff)](https://pypi.org/project/dataruff/)
37
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
38
+
39
+ **The Ruff of datasets.** One command to discover, explain, score, and fix data quality problems in Pandas DataFrames and CSV/Excel files.
40
+
41
+ ```python
42
+ from datadoctor import audit
43
+
44
+ audit(df)
45
+ ```
46
+
47
+ ```
48
+ Data Quality Score: 81/100
49
+
50
+ Issues Found (5):
51
+ ! 42 duplicate rows
52
+ ~ 13 invalid email (column: email)
53
+ ! 3 empty columns
54
+ ~ 7 outlier (column: salary)
55
+ . 2 inconsistent date format (column: created_at)
56
+
57
+ Rows: 10,000 | Columns: 12
58
+ ```
59
+
60
+ ---
61
+
62
+ ## Install
63
+
64
+ ```bash
65
+ pip install dataruff
66
+ ```
67
+
68
+ Optionally install [rich](https://github.com/Textualize/rich) for prettier terminal output:
69
+
70
+ ```bash
71
+ pip install dataruff[rich]
72
+ ```
73
+
74
+ ---
75
+
76
+ ## Quick start
77
+
78
+ ```python
79
+ import pandas as pd
80
+ from datadoctor import audit, fix, score, validate, detect_pii
81
+
82
+ df = pd.read_csv("customers.csv")
83
+
84
+ # Full health report
85
+ audit(df)
86
+
87
+ # Get numeric score
88
+ s = score(df)
89
+ print(s.overall) # 81
90
+ print(s.to_dict()) # {'overall': 81, 'completeness': 92, ...}
91
+
92
+ # Auto-fix common issues
93
+ clean_df = fix(df)
94
+
95
+ # Validate against a schema
96
+ result = validate(df, schema={
97
+ "email": "email",
98
+ "age": "0-120",
99
+ "id": "unique",
100
+ })
101
+
102
+ # PII detection
103
+ report = detect_pii(df)
104
+ print(report.columns_with_pii)
105
+ # {'email': ['email'], 'phone': ['phone'], 'uid': ['aadhaar']}
106
+ ```
107
+
108
+ ---
109
+
110
+ ## API reference
111
+
112
+ | Function | Description | Returns |
113
+ |---|---|---|
114
+ | `audit(df)` | Print full health report | `InvestigationReport` |
115
+ | `investigate(df)` | Structured issue breakdown | `InvestigationReport` |
116
+ | `score(df)` | Data quality score | `ScoreBreakdown` |
117
+ | `fix(df)` | Auto-repair common issues | `pd.DataFrame` |
118
+ | `validate(df, schema)` | Check schema constraints | `dict` |
119
+ | `compare(old, new)` | Diff two datasets | `ComparisonReport` |
120
+ | `detect_pii(df)` | Find PII columns | `PIIReport` |
121
+ | `mask_pii(df)` | Redact PII values | `pd.DataFrame` |
122
+ | `detect_drift(old, new)` | Distribution drift analysis | `DriftReport` |
123
+ | `find_anomalies(df)` | Anomaly / outlier detection | `dict` |
124
+
125
+ All functions accept a **DataFrame, CSV path, or XLSX path** as input.
126
+
127
+ ---
128
+
129
+ ## Scoring formula
130
+
131
+ | Dimension | Weight | Measures |
132
+ |---|---|---|
133
+ | Completeness | 25% | Non-null ratio across all cells |
134
+ | Validity | 25% | Format correctness (emails, dates, types) |
135
+ | Consistency | 20% | Uniform types and formats per column |
136
+ | Uniqueness | 20% | Absence of duplicate rows |
137
+ | Schema compliance | 10% | Adherence to user-provided schema |
138
+
139
+ ---
140
+
141
+ ## `fix()` — what gets repaired
142
+
143
+ | Issue | Fix applied |
144
+ |---|---|
145
+ | Duplicate rows | Removed |
146
+ | Leading/trailing whitespace | Stripped |
147
+ | Boolean strings (`yes/no/true/false`) | Converted to `bool` |
148
+ | Mixed date formats | Normalized to `YYYY-MM-DD` |
149
+ | Missing numeric values | Filled with column median |
150
+ | Missing string values | Filled with column mode |
151
+
152
+ ---
153
+
154
+ ## `validate()` — schema rules
155
+
156
+ ```python
157
+ validate(df, schema={
158
+ "email": "email", # valid email format
159
+ "age": "0-120", # numeric range
160
+ "user_id": "unique", # no duplicates
161
+ "price": "positive", # > 0
162
+ "code": "not_null", # no missing values
163
+ "ref": "regex:[A-Z]{3}", # custom regex
164
+ })
165
+ ```
166
+
167
+ ---
168
+
169
+ ## `detect_pii()` — supported PII types
170
+
171
+ | Type | Example |
172
+ |---|---|
173
+ | `email` | `alice@example.com` |
174
+ | `phone` | `9876543210` |
175
+ | `aadhaar` | `2345 6789 0123` |
176
+ | `pan` | `ABCDE1234F` |
177
+ | `ssn` | `123-45-6789` |
178
+ | `credit_card` | `4111 1111 1111 1111` |
179
+
180
+ ---
181
+
182
+ ## CLI
183
+
184
+ ```bash
185
+ # Audit a CSV file
186
+ dataruff audit customers.csv
187
+
188
+ # Output as JSON
189
+ dataruff audit customers.csv --json
190
+
191
+ # Fix issues and write cleaned file
192
+ dataruff fix customers.csv
193
+ # -> customers_clean.csv
194
+
195
+ # Compare two datasets
196
+ dataruff compare old.csv new.csv
197
+
198
+ # Data quality score
199
+ dataruff score customers.csv
200
+
201
+ # PII detection
202
+ dataruff detect-pii customers.csv
203
+
204
+ # Mask PII
205
+ dataruff mask-pii customers.csv
206
+ # -> customers_masked.csv
207
+ ```
208
+
209
+ ---
210
+
211
+ ## Architecture
212
+
213
+ ```
214
+ datadoctor/
215
+ ├── analyzers/ # DuplicateAnalyzer, NullAnalyzer, TypeAnalyzer,
216
+ │ # FormatAnalyzer, OutlierAnalyzer, PIIAnalyzer, DriftAnalyzer
217
+ ├── scoring/ # Weighted scoring engine
218
+ ├── fixing/ # Auto-remediation rules
219
+ └── reporting/ # Terminal (rich + plain fallback) and JSON output
220
+ ```
221
+
222
+ No LLMs. No API calls. Everything deterministic and offline.
223
+
224
+ ---
225
+
226
+ ## Requirements
227
+
228
+ - Python 3.10+
229
+ - pandas, numpy, scipy, scikit-learn, openpyxl, python-dateutil
230
+
231
+ ---
232
+
233
+ ## License
234
+
235
+ MIT
@@ -0,0 +1,205 @@
1
+ # dataruff
2
+
3
+ [![CI](https://github.com/AryanPatankar27/dataruff/actions/workflows/ci.yml/badge.svg)](https://github.com/AryanPatankar27/dataruff/actions/workflows/ci.yml)
4
+ [![codecov](https://codecov.io/gh/AryanPatankar27/dataruff/branch/main/graph/badge.svg)](https://codecov.io/gh/AryanPatankar27/dataruff)
5
+ [![PyPI version](https://img.shields.io/pypi/v/dataruff)](https://pypi.org/project/dataruff/)
6
+ [![Python](https://img.shields.io/pypi/pyversions/dataruff)](https://pypi.org/project/dataruff/)
7
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
+
9
+ **The Ruff of datasets.** One command to discover, explain, score, and fix data quality problems in Pandas DataFrames and CSV/Excel files.
10
+
11
+ ```python
12
+ from datadoctor import audit
13
+
14
+ audit(df)
15
+ ```
16
+
17
+ ```
18
+ Data Quality Score: 81/100
19
+
20
+ Issues Found (5):
21
+ ! 42 duplicate rows
22
+ ~ 13 invalid email (column: email)
23
+ ! 3 empty columns
24
+ ~ 7 outlier (column: salary)
25
+ . 2 inconsistent date format (column: created_at)
26
+
27
+ Rows: 10,000 | Columns: 12
28
+ ```
29
+
30
+ ---
31
+
32
+ ## Install
33
+
34
+ ```bash
35
+ pip install dataruff
36
+ ```
37
+
38
+ Optionally install [rich](https://github.com/Textualize/rich) for prettier terminal output:
39
+
40
+ ```bash
41
+ pip install dataruff[rich]
42
+ ```
43
+
44
+ ---
45
+
46
+ ## Quick start
47
+
48
+ ```python
49
+ import pandas as pd
50
+ from datadoctor import audit, fix, score, validate, detect_pii
51
+
52
+ df = pd.read_csv("customers.csv")
53
+
54
+ # Full health report
55
+ audit(df)
56
+
57
+ # Get numeric score
58
+ s = score(df)
59
+ print(s.overall) # 81
60
+ print(s.to_dict()) # {'overall': 81, 'completeness': 92, ...}
61
+
62
+ # Auto-fix common issues
63
+ clean_df = fix(df)
64
+
65
+ # Validate against a schema
66
+ result = validate(df, schema={
67
+ "email": "email",
68
+ "age": "0-120",
69
+ "id": "unique",
70
+ })
71
+
72
+ # PII detection
73
+ report = detect_pii(df)
74
+ print(report.columns_with_pii)
75
+ # {'email': ['email'], 'phone': ['phone'], 'uid': ['aadhaar']}
76
+ ```
77
+
78
+ ---
79
+
80
+ ## API reference
81
+
82
+ | Function | Description | Returns |
83
+ |---|---|---|
84
+ | `audit(df)` | Print full health report | `InvestigationReport` |
85
+ | `investigate(df)` | Structured issue breakdown | `InvestigationReport` |
86
+ | `score(df)` | Data quality score | `ScoreBreakdown` |
87
+ | `fix(df)` | Auto-repair common issues | `pd.DataFrame` |
88
+ | `validate(df, schema)` | Check schema constraints | `dict` |
89
+ | `compare(old, new)` | Diff two datasets | `ComparisonReport` |
90
+ | `detect_pii(df)` | Find PII columns | `PIIReport` |
91
+ | `mask_pii(df)` | Redact PII values | `pd.DataFrame` |
92
+ | `detect_drift(old, new)` | Distribution drift analysis | `DriftReport` |
93
+ | `find_anomalies(df)` | Anomaly / outlier detection | `dict` |
94
+
95
+ All functions accept a **DataFrame, CSV path, or XLSX path** as input.
96
+
97
+ ---
98
+
99
+ ## Scoring formula
100
+
101
+ | Dimension | Weight | Measures |
102
+ |---|---|---|
103
+ | Completeness | 25% | Non-null ratio across all cells |
104
+ | Validity | 25% | Format correctness (emails, dates, types) |
105
+ | Consistency | 20% | Uniform types and formats per column |
106
+ | Uniqueness | 20% | Absence of duplicate rows |
107
+ | Schema compliance | 10% | Adherence to user-provided schema |
108
+
109
+ ---
110
+
111
+ ## `fix()` — what gets repaired
112
+
113
+ | Issue | Fix applied |
114
+ |---|---|
115
+ | Duplicate rows | Removed |
116
+ | Leading/trailing whitespace | Stripped |
117
+ | Boolean strings (`yes/no/true/false`) | Converted to `bool` |
118
+ | Mixed date formats | Normalized to `YYYY-MM-DD` |
119
+ | Missing numeric values | Filled with column median |
120
+ | Missing string values | Filled with column mode |
121
+
122
+ ---
123
+
124
+ ## `validate()` — schema rules
125
+
126
+ ```python
127
+ validate(df, schema={
128
+ "email": "email", # valid email format
129
+ "age": "0-120", # numeric range
130
+ "user_id": "unique", # no duplicates
131
+ "price": "positive", # > 0
132
+ "code": "not_null", # no missing values
133
+ "ref": "regex:[A-Z]{3}", # custom regex
134
+ })
135
+ ```
136
+
137
+ ---
138
+
139
+ ## `detect_pii()` — supported PII types
140
+
141
+ | Type | Example |
142
+ |---|---|
143
+ | `email` | `alice@example.com` |
144
+ | `phone` | `9876543210` |
145
+ | `aadhaar` | `2345 6789 0123` |
146
+ | `pan` | `ABCDE1234F` |
147
+ | `ssn` | `123-45-6789` |
148
+ | `credit_card` | `4111 1111 1111 1111` |
149
+
150
+ ---
151
+
152
+ ## CLI
153
+
154
+ ```bash
155
+ # Audit a CSV file
156
+ dataruff audit customers.csv
157
+
158
+ # Output as JSON
159
+ dataruff audit customers.csv --json
160
+
161
+ # Fix issues and write cleaned file
162
+ dataruff fix customers.csv
163
+ # -> customers_clean.csv
164
+
165
+ # Compare two datasets
166
+ dataruff compare old.csv new.csv
167
+
168
+ # Data quality score
169
+ dataruff score customers.csv
170
+
171
+ # PII detection
172
+ dataruff detect-pii customers.csv
173
+
174
+ # Mask PII
175
+ dataruff mask-pii customers.csv
176
+ # -> customers_masked.csv
177
+ ```
178
+
179
+ ---
180
+
181
+ ## Architecture
182
+
183
+ ```
184
+ datadoctor/
185
+ ├── analyzers/ # DuplicateAnalyzer, NullAnalyzer, TypeAnalyzer,
186
+ │ # FormatAnalyzer, OutlierAnalyzer, PIIAnalyzer, DriftAnalyzer
187
+ ├── scoring/ # Weighted scoring engine
188
+ ├── fixing/ # Auto-remediation rules
189
+ └── reporting/ # Terminal (rich + plain fallback) and JSON output
190
+ ```
191
+
192
+ No LLMs. No API calls. Everything deterministic and offline.
193
+
194
+ ---
195
+
196
+ ## Requirements
197
+
198
+ - Python 3.10+
199
+ - pandas, numpy, scipy, scikit-learn, openpyxl, python-dateutil
200
+
201
+ ---
202
+
203
+ ## License
204
+
205
+ MIT
@@ -0,0 +1,36 @@
1
+ """
2
+ dataruff — One-command dataset health diagnostics.
3
+
4
+ Usage:
5
+ from datadoctor import audit, fix, score, detect_pii
6
+
7
+ audit(df) # Print quality report
8
+ fix(df) # Return cleaned DataFrame
9
+ score(df) # Return ScoreBreakdown
10
+ detect_pii(df) # Return PIIReport
11
+ """
12
+
13
+ from datadoctor.audit import audit
14
+ from datadoctor.investigate import investigate
15
+ from datadoctor.fix import fix
16
+ from datadoctor.validate import validate
17
+ from datadoctor.compare import compare
18
+ from datadoctor.pii import detect_pii, mask_pii
19
+ from datadoctor.drift import detect_drift
20
+ from datadoctor.anomalies import find_anomalies
21
+ from datadoctor.score import score
22
+
23
+ __version__ = "0.1.0"
24
+
25
+ __all__ = [
26
+ "audit",
27
+ "investigate",
28
+ "fix",
29
+ "validate",
30
+ "compare",
31
+ "detect_pii",
32
+ "mask_pii",
33
+ "detect_drift",
34
+ "find_anomalies",
35
+ "score",
36
+ ]
@@ -0,0 +1,40 @@
1
+ """
2
+ Pandas 2.x / 3.x dtype compatibility.
3
+
4
+ In pandas 2.x string columns have dtype ``object`` (dtype.name == 'object').
5
+ In pandas 3.x (infer_string=True by default) they have a ``StringDtype``
6
+ instance whose repr shows as ``dtype: str`` (dtype.name may be 'str',
7
+ 'string', or 'string[python]' depending on the sub-release).
8
+
9
+ The safest guard is ``isinstance(dtype, pd.StringDtype)`` — it covers every
10
+ StringDtype variant without relying on the `.name` attribute.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import pandas as pd
15
+
16
+
17
+ def is_str_col(series: pd.Series) -> bool:
18
+ """
19
+ True for any string-like column, pandas 2.x and 3.x compatible.
20
+
21
+ - pandas 2.x default: dtype == object (plain Python objects)
22
+ - pandas 3.x default: isinstance(dtype, pd.StringDtype)
23
+ (repr shows as ``dtype: str``)
24
+ """
25
+ dtype = series.dtype
26
+ # Fast path: classic object dtype used by pandas 2.x
27
+ if dtype == object:
28
+ return True
29
+ # All StringDtype variants (pd.StringDtype was added in pandas 1.0 and is
30
+ # the default in pandas 3.x regardless of storage backend)
31
+ if hasattr(pd, "StringDtype") and isinstance(dtype, pd.StringDtype):
32
+ return True
33
+ # Extra safety-net: catch any future/vendor string dtype by name
34
+ name = getattr(dtype, "name", "")
35
+ return name in ("str", "string", "large_string") or "string" in str(dtype).lower()
36
+
37
+
38
+ def str_columns(df: pd.DataFrame) -> list[str]:
39
+ """Return names of all string-like columns in *df*."""
40
+ return [col for col in df.columns if is_str_col(df[col])]
@@ -0,0 +1,19 @@
1
+ from datadoctor.analyzers import (
2
+ duplicate,
3
+ null_analyzer,
4
+ type_analyzer,
5
+ format_analyzer,
6
+ outlier,
7
+ pii_analyzer,
8
+ drift_analyzer,
9
+ )
10
+
11
+ __all__ = [
12
+ "duplicate",
13
+ "null_analyzer",
14
+ "type_analyzer",
15
+ "format_analyzer",
16
+ "outlier",
17
+ "pii_analyzer",
18
+ "drift_analyzer",
19
+ ]
@@ -0,0 +1,68 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from scipy import stats
8
+
9
+ from datadoctor._compat import is_str_col
10
+
11
+ _KS_SIGNIFICANCE = 0.05
12
+ _CATEGORY_DRIFT_THRESHOLD = 0.05
13
+
14
+
15
+ def analyze(old_df: pd.DataFrame, new_df: pd.DataFrame) -> dict[str, Any]:
16
+ distribution_drift: dict[str, float] = {}
17
+ category_drift: dict[str, dict[str, Any]] = {}
18
+ missing_value_drift: dict[str, float] = {}
19
+ drifted: set[str] = set()
20
+
21
+ common_cols = set(old_df.columns) & set(new_df.columns)
22
+
23
+ for col in sorted(common_cols):
24
+ old_s = old_df[col]
25
+ new_s = new_df[col]
26
+
27
+ # Missing-value drift
28
+ old_null = old_s.isna().mean()
29
+ new_null = new_s.isna().mean()
30
+ mv_change = round(abs(new_null - old_null) * 100, 2)
31
+ missing_value_drift[col] = mv_change
32
+ if mv_change > 5.0:
33
+ drifted.add(col)
34
+
35
+ # Numeric distribution drift (KS test)
36
+ if pd.api.types.is_numeric_dtype(old_s) and pd.api.types.is_numeric_dtype(new_s):
37
+ old_clean = old_s.dropna().astype(float)
38
+ new_clean = new_s.dropna().astype(float)
39
+ if len(old_clean) > 1 and len(new_clean) > 1:
40
+ stat, p_value = stats.ks_2samp(old_clean, new_clean)
41
+ distribution_drift[col] = round(float(stat), 4)
42
+ if p_value < _KS_SIGNIFICANCE:
43
+ drifted.add(col)
44
+
45
+ # Categorical distribution drift
46
+ elif is_str_col(old_s) and is_str_col(new_s):
47
+ old_freq = old_s.value_counts(normalize=True)
48
+ new_freq = new_s.value_counts(normalize=True)
49
+ all_cats = set(old_freq.index) | set(new_freq.index)
50
+ changes: dict[str, Any] = {}
51
+ for cat in all_cats:
52
+ old_p = float(old_freq.get(cat, 0.0))
53
+ new_p = float(new_freq.get(cat, 0.0))
54
+ if abs(new_p - old_p) > _CATEGORY_DRIFT_THRESHOLD:
55
+ changes[str(cat)] = {
56
+ "old_pct": round(old_p * 100, 2),
57
+ "new_pct": round(new_p * 100, 2),
58
+ }
59
+ if changes:
60
+ category_drift[col] = changes
61
+ drifted.add(col)
62
+
63
+ return {
64
+ "distribution_drift": distribution_drift,
65
+ "category_drift": category_drift,
66
+ "missing_value_drift": missing_value_drift,
67
+ "drifted_columns": sorted(drifted),
68
+ }
@@ -0,0 +1,33 @@
1
+ from __future__ import annotations
2
+
3
+ import pandas as pd
4
+
5
+ from datadoctor.models import Issue
6
+
7
+ _HIGH_THRESHOLD = 0.10 # >10% duplicates → high severity
8
+
9
+
10
+ def analyze(df: pd.DataFrame) -> list[Issue]:
11
+ if df.empty:
12
+ return []
13
+
14
+ mask = df.duplicated()
15
+ dup_count = int(mask.sum())
16
+
17
+ if dup_count == 0:
18
+ return []
19
+
20
+ pct = dup_count / len(df)
21
+ severity = "high" if pct > _HIGH_THRESHOLD else "medium"
22
+
23
+ return [
24
+ Issue(
25
+ type="duplicate_rows",
26
+ severity=severity,
27
+ count=dup_count,
28
+ details={
29
+ "percentage": round(pct * 100, 2),
30
+ "duplicate_indices": df[mask].index.tolist(),
31
+ },
32
+ )
33
+ ]