dataruff 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datadoctor/__init__.py ADDED
@@ -0,0 +1,36 @@
1
+ """
2
+ dataruff — One-command dataset health diagnostics.
3
+
4
+ Usage:
5
+ from datadoctor import audit, fix, score, detect_pii
6
+
7
+ audit(df) # Print quality report
8
+ fix(df) # Return cleaned DataFrame
9
+ score(df) # Return ScoreBreakdown
10
+ detect_pii(df) # Return PIIReport
11
+ """
12
+
13
+ from datadoctor.audit import audit
14
+ from datadoctor.investigate import investigate
15
+ from datadoctor.fix import fix
16
+ from datadoctor.validate import validate
17
+ from datadoctor.compare import compare
18
+ from datadoctor.pii import detect_pii, mask_pii
19
+ from datadoctor.drift import detect_drift
20
+ from datadoctor.anomalies import find_anomalies
21
+ from datadoctor.score import score
22
+
23
+ __version__ = "0.1.0"
24
+
25
+ __all__ = [
26
+ "audit",
27
+ "investigate",
28
+ "fix",
29
+ "validate",
30
+ "compare",
31
+ "detect_pii",
32
+ "mask_pii",
33
+ "detect_drift",
34
+ "find_anomalies",
35
+ "score",
36
+ ]
datadoctor/_compat.py ADDED
@@ -0,0 +1,40 @@
1
+ """
2
+ Pandas 2.x / 3.x dtype compatibility.
3
+
4
+ In pandas 2.x string columns have dtype ``object`` (dtype.name == 'object').
5
+ In pandas 3.x (infer_string=True by default) they have a ``StringDtype``
6
+ instance whose repr shows as ``dtype: str`` (dtype.name may be 'str',
7
+ 'string', or 'string[python]' depending on the sub-release).
8
+
9
+ The safest guard is ``isinstance(dtype, pd.StringDtype)`` — it covers every
10
+ StringDtype variant without relying on the `.name` attribute.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import pandas as pd
15
+
16
+
17
+ def is_str_col(series: pd.Series) -> bool:
18
+ """
19
+ True for any string-like column, pandas 2.x and 3.x compatible.
20
+
21
+ - pandas 2.x default: dtype == object (plain Python objects)
22
+ - pandas 3.x default: isinstance(dtype, pd.StringDtype)
23
+ (repr shows as ``dtype: str``)
24
+ """
25
+ dtype = series.dtype
26
+ # Fast path: classic object dtype used by pandas 2.x
27
+ if dtype == object:
28
+ return True
29
+ # All StringDtype variants (pd.StringDtype was added in pandas 1.0 and is
30
+ # the default in pandas 3.x regardless of storage backend)
31
+ if hasattr(pd, "StringDtype") and isinstance(dtype, pd.StringDtype):
32
+ return True
33
+ # Extra safety-net: catch any future/vendor string dtype by name
34
+ name = getattr(dtype, "name", "")
35
+ return name in ("str", "string", "large_string") or "string" in str(dtype).lower()
36
+
37
+
38
+ def str_columns(df: pd.DataFrame) -> list[str]:
39
+ """Return names of all string-like columns in *df*."""
40
+ return [col for col in df.columns if is_str_col(df[col])]
@@ -0,0 +1,19 @@
1
+ from datadoctor.analyzers import (
2
+ duplicate,
3
+ null_analyzer,
4
+ type_analyzer,
5
+ format_analyzer,
6
+ outlier,
7
+ pii_analyzer,
8
+ drift_analyzer,
9
+ )
10
+
11
+ __all__ = [
12
+ "duplicate",
13
+ "null_analyzer",
14
+ "type_analyzer",
15
+ "format_analyzer",
16
+ "outlier",
17
+ "pii_analyzer",
18
+ "drift_analyzer",
19
+ ]
@@ -0,0 +1,68 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from scipy import stats
8
+
9
+ from datadoctor._compat import is_str_col
10
+
11
+ _KS_SIGNIFICANCE = 0.05
12
+ _CATEGORY_DRIFT_THRESHOLD = 0.05
13
+
14
+
15
+ def analyze(old_df: pd.DataFrame, new_df: pd.DataFrame) -> dict[str, Any]:
16
+ distribution_drift: dict[str, float] = {}
17
+ category_drift: dict[str, dict[str, Any]] = {}
18
+ missing_value_drift: dict[str, float] = {}
19
+ drifted: set[str] = set()
20
+
21
+ common_cols = set(old_df.columns) & set(new_df.columns)
22
+
23
+ for col in sorted(common_cols):
24
+ old_s = old_df[col]
25
+ new_s = new_df[col]
26
+
27
+ # Missing-value drift
28
+ old_null = old_s.isna().mean()
29
+ new_null = new_s.isna().mean()
30
+ mv_change = round(abs(new_null - old_null) * 100, 2)
31
+ missing_value_drift[col] = mv_change
32
+ if mv_change > 5.0:
33
+ drifted.add(col)
34
+
35
+ # Numeric distribution drift (KS test)
36
+ if pd.api.types.is_numeric_dtype(old_s) and pd.api.types.is_numeric_dtype(new_s):
37
+ old_clean = old_s.dropna().astype(float)
38
+ new_clean = new_s.dropna().astype(float)
39
+ if len(old_clean) > 1 and len(new_clean) > 1:
40
+ stat, p_value = stats.ks_2samp(old_clean, new_clean)
41
+ distribution_drift[col] = round(float(stat), 4)
42
+ if p_value < _KS_SIGNIFICANCE:
43
+ drifted.add(col)
44
+
45
+ # Categorical distribution drift
46
+ elif is_str_col(old_s) and is_str_col(new_s):
47
+ old_freq = old_s.value_counts(normalize=True)
48
+ new_freq = new_s.value_counts(normalize=True)
49
+ all_cats = set(old_freq.index) | set(new_freq.index)
50
+ changes: dict[str, Any] = {}
51
+ for cat in all_cats:
52
+ old_p = float(old_freq.get(cat, 0.0))
53
+ new_p = float(new_freq.get(cat, 0.0))
54
+ if abs(new_p - old_p) > _CATEGORY_DRIFT_THRESHOLD:
55
+ changes[str(cat)] = {
56
+ "old_pct": round(old_p * 100, 2),
57
+ "new_pct": round(new_p * 100, 2),
58
+ }
59
+ if changes:
60
+ category_drift[col] = changes
61
+ drifted.add(col)
62
+
63
+ return {
64
+ "distribution_drift": distribution_drift,
65
+ "category_drift": category_drift,
66
+ "missing_value_drift": missing_value_drift,
67
+ "drifted_columns": sorted(drifted),
68
+ }
@@ -0,0 +1,33 @@
1
+ from __future__ import annotations
2
+
3
+ import pandas as pd
4
+
5
+ from datadoctor.models import Issue
6
+
7
+ _HIGH_THRESHOLD = 0.10 # >10% duplicates → high severity
8
+
9
+
10
+ def analyze(df: pd.DataFrame) -> list[Issue]:
11
+ if df.empty:
12
+ return []
13
+
14
+ mask = df.duplicated()
15
+ dup_count = int(mask.sum())
16
+
17
+ if dup_count == 0:
18
+ return []
19
+
20
+ pct = dup_count / len(df)
21
+ severity = "high" if pct > _HIGH_THRESHOLD else "medium"
22
+
23
+ return [
24
+ Issue(
25
+ type="duplicate_rows",
26
+ severity=severity,
27
+ count=dup_count,
28
+ details={
29
+ "percentage": round(pct * 100, 2),
30
+ "duplicate_indices": df[mask].index.tolist(),
31
+ },
32
+ )
33
+ ]
@@ -0,0 +1,91 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ import pandas as pd
6
+ from dateutil.parser import ParserError
7
+ from dateutil.parser import parse as parse_date
8
+
9
+ from datadoctor._compat import is_str_col
10
+ from datadoctor.models import Issue
11
+
12
+ _EMAIL_RE = re.compile(r"^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$")
13
+ _EMAIL_HINTS = ("email", "mail", "e-mail", "e_mail")
14
+ _DATE_HINTS = ("date", "time", "dt", "created", "updated", "modified", "timestamp")
15
+
16
+
17
+ def _is_email_col(name: str) -> bool:
18
+ low = name.lower()
19
+ return any(h in low for h in _EMAIL_HINTS)
20
+
21
+
22
+ def _is_date_col(name: str) -> bool:
23
+ low = name.lower()
24
+ return any(h in low for h in _DATE_HINTS)
25
+
26
+
27
+ def analyze(df: pd.DataFrame) -> list[Issue]:
28
+ issues: list[Issue] = []
29
+
30
+ for col in df.columns:
31
+ if not is_str_col(df[col]):
32
+ continue
33
+
34
+ series = df[col].dropna().astype(str)
35
+ if len(series) == 0:
36
+ continue
37
+
38
+ if _is_email_col(col):
39
+ invalid = series[~series.str.match(_EMAIL_RE)]
40
+ if len(invalid) > 0:
41
+ issues.append(
42
+ Issue(
43
+ type="invalid_email",
44
+ severity="medium",
45
+ count=len(invalid),
46
+ column=col,
47
+ details={"examples": invalid.head(3).tolist()},
48
+ )
49
+ )
50
+
51
+ if _is_date_col(col):
52
+ formats_seen: set[str] = set()
53
+ parse_errors = 0
54
+
55
+ for val in series:
56
+ try:
57
+ parse_date(val, fuzzy=False)
58
+ if re.match(r"^\d{4}-\d{2}-\d{2}", val):
59
+ formats_seen.add("YYYY-MM-DD")
60
+ elif re.match(r"^\d{1,2}/\d{1,2}/\d{2,4}", val):
61
+ formats_seen.add("MM/DD/YYYY")
62
+ elif re.match(r"^\d{1,2}-\d{1,2}-\d{2,4}", val):
63
+ formats_seen.add("DD-MM-YYYY")
64
+ else:
65
+ formats_seen.add("other")
66
+ except (ParserError, ValueError, OverflowError):
67
+ parse_errors += 1
68
+
69
+ if parse_errors > 0:
70
+ issues.append(
71
+ Issue(
72
+ type="invalid_date",
73
+ severity="medium",
74
+ count=parse_errors,
75
+ column=col,
76
+ details={"unparseable_count": parse_errors},
77
+ )
78
+ )
79
+
80
+ if len(formats_seen) > 1:
81
+ issues.append(
82
+ Issue(
83
+ type="inconsistent_date_format",
84
+ severity="low",
85
+ count=len(series),
86
+ column=col,
87
+ details={"formats_detected": sorted(formats_seen)},
88
+ )
89
+ )
90
+
91
+ return issues
@@ -0,0 +1,52 @@
1
+ from __future__ import annotations
2
+
3
+ import pandas as pd
4
+
5
+ from datadoctor.models import Issue
6
+
7
+ _HIGH_NULL_THRESHOLD = 0.30
8
+ _MEDIUM_NULL_THRESHOLD = 0.05
9
+
10
+
11
+ def analyze(df: pd.DataFrame) -> list[Issue]:
12
+ if df.empty:
13
+ return []
14
+
15
+ issues: list[Issue] = []
16
+
17
+ empty_cols = [col for col in df.columns if df[col].isna().all()]
18
+ if empty_cols:
19
+ issues.append(
20
+ Issue(
21
+ type="empty_columns",
22
+ severity="high",
23
+ count=len(empty_cols),
24
+ details={"columns": empty_cols},
25
+ )
26
+ )
27
+
28
+ for col in df.columns:
29
+ if col in empty_cols:
30
+ continue
31
+ null_count = int(df[col].isna().sum())
32
+ if null_count == 0:
33
+ continue
34
+ pct = null_count / len(df)
35
+ if pct > _HIGH_NULL_THRESHOLD:
36
+ severity = "high"
37
+ elif pct > _MEDIUM_NULL_THRESHOLD:
38
+ severity = "medium"
39
+ else:
40
+ severity = "low"
41
+
42
+ issues.append(
43
+ Issue(
44
+ type="null_values",
45
+ severity=severity,
46
+ count=null_count,
47
+ column=col,
48
+ details={"percentage": round(pct * 100, 2)},
49
+ )
50
+ )
51
+
52
+ return issues
@@ -0,0 +1,73 @@
1
+ from __future__ import annotations
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ from datadoctor.models import Issue
7
+
8
+ _MIN_ROWS = 4
9
+ _HIGH_OUTLIER_THRESHOLD = 0.10
10
+
11
+
12
+ def analyze(df: pd.DataFrame, method: str = "iqr") -> list[Issue]:
13
+ issues: list[Issue] = []
14
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
15
+
16
+ for col in numeric_cols:
17
+ series = df[col].dropna()
18
+ if len(series) < _MIN_ROWS:
19
+ continue
20
+
21
+ if method == "zscore":
22
+ mask = _zscore_mask(series)
23
+ else:
24
+ mask = _iqr_mask(series)
25
+
26
+ count = int(mask.sum())
27
+ if count == 0:
28
+ continue
29
+
30
+ pct = count / len(series)
31
+ severity = "high" if pct > _HIGH_OUTLIER_THRESHOLD else "medium"
32
+
33
+ issues.append(
34
+ Issue(
35
+ type="outlier",
36
+ severity=severity,
37
+ count=count,
38
+ column=col,
39
+ details={
40
+ "method": method,
41
+ "percentage": round(pct * 100, 2),
42
+ "min": float(series.min()),
43
+ "max": float(series.max()),
44
+ "mean": round(float(series.mean()), 4),
45
+ "std": round(float(series.std()), 4),
46
+ },
47
+ )
48
+ )
49
+
50
+ return issues
51
+
52
+
53
+ def get_outlier_mask(series: pd.Series, method: str = "iqr") -> pd.Series:
54
+ if method == "zscore":
55
+ return _zscore_mask(series)
56
+ return _iqr_mask(series)
57
+
58
+
59
+ def _iqr_mask(series: pd.Series) -> pd.Series:
60
+ q1 = series.quantile(0.25)
61
+ q3 = series.quantile(0.75)
62
+ iqr = q3 - q1
63
+ lower = q1 - 1.5 * iqr
64
+ upper = q3 + 1.5 * iqr
65
+ return (series < lower) | (series > upper)
66
+
67
+
68
+ def _zscore_mask(series: pd.Series, threshold: float = 3.0) -> pd.Series:
69
+ std = series.std()
70
+ if std == 0:
71
+ return pd.Series(False, index=series.index)
72
+ z = (series - series.mean()) / std
73
+ return z.abs() > threshold
@@ -0,0 +1,58 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ import pandas as pd
6
+
7
+ from datadoctor._compat import is_str_col
8
+
9
+ _PATTERNS: dict[str, re.Pattern[str]] = {
10
+ "email": re.compile(
11
+ r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b"
12
+ ),
13
+ "phone": re.compile(
14
+ r"\b(?:\+?91[-.\s]?)?[6-9]\d{9}\b"
15
+ r"|\b\+?1?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
16
+ ),
17
+ "aadhaar": re.compile(r"\b[2-9]\d{3}[\s-]?\d{4}[\s-]?\d{4}\b"),
18
+ "pan": re.compile(r"\b[A-Z]{5}[0-9]{4}[A-Z]\b"),
19
+ "ssn": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
20
+ "credit_card": re.compile(
21
+ r"\b(?:4\d{3}|5[1-5]\d{2}|6011|3[47]\d{2})[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b"
22
+ ),
23
+ }
24
+
25
+ _COLUMN_HINTS: dict[str, list[str]] = {
26
+ "email": ["email", "mail", "e-mail", "e_mail"],
27
+ "phone": ["phone", "mobile", "cell", "tel", "contact", "phoneno", "phone_no"],
28
+ "aadhaar": ["aadhaar", "aadhar", "uid", "uidai"],
29
+ "pan": ["pan", "pan_number", "panno", "pan_no"],
30
+ "ssn": ["ssn", "social_security", "social_security_number"],
31
+ "credit_card": ["card", "cc_number", "credit_card", "creditcard", "cardno"],
32
+ }
33
+
34
+ _CONTENT_SCAN_SAMPLE = 200
35
+
36
+
37
+ def analyze(df: pd.DataFrame) -> dict[str, list[str]]:
38
+ results: dict[str, list[str]] = {}
39
+
40
+ for col in df.columns:
41
+ pii_found: set[str] = set()
42
+ col_lower = col.lower()
43
+
44
+ for pii_type, hints in _COLUMN_HINTS.items():
45
+ if any(h in col_lower for h in hints):
46
+ pii_found.add(pii_type)
47
+
48
+ if is_str_col(df[col]):
49
+ sample = df[col].dropna().astype(str).head(_CONTENT_SCAN_SAMPLE)
50
+ for pii_type, pattern in _PATTERNS.items():
51
+ if pii_type not in pii_found:
52
+ if any(pattern.search(val) for val in sample):
53
+ pii_found.add(pii_type)
54
+
55
+ if pii_found:
56
+ results[col] = sorted(pii_found)
57
+
58
+ return results
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ import pandas as pd
4
+
5
+ from datadoctor._compat import is_str_col
6
+ from datadoctor.models import Issue
7
+
8
+
9
+ def analyze(df: pd.DataFrame) -> list[Issue]:
10
+ issues: list[Issue] = []
11
+
12
+ for col in df.columns:
13
+ if not is_str_col(df[col]):
14
+ continue
15
+
16
+ series = df[col].dropna()
17
+ if len(series) == 0:
18
+ continue
19
+
20
+ # Detect mixed Python types (e.g., int and str coexisting)
21
+ type_counts: dict[str, int] = {}
22
+ for val in series:
23
+ t = type(val).__name__
24
+ type_counts[t] = type_counts.get(t, 0) + 1
25
+
26
+ if len(type_counts) > 1 and not set(type_counts.keys()) <= {"str", "bytes"}:
27
+ issues.append(
28
+ Issue(
29
+ type="mixed_types",
30
+ severity="medium",
31
+ count=len(series),
32
+ column=col,
33
+ details={"types_found": list(type_counts.keys())},
34
+ )
35
+ )
36
+ continue # skip numeric-as-string check for mixed columns
37
+
38
+ # Detect numeric values stored as strings
39
+ numeric_mask = pd.to_numeric(series.astype(str).str.replace(",", "", regex=False), errors="coerce").notna()
40
+ if numeric_mask.all() and len(series) > 0:
41
+ issues.append(
42
+ Issue(
43
+ type="numeric_as_string",
44
+ severity="low",
45
+ count=len(series),
46
+ column=col,
47
+ details={"suggestion": f"Column '{col}' contains numeric values stored as strings."},
48
+ )
49
+ )
50
+
51
+ return issues
@@ -0,0 +1,52 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any, Union
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ from datadoctor.analyzers.outlier import analyze as _outlier_analyze
10
+ from datadoctor.analyzers.outlier import get_outlier_mask
11
+ from datadoctor.loader import load
12
+
13
+
14
+ def find_anomalies(
15
+ source: Union[str, Path, pd.DataFrame],
16
+ method: str = "iqr",
17
+ ) -> dict[str, Any]:
18
+ if method not in ("iqr", "zscore"):
19
+ raise ValueError(f"Unknown method '{method}'. Choose 'iqr' or 'zscore'.")
20
+
21
+ df = load(source)
22
+ issues = _outlier_analyze(df, method=method)
23
+
24
+ total = sum(i.count for i in issues)
25
+
26
+ # Collect the union of anomalous row indices
27
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
28
+ anomalous_mask = pd.Series(False, index=df.index)
29
+ for col in numeric_cols:
30
+ series = df[col].dropna()
31
+ if len(series) >= 4:
32
+ col_mask = get_outlier_mask(series, method=method)
33
+ full_mask = col_mask.reindex(df.index, fill_value=False)
34
+ anomalous_mask = anomalous_mask | full_mask
35
+
36
+ return {
37
+ "total_anomalous_records": int(anomalous_mask.sum()),
38
+ "method": method,
39
+ "by_column": [
40
+ {
41
+ "column": i.column,
42
+ "count": i.count,
43
+ "percentage": i.details.get("percentage"),
44
+ "severity": i.severity,
45
+ "min": i.details.get("min"),
46
+ "max": i.details.get("max"),
47
+ "mean": i.details.get("mean"),
48
+ }
49
+ for i in issues
50
+ ],
51
+ "anomalous_indices": anomalous_mask[anomalous_mask].index.tolist(),
52
+ }
datadoctor/audit.py ADDED
@@ -0,0 +1,19 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Union
4
+ from pathlib import Path
5
+
6
+ import pandas as pd
7
+
8
+ from datadoctor.investigate import investigate
9
+ from datadoctor.models import InvestigationReport
10
+ from datadoctor.reporting.terminal import print_audit_report
11
+
12
+
13
+ def audit(
14
+ source: Union[str, Path, pd.DataFrame],
15
+ schema: dict | None = None,
16
+ ) -> InvestigationReport:
17
+ report = investigate(source, schema=schema)
18
+ print_audit_report(report)
19
+ return report