dataruff 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datadoctor/__init__.py +36 -0
- datadoctor/_compat.py +40 -0
- datadoctor/analyzers/__init__.py +19 -0
- datadoctor/analyzers/drift_analyzer.py +68 -0
- datadoctor/analyzers/duplicate.py +33 -0
- datadoctor/analyzers/format_analyzer.py +91 -0
- datadoctor/analyzers/null_analyzer.py +52 -0
- datadoctor/analyzers/outlier.py +73 -0
- datadoctor/analyzers/pii_analyzer.py +58 -0
- datadoctor/analyzers/type_analyzer.py +51 -0
- datadoctor/anomalies.py +52 -0
- datadoctor/audit.py +19 -0
- datadoctor/cli.py +115 -0
- datadoctor/compare.py +50 -0
- datadoctor/drift.py +20 -0
- datadoctor/fix.py +14 -0
- datadoctor/fixing/__init__.py +3 -0
- datadoctor/fixing/engine.py +83 -0
- datadoctor/investigate.py +40 -0
- datadoctor/loader.py +26 -0
- datadoctor/models.py +89 -0
- datadoctor/pii.py +69 -0
- datadoctor/reporting/__init__.py +10 -0
- datadoctor/reporting/json_reporter.py +29 -0
- datadoctor/reporting/terminal.py +111 -0
- datadoctor/score.py +17 -0
- datadoctor/scoring/__init__.py +3 -0
- datadoctor/scoring/engine.py +128 -0
- datadoctor/validate.py +107 -0
- dataruff-0.1.0.dist-info/METADATA +235 -0
- dataruff-0.1.0.dist-info/RECORD +34 -0
- dataruff-0.1.0.dist-info/WHEEL +5 -0
- dataruff-0.1.0.dist-info/entry_points.txt +2 -0
- dataruff-0.1.0.dist-info/top_level.txt +1 -0
datadoctor/__init__.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""
|
|
2
|
+
dataruff — One-command dataset health diagnostics.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
from datadoctor import audit, fix, score, detect_pii
|
|
6
|
+
|
|
7
|
+
audit(df) # Print quality report
|
|
8
|
+
fix(df) # Return cleaned DataFrame
|
|
9
|
+
score(df) # Return ScoreBreakdown
|
|
10
|
+
detect_pii(df) # Return PIIReport
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from datadoctor.audit import audit
|
|
14
|
+
from datadoctor.investigate import investigate
|
|
15
|
+
from datadoctor.fix import fix
|
|
16
|
+
from datadoctor.validate import validate
|
|
17
|
+
from datadoctor.compare import compare
|
|
18
|
+
from datadoctor.pii import detect_pii, mask_pii
|
|
19
|
+
from datadoctor.drift import detect_drift
|
|
20
|
+
from datadoctor.anomalies import find_anomalies
|
|
21
|
+
from datadoctor.score import score
|
|
22
|
+
|
|
23
|
+
__version__ = "0.1.0"
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"audit",
|
|
27
|
+
"investigate",
|
|
28
|
+
"fix",
|
|
29
|
+
"validate",
|
|
30
|
+
"compare",
|
|
31
|
+
"detect_pii",
|
|
32
|
+
"mask_pii",
|
|
33
|
+
"detect_drift",
|
|
34
|
+
"find_anomalies",
|
|
35
|
+
"score",
|
|
36
|
+
]
|
datadoctor/_compat.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pandas 2.x / 3.x dtype compatibility.
|
|
3
|
+
|
|
4
|
+
In pandas 2.x string columns have dtype ``object`` (dtype.name == 'object').
|
|
5
|
+
In pandas 3.x (infer_string=True by default) they have a ``StringDtype``
|
|
6
|
+
instance whose repr shows as ``dtype: str`` (dtype.name may be 'str',
|
|
7
|
+
'string', or 'string[python]' depending on the sub-release).
|
|
8
|
+
|
|
9
|
+
The safest guard is ``isinstance(dtype, pd.StringDtype)`` — it covers every
|
|
10
|
+
StringDtype variant without relying on the `.name` attribute.
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def is_str_col(series: pd.Series) -> bool:
|
|
18
|
+
"""
|
|
19
|
+
True for any string-like column, pandas 2.x and 3.x compatible.
|
|
20
|
+
|
|
21
|
+
- pandas 2.x default: dtype == object (plain Python objects)
|
|
22
|
+
- pandas 3.x default: isinstance(dtype, pd.StringDtype)
|
|
23
|
+
(repr shows as ``dtype: str``)
|
|
24
|
+
"""
|
|
25
|
+
dtype = series.dtype
|
|
26
|
+
# Fast path: classic object dtype used by pandas 2.x
|
|
27
|
+
if dtype == object:
|
|
28
|
+
return True
|
|
29
|
+
# All StringDtype variants (pd.StringDtype was added in pandas 1.0 and is
|
|
30
|
+
# the default in pandas 3.x regardless of storage backend)
|
|
31
|
+
if hasattr(pd, "StringDtype") and isinstance(dtype, pd.StringDtype):
|
|
32
|
+
return True
|
|
33
|
+
# Extra safety-net: catch any future/vendor string dtype by name
|
|
34
|
+
name = getattr(dtype, "name", "")
|
|
35
|
+
return name in ("str", "string", "large_string") or "string" in str(dtype).lower()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def str_columns(df: pd.DataFrame) -> list[str]:
|
|
39
|
+
"""Return names of all string-like columns in *df*."""
|
|
40
|
+
return [col for col in df.columns if is_str_col(df[col])]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from datadoctor.analyzers import (
|
|
2
|
+
duplicate,
|
|
3
|
+
null_analyzer,
|
|
4
|
+
type_analyzer,
|
|
5
|
+
format_analyzer,
|
|
6
|
+
outlier,
|
|
7
|
+
pii_analyzer,
|
|
8
|
+
drift_analyzer,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"duplicate",
|
|
13
|
+
"null_analyzer",
|
|
14
|
+
"type_analyzer",
|
|
15
|
+
"format_analyzer",
|
|
16
|
+
"outlier",
|
|
17
|
+
"pii_analyzer",
|
|
18
|
+
"drift_analyzer",
|
|
19
|
+
]
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from scipy import stats
|
|
8
|
+
|
|
9
|
+
from datadoctor._compat import is_str_col
|
|
10
|
+
|
|
11
|
+
_KS_SIGNIFICANCE = 0.05
|
|
12
|
+
_CATEGORY_DRIFT_THRESHOLD = 0.05
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def analyze(old_df: pd.DataFrame, new_df: pd.DataFrame) -> dict[str, Any]:
|
|
16
|
+
distribution_drift: dict[str, float] = {}
|
|
17
|
+
category_drift: dict[str, dict[str, Any]] = {}
|
|
18
|
+
missing_value_drift: dict[str, float] = {}
|
|
19
|
+
drifted: set[str] = set()
|
|
20
|
+
|
|
21
|
+
common_cols = set(old_df.columns) & set(new_df.columns)
|
|
22
|
+
|
|
23
|
+
for col in sorted(common_cols):
|
|
24
|
+
old_s = old_df[col]
|
|
25
|
+
new_s = new_df[col]
|
|
26
|
+
|
|
27
|
+
# Missing-value drift
|
|
28
|
+
old_null = old_s.isna().mean()
|
|
29
|
+
new_null = new_s.isna().mean()
|
|
30
|
+
mv_change = round(abs(new_null - old_null) * 100, 2)
|
|
31
|
+
missing_value_drift[col] = mv_change
|
|
32
|
+
if mv_change > 5.0:
|
|
33
|
+
drifted.add(col)
|
|
34
|
+
|
|
35
|
+
# Numeric distribution drift (KS test)
|
|
36
|
+
if pd.api.types.is_numeric_dtype(old_s) and pd.api.types.is_numeric_dtype(new_s):
|
|
37
|
+
old_clean = old_s.dropna().astype(float)
|
|
38
|
+
new_clean = new_s.dropna().astype(float)
|
|
39
|
+
if len(old_clean) > 1 and len(new_clean) > 1:
|
|
40
|
+
stat, p_value = stats.ks_2samp(old_clean, new_clean)
|
|
41
|
+
distribution_drift[col] = round(float(stat), 4)
|
|
42
|
+
if p_value < _KS_SIGNIFICANCE:
|
|
43
|
+
drifted.add(col)
|
|
44
|
+
|
|
45
|
+
# Categorical distribution drift
|
|
46
|
+
elif is_str_col(old_s) and is_str_col(new_s):
|
|
47
|
+
old_freq = old_s.value_counts(normalize=True)
|
|
48
|
+
new_freq = new_s.value_counts(normalize=True)
|
|
49
|
+
all_cats = set(old_freq.index) | set(new_freq.index)
|
|
50
|
+
changes: dict[str, Any] = {}
|
|
51
|
+
for cat in all_cats:
|
|
52
|
+
old_p = float(old_freq.get(cat, 0.0))
|
|
53
|
+
new_p = float(new_freq.get(cat, 0.0))
|
|
54
|
+
if abs(new_p - old_p) > _CATEGORY_DRIFT_THRESHOLD:
|
|
55
|
+
changes[str(cat)] = {
|
|
56
|
+
"old_pct": round(old_p * 100, 2),
|
|
57
|
+
"new_pct": round(new_p * 100, 2),
|
|
58
|
+
}
|
|
59
|
+
if changes:
|
|
60
|
+
category_drift[col] = changes
|
|
61
|
+
drifted.add(col)
|
|
62
|
+
|
|
63
|
+
return {
|
|
64
|
+
"distribution_drift": distribution_drift,
|
|
65
|
+
"category_drift": category_drift,
|
|
66
|
+
"missing_value_drift": missing_value_drift,
|
|
67
|
+
"drifted_columns": sorted(drifted),
|
|
68
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from datadoctor.models import Issue
|
|
6
|
+
|
|
7
|
+
_HIGH_THRESHOLD = 0.10 # >10% duplicates → high severity
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def analyze(df: pd.DataFrame) -> list[Issue]:
|
|
11
|
+
if df.empty:
|
|
12
|
+
return []
|
|
13
|
+
|
|
14
|
+
mask = df.duplicated()
|
|
15
|
+
dup_count = int(mask.sum())
|
|
16
|
+
|
|
17
|
+
if dup_count == 0:
|
|
18
|
+
return []
|
|
19
|
+
|
|
20
|
+
pct = dup_count / len(df)
|
|
21
|
+
severity = "high" if pct > _HIGH_THRESHOLD else "medium"
|
|
22
|
+
|
|
23
|
+
return [
|
|
24
|
+
Issue(
|
|
25
|
+
type="duplicate_rows",
|
|
26
|
+
severity=severity,
|
|
27
|
+
count=dup_count,
|
|
28
|
+
details={
|
|
29
|
+
"percentage": round(pct * 100, 2),
|
|
30
|
+
"duplicate_indices": df[mask].index.tolist(),
|
|
31
|
+
},
|
|
32
|
+
)
|
|
33
|
+
]
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from dateutil.parser import ParserError
|
|
7
|
+
from dateutil.parser import parse as parse_date
|
|
8
|
+
|
|
9
|
+
from datadoctor._compat import is_str_col
|
|
10
|
+
from datadoctor.models import Issue
|
|
11
|
+
|
|
12
|
+
_EMAIL_RE = re.compile(r"^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$")
|
|
13
|
+
_EMAIL_HINTS = ("email", "mail", "e-mail", "e_mail")
|
|
14
|
+
_DATE_HINTS = ("date", "time", "dt", "created", "updated", "modified", "timestamp")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _is_email_col(name: str) -> bool:
|
|
18
|
+
low = name.lower()
|
|
19
|
+
return any(h in low for h in _EMAIL_HINTS)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _is_date_col(name: str) -> bool:
|
|
23
|
+
low = name.lower()
|
|
24
|
+
return any(h in low for h in _DATE_HINTS)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def analyze(df: pd.DataFrame) -> list[Issue]:
|
|
28
|
+
issues: list[Issue] = []
|
|
29
|
+
|
|
30
|
+
for col in df.columns:
|
|
31
|
+
if not is_str_col(df[col]):
|
|
32
|
+
continue
|
|
33
|
+
|
|
34
|
+
series = df[col].dropna().astype(str)
|
|
35
|
+
if len(series) == 0:
|
|
36
|
+
continue
|
|
37
|
+
|
|
38
|
+
if _is_email_col(col):
|
|
39
|
+
invalid = series[~series.str.match(_EMAIL_RE)]
|
|
40
|
+
if len(invalid) > 0:
|
|
41
|
+
issues.append(
|
|
42
|
+
Issue(
|
|
43
|
+
type="invalid_email",
|
|
44
|
+
severity="medium",
|
|
45
|
+
count=len(invalid),
|
|
46
|
+
column=col,
|
|
47
|
+
details={"examples": invalid.head(3).tolist()},
|
|
48
|
+
)
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
if _is_date_col(col):
|
|
52
|
+
formats_seen: set[str] = set()
|
|
53
|
+
parse_errors = 0
|
|
54
|
+
|
|
55
|
+
for val in series:
|
|
56
|
+
try:
|
|
57
|
+
parse_date(val, fuzzy=False)
|
|
58
|
+
if re.match(r"^\d{4}-\d{2}-\d{2}", val):
|
|
59
|
+
formats_seen.add("YYYY-MM-DD")
|
|
60
|
+
elif re.match(r"^\d{1,2}/\d{1,2}/\d{2,4}", val):
|
|
61
|
+
formats_seen.add("MM/DD/YYYY")
|
|
62
|
+
elif re.match(r"^\d{1,2}-\d{1,2}-\d{2,4}", val):
|
|
63
|
+
formats_seen.add("DD-MM-YYYY")
|
|
64
|
+
else:
|
|
65
|
+
formats_seen.add("other")
|
|
66
|
+
except (ParserError, ValueError, OverflowError):
|
|
67
|
+
parse_errors += 1
|
|
68
|
+
|
|
69
|
+
if parse_errors > 0:
|
|
70
|
+
issues.append(
|
|
71
|
+
Issue(
|
|
72
|
+
type="invalid_date",
|
|
73
|
+
severity="medium",
|
|
74
|
+
count=parse_errors,
|
|
75
|
+
column=col,
|
|
76
|
+
details={"unparseable_count": parse_errors},
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
if len(formats_seen) > 1:
|
|
81
|
+
issues.append(
|
|
82
|
+
Issue(
|
|
83
|
+
type="inconsistent_date_format",
|
|
84
|
+
severity="low",
|
|
85
|
+
count=len(series),
|
|
86
|
+
column=col,
|
|
87
|
+
details={"formats_detected": sorted(formats_seen)},
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
return issues
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from datadoctor.models import Issue
|
|
6
|
+
|
|
7
|
+
_HIGH_NULL_THRESHOLD = 0.30
|
|
8
|
+
_MEDIUM_NULL_THRESHOLD = 0.05
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def analyze(df: pd.DataFrame) -> list[Issue]:
|
|
12
|
+
if df.empty:
|
|
13
|
+
return []
|
|
14
|
+
|
|
15
|
+
issues: list[Issue] = []
|
|
16
|
+
|
|
17
|
+
empty_cols = [col for col in df.columns if df[col].isna().all()]
|
|
18
|
+
if empty_cols:
|
|
19
|
+
issues.append(
|
|
20
|
+
Issue(
|
|
21
|
+
type="empty_columns",
|
|
22
|
+
severity="high",
|
|
23
|
+
count=len(empty_cols),
|
|
24
|
+
details={"columns": empty_cols},
|
|
25
|
+
)
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
for col in df.columns:
|
|
29
|
+
if col in empty_cols:
|
|
30
|
+
continue
|
|
31
|
+
null_count = int(df[col].isna().sum())
|
|
32
|
+
if null_count == 0:
|
|
33
|
+
continue
|
|
34
|
+
pct = null_count / len(df)
|
|
35
|
+
if pct > _HIGH_NULL_THRESHOLD:
|
|
36
|
+
severity = "high"
|
|
37
|
+
elif pct > _MEDIUM_NULL_THRESHOLD:
|
|
38
|
+
severity = "medium"
|
|
39
|
+
else:
|
|
40
|
+
severity = "low"
|
|
41
|
+
|
|
42
|
+
issues.append(
|
|
43
|
+
Issue(
|
|
44
|
+
type="null_values",
|
|
45
|
+
severity=severity,
|
|
46
|
+
count=null_count,
|
|
47
|
+
column=col,
|
|
48
|
+
details={"percentage": round(pct * 100, 2)},
|
|
49
|
+
)
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
return issues
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from datadoctor.models import Issue
|
|
7
|
+
|
|
8
|
+
_MIN_ROWS = 4
|
|
9
|
+
_HIGH_OUTLIER_THRESHOLD = 0.10
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def analyze(df: pd.DataFrame, method: str = "iqr") -> list[Issue]:
|
|
13
|
+
issues: list[Issue] = []
|
|
14
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
15
|
+
|
|
16
|
+
for col in numeric_cols:
|
|
17
|
+
series = df[col].dropna()
|
|
18
|
+
if len(series) < _MIN_ROWS:
|
|
19
|
+
continue
|
|
20
|
+
|
|
21
|
+
if method == "zscore":
|
|
22
|
+
mask = _zscore_mask(series)
|
|
23
|
+
else:
|
|
24
|
+
mask = _iqr_mask(series)
|
|
25
|
+
|
|
26
|
+
count = int(mask.sum())
|
|
27
|
+
if count == 0:
|
|
28
|
+
continue
|
|
29
|
+
|
|
30
|
+
pct = count / len(series)
|
|
31
|
+
severity = "high" if pct > _HIGH_OUTLIER_THRESHOLD else "medium"
|
|
32
|
+
|
|
33
|
+
issues.append(
|
|
34
|
+
Issue(
|
|
35
|
+
type="outlier",
|
|
36
|
+
severity=severity,
|
|
37
|
+
count=count,
|
|
38
|
+
column=col,
|
|
39
|
+
details={
|
|
40
|
+
"method": method,
|
|
41
|
+
"percentage": round(pct * 100, 2),
|
|
42
|
+
"min": float(series.min()),
|
|
43
|
+
"max": float(series.max()),
|
|
44
|
+
"mean": round(float(series.mean()), 4),
|
|
45
|
+
"std": round(float(series.std()), 4),
|
|
46
|
+
},
|
|
47
|
+
)
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
return issues
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_outlier_mask(series: pd.Series, method: str = "iqr") -> pd.Series:
|
|
54
|
+
if method == "zscore":
|
|
55
|
+
return _zscore_mask(series)
|
|
56
|
+
return _iqr_mask(series)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _iqr_mask(series: pd.Series) -> pd.Series:
|
|
60
|
+
q1 = series.quantile(0.25)
|
|
61
|
+
q3 = series.quantile(0.75)
|
|
62
|
+
iqr = q3 - q1
|
|
63
|
+
lower = q1 - 1.5 * iqr
|
|
64
|
+
upper = q3 + 1.5 * iqr
|
|
65
|
+
return (series < lower) | (series > upper)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _zscore_mask(series: pd.Series, threshold: float = 3.0) -> pd.Series:
|
|
69
|
+
std = series.std()
|
|
70
|
+
if std == 0:
|
|
71
|
+
return pd.Series(False, index=series.index)
|
|
72
|
+
z = (series - series.mean()) / std
|
|
73
|
+
return z.abs() > threshold
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from datadoctor._compat import is_str_col
|
|
8
|
+
|
|
9
|
+
_PATTERNS: dict[str, re.Pattern[str]] = {
|
|
10
|
+
"email": re.compile(
|
|
11
|
+
r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b"
|
|
12
|
+
),
|
|
13
|
+
"phone": re.compile(
|
|
14
|
+
r"\b(?:\+?91[-.\s]?)?[6-9]\d{9}\b"
|
|
15
|
+
r"|\b\+?1?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
|
|
16
|
+
),
|
|
17
|
+
"aadhaar": re.compile(r"\b[2-9]\d{3}[\s-]?\d{4}[\s-]?\d{4}\b"),
|
|
18
|
+
"pan": re.compile(r"\b[A-Z]{5}[0-9]{4}[A-Z]\b"),
|
|
19
|
+
"ssn": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
|
|
20
|
+
"credit_card": re.compile(
|
|
21
|
+
r"\b(?:4\d{3}|5[1-5]\d{2}|6011|3[47]\d{2})[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b"
|
|
22
|
+
),
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
_COLUMN_HINTS: dict[str, list[str]] = {
|
|
26
|
+
"email": ["email", "mail", "e-mail", "e_mail"],
|
|
27
|
+
"phone": ["phone", "mobile", "cell", "tel", "contact", "phoneno", "phone_no"],
|
|
28
|
+
"aadhaar": ["aadhaar", "aadhar", "uid", "uidai"],
|
|
29
|
+
"pan": ["pan", "pan_number", "panno", "pan_no"],
|
|
30
|
+
"ssn": ["ssn", "social_security", "social_security_number"],
|
|
31
|
+
"credit_card": ["card", "cc_number", "credit_card", "creditcard", "cardno"],
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
_CONTENT_SCAN_SAMPLE = 200
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def analyze(df: pd.DataFrame) -> dict[str, list[str]]:
|
|
38
|
+
results: dict[str, list[str]] = {}
|
|
39
|
+
|
|
40
|
+
for col in df.columns:
|
|
41
|
+
pii_found: set[str] = set()
|
|
42
|
+
col_lower = col.lower()
|
|
43
|
+
|
|
44
|
+
for pii_type, hints in _COLUMN_HINTS.items():
|
|
45
|
+
if any(h in col_lower for h in hints):
|
|
46
|
+
pii_found.add(pii_type)
|
|
47
|
+
|
|
48
|
+
if is_str_col(df[col]):
|
|
49
|
+
sample = df[col].dropna().astype(str).head(_CONTENT_SCAN_SAMPLE)
|
|
50
|
+
for pii_type, pattern in _PATTERNS.items():
|
|
51
|
+
if pii_type not in pii_found:
|
|
52
|
+
if any(pattern.search(val) for val in sample):
|
|
53
|
+
pii_found.add(pii_type)
|
|
54
|
+
|
|
55
|
+
if pii_found:
|
|
56
|
+
results[col] = sorted(pii_found)
|
|
57
|
+
|
|
58
|
+
return results
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from datadoctor._compat import is_str_col
|
|
6
|
+
from datadoctor.models import Issue
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def analyze(df: pd.DataFrame) -> list[Issue]:
|
|
10
|
+
issues: list[Issue] = []
|
|
11
|
+
|
|
12
|
+
for col in df.columns:
|
|
13
|
+
if not is_str_col(df[col]):
|
|
14
|
+
continue
|
|
15
|
+
|
|
16
|
+
series = df[col].dropna()
|
|
17
|
+
if len(series) == 0:
|
|
18
|
+
continue
|
|
19
|
+
|
|
20
|
+
# Detect mixed Python types (e.g., int and str coexisting)
|
|
21
|
+
type_counts: dict[str, int] = {}
|
|
22
|
+
for val in series:
|
|
23
|
+
t = type(val).__name__
|
|
24
|
+
type_counts[t] = type_counts.get(t, 0) + 1
|
|
25
|
+
|
|
26
|
+
if len(type_counts) > 1 and not set(type_counts.keys()) <= {"str", "bytes"}:
|
|
27
|
+
issues.append(
|
|
28
|
+
Issue(
|
|
29
|
+
type="mixed_types",
|
|
30
|
+
severity="medium",
|
|
31
|
+
count=len(series),
|
|
32
|
+
column=col,
|
|
33
|
+
details={"types_found": list(type_counts.keys())},
|
|
34
|
+
)
|
|
35
|
+
)
|
|
36
|
+
continue # skip numeric-as-string check for mixed columns
|
|
37
|
+
|
|
38
|
+
# Detect numeric values stored as strings
|
|
39
|
+
numeric_mask = pd.to_numeric(series.astype(str).str.replace(",", "", regex=False), errors="coerce").notna()
|
|
40
|
+
if numeric_mask.all() and len(series) > 0:
|
|
41
|
+
issues.append(
|
|
42
|
+
Issue(
|
|
43
|
+
type="numeric_as_string",
|
|
44
|
+
severity="low",
|
|
45
|
+
count=len(series),
|
|
46
|
+
column=col,
|
|
47
|
+
details={"suggestion": f"Column '{col}' contains numeric values stored as strings."},
|
|
48
|
+
)
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
return issues
|
datadoctor/anomalies.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Union
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from datadoctor.analyzers.outlier import analyze as _outlier_analyze
|
|
10
|
+
from datadoctor.analyzers.outlier import get_outlier_mask
|
|
11
|
+
from datadoctor.loader import load
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def find_anomalies(
|
|
15
|
+
source: Union[str, Path, pd.DataFrame],
|
|
16
|
+
method: str = "iqr",
|
|
17
|
+
) -> dict[str, Any]:
|
|
18
|
+
if method not in ("iqr", "zscore"):
|
|
19
|
+
raise ValueError(f"Unknown method '{method}'. Choose 'iqr' or 'zscore'.")
|
|
20
|
+
|
|
21
|
+
df = load(source)
|
|
22
|
+
issues = _outlier_analyze(df, method=method)
|
|
23
|
+
|
|
24
|
+
total = sum(i.count for i in issues)
|
|
25
|
+
|
|
26
|
+
# Collect the union of anomalous row indices
|
|
27
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
28
|
+
anomalous_mask = pd.Series(False, index=df.index)
|
|
29
|
+
for col in numeric_cols:
|
|
30
|
+
series = df[col].dropna()
|
|
31
|
+
if len(series) >= 4:
|
|
32
|
+
col_mask = get_outlier_mask(series, method=method)
|
|
33
|
+
full_mask = col_mask.reindex(df.index, fill_value=False)
|
|
34
|
+
anomalous_mask = anomalous_mask | full_mask
|
|
35
|
+
|
|
36
|
+
return {
|
|
37
|
+
"total_anomalous_records": int(anomalous_mask.sum()),
|
|
38
|
+
"method": method,
|
|
39
|
+
"by_column": [
|
|
40
|
+
{
|
|
41
|
+
"column": i.column,
|
|
42
|
+
"count": i.count,
|
|
43
|
+
"percentage": i.details.get("percentage"),
|
|
44
|
+
"severity": i.severity,
|
|
45
|
+
"min": i.details.get("min"),
|
|
46
|
+
"max": i.details.get("max"),
|
|
47
|
+
"mean": i.details.get("mean"),
|
|
48
|
+
}
|
|
49
|
+
for i in issues
|
|
50
|
+
],
|
|
51
|
+
"anomalous_indices": anomalous_mask[anomalous_mask].index.tolist(),
|
|
52
|
+
}
|
datadoctor/audit.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Union
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from datadoctor.investigate import investigate
|
|
9
|
+
from datadoctor.models import InvestigationReport
|
|
10
|
+
from datadoctor.reporting.terminal import print_audit_report
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def audit(
|
|
14
|
+
source: Union[str, Path, pd.DataFrame],
|
|
15
|
+
schema: dict | None = None,
|
|
16
|
+
) -> InvestigationReport:
|
|
17
|
+
report = investigate(source, schema=schema)
|
|
18
|
+
print_audit_report(report)
|
|
19
|
+
return report
|