datawash 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datawash/__init__.py +9 -0
- datawash/adapters/__init__.py +12 -0
- datawash/adapters/base.py +66 -0
- datawash/adapters/csv_adapter.py +23 -0
- datawash/adapters/excel_adapter.py +36 -0
- datawash/adapters/json_adapter.py +21 -0
- datawash/adapters/parquet_adapter.py +34 -0
- datawash/cli/__init__.py +0 -0
- datawash/cli/formatters.py +110 -0
- datawash/cli/main.py +168 -0
- datawash/codegen/__init__.py +1 -0
- datawash/codegen/generator.py +72 -0
- datawash/core/__init__.py +1 -0
- datawash/core/cache.py +64 -0
- datawash/core/config.py +56 -0
- datawash/core/dtypes.py +24 -0
- datawash/core/exceptions.py +21 -0
- datawash/core/models.py +78 -0
- datawash/core/report.py +430 -0
- datawash/core/sampling.py +84 -0
- datawash/detectors/__init__.py +13 -0
- datawash/detectors/base.py +27 -0
- datawash/detectors/duplicate_detector.py +56 -0
- datawash/detectors/format_detector.py +130 -0
- datawash/detectors/missing_detector.py +78 -0
- datawash/detectors/outlier_detector.py +93 -0
- datawash/detectors/registry.py +64 -0
- datawash/detectors/similarity_detector.py +294 -0
- datawash/detectors/type_detector.py +100 -0
- datawash/profiler/__init__.py +1 -0
- datawash/profiler/engine.py +88 -0
- datawash/profiler/parallel.py +122 -0
- datawash/profiler/patterns.py +80 -0
- datawash/profiler/statistics.py +41 -0
- datawash/suggestors/__init__.py +1 -0
- datawash/suggestors/base.py +15 -0
- datawash/suggestors/engine.py +327 -0
- datawash/suggestors/prioritizer.py +23 -0
- datawash/transformers/__init__.py +13 -0
- datawash/transformers/base.py +27 -0
- datawash/transformers/categories.py +64 -0
- datawash/transformers/columns.py +72 -0
- datawash/transformers/duplicates.py +43 -0
- datawash/transformers/formats.py +95 -0
- datawash/transformers/missing.py +201 -0
- datawash/transformers/registry.py +30 -0
- datawash/transformers/types.py +95 -0
- datawash-0.2.0.dist-info/METADATA +353 -0
- datawash-0.2.0.dist-info/RECORD +53 -0
- datawash-0.2.0.dist-info/WHEEL +5 -0
- datawash-0.2.0.dist-info/entry_points.txt +2 -0
- datawash-0.2.0.dist-info/licenses/LICENSE +21 -0
- datawash-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Detect duplicate rows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from datawash.core.models import DatasetProfile, Finding, Severity
|
|
8
|
+
from datawash.detectors.base import BaseDetector
|
|
9
|
+
from datawash.detectors.registry import register_detector
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DuplicateDetector(BaseDetector):
|
|
13
|
+
@property
|
|
14
|
+
def name(self) -> str:
|
|
15
|
+
return "duplicates"
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def description(self) -> str:
|
|
19
|
+
return "Detects exact duplicate rows"
|
|
20
|
+
|
|
21
|
+
def detect(self, df: pd.DataFrame, profile: DatasetProfile) -> list[Finding]:
|
|
22
|
+
findings: list[Finding] = []
|
|
23
|
+
dup_count = profile.duplicate_row_count
|
|
24
|
+
if dup_count == 0:
|
|
25
|
+
return findings
|
|
26
|
+
|
|
27
|
+
ratio = dup_count / profile.row_count if profile.row_count > 0 else 0
|
|
28
|
+
if ratio > 0.1:
|
|
29
|
+
severity = Severity.HIGH
|
|
30
|
+
elif ratio > 0.01:
|
|
31
|
+
severity = Severity.MEDIUM
|
|
32
|
+
else:
|
|
33
|
+
severity = Severity.LOW
|
|
34
|
+
|
|
35
|
+
dup_mask = df.duplicated(keep="first")
|
|
36
|
+
dup_indices = df.index[dup_mask].tolist()[:100] # Cap for memory
|
|
37
|
+
|
|
38
|
+
findings.append(
|
|
39
|
+
Finding(
|
|
40
|
+
detector=self.name,
|
|
41
|
+
issue_type="duplicate_rows",
|
|
42
|
+
severity=severity,
|
|
43
|
+
columns=list(df.columns),
|
|
44
|
+
rows=dup_indices,
|
|
45
|
+
details={
|
|
46
|
+
"duplicate_count": dup_count,
|
|
47
|
+
"duplicate_ratio": round(ratio, 4),
|
|
48
|
+
},
|
|
49
|
+
message=f"Found {dup_count} exact duplicate rows ({ratio:.1%} of data)",
|
|
50
|
+
confidence=1.0,
|
|
51
|
+
)
|
|
52
|
+
)
|
|
53
|
+
return findings
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
register_detector(DuplicateDetector())
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Detect inconsistent formats within columns."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from datawash.core.models import DatasetProfile, Finding, Severity
|
|
8
|
+
from datawash.detectors.base import BaseDetector
|
|
9
|
+
from datawash.detectors.registry import register_detector
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class FormatDetector(BaseDetector):
|
|
13
|
+
@property
|
|
14
|
+
def name(self) -> str:
|
|
15
|
+
return "formats"
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def description(self) -> str:
|
|
19
|
+
return "Detects inconsistent formats within columns"
|
|
20
|
+
|
|
21
|
+
def detect(self, df: pd.DataFrame, profile: DatasetProfile) -> list[Finding]:
|
|
22
|
+
findings: list[Finding] = []
|
|
23
|
+
for col_name in df.columns:
|
|
24
|
+
if not pd.api.types.is_string_dtype(df[col_name]):
|
|
25
|
+
continue
|
|
26
|
+
clean = df[col_name].dropna().astype(str)
|
|
27
|
+
if clean.empty or len(clean) < 5:
|
|
28
|
+
continue
|
|
29
|
+
|
|
30
|
+
# Check for mixed case patterns
|
|
31
|
+
case_finding = self._check_case_inconsistency(col_name, clean)
|
|
32
|
+
if case_finding:
|
|
33
|
+
findings.append(case_finding)
|
|
34
|
+
|
|
35
|
+
# Check for mixed date formats
|
|
36
|
+
date_finding = self._check_date_formats(col_name, clean)
|
|
37
|
+
if date_finding:
|
|
38
|
+
findings.append(date_finding)
|
|
39
|
+
|
|
40
|
+
# Check for mixed whitespace/padding
|
|
41
|
+
ws_finding = self._check_whitespace(col_name, clean)
|
|
42
|
+
if ws_finding:
|
|
43
|
+
findings.append(ws_finding)
|
|
44
|
+
|
|
45
|
+
return findings
|
|
46
|
+
|
|
47
|
+
def _check_case_inconsistency(
|
|
48
|
+
self, col_name: str, series: pd.Series
|
|
49
|
+
) -> Finding | None:
|
|
50
|
+
has_upper = series.str.isupper().any()
|
|
51
|
+
has_lower = series.str.islower().any()
|
|
52
|
+
has_title = series.str.istitle().any()
|
|
53
|
+
case_types = sum([has_upper, has_lower, has_title])
|
|
54
|
+
if case_types >= 2:
|
|
55
|
+
return Finding(
|
|
56
|
+
detector=self.name,
|
|
57
|
+
issue_type="inconsistent_case",
|
|
58
|
+
severity=Severity.LOW,
|
|
59
|
+
columns=[col_name],
|
|
60
|
+
details={
|
|
61
|
+
"has_upper": bool(has_upper),
|
|
62
|
+
"has_lower": bool(has_lower),
|
|
63
|
+
"has_title": bool(has_title),
|
|
64
|
+
},
|
|
65
|
+
message=(
|
|
66
|
+
f"Column '{col_name}' has inconsistent "
|
|
67
|
+
f"casing (mixed upper/lower/title case)"
|
|
68
|
+
),
|
|
69
|
+
confidence=0.8,
|
|
70
|
+
)
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
_DATE_PATTERNS: list[tuple[str, str]] = [
|
|
74
|
+
("iso", r"^\d{4}-\d{2}-\d{2}"),
|
|
75
|
+
("slash_mdy", r"^\d{1,2}/\d{1,2}/\d{2,4}$"),
|
|
76
|
+
("dash_dmy", r"^\d{1,2}-[A-Za-z]{3}-\d{2,4}$"),
|
|
77
|
+
("named_mdy", r"^[A-Za-z]{3,9}\s+\d{1,2},?\s+\d{4}$"),
|
|
78
|
+
("named_dmy", r"^\d{1,2}\s+[A-Za-z]{3,9}\s+\d{4}$"),
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
def _check_date_formats(self, col_name: str, series: pd.Series) -> Finding | None:
|
|
82
|
+
# Count how many distinct format patterns appear
|
|
83
|
+
format_counts: dict[str, int] = {}
|
|
84
|
+
for fmt_name, pattern in self._DATE_PATTERNS:
|
|
85
|
+
count = int(series.str.match(pattern).sum())
|
|
86
|
+
if count > 0:
|
|
87
|
+
format_counts[fmt_name] = count
|
|
88
|
+
|
|
89
|
+
total_matched = sum(format_counts.values())
|
|
90
|
+
if len(format_counts) >= 2 and total_matched / len(series) >= 0.5:
|
|
91
|
+
detail_str = ", ".join(
|
|
92
|
+
f"{count} {name}" for name, count in format_counts.items()
|
|
93
|
+
)
|
|
94
|
+
return Finding(
|
|
95
|
+
detector=self.name,
|
|
96
|
+
issue_type="inconsistent_date_format",
|
|
97
|
+
severity=Severity.MEDIUM,
|
|
98
|
+
columns=[col_name],
|
|
99
|
+
details={
|
|
100
|
+
"format_counts": format_counts,
|
|
101
|
+
"total_matched": total_matched,
|
|
102
|
+
},
|
|
103
|
+
message=(
|
|
104
|
+
f"Column '{col_name}' has mixed date " f"formats ({detail_str})"
|
|
105
|
+
),
|
|
106
|
+
confidence=0.85,
|
|
107
|
+
)
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
def _check_whitespace(self, col_name: str, series: pd.Series) -> Finding | None:
|
|
111
|
+
leading = (series != series.str.lstrip()).sum()
|
|
112
|
+
trailing = (series != series.str.rstrip()).sum()
|
|
113
|
+
total = int(leading + trailing)
|
|
114
|
+
if total > 0:
|
|
115
|
+
return Finding(
|
|
116
|
+
detector=self.name,
|
|
117
|
+
issue_type="whitespace_padding",
|
|
118
|
+
severity=Severity.LOW,
|
|
119
|
+
columns=[col_name],
|
|
120
|
+
details={
|
|
121
|
+
"leading_spaces": int(leading),
|
|
122
|
+
"trailing_spaces": int(trailing),
|
|
123
|
+
},
|
|
124
|
+
message=f"Column '{col_name}' has {total} values with leading/trailing whitespace",
|
|
125
|
+
confidence=1.0,
|
|
126
|
+
)
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
register_detector(FormatDetector())
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Detect missing value patterns."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from datawash.core.models import DatasetProfile, Finding, Severity
|
|
8
|
+
from datawash.detectors.base import BaseDetector
|
|
9
|
+
from datawash.detectors.registry import register_detector
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MissingDetector(BaseDetector):
|
|
13
|
+
@property
|
|
14
|
+
def name(self) -> str:
|
|
15
|
+
return "missing"
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def description(self) -> str:
|
|
19
|
+
return "Detects missing values and null patterns"
|
|
20
|
+
|
|
21
|
+
def detect(self, df: pd.DataFrame, profile: DatasetProfile) -> list[Finding]:
|
|
22
|
+
findings: list[Finding] = []
|
|
23
|
+
for col_name, col_profile in profile.columns.items():
|
|
24
|
+
if col_profile.null_count == 0:
|
|
25
|
+
continue
|
|
26
|
+
ratio = col_profile.null_ratio
|
|
27
|
+
if ratio > 0.5:
|
|
28
|
+
severity = Severity.HIGH
|
|
29
|
+
elif ratio > 0.1:
|
|
30
|
+
severity = Severity.MEDIUM
|
|
31
|
+
else:
|
|
32
|
+
severity = Severity.LOW
|
|
33
|
+
|
|
34
|
+
findings.append(
|
|
35
|
+
Finding(
|
|
36
|
+
detector=self.name,
|
|
37
|
+
issue_type="missing_values",
|
|
38
|
+
severity=severity,
|
|
39
|
+
columns=[col_name],
|
|
40
|
+
details={
|
|
41
|
+
"null_count": col_profile.null_count,
|
|
42
|
+
"null_ratio": col_profile.null_ratio,
|
|
43
|
+
"dtype": col_profile.dtype,
|
|
44
|
+
},
|
|
45
|
+
message=(
|
|
46
|
+
f"Column '{col_name}' has "
|
|
47
|
+
f"{col_profile.null_count} missing "
|
|
48
|
+
f"values ({col_profile.null_ratio:.1%})"
|
|
49
|
+
),
|
|
50
|
+
confidence=1.0,
|
|
51
|
+
)
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Detect columns with empty or whitespace-only strings
|
|
55
|
+
for col_name in df.columns:
|
|
56
|
+
if pd.api.types.is_string_dtype(df[col_name]):
|
|
57
|
+
stripped = df[col_name].dropna().astype(str).str.strip()
|
|
58
|
+
empty_count = int((stripped == "").sum())
|
|
59
|
+
if empty_count > 0:
|
|
60
|
+
findings.append(
|
|
61
|
+
Finding(
|
|
62
|
+
detector=self.name,
|
|
63
|
+
issue_type="empty_strings",
|
|
64
|
+
severity=Severity.MEDIUM,
|
|
65
|
+
columns=[col_name],
|
|
66
|
+
details={"empty_string_count": empty_count},
|
|
67
|
+
message=(
|
|
68
|
+
f"Column '{col_name}' has "
|
|
69
|
+
f"{empty_count} empty strings "
|
|
70
|
+
f"that may represent missing values"
|
|
71
|
+
),
|
|
72
|
+
confidence=0.9,
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
return findings
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
register_detector(MissingDetector())
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Detect statistical outliers in numeric columns."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from datawash.core.models import DatasetProfile, Finding, Severity
|
|
9
|
+
from datawash.detectors.base import BaseDetector
|
|
10
|
+
from datawash.detectors.registry import register_detector
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class OutlierDetector(BaseDetector):
|
|
14
|
+
def __init__(self, method: str = "iqr", threshold: float = 1.5) -> None:
|
|
15
|
+
self._method = method
|
|
16
|
+
self._threshold = threshold
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def name(self) -> str:
|
|
20
|
+
return "outliers"
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def description(self) -> str:
|
|
24
|
+
return "Detects statistical outliers in numeric columns"
|
|
25
|
+
|
|
26
|
+
def detect(self, df: pd.DataFrame, profile: DatasetProfile) -> list[Finding]:
|
|
27
|
+
findings: list[Finding] = []
|
|
28
|
+
for col_name in df.select_dtypes(include=[np.number]).columns:
|
|
29
|
+
series = df[col_name].dropna()
|
|
30
|
+
if len(series) < 10:
|
|
31
|
+
continue
|
|
32
|
+
|
|
33
|
+
if self._method == "iqr":
|
|
34
|
+
outlier_indices = self._iqr_outliers(series)
|
|
35
|
+
else:
|
|
36
|
+
outlier_indices = self._zscore_outliers(series)
|
|
37
|
+
|
|
38
|
+
if len(outlier_indices) == 0:
|
|
39
|
+
continue
|
|
40
|
+
|
|
41
|
+
ratio = len(outlier_indices) / len(series)
|
|
42
|
+
severity = (
|
|
43
|
+
Severity.HIGH
|
|
44
|
+
if ratio > 0.05
|
|
45
|
+
else Severity.MEDIUM if ratio > 0.01 else Severity.LOW
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
findings.append(
|
|
49
|
+
Finding(
|
|
50
|
+
detector=self.name,
|
|
51
|
+
issue_type="outliers",
|
|
52
|
+
severity=severity,
|
|
53
|
+
columns=[col_name],
|
|
54
|
+
rows=outlier_indices[:100],
|
|
55
|
+
details={
|
|
56
|
+
"outlier_count": len(outlier_indices),
|
|
57
|
+
"outlier_ratio": round(ratio, 4),
|
|
58
|
+
"method": self._method,
|
|
59
|
+
"threshold": self._threshold,
|
|
60
|
+
},
|
|
61
|
+
message=(
|
|
62
|
+
f"Column '{col_name}' has "
|
|
63
|
+
f"{len(outlier_indices)} outliers "
|
|
64
|
+
f"({ratio:.1%}) detected by "
|
|
65
|
+
f"{self._method.upper()}"
|
|
66
|
+
),
|
|
67
|
+
confidence=0.85,
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
return findings
|
|
71
|
+
|
|
72
|
+
def _iqr_outliers(self, series: pd.Series) -> list[int]:
|
|
73
|
+
q1 = series.quantile(0.25)
|
|
74
|
+
q3 = series.quantile(0.75)
|
|
75
|
+
iqr = q3 - q1
|
|
76
|
+
if iqr == 0:
|
|
77
|
+
return []
|
|
78
|
+
lower = q1 - self._threshold * iqr
|
|
79
|
+
upper = q3 + self._threshold * iqr
|
|
80
|
+
mask = (series < lower) | (series > upper)
|
|
81
|
+
return series.index[mask].tolist()
|
|
82
|
+
|
|
83
|
+
def _zscore_outliers(self, series: pd.Series) -> list[int]:
|
|
84
|
+
mean = series.mean()
|
|
85
|
+
std = series.std()
|
|
86
|
+
if std == 0:
|
|
87
|
+
return []
|
|
88
|
+
z_scores = ((series - mean) / std).abs()
|
|
89
|
+
mask = z_scores > self._threshold
|
|
90
|
+
return series.index[mask].tolist()
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
register_detector(OutlierDetector())
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Detector registration and orchestration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import sys
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from datawash.core.models import DatasetProfile, Finding
|
|
12
|
+
from datawash.detectors.base import BaseDetector
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
_DETECTORS: dict[str, BaseDetector] = {}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def register_detector(detector: BaseDetector) -> None:
|
|
20
|
+
_DETECTORS[detector.name] = detector
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_all_detectors() -> dict[str, BaseDetector]:
|
|
24
|
+
return dict(_DETECTORS)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def run_all_detectors(
|
|
28
|
+
df: pd.DataFrame,
|
|
29
|
+
profile: DatasetProfile,
|
|
30
|
+
enabled: Optional[list[str]] = None,
|
|
31
|
+
) -> list[Finding]:
|
|
32
|
+
"""Run enabled detectors and return all findings."""
|
|
33
|
+
findings: list[Finding] = []
|
|
34
|
+
active_detectors = {
|
|
35
|
+
n: d for n, d in _DETECTORS.items() if enabled is None or n in enabled
|
|
36
|
+
}
|
|
37
|
+
use_progress = len(df) > 10000 and sys.stderr.isatty()
|
|
38
|
+
|
|
39
|
+
if use_progress:
|
|
40
|
+
from rich.progress import Progress
|
|
41
|
+
|
|
42
|
+
with Progress() as progress:
|
|
43
|
+
task = progress.add_task(
|
|
44
|
+
"Running detectors...", total=len(active_detectors)
|
|
45
|
+
)
|
|
46
|
+
for name, detector in active_detectors.items():
|
|
47
|
+
try:
|
|
48
|
+
logger.info("Running detector: %s", name)
|
|
49
|
+
results = detector.detect(df, profile)
|
|
50
|
+
findings.extend(results)
|
|
51
|
+
logger.info("Detector %s found %d issues", name, len(results))
|
|
52
|
+
except Exception:
|
|
53
|
+
logger.exception("Detector %s failed", name)
|
|
54
|
+
progress.update(task, advance=1)
|
|
55
|
+
else:
|
|
56
|
+
for name, detector in active_detectors.items():
|
|
57
|
+
try:
|
|
58
|
+
logger.info("Running detector: %s", name)
|
|
59
|
+
results = detector.detect(df, profile)
|
|
60
|
+
findings.extend(results)
|
|
61
|
+
logger.info("Detector %s found %d issues", name, len(results))
|
|
62
|
+
except Exception:
|
|
63
|
+
logger.exception("Detector %s failed", name)
|
|
64
|
+
return findings
|