datawash 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. datawash/__init__.py +9 -0
  2. datawash/adapters/__init__.py +12 -0
  3. datawash/adapters/base.py +66 -0
  4. datawash/adapters/csv_adapter.py +23 -0
  5. datawash/adapters/excel_adapter.py +36 -0
  6. datawash/adapters/json_adapter.py +21 -0
  7. datawash/adapters/parquet_adapter.py +34 -0
  8. datawash/cli/__init__.py +0 -0
  9. datawash/cli/formatters.py +110 -0
  10. datawash/cli/main.py +168 -0
  11. datawash/codegen/__init__.py +1 -0
  12. datawash/codegen/generator.py +72 -0
  13. datawash/core/__init__.py +1 -0
  14. datawash/core/cache.py +64 -0
  15. datawash/core/config.py +56 -0
  16. datawash/core/dtypes.py +24 -0
  17. datawash/core/exceptions.py +21 -0
  18. datawash/core/models.py +78 -0
  19. datawash/core/report.py +430 -0
  20. datawash/core/sampling.py +84 -0
  21. datawash/detectors/__init__.py +13 -0
  22. datawash/detectors/base.py +27 -0
  23. datawash/detectors/duplicate_detector.py +56 -0
  24. datawash/detectors/format_detector.py +130 -0
  25. datawash/detectors/missing_detector.py +78 -0
  26. datawash/detectors/outlier_detector.py +93 -0
  27. datawash/detectors/registry.py +64 -0
  28. datawash/detectors/similarity_detector.py +294 -0
  29. datawash/detectors/type_detector.py +100 -0
  30. datawash/profiler/__init__.py +1 -0
  31. datawash/profiler/engine.py +88 -0
  32. datawash/profiler/parallel.py +122 -0
  33. datawash/profiler/patterns.py +80 -0
  34. datawash/profiler/statistics.py +41 -0
  35. datawash/suggestors/__init__.py +1 -0
  36. datawash/suggestors/base.py +15 -0
  37. datawash/suggestors/engine.py +327 -0
  38. datawash/suggestors/prioritizer.py +23 -0
  39. datawash/transformers/__init__.py +13 -0
  40. datawash/transformers/base.py +27 -0
  41. datawash/transformers/categories.py +64 -0
  42. datawash/transformers/columns.py +72 -0
  43. datawash/transformers/duplicates.py +43 -0
  44. datawash/transformers/formats.py +95 -0
  45. datawash/transformers/missing.py +201 -0
  46. datawash/transformers/registry.py +30 -0
  47. datawash/transformers/types.py +95 -0
  48. datawash-0.2.0.dist-info/METADATA +353 -0
  49. datawash-0.2.0.dist-info/RECORD +53 -0
  50. datawash-0.2.0.dist-info/WHEEL +5 -0
  51. datawash-0.2.0.dist-info/entry_points.txt +2 -0
  52. datawash-0.2.0.dist-info/licenses/LICENSE +21 -0
  53. datawash-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,56 @@
1
+ """Detect duplicate rows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pandas as pd
6
+
7
+ from datawash.core.models import DatasetProfile, Finding, Severity
8
+ from datawash.detectors.base import BaseDetector
9
+ from datawash.detectors.registry import register_detector
10
+
11
+
12
+ class DuplicateDetector(BaseDetector):
13
+ @property
14
+ def name(self) -> str:
15
+ return "duplicates"
16
+
17
+ @property
18
+ def description(self) -> str:
19
+ return "Detects exact duplicate rows"
20
+
21
+ def detect(self, df: pd.DataFrame, profile: DatasetProfile) -> list[Finding]:
22
+ findings: list[Finding] = []
23
+ dup_count = profile.duplicate_row_count
24
+ if dup_count == 0:
25
+ return findings
26
+
27
+ ratio = dup_count / profile.row_count if profile.row_count > 0 else 0
28
+ if ratio > 0.1:
29
+ severity = Severity.HIGH
30
+ elif ratio > 0.01:
31
+ severity = Severity.MEDIUM
32
+ else:
33
+ severity = Severity.LOW
34
+
35
+ dup_mask = df.duplicated(keep="first")
36
+ dup_indices = df.index[dup_mask].tolist()[:100] # Cap for memory
37
+
38
+ findings.append(
39
+ Finding(
40
+ detector=self.name,
41
+ issue_type="duplicate_rows",
42
+ severity=severity,
43
+ columns=list(df.columns),
44
+ rows=dup_indices,
45
+ details={
46
+ "duplicate_count": dup_count,
47
+ "duplicate_ratio": round(ratio, 4),
48
+ },
49
+ message=f"Found {dup_count} exact duplicate rows ({ratio:.1%} of data)",
50
+ confidence=1.0,
51
+ )
52
+ )
53
+ return findings
54
+
55
+
56
+ register_detector(DuplicateDetector())
@@ -0,0 +1,130 @@
1
+ """Detect inconsistent formats within columns."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pandas as pd
6
+
7
+ from datawash.core.models import DatasetProfile, Finding, Severity
8
+ from datawash.detectors.base import BaseDetector
9
+ from datawash.detectors.registry import register_detector
10
+
11
+
12
+ class FormatDetector(BaseDetector):
13
+ @property
14
+ def name(self) -> str:
15
+ return "formats"
16
+
17
+ @property
18
+ def description(self) -> str:
19
+ return "Detects inconsistent formats within columns"
20
+
21
+ def detect(self, df: pd.DataFrame, profile: DatasetProfile) -> list[Finding]:
22
+ findings: list[Finding] = []
23
+ for col_name in df.columns:
24
+ if not pd.api.types.is_string_dtype(df[col_name]):
25
+ continue
26
+ clean = df[col_name].dropna().astype(str)
27
+ if clean.empty or len(clean) < 5:
28
+ continue
29
+
30
+ # Check for mixed case patterns
31
+ case_finding = self._check_case_inconsistency(col_name, clean)
32
+ if case_finding:
33
+ findings.append(case_finding)
34
+
35
+ # Check for mixed date formats
36
+ date_finding = self._check_date_formats(col_name, clean)
37
+ if date_finding:
38
+ findings.append(date_finding)
39
+
40
+ # Check for mixed whitespace/padding
41
+ ws_finding = self._check_whitespace(col_name, clean)
42
+ if ws_finding:
43
+ findings.append(ws_finding)
44
+
45
+ return findings
46
+
47
+ def _check_case_inconsistency(
48
+ self, col_name: str, series: pd.Series
49
+ ) -> Finding | None:
50
+ has_upper = series.str.isupper().any()
51
+ has_lower = series.str.islower().any()
52
+ has_title = series.str.istitle().any()
53
+ case_types = sum([has_upper, has_lower, has_title])
54
+ if case_types >= 2:
55
+ return Finding(
56
+ detector=self.name,
57
+ issue_type="inconsistent_case",
58
+ severity=Severity.LOW,
59
+ columns=[col_name],
60
+ details={
61
+ "has_upper": bool(has_upper),
62
+ "has_lower": bool(has_lower),
63
+ "has_title": bool(has_title),
64
+ },
65
+ message=(
66
+ f"Column '{col_name}' has inconsistent "
67
+ f"casing (mixed upper/lower/title case)"
68
+ ),
69
+ confidence=0.8,
70
+ )
71
+ return None
72
+
73
+ _DATE_PATTERNS: list[tuple[str, str]] = [
74
+ ("iso", r"^\d{4}-\d{2}-\d{2}"),
75
+ ("slash_mdy", r"^\d{1,2}/\d{1,2}/\d{2,4}$"),
76
+ ("dash_dmy", r"^\d{1,2}-[A-Za-z]{3}-\d{2,4}$"),
77
+ ("named_mdy", r"^[A-Za-z]{3,9}\s+\d{1,2},?\s+\d{4}$"),
78
+ ("named_dmy", r"^\d{1,2}\s+[A-Za-z]{3,9}\s+\d{4}$"),
79
+ ]
80
+
81
+ def _check_date_formats(self, col_name: str, series: pd.Series) -> Finding | None:
82
+ # Count how many distinct format patterns appear
83
+ format_counts: dict[str, int] = {}
84
+ for fmt_name, pattern in self._DATE_PATTERNS:
85
+ count = int(series.str.match(pattern).sum())
86
+ if count > 0:
87
+ format_counts[fmt_name] = count
88
+
89
+ total_matched = sum(format_counts.values())
90
+ if len(format_counts) >= 2 and total_matched / len(series) >= 0.5:
91
+ detail_str = ", ".join(
92
+ f"{count} {name}" for name, count in format_counts.items()
93
+ )
94
+ return Finding(
95
+ detector=self.name,
96
+ issue_type="inconsistent_date_format",
97
+ severity=Severity.MEDIUM,
98
+ columns=[col_name],
99
+ details={
100
+ "format_counts": format_counts,
101
+ "total_matched": total_matched,
102
+ },
103
+ message=(
104
+ f"Column '{col_name}' has mixed date " f"formats ({detail_str})"
105
+ ),
106
+ confidence=0.85,
107
+ )
108
+ return None
109
+
110
+ def _check_whitespace(self, col_name: str, series: pd.Series) -> Finding | None:
111
+ leading = (series != series.str.lstrip()).sum()
112
+ trailing = (series != series.str.rstrip()).sum()
113
+ total = int(leading + trailing)
114
+ if total > 0:
115
+ return Finding(
116
+ detector=self.name,
117
+ issue_type="whitespace_padding",
118
+ severity=Severity.LOW,
119
+ columns=[col_name],
120
+ details={
121
+ "leading_spaces": int(leading),
122
+ "trailing_spaces": int(trailing),
123
+ },
124
+ message=f"Column '{col_name}' has {total} values with leading/trailing whitespace",
125
+ confidence=1.0,
126
+ )
127
+ return None
128
+
129
+
130
+ register_detector(FormatDetector())
@@ -0,0 +1,78 @@
1
+ """Detect missing value patterns."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pandas as pd
6
+
7
+ from datawash.core.models import DatasetProfile, Finding, Severity
8
+ from datawash.detectors.base import BaseDetector
9
+ from datawash.detectors.registry import register_detector
10
+
11
+
12
+ class MissingDetector(BaseDetector):
13
+ @property
14
+ def name(self) -> str:
15
+ return "missing"
16
+
17
+ @property
18
+ def description(self) -> str:
19
+ return "Detects missing values and null patterns"
20
+
21
+ def detect(self, df: pd.DataFrame, profile: DatasetProfile) -> list[Finding]:
22
+ findings: list[Finding] = []
23
+ for col_name, col_profile in profile.columns.items():
24
+ if col_profile.null_count == 0:
25
+ continue
26
+ ratio = col_profile.null_ratio
27
+ if ratio > 0.5:
28
+ severity = Severity.HIGH
29
+ elif ratio > 0.1:
30
+ severity = Severity.MEDIUM
31
+ else:
32
+ severity = Severity.LOW
33
+
34
+ findings.append(
35
+ Finding(
36
+ detector=self.name,
37
+ issue_type="missing_values",
38
+ severity=severity,
39
+ columns=[col_name],
40
+ details={
41
+ "null_count": col_profile.null_count,
42
+ "null_ratio": col_profile.null_ratio,
43
+ "dtype": col_profile.dtype,
44
+ },
45
+ message=(
46
+ f"Column '{col_name}' has "
47
+ f"{col_profile.null_count} missing "
48
+ f"values ({col_profile.null_ratio:.1%})"
49
+ ),
50
+ confidence=1.0,
51
+ )
52
+ )
53
+
54
+ # Detect columns with empty or whitespace-only strings
55
+ for col_name in df.columns:
56
+ if pd.api.types.is_string_dtype(df[col_name]):
57
+ stripped = df[col_name].dropna().astype(str).str.strip()
58
+ empty_count = int((stripped == "").sum())
59
+ if empty_count > 0:
60
+ findings.append(
61
+ Finding(
62
+ detector=self.name,
63
+ issue_type="empty_strings",
64
+ severity=Severity.MEDIUM,
65
+ columns=[col_name],
66
+ details={"empty_string_count": empty_count},
67
+ message=(
68
+ f"Column '{col_name}' has "
69
+ f"{empty_count} empty strings "
70
+ f"that may represent missing values"
71
+ ),
72
+ confidence=0.9,
73
+ )
74
+ )
75
+ return findings
76
+
77
+
78
+ register_detector(MissingDetector())
@@ -0,0 +1,93 @@
1
+ """Detect statistical outliers in numeric columns."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from datawash.core.models import DatasetProfile, Finding, Severity
9
+ from datawash.detectors.base import BaseDetector
10
+ from datawash.detectors.registry import register_detector
11
+
12
+
13
+ class OutlierDetector(BaseDetector):
14
+ def __init__(self, method: str = "iqr", threshold: float = 1.5) -> None:
15
+ self._method = method
16
+ self._threshold = threshold
17
+
18
+ @property
19
+ def name(self) -> str:
20
+ return "outliers"
21
+
22
+ @property
23
+ def description(self) -> str:
24
+ return "Detects statistical outliers in numeric columns"
25
+
26
+ def detect(self, df: pd.DataFrame, profile: DatasetProfile) -> list[Finding]:
27
+ findings: list[Finding] = []
28
+ for col_name in df.select_dtypes(include=[np.number]).columns:
29
+ series = df[col_name].dropna()
30
+ if len(series) < 10:
31
+ continue
32
+
33
+ if self._method == "iqr":
34
+ outlier_indices = self._iqr_outliers(series)
35
+ else:
36
+ outlier_indices = self._zscore_outliers(series)
37
+
38
+ if len(outlier_indices) == 0:
39
+ continue
40
+
41
+ ratio = len(outlier_indices) / len(series)
42
+ severity = (
43
+ Severity.HIGH
44
+ if ratio > 0.05
45
+ else Severity.MEDIUM if ratio > 0.01 else Severity.LOW
46
+ )
47
+
48
+ findings.append(
49
+ Finding(
50
+ detector=self.name,
51
+ issue_type="outliers",
52
+ severity=severity,
53
+ columns=[col_name],
54
+ rows=outlier_indices[:100],
55
+ details={
56
+ "outlier_count": len(outlier_indices),
57
+ "outlier_ratio": round(ratio, 4),
58
+ "method": self._method,
59
+ "threshold": self._threshold,
60
+ },
61
+ message=(
62
+ f"Column '{col_name}' has "
63
+ f"{len(outlier_indices)} outliers "
64
+ f"({ratio:.1%}) detected by "
65
+ f"{self._method.upper()}"
66
+ ),
67
+ confidence=0.85,
68
+ )
69
+ )
70
+ return findings
71
+
72
+ def _iqr_outliers(self, series: pd.Series) -> list[int]:
73
+ q1 = series.quantile(0.25)
74
+ q3 = series.quantile(0.75)
75
+ iqr = q3 - q1
76
+ if iqr == 0:
77
+ return []
78
+ lower = q1 - self._threshold * iqr
79
+ upper = q3 + self._threshold * iqr
80
+ mask = (series < lower) | (series > upper)
81
+ return series.index[mask].tolist()
82
+
83
+ def _zscore_outliers(self, series: pd.Series) -> list[int]:
84
+ mean = series.mean()
85
+ std = series.std()
86
+ if std == 0:
87
+ return []
88
+ z_scores = ((series - mean) / std).abs()
89
+ mask = z_scores > self._threshold
90
+ return series.index[mask].tolist()
91
+
92
+
93
+ register_detector(OutlierDetector())
@@ -0,0 +1,64 @@
1
+ """Detector registration and orchestration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import sys
7
+ from typing import Optional
8
+
9
+ import pandas as pd
10
+
11
+ from datawash.core.models import DatasetProfile, Finding
12
+ from datawash.detectors.base import BaseDetector
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ _DETECTORS: dict[str, BaseDetector] = {}
17
+
18
+
19
+ def register_detector(detector: BaseDetector) -> None:
20
+ _DETECTORS[detector.name] = detector
21
+
22
+
23
+ def get_all_detectors() -> dict[str, BaseDetector]:
24
+ return dict(_DETECTORS)
25
+
26
+
27
+ def run_all_detectors(
28
+ df: pd.DataFrame,
29
+ profile: DatasetProfile,
30
+ enabled: Optional[list[str]] = None,
31
+ ) -> list[Finding]:
32
+ """Run enabled detectors and return all findings."""
33
+ findings: list[Finding] = []
34
+ active_detectors = {
35
+ n: d for n, d in _DETECTORS.items() if enabled is None or n in enabled
36
+ }
37
+ use_progress = len(df) > 10000 and sys.stderr.isatty()
38
+
39
+ if use_progress:
40
+ from rich.progress import Progress
41
+
42
+ with Progress() as progress:
43
+ task = progress.add_task(
44
+ "Running detectors...", total=len(active_detectors)
45
+ )
46
+ for name, detector in active_detectors.items():
47
+ try:
48
+ logger.info("Running detector: %s", name)
49
+ results = detector.detect(df, profile)
50
+ findings.extend(results)
51
+ logger.info("Detector %s found %d issues", name, len(results))
52
+ except Exception:
53
+ logger.exception("Detector %s failed", name)
54
+ progress.update(task, advance=1)
55
+ else:
56
+ for name, detector in active_detectors.items():
57
+ try:
58
+ logger.info("Running detector: %s", name)
59
+ results = detector.detect(df, profile)
60
+ findings.extend(results)
61
+ logger.info("Detector %s found %d issues", name, len(results))
62
+ except Exception:
63
+ logger.exception("Detector %s failed", name)
64
+ return findings