datawash 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datawash/__init__.py +9 -0
- datawash/adapters/__init__.py +12 -0
- datawash/adapters/base.py +66 -0
- datawash/adapters/csv_adapter.py +23 -0
- datawash/adapters/excel_adapter.py +36 -0
- datawash/adapters/json_adapter.py +21 -0
- datawash/adapters/parquet_adapter.py +34 -0
- datawash/cli/__init__.py +0 -0
- datawash/cli/formatters.py +110 -0
- datawash/cli/main.py +168 -0
- datawash/codegen/__init__.py +1 -0
- datawash/codegen/generator.py +72 -0
- datawash/core/__init__.py +1 -0
- datawash/core/cache.py +64 -0
- datawash/core/config.py +56 -0
- datawash/core/dtypes.py +24 -0
- datawash/core/exceptions.py +21 -0
- datawash/core/models.py +78 -0
- datawash/core/report.py +430 -0
- datawash/core/sampling.py +84 -0
- datawash/detectors/__init__.py +13 -0
- datawash/detectors/base.py +27 -0
- datawash/detectors/duplicate_detector.py +56 -0
- datawash/detectors/format_detector.py +130 -0
- datawash/detectors/missing_detector.py +78 -0
- datawash/detectors/outlier_detector.py +93 -0
- datawash/detectors/registry.py +64 -0
- datawash/detectors/similarity_detector.py +294 -0
- datawash/detectors/type_detector.py +100 -0
- datawash/profiler/__init__.py +1 -0
- datawash/profiler/engine.py +88 -0
- datawash/profiler/parallel.py +122 -0
- datawash/profiler/patterns.py +80 -0
- datawash/profiler/statistics.py +41 -0
- datawash/suggestors/__init__.py +1 -0
- datawash/suggestors/base.py +15 -0
- datawash/suggestors/engine.py +327 -0
- datawash/suggestors/prioritizer.py +23 -0
- datawash/transformers/__init__.py +13 -0
- datawash/transformers/base.py +27 -0
- datawash/transformers/categories.py +64 -0
- datawash/transformers/columns.py +72 -0
- datawash/transformers/duplicates.py +43 -0
- datawash/transformers/formats.py +95 -0
- datawash/transformers/missing.py +201 -0
- datawash/transformers/registry.py +30 -0
- datawash/transformers/types.py +95 -0
- datawash-0.2.0.dist-info/METADATA +353 -0
- datawash-0.2.0.dist-info/RECORD +53 -0
- datawash-0.2.0.dist-info/WHEEL +5 -0
- datawash-0.2.0.dist-info/entry_points.txt +2 -0
- datawash-0.2.0.dist-info/licenses/LICENSE +21 -0
- datawash-0.2.0.dist-info/top_level.txt +1 -0
datawash/core/dtypes.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""DataFrame dtype optimization for performance."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def optimize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
9
|
+
"""Optimize DataFrame dtypes for faster analysis.
|
|
10
|
+
|
|
11
|
+
Downcasts numeric types to reduce memory usage.
|
|
12
|
+
Object/string columns are left unchanged to preserve detector compatibility.
|
|
13
|
+
"""
|
|
14
|
+
df = df.copy()
|
|
15
|
+
|
|
16
|
+
for col in df.columns:
|
|
17
|
+
dtype = df[col].dtype
|
|
18
|
+
|
|
19
|
+
if pd.api.types.is_integer_dtype(dtype):
|
|
20
|
+
df[col] = pd.to_numeric(df[col], downcast="integer")
|
|
21
|
+
elif pd.api.types.is_float_dtype(dtype):
|
|
22
|
+
df[col] = pd.to_numeric(df[col], downcast="float")
|
|
23
|
+
|
|
24
|
+
return df
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Custom exceptions for datawash."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class DataWashError(Exception):
|
|
5
|
+
"""Base exception for datawash."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AdapterError(DataWashError):
|
|
9
|
+
"""Error loading or saving data."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DetectionError(DataWashError):
|
|
13
|
+
"""Error during issue detection."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TransformationError(DataWashError):
|
|
17
|
+
"""Error applying a transformation."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ConfigError(DataWashError):
|
|
21
|
+
"""Error in configuration."""
|
datawash/core/models.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Data models for datawash."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import enum
|
|
6
|
+
from typing import Any, Optional
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Severity(str, enum.Enum):
|
|
12
|
+
HIGH = "high"
|
|
13
|
+
MEDIUM = "medium"
|
|
14
|
+
LOW = "low"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ColumnProfile(BaseModel):
|
|
18
|
+
"""Profile for a single column."""
|
|
19
|
+
|
|
20
|
+
name: str
|
|
21
|
+
dtype: str
|
|
22
|
+
semantic_type: Optional[str] = None
|
|
23
|
+
null_count: int = 0
|
|
24
|
+
null_ratio: float = 0.0
|
|
25
|
+
unique_count: int = 0
|
|
26
|
+
unique_ratio: float = 0.0
|
|
27
|
+
sample_values: list[Any] = Field(default_factory=list)
|
|
28
|
+
statistics: dict[str, Any] = Field(default_factory=dict)
|
|
29
|
+
patterns: dict[str, Any] = Field(default_factory=dict)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class DatasetProfile(BaseModel):
|
|
33
|
+
"""Profile for an entire dataset."""
|
|
34
|
+
|
|
35
|
+
row_count: int
|
|
36
|
+
column_count: int
|
|
37
|
+
memory_bytes: int
|
|
38
|
+
columns: dict[str, ColumnProfile] = Field(default_factory=dict)
|
|
39
|
+
duplicate_row_count: int = 0
|
|
40
|
+
sampled: bool = False
|
|
41
|
+
sample_size: Optional[int] = None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class Finding(BaseModel):
|
|
45
|
+
"""A detected data quality issue."""
|
|
46
|
+
|
|
47
|
+
detector: str
|
|
48
|
+
issue_type: str
|
|
49
|
+
severity: Severity
|
|
50
|
+
columns: list[str] = Field(default_factory=list)
|
|
51
|
+
rows: Optional[list[int]] = None
|
|
52
|
+
details: dict[str, Any] = Field(default_factory=dict)
|
|
53
|
+
message: str
|
|
54
|
+
confidence: float = 1.0
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class Suggestion(BaseModel):
|
|
58
|
+
"""A suggested fix for a finding."""
|
|
59
|
+
|
|
60
|
+
id: int
|
|
61
|
+
finding: Finding
|
|
62
|
+
action: str
|
|
63
|
+
transformer: str
|
|
64
|
+
params: dict[str, Any] = Field(default_factory=dict)
|
|
65
|
+
priority: Severity
|
|
66
|
+
impact: str
|
|
67
|
+
rationale: str
|
|
68
|
+
preview: Optional[str] = None
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class TransformationResult(BaseModel):
|
|
72
|
+
"""Result of applying a transformation."""
|
|
73
|
+
|
|
74
|
+
transformer: str
|
|
75
|
+
params: dict[str, Any] = Field(default_factory=dict)
|
|
76
|
+
rows_affected: int
|
|
77
|
+
columns_affected: list[str] = Field(default_factory=list)
|
|
78
|
+
code: str = ""
|
datawash/core/report.py
ADDED
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
"""Report class - main user-facing interface."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Optional
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich.table import Table
|
|
12
|
+
|
|
13
|
+
from datawash.adapters import load_dataframe
|
|
14
|
+
from datawash.codegen import generate_code as _generate_code
|
|
15
|
+
from datawash.core.cache import ComputationCache
|
|
16
|
+
from datawash.core.config import Config
|
|
17
|
+
from datawash.core.dtypes import optimize_dataframe
|
|
18
|
+
from datawash.core.models import (
|
|
19
|
+
DatasetProfile,
|
|
20
|
+
Finding,
|
|
21
|
+
Severity,
|
|
22
|
+
Suggestion,
|
|
23
|
+
TransformationResult,
|
|
24
|
+
)
|
|
25
|
+
from datawash.core.sampling import SmartSampler
|
|
26
|
+
from datawash.detectors import run_all_detectors
|
|
27
|
+
from datawash.detectors.registry import get_all_detectors
|
|
28
|
+
from datawash.profiler import profile_dataset
|
|
29
|
+
from datawash.profiler.parallel import (
|
|
30
|
+
profile_dataset_parallel,
|
|
31
|
+
run_detectors_parallel,
|
|
32
|
+
)
|
|
33
|
+
from datawash.suggestors import generate_suggestions
|
|
34
|
+
from datawash.suggestors.engine import _sort_by_execution_order
|
|
35
|
+
from datawash.transformers import run_transformer
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
# Threshold below which parallel overhead isn't worth it
|
|
40
|
+
_PARALLEL_THRESHOLD = 5000
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class Report:
|
|
44
|
+
"""Main interface for data analysis and cleaning.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
data: A DataFrame or path to a data file.
|
|
48
|
+
config: Optional configuration. Uses defaults if None.
|
|
49
|
+
use_case: Context for suggestion prioritization.
|
|
50
|
+
sample: Enable smart sampling for large datasets (default True).
|
|
51
|
+
parallel: Enable parallel profiling and detection (default True).
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
data: pd.DataFrame | str | Path,
|
|
57
|
+
config: Optional[Config | dict[str, Any]] = None,
|
|
58
|
+
use_case: str = "general",
|
|
59
|
+
sample: bool = True,
|
|
60
|
+
parallel: bool = True,
|
|
61
|
+
) -> None:
|
|
62
|
+
# Resolve config
|
|
63
|
+
if config is None:
|
|
64
|
+
self._config = Config(use_case=use_case)
|
|
65
|
+
elif isinstance(config, dict):
|
|
66
|
+
config.setdefault("use_case", use_case)
|
|
67
|
+
self._config = Config.from_dict(config)
|
|
68
|
+
else:
|
|
69
|
+
self._config = config
|
|
70
|
+
|
|
71
|
+
# Load data
|
|
72
|
+
if isinstance(data, (str, Path)):
|
|
73
|
+
self._df = load_dataframe(data)
|
|
74
|
+
self._source_path = str(data)
|
|
75
|
+
else:
|
|
76
|
+
self._df = data
|
|
77
|
+
self._source_path = None
|
|
78
|
+
|
|
79
|
+
# Optimize dtypes for faster analysis
|
|
80
|
+
try:
|
|
81
|
+
optimized_df = optimize_dataframe(self._df)
|
|
82
|
+
except Exception:
|
|
83
|
+
logger.debug("Dtype optimization skipped", exc_info=True)
|
|
84
|
+
optimized_df = self._df
|
|
85
|
+
|
|
86
|
+
# Smart sampling for large datasets
|
|
87
|
+
self._sampler: SmartSampler | None = None
|
|
88
|
+
if sample and len(optimized_df) >= 50_000:
|
|
89
|
+
self._sampler = SmartSampler(optimized_df)
|
|
90
|
+
analysis_df = self._sampler.sample_df
|
|
91
|
+
else:
|
|
92
|
+
analysis_df = optimized_df
|
|
93
|
+
|
|
94
|
+
# Decide whether to use parallel execution
|
|
95
|
+
use_parallel = parallel and (
|
|
96
|
+
len(analysis_df) > _PARALLEL_THRESHOLD or len(analysis_df.columns) > 20
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Profile and detect
|
|
100
|
+
if use_parallel:
|
|
101
|
+
cache = ComputationCache(analysis_df)
|
|
102
|
+
self._profile = profile_dataset_parallel(analysis_df, cache=cache)
|
|
103
|
+
active = {
|
|
104
|
+
n: d
|
|
105
|
+
for n, d in get_all_detectors().items()
|
|
106
|
+
if self._config.detectors.enabled is None
|
|
107
|
+
or n in self._config.detectors.enabled
|
|
108
|
+
}
|
|
109
|
+
self._findings = run_detectors_parallel(analysis_df, self._profile, active)
|
|
110
|
+
else:
|
|
111
|
+
self._profile = profile_dataset(analysis_df)
|
|
112
|
+
self._findings = run_all_detectors(
|
|
113
|
+
analysis_df,
|
|
114
|
+
self._profile,
|
|
115
|
+
enabled=self._config.detectors.enabled,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Restore original row/column counts in profile when sampled
|
|
119
|
+
if self._sampler and self._sampler.is_sampled:
|
|
120
|
+
self._profile = DatasetProfile(
|
|
121
|
+
row_count=len(self._df),
|
|
122
|
+
column_count=len(self._df.columns),
|
|
123
|
+
memory_bytes=int(self._df.memory_usage(deep=True).sum()),
|
|
124
|
+
columns=self._profile.columns,
|
|
125
|
+
duplicate_row_count=self._sampler.extrapolate_count(
|
|
126
|
+
self._profile.duplicate_row_count
|
|
127
|
+
),
|
|
128
|
+
sampled=True,
|
|
129
|
+
sample_size=len(self._sampler.sample_df),
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
self._suggestions = generate_suggestions(
|
|
133
|
+
self._findings,
|
|
134
|
+
max_suggestions=self._config.suggestions.max_suggestions,
|
|
135
|
+
use_case=self._config.use_case,
|
|
136
|
+
)
|
|
137
|
+
self._applied: list[TransformationResult] = []
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def df(self) -> pd.DataFrame:
|
|
141
|
+
"""The original DataFrame (read-only copy)."""
|
|
142
|
+
return self._df.copy()
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def profile(self) -> DatasetProfile:
|
|
146
|
+
"""Dataset profile with statistics."""
|
|
147
|
+
return self._profile
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def issues(self) -> list[Finding]:
|
|
151
|
+
"""All detected data quality issues."""
|
|
152
|
+
return list(self._findings)
|
|
153
|
+
|
|
154
|
+
@property
|
|
155
|
+
def suggestions(self) -> list[Suggestion]:
|
|
156
|
+
"""Prioritized list of suggestions."""
|
|
157
|
+
return list(self._suggestions)
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def quality_score(self) -> int:
|
|
161
|
+
"""Data quality score from 0 to 100."""
|
|
162
|
+
score = 100.0
|
|
163
|
+
if self._profile.row_count == 0:
|
|
164
|
+
return 100
|
|
165
|
+
for finding in self._findings:
|
|
166
|
+
if finding.severity == Severity.HIGH:
|
|
167
|
+
penalty = 10.0
|
|
168
|
+
elif finding.severity == Severity.MEDIUM:
|
|
169
|
+
penalty = 5.0
|
|
170
|
+
else:
|
|
171
|
+
penalty = 2.0
|
|
172
|
+
# Scale by confidence
|
|
173
|
+
penalty *= finding.confidence
|
|
174
|
+
score -= penalty
|
|
175
|
+
return max(0, min(100, int(score)))
|
|
176
|
+
|
|
177
|
+
def suggest(self, use_case: Optional[str] = None) -> list[Suggestion]:
|
|
178
|
+
"""Get filtered suggestions, optionally for a specific use case."""
|
|
179
|
+
# For now, return all suggestions. Use-case filtering is Phase 2.
|
|
180
|
+
return list(self._suggestions)
|
|
181
|
+
|
|
182
|
+
def _compute_quality_score(self, df: pd.DataFrame) -> int:
|
|
183
|
+
"""Compute quality score for an arbitrary DataFrame."""
|
|
184
|
+
from datawash.detectors import run_all_detectors as _detect
|
|
185
|
+
from datawash.profiler import profile_dataset as _profile
|
|
186
|
+
|
|
187
|
+
prof = _profile(df)
|
|
188
|
+
findings = _detect(df, prof, enabled=self._config.detectors.enabled)
|
|
189
|
+
score = 100.0
|
|
190
|
+
if prof.row_count == 0:
|
|
191
|
+
return 100
|
|
192
|
+
for f in findings:
|
|
193
|
+
if f.severity == Severity.HIGH:
|
|
194
|
+
penalty = 10.0
|
|
195
|
+
elif f.severity == Severity.MEDIUM:
|
|
196
|
+
penalty = 5.0
|
|
197
|
+
else:
|
|
198
|
+
penalty = 2.0
|
|
199
|
+
penalty *= f.confidence
|
|
200
|
+
score -= penalty
|
|
201
|
+
return max(0, min(100, int(score)))
|
|
202
|
+
|
|
203
|
+
def apply(self, suggestion_ids: list[int]) -> pd.DataFrame:
|
|
204
|
+
"""Apply selected suggestions by ID, return cleaned DataFrame."""
|
|
205
|
+
score_before = self.quality_score
|
|
206
|
+
result_df = self._df.copy()
|
|
207
|
+
id_map = {s.id: s for s in self._suggestions}
|
|
208
|
+
self._applied = []
|
|
209
|
+
|
|
210
|
+
# Collect and sort suggestions by execution order
|
|
211
|
+
suggestions_to_apply = []
|
|
212
|
+
for sid in suggestion_ids:
|
|
213
|
+
suggestion = id_map.get(sid)
|
|
214
|
+
if suggestion is None:
|
|
215
|
+
logger.warning("Suggestion ID %d not found, skipping", sid)
|
|
216
|
+
continue
|
|
217
|
+
suggestions_to_apply.append(suggestion)
|
|
218
|
+
|
|
219
|
+
# Sort by transformation execution order to prevent conflicts
|
|
220
|
+
suggestions_to_apply = _sort_by_execution_order(suggestions_to_apply)
|
|
221
|
+
|
|
222
|
+
for suggestion in suggestions_to_apply:
|
|
223
|
+
result_df, tx_result = run_transformer(
|
|
224
|
+
suggestion.transformer, result_df, **suggestion.params
|
|
225
|
+
)
|
|
226
|
+
self._applied.append(tx_result)
|
|
227
|
+
logger.info(
|
|
228
|
+
"Applied suggestion %d (%s): %d rows affected",
|
|
229
|
+
suggestion.id,
|
|
230
|
+
suggestion.action,
|
|
231
|
+
tx_result.rows_affected,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
score_after = self._compute_quality_score(result_df)
|
|
235
|
+
diff = score_after - score_before
|
|
236
|
+
sign = "+" if diff >= 0 else ""
|
|
237
|
+
logger.info(
|
|
238
|
+
"Quality score: %d → %d (%s%d)", score_before, score_after, sign, diff
|
|
239
|
+
)
|
|
240
|
+
self._last_score_before = score_before
|
|
241
|
+
self._last_score_after = score_after
|
|
242
|
+
return result_df
|
|
243
|
+
|
|
244
|
+
def apply_all(self) -> pd.DataFrame:
|
|
245
|
+
"""Apply all suggestions and return cleaned DataFrame."""
|
|
246
|
+
return self.apply([s.id for s in self._suggestions])
|
|
247
|
+
|
|
248
|
+
def apply_interactive(
|
|
249
|
+
self, input_fn: Any = None, console: Optional[Console] = None
|
|
250
|
+
) -> pd.DataFrame:
|
|
251
|
+
"""Interactively apply suggestions with user prompts.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
input_fn: Callable for getting user input (default: builtin input).
|
|
255
|
+
Useful for testing with monkeypatch.
|
|
256
|
+
console: Optional Rich Console for output.
|
|
257
|
+
"""
|
|
258
|
+
if input_fn is None:
|
|
259
|
+
input_fn = input
|
|
260
|
+
if console is None:
|
|
261
|
+
console = Console()
|
|
262
|
+
|
|
263
|
+
score_before = self.quality_score
|
|
264
|
+
result_df = self._df.copy()
|
|
265
|
+
self._applied = []
|
|
266
|
+
apply_all = False
|
|
267
|
+
|
|
268
|
+
# Sort suggestions by execution order to prevent conflicts
|
|
269
|
+
sorted_suggestions = _sort_by_execution_order(list(self._suggestions))
|
|
270
|
+
|
|
271
|
+
for suggestion in sorted_suggestions:
|
|
272
|
+
if not apply_all:
|
|
273
|
+
table = Table(title=f"Suggestion #{suggestion.id}", show_lines=True)
|
|
274
|
+
table.add_column("Field", style="bold")
|
|
275
|
+
table.add_column("Value")
|
|
276
|
+
table.add_row("Action", suggestion.action)
|
|
277
|
+
table.add_row("Priority", suggestion.priority.value)
|
|
278
|
+
table.add_row("Impact", suggestion.impact)
|
|
279
|
+
table.add_row("Rationale", suggestion.rationale)
|
|
280
|
+
cols_list = suggestion.params.get("columns", [])
|
|
281
|
+
if cols_list:
|
|
282
|
+
table.add_row("Columns", ", ".join(cols_list))
|
|
283
|
+
else:
|
|
284
|
+
table.add_row("Columns", "all")
|
|
285
|
+
console.print(table)
|
|
286
|
+
|
|
287
|
+
# Show preview of affected rows
|
|
288
|
+
cols = suggestion.params.get("columns", [])
|
|
289
|
+
valid_cols = [c for c in cols if c in result_df.columns]
|
|
290
|
+
if valid_cols:
|
|
291
|
+
affected_preview = result_df[valid_cols].head(5)
|
|
292
|
+
console.print("[dim]Preview (first 5 rows):[/dim]")
|
|
293
|
+
console.print(affected_preview.to_string())
|
|
294
|
+
|
|
295
|
+
raw_choice = input_fn(
|
|
296
|
+
"\n[a]pply / [s]kip / apply [A]ll / [q]uit: "
|
|
297
|
+
).strip()
|
|
298
|
+
choice = raw_choice.lower()
|
|
299
|
+
if choice == "q":
|
|
300
|
+
break
|
|
301
|
+
elif choice == "s":
|
|
302
|
+
continue
|
|
303
|
+
elif choice == "all" or raw_choice == "A":
|
|
304
|
+
apply_all = True
|
|
305
|
+
elif choice not in ("a", "apply", ""):
|
|
306
|
+
continue
|
|
307
|
+
|
|
308
|
+
result_df, tx_result = run_transformer(
|
|
309
|
+
suggestion.transformer, result_df, **suggestion.params
|
|
310
|
+
)
|
|
311
|
+
self._applied.append(tx_result)
|
|
312
|
+
console.print(
|
|
313
|
+
f"[green]✓ Applied:[/green] {suggestion.action} "
|
|
314
|
+
f"({tx_result.rows_affected} rows affected)"
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
score_after = self._compute_quality_score(result_df)
|
|
318
|
+
diff = score_after - score_before
|
|
319
|
+
sign = "+" if diff >= 0 else ""
|
|
320
|
+
msg = f"\n[bold]Quality score: {score_before} → {score_after} ({sign}{diff})[/bold]"
|
|
321
|
+
console.print(msg)
|
|
322
|
+
self._last_score_before = score_before
|
|
323
|
+
self._last_score_after = score_after
|
|
324
|
+
return result_df
|
|
325
|
+
|
|
326
|
+
def generate_code(self, style: str = "function") -> str:
|
|
327
|
+
"""Generate Python code for applied transformations.
|
|
328
|
+
|
|
329
|
+
Must call apply() or apply_all() first.
|
|
330
|
+
"""
|
|
331
|
+
if not self._applied:
|
|
332
|
+
# Auto-apply all if nothing applied yet
|
|
333
|
+
self.apply_all()
|
|
334
|
+
return _generate_code(
|
|
335
|
+
self._applied,
|
|
336
|
+
style=style,
|
|
337
|
+
include_comments=self._config.codegen.include_comments,
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
def summary(self) -> str:
|
|
341
|
+
"""Human-readable analysis summary."""
|
|
342
|
+
lines = [
|
|
343
|
+
f"Dataset: {self._profile.row_count} rows "
|
|
344
|
+
f"x {self._profile.column_count} columns",
|
|
345
|
+
f"Memory: {self._profile.memory_bytes / 1024 / 1024:.1f} MB",
|
|
346
|
+
f"Duplicate rows: {self._profile.duplicate_row_count}",
|
|
347
|
+
f"Data Quality Score: {self.quality_score}/100",
|
|
348
|
+
f"Issues found: {len(self._findings)}",
|
|
349
|
+
f"Suggestions: {len(self._suggestions)}",
|
|
350
|
+
"",
|
|
351
|
+
]
|
|
352
|
+
|
|
353
|
+
# Group issues by severity
|
|
354
|
+
for severity in ["high", "medium", "low"]:
|
|
355
|
+
issues = [f for f in self._findings if f.severity.value == severity]
|
|
356
|
+
if issues:
|
|
357
|
+
lines.append(f" [{severity.upper()}] {len(issues)} issue(s)")
|
|
358
|
+
for issue in issues[:5]:
|
|
359
|
+
lines.append(f" - {issue.message}")
|
|
360
|
+
if len(issues) > 5:
|
|
361
|
+
lines.append(f" ... and {len(issues) - 5} more")
|
|
362
|
+
|
|
363
|
+
return "\n".join(lines)
|
|
364
|
+
|
|
365
|
+
def __repr__(self) -> str:
|
|
366
|
+
return (
|
|
367
|
+
f"Report(rows={self._profile.row_count}, "
|
|
368
|
+
f"cols={self._profile.column_count}, "
|
|
369
|
+
f"issues={len(self._findings)}, "
|
|
370
|
+
f"suggestions={len(self._suggestions)})"
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
def _repr_html_(self) -> str:
|
|
374
|
+
"""Rich display for Jupyter notebooks."""
|
|
375
|
+
html = [
|
|
376
|
+
"<div style='font-family: monospace;'>",
|
|
377
|
+
"<h3>DataWash Report</h3>",
|
|
378
|
+
f"<p><b>Dataset:</b> {self._profile.row_count} rows "
|
|
379
|
+
f"x {self._profile.column_count} columns</p>",
|
|
380
|
+
f"<p><b>Issues:</b> {len(self._findings)} | "
|
|
381
|
+
f"<b>Suggestions:</b> {len(self._suggestions)}</p>",
|
|
382
|
+
]
|
|
383
|
+
if self._suggestions:
|
|
384
|
+
html.append("<table border='1' style='border-collapse: collapse;'>")
|
|
385
|
+
html.append(
|
|
386
|
+
"<tr><th>#</th><th>Priority</th><th>Action</th><th>Impact</th></tr>"
|
|
387
|
+
)
|
|
388
|
+
for s in self._suggestions[:10]:
|
|
389
|
+
color = {"high": "red", "medium": "orange", "low": "green"}.get(
|
|
390
|
+
s.priority.value, "gray"
|
|
391
|
+
)
|
|
392
|
+
html.append(
|
|
393
|
+
f"<tr><td>{s.id}</td>"
|
|
394
|
+
f"<td style='color:{color};'>{s.priority.value}</td>"
|
|
395
|
+
f"<td>{s.action}</td>"
|
|
396
|
+
f"<td>{s.impact}</td></tr>"
|
|
397
|
+
)
|
|
398
|
+
if len(self._suggestions) > 10:
|
|
399
|
+
html.append(
|
|
400
|
+
f"<tr><td colspan='4'>... and {len(self._suggestions) - 10} more</td></tr>"
|
|
401
|
+
)
|
|
402
|
+
html.append("</table>")
|
|
403
|
+
html.append("</div>")
|
|
404
|
+
return "\n".join(html)
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def analyze(
|
|
408
|
+
data: pd.DataFrame | str | Path,
|
|
409
|
+
config: Optional[Config | dict[str, Any]] = None,
|
|
410
|
+
use_case: str = "general",
|
|
411
|
+
sample: bool = True,
|
|
412
|
+
parallel: bool = True,
|
|
413
|
+
) -> Report:
|
|
414
|
+
"""Analyze a dataset and return a Report.
|
|
415
|
+
|
|
416
|
+
This is the main entry point for datawash.
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
data: DataFrame or path to a data file.
|
|
420
|
+
config: Optional configuration dict or Config object.
|
|
421
|
+
use_case: One of "general", "ml", "analytics", "export".
|
|
422
|
+
sample: Enable smart sampling for large datasets.
|
|
423
|
+
parallel: Enable parallel profiling and detection.
|
|
424
|
+
|
|
425
|
+
Returns:
|
|
426
|
+
A Report object with issues, suggestions, and cleaning methods.
|
|
427
|
+
"""
|
|
428
|
+
return Report(
|
|
429
|
+
data, config=config, use_case=use_case, sample=sample, parallel=parallel
|
|
430
|
+
)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Smart sampling for large datasets."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
SAMPLE_THRESHOLD = 50_000
|
|
8
|
+
SAMPLE_SIZE = 10_000
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SmartSampler:
|
|
12
|
+
"""Intelligent sampling for large datasets.
|
|
13
|
+
|
|
14
|
+
For datasets above ``SAMPLE_THRESHOLD`` rows, creates a representative
|
|
15
|
+
sample that preserves data distribution and edge cases (nulls, outliers).
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, df: pd.DataFrame) -> None:
|
|
19
|
+
self.original_df = df
|
|
20
|
+
self.original_size = len(df)
|
|
21
|
+
self.is_sampled = len(df) > SAMPLE_THRESHOLD
|
|
22
|
+
self.sample_df = self._create_sample() if self.is_sampled else df
|
|
23
|
+
self.scale_factor = (
|
|
24
|
+
self.original_size / len(self.sample_df) if self.is_sampled else 1.0
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def _create_sample(self) -> pd.DataFrame:
|
|
28
|
+
"""Create representative sample preserving data distribution."""
|
|
29
|
+
df = self.original_df
|
|
30
|
+
|
|
31
|
+
# Always include rows with nulls (up to 10% of sample)
|
|
32
|
+
null_rows = df[df.isna().any(axis=1)]
|
|
33
|
+
max_null_rows = SAMPLE_SIZE // 10
|
|
34
|
+
if len(null_rows) > max_null_rows:
|
|
35
|
+
null_sample = null_rows.sample(max_null_rows, random_state=42)
|
|
36
|
+
elif len(null_rows) > 0:
|
|
37
|
+
null_sample = null_rows
|
|
38
|
+
else:
|
|
39
|
+
null_sample = df.iloc[:0]
|
|
40
|
+
|
|
41
|
+
# Sample remaining rows
|
|
42
|
+
remaining_size = SAMPLE_SIZE - len(null_sample)
|
|
43
|
+
non_null_rows = df.drop(null_sample.index, errors="ignore")
|
|
44
|
+
|
|
45
|
+
# Try stratified sampling on first low-cardinality column
|
|
46
|
+
strat_col = self._find_stratification_column(non_null_rows)
|
|
47
|
+
if strat_col:
|
|
48
|
+
main_sample = self._stratified_sample(
|
|
49
|
+
non_null_rows, strat_col, remaining_size
|
|
50
|
+
)
|
|
51
|
+
else:
|
|
52
|
+
actual_size = min(remaining_size, len(non_null_rows))
|
|
53
|
+
main_sample = non_null_rows.sample(actual_size, random_state=42)
|
|
54
|
+
|
|
55
|
+
return pd.concat([null_sample, main_sample]).reset_index(drop=True)
|
|
56
|
+
|
|
57
|
+
def _find_stratification_column(self, df: pd.DataFrame) -> str | None:
|
|
58
|
+
"""Find a good column for stratified sampling."""
|
|
59
|
+
for col in df.columns:
|
|
60
|
+
if df[col].isna().all():
|
|
61
|
+
continue
|
|
62
|
+
nunique = df[col].nunique()
|
|
63
|
+
if 2 <= nunique <= 20:
|
|
64
|
+
return col
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
def _stratified_sample(
|
|
68
|
+
self, df: pd.DataFrame, strat_col: str, n: int
|
|
69
|
+
) -> pd.DataFrame:
|
|
70
|
+
"""Stratified sampling proportional to group sizes."""
|
|
71
|
+
groups = df.groupby(strat_col, group_keys=False, observed=True)
|
|
72
|
+
total = len(df)
|
|
73
|
+
parts = []
|
|
74
|
+
for _name, group in groups:
|
|
75
|
+
group_n = max(1, int(n * len(group) / total))
|
|
76
|
+
group_n = min(group_n, len(group))
|
|
77
|
+
parts.append(group.sample(group_n, random_state=42))
|
|
78
|
+
return pd.concat(parts)
|
|
79
|
+
|
|
80
|
+
def extrapolate_count(self, sample_count: int) -> int:
|
|
81
|
+
"""Scale sample count to full dataset estimate."""
|
|
82
|
+
if not self.is_sampled:
|
|
83
|
+
return sample_count
|
|
84
|
+
return int(sample_count * self.scale_factor)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Detectors for data quality issues."""
|
|
2
|
+
|
|
3
|
+
# Import detectors to trigger registration
|
|
4
|
+
from . import ( # noqa: F401
|
|
5
|
+
duplicate_detector,
|
|
6
|
+
format_detector,
|
|
7
|
+
missing_detector,
|
|
8
|
+
outlier_detector,
|
|
9
|
+
similarity_detector,
|
|
10
|
+
type_detector,
|
|
11
|
+
)
|
|
12
|
+
from .registry import get_all_detectors as get_all_detectors
|
|
13
|
+
from .registry import run_all_detectors as run_all_detectors
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Base detector interface."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from datawash.core.models import DatasetProfile, Finding
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BaseDetector(ABC):
|
|
13
|
+
"""Abstract base class for all detectors."""
|
|
14
|
+
|
|
15
|
+
@property
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def name(self) -> str:
|
|
18
|
+
"""Unique detector name."""
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def description(self) -> str:
|
|
23
|
+
"""Human-readable description."""
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def detect(self, df: pd.DataFrame, profile: DatasetProfile) -> list[Finding]:
|
|
27
|
+
"""Run detection and return findings."""
|