datawash 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. datawash/__init__.py +9 -0
  2. datawash/adapters/__init__.py +12 -0
  3. datawash/adapters/base.py +66 -0
  4. datawash/adapters/csv_adapter.py +23 -0
  5. datawash/adapters/excel_adapter.py +36 -0
  6. datawash/adapters/json_adapter.py +21 -0
  7. datawash/adapters/parquet_adapter.py +34 -0
  8. datawash/cli/__init__.py +0 -0
  9. datawash/cli/formatters.py +110 -0
  10. datawash/cli/main.py +168 -0
  11. datawash/codegen/__init__.py +1 -0
  12. datawash/codegen/generator.py +72 -0
  13. datawash/core/__init__.py +1 -0
  14. datawash/core/cache.py +64 -0
  15. datawash/core/config.py +56 -0
  16. datawash/core/dtypes.py +24 -0
  17. datawash/core/exceptions.py +21 -0
  18. datawash/core/models.py +78 -0
  19. datawash/core/report.py +430 -0
  20. datawash/core/sampling.py +84 -0
  21. datawash/detectors/__init__.py +13 -0
  22. datawash/detectors/base.py +27 -0
  23. datawash/detectors/duplicate_detector.py +56 -0
  24. datawash/detectors/format_detector.py +130 -0
  25. datawash/detectors/missing_detector.py +78 -0
  26. datawash/detectors/outlier_detector.py +93 -0
  27. datawash/detectors/registry.py +64 -0
  28. datawash/detectors/similarity_detector.py +294 -0
  29. datawash/detectors/type_detector.py +100 -0
  30. datawash/profiler/__init__.py +1 -0
  31. datawash/profiler/engine.py +88 -0
  32. datawash/profiler/parallel.py +122 -0
  33. datawash/profiler/patterns.py +80 -0
  34. datawash/profiler/statistics.py +41 -0
  35. datawash/suggestors/__init__.py +1 -0
  36. datawash/suggestors/base.py +15 -0
  37. datawash/suggestors/engine.py +327 -0
  38. datawash/suggestors/prioritizer.py +23 -0
  39. datawash/transformers/__init__.py +13 -0
  40. datawash/transformers/base.py +27 -0
  41. datawash/transformers/categories.py +64 -0
  42. datawash/transformers/columns.py +72 -0
  43. datawash/transformers/duplicates.py +43 -0
  44. datawash/transformers/formats.py +95 -0
  45. datawash/transformers/missing.py +201 -0
  46. datawash/transformers/registry.py +30 -0
  47. datawash/transformers/types.py +95 -0
  48. datawash-0.2.0.dist-info/METADATA +353 -0
  49. datawash-0.2.0.dist-info/RECORD +53 -0
  50. datawash-0.2.0.dist-info/WHEEL +5 -0
  51. datawash-0.2.0.dist-info/entry_points.txt +2 -0
  52. datawash-0.2.0.dist-info/licenses/LICENSE +21 -0
  53. datawash-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,24 @@
1
+ """DataFrame dtype optimization for performance."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pandas as pd
6
+
7
+
8
+ def optimize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
9
+ """Optimize DataFrame dtypes for faster analysis.
10
+
11
+ Downcasts numeric types to reduce memory usage.
12
+ Object/string columns are left unchanged to preserve detector compatibility.
13
+ """
14
+ df = df.copy()
15
+
16
+ for col in df.columns:
17
+ dtype = df[col].dtype
18
+
19
+ if pd.api.types.is_integer_dtype(dtype):
20
+ df[col] = pd.to_numeric(df[col], downcast="integer")
21
+ elif pd.api.types.is_float_dtype(dtype):
22
+ df[col] = pd.to_numeric(df[col], downcast="float")
23
+
24
+ return df
@@ -0,0 +1,21 @@
1
+ """Custom exceptions for datawash."""
2
+
3
+
4
+ class DataWashError(Exception):
5
+ """Base exception for datawash."""
6
+
7
+
8
+ class AdapterError(DataWashError):
9
+ """Error loading or saving data."""
10
+
11
+
12
+ class DetectionError(DataWashError):
13
+ """Error during issue detection."""
14
+
15
+
16
+ class TransformationError(DataWashError):
17
+ """Error applying a transformation."""
18
+
19
+
20
+ class ConfigError(DataWashError):
21
+ """Error in configuration."""
@@ -0,0 +1,78 @@
1
+ """Data models for datawash."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import enum
6
+ from typing import Any, Optional
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ class Severity(str, enum.Enum):
12
+ HIGH = "high"
13
+ MEDIUM = "medium"
14
+ LOW = "low"
15
+
16
+
17
+ class ColumnProfile(BaseModel):
18
+ """Profile for a single column."""
19
+
20
+ name: str
21
+ dtype: str
22
+ semantic_type: Optional[str] = None
23
+ null_count: int = 0
24
+ null_ratio: float = 0.0
25
+ unique_count: int = 0
26
+ unique_ratio: float = 0.0
27
+ sample_values: list[Any] = Field(default_factory=list)
28
+ statistics: dict[str, Any] = Field(default_factory=dict)
29
+ patterns: dict[str, Any] = Field(default_factory=dict)
30
+
31
+
32
+ class DatasetProfile(BaseModel):
33
+ """Profile for an entire dataset."""
34
+
35
+ row_count: int
36
+ column_count: int
37
+ memory_bytes: int
38
+ columns: dict[str, ColumnProfile] = Field(default_factory=dict)
39
+ duplicate_row_count: int = 0
40
+ sampled: bool = False
41
+ sample_size: Optional[int] = None
42
+
43
+
44
+ class Finding(BaseModel):
45
+ """A detected data quality issue."""
46
+
47
+ detector: str
48
+ issue_type: str
49
+ severity: Severity
50
+ columns: list[str] = Field(default_factory=list)
51
+ rows: Optional[list[int]] = None
52
+ details: dict[str, Any] = Field(default_factory=dict)
53
+ message: str
54
+ confidence: float = 1.0
55
+
56
+
57
+ class Suggestion(BaseModel):
58
+ """A suggested fix for a finding."""
59
+
60
+ id: int
61
+ finding: Finding
62
+ action: str
63
+ transformer: str
64
+ params: dict[str, Any] = Field(default_factory=dict)
65
+ priority: Severity
66
+ impact: str
67
+ rationale: str
68
+ preview: Optional[str] = None
69
+
70
+
71
+ class TransformationResult(BaseModel):
72
+ """Result of applying a transformation."""
73
+
74
+ transformer: str
75
+ params: dict[str, Any] = Field(default_factory=dict)
76
+ rows_affected: int
77
+ columns_affected: list[str] = Field(default_factory=list)
78
+ code: str = ""
@@ -0,0 +1,430 @@
1
+ """Report class - main user-facing interface."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from pathlib import Path
7
+ from typing import Any, Optional
8
+
9
+ import pandas as pd
10
+ from rich.console import Console
11
+ from rich.table import Table
12
+
13
+ from datawash.adapters import load_dataframe
14
+ from datawash.codegen import generate_code as _generate_code
15
+ from datawash.core.cache import ComputationCache
16
+ from datawash.core.config import Config
17
+ from datawash.core.dtypes import optimize_dataframe
18
+ from datawash.core.models import (
19
+ DatasetProfile,
20
+ Finding,
21
+ Severity,
22
+ Suggestion,
23
+ TransformationResult,
24
+ )
25
+ from datawash.core.sampling import SmartSampler
26
+ from datawash.detectors import run_all_detectors
27
+ from datawash.detectors.registry import get_all_detectors
28
+ from datawash.profiler import profile_dataset
29
+ from datawash.profiler.parallel import (
30
+ profile_dataset_parallel,
31
+ run_detectors_parallel,
32
+ )
33
+ from datawash.suggestors import generate_suggestions
34
+ from datawash.suggestors.engine import _sort_by_execution_order
35
+ from datawash.transformers import run_transformer
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+ # Threshold below which parallel overhead isn't worth it
40
+ _PARALLEL_THRESHOLD = 5000
41
+
42
+
43
+ class Report:
44
+ """Main interface for data analysis and cleaning.
45
+
46
+ Args:
47
+ data: A DataFrame or path to a data file.
48
+ config: Optional configuration. Uses defaults if None.
49
+ use_case: Context for suggestion prioritization.
50
+ sample: Enable smart sampling for large datasets (default True).
51
+ parallel: Enable parallel profiling and detection (default True).
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ data: pd.DataFrame | str | Path,
57
+ config: Optional[Config | dict[str, Any]] = None,
58
+ use_case: str = "general",
59
+ sample: bool = True,
60
+ parallel: bool = True,
61
+ ) -> None:
62
+ # Resolve config
63
+ if config is None:
64
+ self._config = Config(use_case=use_case)
65
+ elif isinstance(config, dict):
66
+ config.setdefault("use_case", use_case)
67
+ self._config = Config.from_dict(config)
68
+ else:
69
+ self._config = config
70
+
71
+ # Load data
72
+ if isinstance(data, (str, Path)):
73
+ self._df = load_dataframe(data)
74
+ self._source_path = str(data)
75
+ else:
76
+ self._df = data
77
+ self._source_path = None
78
+
79
+ # Optimize dtypes for faster analysis
80
+ try:
81
+ optimized_df = optimize_dataframe(self._df)
82
+ except Exception:
83
+ logger.debug("Dtype optimization skipped", exc_info=True)
84
+ optimized_df = self._df
85
+
86
+ # Smart sampling for large datasets
87
+ self._sampler: SmartSampler | None = None
88
+ if sample and len(optimized_df) >= 50_000:
89
+ self._sampler = SmartSampler(optimized_df)
90
+ analysis_df = self._sampler.sample_df
91
+ else:
92
+ analysis_df = optimized_df
93
+
94
+ # Decide whether to use parallel execution
95
+ use_parallel = parallel and (
96
+ len(analysis_df) > _PARALLEL_THRESHOLD or len(analysis_df.columns) > 20
97
+ )
98
+
99
+ # Profile and detect
100
+ if use_parallel:
101
+ cache = ComputationCache(analysis_df)
102
+ self._profile = profile_dataset_parallel(analysis_df, cache=cache)
103
+ active = {
104
+ n: d
105
+ for n, d in get_all_detectors().items()
106
+ if self._config.detectors.enabled is None
107
+ or n in self._config.detectors.enabled
108
+ }
109
+ self._findings = run_detectors_parallel(analysis_df, self._profile, active)
110
+ else:
111
+ self._profile = profile_dataset(analysis_df)
112
+ self._findings = run_all_detectors(
113
+ analysis_df,
114
+ self._profile,
115
+ enabled=self._config.detectors.enabled,
116
+ )
117
+
118
+ # Restore original row/column counts in profile when sampled
119
+ if self._sampler and self._sampler.is_sampled:
120
+ self._profile = DatasetProfile(
121
+ row_count=len(self._df),
122
+ column_count=len(self._df.columns),
123
+ memory_bytes=int(self._df.memory_usage(deep=True).sum()),
124
+ columns=self._profile.columns,
125
+ duplicate_row_count=self._sampler.extrapolate_count(
126
+ self._profile.duplicate_row_count
127
+ ),
128
+ sampled=True,
129
+ sample_size=len(self._sampler.sample_df),
130
+ )
131
+
132
+ self._suggestions = generate_suggestions(
133
+ self._findings,
134
+ max_suggestions=self._config.suggestions.max_suggestions,
135
+ use_case=self._config.use_case,
136
+ )
137
+ self._applied: list[TransformationResult] = []
138
+
139
+ @property
140
+ def df(self) -> pd.DataFrame:
141
+ """The original DataFrame (read-only copy)."""
142
+ return self._df.copy()
143
+
144
+ @property
145
+ def profile(self) -> DatasetProfile:
146
+ """Dataset profile with statistics."""
147
+ return self._profile
148
+
149
+ @property
150
+ def issues(self) -> list[Finding]:
151
+ """All detected data quality issues."""
152
+ return list(self._findings)
153
+
154
+ @property
155
+ def suggestions(self) -> list[Suggestion]:
156
+ """Prioritized list of suggestions."""
157
+ return list(self._suggestions)
158
+
159
+ @property
160
+ def quality_score(self) -> int:
161
+ """Data quality score from 0 to 100."""
162
+ score = 100.0
163
+ if self._profile.row_count == 0:
164
+ return 100
165
+ for finding in self._findings:
166
+ if finding.severity == Severity.HIGH:
167
+ penalty = 10.0
168
+ elif finding.severity == Severity.MEDIUM:
169
+ penalty = 5.0
170
+ else:
171
+ penalty = 2.0
172
+ # Scale by confidence
173
+ penalty *= finding.confidence
174
+ score -= penalty
175
+ return max(0, min(100, int(score)))
176
+
177
+ def suggest(self, use_case: Optional[str] = None) -> list[Suggestion]:
178
+ """Get filtered suggestions, optionally for a specific use case."""
179
+ # For now, return all suggestions. Use-case filtering is Phase 2.
180
+ return list(self._suggestions)
181
+
182
+ def _compute_quality_score(self, df: pd.DataFrame) -> int:
183
+ """Compute quality score for an arbitrary DataFrame."""
184
+ from datawash.detectors import run_all_detectors as _detect
185
+ from datawash.profiler import profile_dataset as _profile
186
+
187
+ prof = _profile(df)
188
+ findings = _detect(df, prof, enabled=self._config.detectors.enabled)
189
+ score = 100.0
190
+ if prof.row_count == 0:
191
+ return 100
192
+ for f in findings:
193
+ if f.severity == Severity.HIGH:
194
+ penalty = 10.0
195
+ elif f.severity == Severity.MEDIUM:
196
+ penalty = 5.0
197
+ else:
198
+ penalty = 2.0
199
+ penalty *= f.confidence
200
+ score -= penalty
201
+ return max(0, min(100, int(score)))
202
+
203
+ def apply(self, suggestion_ids: list[int]) -> pd.DataFrame:
204
+ """Apply selected suggestions by ID, return cleaned DataFrame."""
205
+ score_before = self.quality_score
206
+ result_df = self._df.copy()
207
+ id_map = {s.id: s for s in self._suggestions}
208
+ self._applied = []
209
+
210
+ # Collect and sort suggestions by execution order
211
+ suggestions_to_apply = []
212
+ for sid in suggestion_ids:
213
+ suggestion = id_map.get(sid)
214
+ if suggestion is None:
215
+ logger.warning("Suggestion ID %d not found, skipping", sid)
216
+ continue
217
+ suggestions_to_apply.append(suggestion)
218
+
219
+ # Sort by transformation execution order to prevent conflicts
220
+ suggestions_to_apply = _sort_by_execution_order(suggestions_to_apply)
221
+
222
+ for suggestion in suggestions_to_apply:
223
+ result_df, tx_result = run_transformer(
224
+ suggestion.transformer, result_df, **suggestion.params
225
+ )
226
+ self._applied.append(tx_result)
227
+ logger.info(
228
+ "Applied suggestion %d (%s): %d rows affected",
229
+ suggestion.id,
230
+ suggestion.action,
231
+ tx_result.rows_affected,
232
+ )
233
+
234
+ score_after = self._compute_quality_score(result_df)
235
+ diff = score_after - score_before
236
+ sign = "+" if diff >= 0 else ""
237
+ logger.info(
238
+ "Quality score: %d → %d (%s%d)", score_before, score_after, sign, diff
239
+ )
240
+ self._last_score_before = score_before
241
+ self._last_score_after = score_after
242
+ return result_df
243
+
244
+ def apply_all(self) -> pd.DataFrame:
245
+ """Apply all suggestions and return cleaned DataFrame."""
246
+ return self.apply([s.id for s in self._suggestions])
247
+
248
+ def apply_interactive(
249
+ self, input_fn: Any = None, console: Optional[Console] = None
250
+ ) -> pd.DataFrame:
251
+ """Interactively apply suggestions with user prompts.
252
+
253
+ Args:
254
+ input_fn: Callable for getting user input (default: builtin input).
255
+ Useful for testing with monkeypatch.
256
+ console: Optional Rich Console for output.
257
+ """
258
+ if input_fn is None:
259
+ input_fn = input
260
+ if console is None:
261
+ console = Console()
262
+
263
+ score_before = self.quality_score
264
+ result_df = self._df.copy()
265
+ self._applied = []
266
+ apply_all = False
267
+
268
+ # Sort suggestions by execution order to prevent conflicts
269
+ sorted_suggestions = _sort_by_execution_order(list(self._suggestions))
270
+
271
+ for suggestion in sorted_suggestions:
272
+ if not apply_all:
273
+ table = Table(title=f"Suggestion #{suggestion.id}", show_lines=True)
274
+ table.add_column("Field", style="bold")
275
+ table.add_column("Value")
276
+ table.add_row("Action", suggestion.action)
277
+ table.add_row("Priority", suggestion.priority.value)
278
+ table.add_row("Impact", suggestion.impact)
279
+ table.add_row("Rationale", suggestion.rationale)
280
+ cols_list = suggestion.params.get("columns", [])
281
+ if cols_list:
282
+ table.add_row("Columns", ", ".join(cols_list))
283
+ else:
284
+ table.add_row("Columns", "all")
285
+ console.print(table)
286
+
287
+ # Show preview of affected rows
288
+ cols = suggestion.params.get("columns", [])
289
+ valid_cols = [c for c in cols if c in result_df.columns]
290
+ if valid_cols:
291
+ affected_preview = result_df[valid_cols].head(5)
292
+ console.print("[dim]Preview (first 5 rows):[/dim]")
293
+ console.print(affected_preview.to_string())
294
+
295
+ raw_choice = input_fn(
296
+ "\n[a]pply / [s]kip / apply [A]ll / [q]uit: "
297
+ ).strip()
298
+ choice = raw_choice.lower()
299
+ if choice == "q":
300
+ break
301
+ elif choice == "s":
302
+ continue
303
+ elif choice == "all" or raw_choice == "A":
304
+ apply_all = True
305
+ elif choice not in ("a", "apply", ""):
306
+ continue
307
+
308
+ result_df, tx_result = run_transformer(
309
+ suggestion.transformer, result_df, **suggestion.params
310
+ )
311
+ self._applied.append(tx_result)
312
+ console.print(
313
+ f"[green]✓ Applied:[/green] {suggestion.action} "
314
+ f"({tx_result.rows_affected} rows affected)"
315
+ )
316
+
317
+ score_after = self._compute_quality_score(result_df)
318
+ diff = score_after - score_before
319
+ sign = "+" if diff >= 0 else ""
320
+ msg = f"\n[bold]Quality score: {score_before} → {score_after} ({sign}{diff})[/bold]"
321
+ console.print(msg)
322
+ self._last_score_before = score_before
323
+ self._last_score_after = score_after
324
+ return result_df
325
+
326
+ def generate_code(self, style: str = "function") -> str:
327
+ """Generate Python code for applied transformations.
328
+
329
+ Must call apply() or apply_all() first.
330
+ """
331
+ if not self._applied:
332
+ # Auto-apply all if nothing applied yet
333
+ self.apply_all()
334
+ return _generate_code(
335
+ self._applied,
336
+ style=style,
337
+ include_comments=self._config.codegen.include_comments,
338
+ )
339
+
340
+ def summary(self) -> str:
341
+ """Human-readable analysis summary."""
342
+ lines = [
343
+ f"Dataset: {self._profile.row_count} rows "
344
+ f"x {self._profile.column_count} columns",
345
+ f"Memory: {self._profile.memory_bytes / 1024 / 1024:.1f} MB",
346
+ f"Duplicate rows: {self._profile.duplicate_row_count}",
347
+ f"Data Quality Score: {self.quality_score}/100",
348
+ f"Issues found: {len(self._findings)}",
349
+ f"Suggestions: {len(self._suggestions)}",
350
+ "",
351
+ ]
352
+
353
+ # Group issues by severity
354
+ for severity in ["high", "medium", "low"]:
355
+ issues = [f for f in self._findings if f.severity.value == severity]
356
+ if issues:
357
+ lines.append(f" [{severity.upper()}] {len(issues)} issue(s)")
358
+ for issue in issues[:5]:
359
+ lines.append(f" - {issue.message}")
360
+ if len(issues) > 5:
361
+ lines.append(f" ... and {len(issues) - 5} more")
362
+
363
+ return "\n".join(lines)
364
+
365
+ def __repr__(self) -> str:
366
+ return (
367
+ f"Report(rows={self._profile.row_count}, "
368
+ f"cols={self._profile.column_count}, "
369
+ f"issues={len(self._findings)}, "
370
+ f"suggestions={len(self._suggestions)})"
371
+ )
372
+
373
+ def _repr_html_(self) -> str:
374
+ """Rich display for Jupyter notebooks."""
375
+ html = [
376
+ "<div style='font-family: monospace;'>",
377
+ "<h3>DataWash Report</h3>",
378
+ f"<p><b>Dataset:</b> {self._profile.row_count} rows "
379
+ f"x {self._profile.column_count} columns</p>",
380
+ f"<p><b>Issues:</b> {len(self._findings)} | "
381
+ f"<b>Suggestions:</b> {len(self._suggestions)}</p>",
382
+ ]
383
+ if self._suggestions:
384
+ html.append("<table border='1' style='border-collapse: collapse;'>")
385
+ html.append(
386
+ "<tr><th>#</th><th>Priority</th><th>Action</th><th>Impact</th></tr>"
387
+ )
388
+ for s in self._suggestions[:10]:
389
+ color = {"high": "red", "medium": "orange", "low": "green"}.get(
390
+ s.priority.value, "gray"
391
+ )
392
+ html.append(
393
+ f"<tr><td>{s.id}</td>"
394
+ f"<td style='color:{color};'>{s.priority.value}</td>"
395
+ f"<td>{s.action}</td>"
396
+ f"<td>{s.impact}</td></tr>"
397
+ )
398
+ if len(self._suggestions) > 10:
399
+ html.append(
400
+ f"<tr><td colspan='4'>... and {len(self._suggestions) - 10} more</td></tr>"
401
+ )
402
+ html.append("</table>")
403
+ html.append("</div>")
404
+ return "\n".join(html)
405
+
406
+
407
+ def analyze(
408
+ data: pd.DataFrame | str | Path,
409
+ config: Optional[Config | dict[str, Any]] = None,
410
+ use_case: str = "general",
411
+ sample: bool = True,
412
+ parallel: bool = True,
413
+ ) -> Report:
414
+ """Analyze a dataset and return a Report.
415
+
416
+ This is the main entry point for datawash.
417
+
418
+ Args:
419
+ data: DataFrame or path to a data file.
420
+ config: Optional configuration dict or Config object.
421
+ use_case: One of "general", "ml", "analytics", "export".
422
+ sample: Enable smart sampling for large datasets.
423
+ parallel: Enable parallel profiling and detection.
424
+
425
+ Returns:
426
+ A Report object with issues, suggestions, and cleaning methods.
427
+ """
428
+ return Report(
429
+ data, config=config, use_case=use_case, sample=sample, parallel=parallel
430
+ )
@@ -0,0 +1,84 @@
1
+ """Smart sampling for large datasets."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pandas as pd
6
+
7
+ SAMPLE_THRESHOLD = 50_000
8
+ SAMPLE_SIZE = 10_000
9
+
10
+
11
+ class SmartSampler:
12
+ """Intelligent sampling for large datasets.
13
+
14
+ For datasets above ``SAMPLE_THRESHOLD`` rows, creates a representative
15
+ sample that preserves data distribution and edge cases (nulls, outliers).
16
+ """
17
+
18
+ def __init__(self, df: pd.DataFrame) -> None:
19
+ self.original_df = df
20
+ self.original_size = len(df)
21
+ self.is_sampled = len(df) > SAMPLE_THRESHOLD
22
+ self.sample_df = self._create_sample() if self.is_sampled else df
23
+ self.scale_factor = (
24
+ self.original_size / len(self.sample_df) if self.is_sampled else 1.0
25
+ )
26
+
27
+ def _create_sample(self) -> pd.DataFrame:
28
+ """Create representative sample preserving data distribution."""
29
+ df = self.original_df
30
+
31
+ # Always include rows with nulls (up to 10% of sample)
32
+ null_rows = df[df.isna().any(axis=1)]
33
+ max_null_rows = SAMPLE_SIZE // 10
34
+ if len(null_rows) > max_null_rows:
35
+ null_sample = null_rows.sample(max_null_rows, random_state=42)
36
+ elif len(null_rows) > 0:
37
+ null_sample = null_rows
38
+ else:
39
+ null_sample = df.iloc[:0]
40
+
41
+ # Sample remaining rows
42
+ remaining_size = SAMPLE_SIZE - len(null_sample)
43
+ non_null_rows = df.drop(null_sample.index, errors="ignore")
44
+
45
+ # Try stratified sampling on first low-cardinality column
46
+ strat_col = self._find_stratification_column(non_null_rows)
47
+ if strat_col:
48
+ main_sample = self._stratified_sample(
49
+ non_null_rows, strat_col, remaining_size
50
+ )
51
+ else:
52
+ actual_size = min(remaining_size, len(non_null_rows))
53
+ main_sample = non_null_rows.sample(actual_size, random_state=42)
54
+
55
+ return pd.concat([null_sample, main_sample]).reset_index(drop=True)
56
+
57
+ def _find_stratification_column(self, df: pd.DataFrame) -> str | None:
58
+ """Find a good column for stratified sampling."""
59
+ for col in df.columns:
60
+ if df[col].isna().all():
61
+ continue
62
+ nunique = df[col].nunique()
63
+ if 2 <= nunique <= 20:
64
+ return col
65
+ return None
66
+
67
+ def _stratified_sample(
68
+ self, df: pd.DataFrame, strat_col: str, n: int
69
+ ) -> pd.DataFrame:
70
+ """Stratified sampling proportional to group sizes."""
71
+ groups = df.groupby(strat_col, group_keys=False, observed=True)
72
+ total = len(df)
73
+ parts = []
74
+ for _name, group in groups:
75
+ group_n = max(1, int(n * len(group) / total))
76
+ group_n = min(group_n, len(group))
77
+ parts.append(group.sample(group_n, random_state=42))
78
+ return pd.concat(parts)
79
+
80
+ def extrapolate_count(self, sample_count: int) -> int:
81
+ """Scale sample count to full dataset estimate."""
82
+ if not self.is_sampled:
83
+ return sample_count
84
+ return int(sample_count * self.scale_factor)
@@ -0,0 +1,13 @@
1
+ """Detectors for data quality issues."""
2
+
3
+ # Import detectors to trigger registration
4
+ from . import ( # noqa: F401
5
+ duplicate_detector,
6
+ format_detector,
7
+ missing_detector,
8
+ outlier_detector,
9
+ similarity_detector,
10
+ type_detector,
11
+ )
12
+ from .registry import get_all_detectors as get_all_detectors
13
+ from .registry import run_all_detectors as run_all_detectors
@@ -0,0 +1,27 @@
1
+ """Base detector interface."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+
7
+ import pandas as pd
8
+
9
+ from datawash.core.models import DatasetProfile, Finding
10
+
11
+
12
+ class BaseDetector(ABC):
13
+ """Abstract base class for all detectors."""
14
+
15
+ @property
16
+ @abstractmethod
17
+ def name(self) -> str:
18
+ """Unique detector name."""
19
+
20
+ @property
21
+ @abstractmethod
22
+ def description(self) -> str:
23
+ """Human-readable description."""
24
+
25
+ @abstractmethod
26
+ def detect(self, df: pd.DataFrame, profile: DatasetProfile) -> list[Finding]:
27
+ """Run detection and return findings."""