datawash 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. datawash/__init__.py +9 -0
  2. datawash/adapters/__init__.py +12 -0
  3. datawash/adapters/base.py +66 -0
  4. datawash/adapters/csv_adapter.py +23 -0
  5. datawash/adapters/excel_adapter.py +36 -0
  6. datawash/adapters/json_adapter.py +21 -0
  7. datawash/adapters/parquet_adapter.py +34 -0
  8. datawash/cli/__init__.py +0 -0
  9. datawash/cli/formatters.py +110 -0
  10. datawash/cli/main.py +168 -0
  11. datawash/codegen/__init__.py +1 -0
  12. datawash/codegen/generator.py +72 -0
  13. datawash/core/__init__.py +1 -0
  14. datawash/core/cache.py +64 -0
  15. datawash/core/config.py +56 -0
  16. datawash/core/dtypes.py +24 -0
  17. datawash/core/exceptions.py +21 -0
  18. datawash/core/models.py +78 -0
  19. datawash/core/report.py +430 -0
  20. datawash/core/sampling.py +84 -0
  21. datawash/detectors/__init__.py +13 -0
  22. datawash/detectors/base.py +27 -0
  23. datawash/detectors/duplicate_detector.py +56 -0
  24. datawash/detectors/format_detector.py +130 -0
  25. datawash/detectors/missing_detector.py +78 -0
  26. datawash/detectors/outlier_detector.py +93 -0
  27. datawash/detectors/registry.py +64 -0
  28. datawash/detectors/similarity_detector.py +294 -0
  29. datawash/detectors/type_detector.py +100 -0
  30. datawash/profiler/__init__.py +1 -0
  31. datawash/profiler/engine.py +88 -0
  32. datawash/profiler/parallel.py +122 -0
  33. datawash/profiler/patterns.py +80 -0
  34. datawash/profiler/statistics.py +41 -0
  35. datawash/suggestors/__init__.py +1 -0
  36. datawash/suggestors/base.py +15 -0
  37. datawash/suggestors/engine.py +327 -0
  38. datawash/suggestors/prioritizer.py +23 -0
  39. datawash/transformers/__init__.py +13 -0
  40. datawash/transformers/base.py +27 -0
  41. datawash/transformers/categories.py +64 -0
  42. datawash/transformers/columns.py +72 -0
  43. datawash/transformers/duplicates.py +43 -0
  44. datawash/transformers/formats.py +95 -0
  45. datawash/transformers/missing.py +201 -0
  46. datawash/transformers/registry.py +30 -0
  47. datawash/transformers/types.py +95 -0
  48. datawash-0.2.0.dist-info/METADATA +353 -0
  49. datawash-0.2.0.dist-info/RECORD +53 -0
  50. datawash-0.2.0.dist-info/WHEEL +5 -0
  51. datawash-0.2.0.dist-info/entry_points.txt +2 -0
  52. datawash-0.2.0.dist-info/licenses/LICENSE +21 -0
  53. datawash-0.2.0.dist-info/top_level.txt +1 -0
datawash/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ """Datawash - Intelligent data quality analysis and cleaning."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datawash.core.report import analyze
6
+
7
+ __version__ = "0.2.0"
8
+
9
+ __all__ = ["analyze", "__version__"]
@@ -0,0 +1,12 @@
1
+ """Data adapters for loading and saving DataFrames."""
2
+
3
+ # Import adapters to trigger registration
4
+ from datawash.adapters import (
5
+ csv_adapter, # noqa: F401
6
+ excel_adapter, # noqa: F401
7
+ json_adapter, # noqa: F401
8
+ parquet_adapter, # noqa: F401
9
+ )
10
+ from datawash.adapters.base import load_dataframe
11
+
12
+ __all__ = ["load_dataframe"]
@@ -0,0 +1,66 @@
1
+ """Base adapter interface and loader dispatch."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from pathlib import Path
7
+ from typing import Any, Optional, Protocol
8
+
9
+ import pandas as pd
10
+
11
+ from datawash.core.exceptions import AdapterError
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class DataAdapter(Protocol):
17
+ """Protocol for data adapters."""
18
+
19
+ def read(self, path: Path, **kwargs: Any) -> pd.DataFrame: ...
20
+ def write(self, df: pd.DataFrame, path: Path, **kwargs: Any) -> None: ...
21
+
22
+
23
+ _ADAPTERS: dict[str, DataAdapter] = {}
24
+
25
+
26
+ def register_adapter(extension: str, adapter: DataAdapter) -> None:
27
+ _ADAPTERS[extension] = adapter
28
+
29
+
30
+ def load_dataframe(
31
+ source: str | Path, format: Optional[str] = None, **kwargs: Any
32
+ ) -> pd.DataFrame:
33
+ """Load a DataFrame from a file path.
34
+
35
+ Args:
36
+ source: Path to the data file.
37
+ format: Force file format. Auto-detected from extension if None.
38
+ **kwargs: Passed to the adapter's read method.
39
+
40
+ Returns:
41
+ Loaded DataFrame.
42
+
43
+ Raises:
44
+ AdapterError: If the file cannot be loaded.
45
+ """
46
+ path = Path(source)
47
+ if not path.exists():
48
+ raise AdapterError(f"File not found: {path}")
49
+
50
+ ext = format or path.suffix.lstrip(".")
51
+ adapter = _ADAPTERS.get(ext)
52
+ if adapter is None:
53
+ raise AdapterError(
54
+ f"Unsupported format: '{ext}'. "
55
+ f"Supported formats: {', '.join(sorted(_ADAPTERS.keys()))}"
56
+ )
57
+
58
+ try:
59
+ logger.info("Loading %s with %s adapter", path, ext)
60
+ df = adapter.read(path, **kwargs)
61
+ logger.info("Loaded %d rows, %d columns", len(df), len(df.columns))
62
+ return df
63
+ except AdapterError:
64
+ raise
65
+ except Exception as e:
66
+ raise AdapterError(f"Failed to read {path}: {e}") from e
@@ -0,0 +1,23 @@
1
+ """CSV file adapter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import pandas as pd
9
+
10
+ from datawash.adapters.base import register_adapter
11
+
12
+
13
+ class CsvAdapter:
14
+ def read(self, path: Path, **kwargs: Any) -> pd.DataFrame:
15
+ return pd.read_csv(path, **kwargs)
16
+
17
+ def write(self, df: pd.DataFrame, path: Path, **kwargs: Any) -> None:
18
+ df.to_csv(path, index=False, **kwargs)
19
+
20
+
21
+ _adapter = CsvAdapter()
22
+ register_adapter("csv", _adapter)
23
+ register_adapter("tsv", _adapter) # TSV uses same adapter with sep='\t'
@@ -0,0 +1,36 @@
1
+ """Excel file adapter (requires openpyxl)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import pandas as pd
9
+
10
+ from datawash.adapters.base import register_adapter
11
+ from datawash.core.exceptions import AdapterError
12
+
13
+
14
+ class ExcelAdapter:
15
+ def read(self, path: Path, **kwargs: Any) -> pd.DataFrame:
16
+ try:
17
+ return pd.read_excel(path, **kwargs)
18
+ except ImportError:
19
+ raise AdapterError(
20
+ "Excel support requires openpyxl. "
21
+ "Install with: pip install datawash[formats]"
22
+ )
23
+
24
+ def write(self, df: pd.DataFrame, path: Path, **kwargs: Any) -> None:
25
+ try:
26
+ df.to_excel(path, index=False, **kwargs)
27
+ except ImportError:
28
+ raise AdapterError(
29
+ "Excel support requires openpyxl. "
30
+ "Install with: pip install datawash[formats]"
31
+ )
32
+
33
+
34
+ _adapter = ExcelAdapter()
35
+ register_adapter("xlsx", _adapter)
36
+ register_adapter("xls", _adapter)
@@ -0,0 +1,21 @@
1
+ """JSON file adapter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import pandas as pd
9
+
10
+ from datawash.adapters.base import register_adapter
11
+
12
+
13
+ class JsonAdapter:
14
+ def read(self, path: Path, **kwargs: Any) -> pd.DataFrame:
15
+ return pd.read_json(path, **kwargs)
16
+
17
+ def write(self, df: pd.DataFrame, path: Path, **kwargs: Any) -> None:
18
+ df.to_json(path, orient="records", indent=2, **kwargs)
19
+
20
+
21
+ register_adapter("json", JsonAdapter())
@@ -0,0 +1,34 @@
1
+ """Parquet file adapter (requires pyarrow)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import pandas as pd
9
+
10
+ from datawash.adapters.base import register_adapter
11
+ from datawash.core.exceptions import AdapterError
12
+
13
+
14
+ class ParquetAdapter:
15
+ def read(self, path: Path, **kwargs: Any) -> pd.DataFrame:
16
+ try:
17
+ return pd.read_parquet(path, **kwargs)
18
+ except ImportError:
19
+ raise AdapterError(
20
+ "Parquet support requires pyarrow. "
21
+ "Install with: pip install datawash[formats]"
22
+ )
23
+
24
+ def write(self, df: pd.DataFrame, path: Path, **kwargs: Any) -> None:
25
+ try:
26
+ df.to_parquet(path, index=False, **kwargs)
27
+ except ImportError:
28
+ raise AdapterError(
29
+ "Parquet support requires pyarrow. "
30
+ "Install with: pip install datawash[formats]"
31
+ )
32
+
33
+
34
+ register_adapter("parquet", ParquetAdapter())
File without changes
@@ -0,0 +1,110 @@
1
+ """Rich output formatting for CLI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from rich.console import Console
6
+ from rich.panel import Panel
7
+ from rich.table import Table
8
+ from rich.text import Text
9
+
10
+ from datawash.core.models import DatasetProfile, Finding, Suggestion
11
+
12
+ console = Console()
13
+
14
+ SEVERITY_COLORS = {"high": "red", "medium": "yellow", "low": "green"}
15
+
16
+
17
+ def format_profile(profile: DatasetProfile) -> None:
18
+ """Print dataset profile."""
19
+ console.print(
20
+ Panel(
21
+ f"[bold]{profile.row_count}[/] rows x "
22
+ f"[bold]{profile.column_count}[/] columns | "
23
+ f"Memory: [bold]"
24
+ f"{profile.memory_bytes / 1024 / 1024:.1f} MB[/] | "
25
+ f"Dupes: [bold]{profile.duplicate_row_count}[/]",
26
+ title="Dataset Overview",
27
+ )
28
+ )
29
+
30
+ table = Table(title="Column Profiles")
31
+ table.add_column("Column", style="cyan")
32
+ table.add_column("Type")
33
+ table.add_column("Nulls", justify="right")
34
+ table.add_column("Unique", justify="right")
35
+ table.add_column("Sample Values")
36
+
37
+ for name, col in profile.columns.items():
38
+ null_str = (
39
+ f"{col.null_count} ({col.null_ratio:.0%})" if col.null_count > 0 else "0"
40
+ )
41
+ samples = ", ".join(str(v) for v in col.sample_values[:3])
42
+ semantic = f" [{col.semantic_type}]" if col.semantic_type else ""
43
+ table.add_row(
44
+ name, f"{col.dtype}{semantic}", null_str, str(col.unique_count), samples
45
+ )
46
+
47
+ console.print(table)
48
+
49
+
50
+ def format_issues(findings: list[Finding]) -> None:
51
+ """Print detected issues."""
52
+ if not findings:
53
+ console.print("[green]No issues detected![/]")
54
+ return
55
+
56
+ table = Table(title=f"Issues Found ({len(findings)})")
57
+ table.add_column("Severity", justify="center")
58
+ table.add_column("Detector")
59
+ table.add_column("Message")
60
+ table.add_column("Columns")
61
+
62
+ for f in findings:
63
+ color = SEVERITY_COLORS.get(f.severity.value, "white")
64
+ table.add_row(
65
+ Text(f.severity.value.upper(), style=color),
66
+ f.detector,
67
+ f.message,
68
+ ", ".join(f.columns),
69
+ )
70
+
71
+ console.print(table)
72
+
73
+
74
+ def format_suggestions(suggestions: list[Suggestion]) -> None:
75
+ """Print suggestions."""
76
+ if not suggestions:
77
+ console.print("[green]No suggestions.[/]")
78
+ return
79
+
80
+ table = Table(title=f"Suggestions ({len(suggestions)})")
81
+ table.add_column("#", justify="right", style="bold")
82
+ table.add_column("Priority", justify="center")
83
+ table.add_column("Action")
84
+ table.add_column("Impact")
85
+ table.add_column("Rationale")
86
+
87
+ for s in suggestions:
88
+ color = SEVERITY_COLORS.get(s.priority.value, "white")
89
+ table.add_row(
90
+ str(s.id),
91
+ Text(s.priority.value.upper(), style=color),
92
+ s.action,
93
+ s.impact,
94
+ s.rationale,
95
+ )
96
+
97
+ console.print(table)
98
+
99
+
100
+ def format_transformation_summary(
101
+ before_rows: int, after_rows: int, n_applied: int
102
+ ) -> None:
103
+ """Print transformation summary."""
104
+ console.print(
105
+ Panel(
106
+ f"Applied [bold]{n_applied}[/] transformation(s)\n"
107
+ f"Rows: {before_rows} → {after_rows} ({before_rows - after_rows} removed)",
108
+ title="Cleaning Summary",
109
+ )
110
+ )
datawash/cli/main.py ADDED
@@ -0,0 +1,168 @@
1
+ """CLI entry point for datawash."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ import typer
9
+ from rich.console import Console
10
+
11
+ app = typer.Typer(
12
+ name="datawash",
13
+ help="Intelligent data cleaning and quality analysis.",
14
+ no_args_is_help=True,
15
+ )
16
+ console = Console()
17
+
18
+
19
+ @app.command()
20
+ def analyze(
21
+ file: Path = typer.Argument(..., help="Path to data file"),
22
+ sample: Optional[int] = typer.Option(
23
+ None, "--sample", "-s", help="Number of rows to sample"
24
+ ),
25
+ verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed output"),
26
+ ) -> None:
27
+ """Analyze a dataset and show quality report."""
28
+ from datawash.cli.formatters import (
29
+ format_issues,
30
+ format_profile,
31
+ format_suggestions,
32
+ )
33
+ from datawash.core.report import Report
34
+
35
+ with console.status("Analyzing..."):
36
+ report = Report(str(file))
37
+
38
+ format_profile(report.profile)
39
+ console.print()
40
+ format_issues(report.issues)
41
+ console.print()
42
+ format_suggestions(report.suggestions)
43
+
44
+
45
+ @app.command()
46
+ def suggest(
47
+ file: Path = typer.Argument(..., help="Path to data file"),
48
+ use_case: str = typer.Option(
49
+ "general", "--use-case", "-u", help="Context: general, ml, analytics, export"
50
+ ),
51
+ priority: str = typer.Option(
52
+ "all", "--priority", "-p", help="Filter: high, medium, low, all"
53
+ ),
54
+ limit: int = typer.Option(20, "--limit", "-l", help="Max suggestions"),
55
+ ) -> None:
56
+ """Show cleaning suggestions for a dataset."""
57
+ from datawash.cli.formatters import format_suggestions
58
+ from datawash.core.report import Report
59
+
60
+ with console.status("Analyzing..."):
61
+ report = Report(str(file), use_case=use_case)
62
+
63
+ suggestions = report.suggestions
64
+ if priority != "all":
65
+ suggestions = [s for s in suggestions if s.priority.value == priority]
66
+ suggestions = suggestions[:limit]
67
+
68
+ format_suggestions(suggestions)
69
+
70
+
71
+ @app.command()
72
+ def clean(
73
+ file: Path = typer.Argument(..., help="Path to data file"),
74
+ output: Path = typer.Option(..., "--output", "-o", help="Output file path"),
75
+ apply: Optional[str] = typer.Option(
76
+ None, "--apply", "-a", help="Suggestion IDs (comma-separated)"
77
+ ),
78
+ apply_all: bool = typer.Option(False, "--apply-all", help="Apply all suggestions"),
79
+ use_case: str = typer.Option("general", "--use-case", "-u", help="Context"),
80
+ codegen: Optional[Path] = typer.Option(
81
+ None, "--codegen", help="Also save generated Python code"
82
+ ),
83
+ ) -> None:
84
+ """Clean a dataset by applying suggestions."""
85
+ from datawash.adapters.base import _ADAPTERS
86
+ from datawash.cli.formatters import format_transformation_summary
87
+ from datawash.core.report import Report
88
+
89
+ with console.status("Analyzing..."):
90
+ report = Report(str(file), use_case=use_case)
91
+
92
+ before_rows = len(report.df)
93
+
94
+ if apply_all:
95
+ clean_df = report.apply_all()
96
+ elif apply:
97
+ ids = [int(x.strip()) for x in apply.split(",")]
98
+ clean_df = report.apply(ids)
99
+ else:
100
+ console.print("[red]Specify --apply or --apply-all[/]")
101
+ raise typer.Exit(1)
102
+
103
+ # Save output
104
+ ext = output.suffix.lstrip(".")
105
+ adapter = _ADAPTERS.get(ext)
106
+ if adapter is None:
107
+ console.print(f"[red]Unsupported output format: {ext}[/]")
108
+ raise typer.Exit(1)
109
+ adapter.write(clean_df, output)
110
+ console.print(f"Saved cleaned data to [bold]{output}[/]")
111
+
112
+ format_transformation_summary(before_rows, len(clean_df), len(report._applied))
113
+
114
+ # Show before/after quality score
115
+ score_before = getattr(report, "_last_score_before", None)
116
+ score_after = getattr(report, "_last_score_after", None)
117
+ if score_before is not None and score_after is not None:
118
+ diff = score_after - score_before
119
+ sign = "+" if diff >= 0 else ""
120
+ console.print(f"Quality score: {score_before} → {score_after} ({sign}{diff})")
121
+
122
+ if codegen:
123
+ code = report.generate_code()
124
+ codegen.write_text(code)
125
+ console.print(f"Saved code to [bold]{codegen}[/]")
126
+
127
+
128
+ @app.command()
129
+ def codegen(
130
+ file: Path = typer.Argument(..., help="Path to data file"),
131
+ apply: Optional[str] = typer.Option(
132
+ None, "--apply", "-a", help="Suggestion IDs (comma-separated)"
133
+ ),
134
+ apply_all: bool = typer.Option(
135
+ False, "--apply-all", help="Generate code for all suggestions"
136
+ ),
137
+ style: str = typer.Option(
138
+ "function", "--style", "-s", help="Code style: function or script"
139
+ ),
140
+ output: Optional[Path] = typer.Option(
141
+ None, "--output", "-o", help="Output Python file"
142
+ ),
143
+ ) -> None:
144
+ """Generate Python code for data cleaning transformations."""
145
+ from datawash.core.report import Report
146
+
147
+ with console.status("Analyzing..."):
148
+ report = Report(str(file))
149
+
150
+ if apply_all:
151
+ report.apply_all()
152
+ elif apply:
153
+ ids = [int(x.strip()) for x in apply.split(",")]
154
+ report.apply(ids)
155
+ else:
156
+ report.apply_all()
157
+
158
+ code = report.generate_code(style=style)
159
+
160
+ if output:
161
+ output.write_text(code)
162
+ console.print(f"Saved code to [bold]{output}[/]")
163
+ else:
164
+ console.print(code)
165
+
166
+
167
+ if __name__ == "__main__":
168
+ app()
@@ -0,0 +1 @@
1
+ from .generator import generate_code as generate_code
@@ -0,0 +1,72 @@
1
+ """Python code generation from transformation log."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datawash.core.models import TransformationResult
6
+
7
+
8
+ def generate_code(
9
+ results: list[TransformationResult],
10
+ style: str = "function",
11
+ include_comments: bool = True,
12
+ ) -> str:
13
+ """Generate Python code from a list of transformation results.
14
+
15
+ Args:
16
+ results: List of TransformationResult from applied transformations.
17
+ style: "function" wraps in a function, "script" generates standalone.
18
+ include_comments: Whether to add explanatory comments.
19
+
20
+ Returns:
21
+ Python source code as a string.
22
+ """
23
+ if not results:
24
+ return "# No transformations to apply"
25
+
26
+ lines: list[str] = []
27
+
28
+ # Header
29
+ lines.append("import pandas as pd")
30
+ lines.append("import numpy as np")
31
+ lines.append("")
32
+
33
+ if style == "function":
34
+ lines.append("")
35
+ lines.append("def clean_data(df: pd.DataFrame) -> pd.DataFrame:")
36
+ lines.append(' """Apply data cleaning transformations."""')
37
+ lines.append(" df = df.copy()")
38
+ lines.append("")
39
+ for result in results:
40
+ if include_comments:
41
+ lines.append(
42
+ f" # {result.transformer}: {result.rows_affected} rows affected"
43
+ )
44
+ for code_line in result.code.split("\n"):
45
+ if code_line.strip():
46
+ # Skip redundant imports inside function
47
+ if code_line.startswith("import "):
48
+ continue
49
+ lines.append(f" {code_line}")
50
+ lines.append("")
51
+ lines.append(" return df")
52
+ else:
53
+ if include_comments:
54
+ lines.append("# Load data")
55
+ lines.append('df = pd.read_csv("input.csv") # Update path as needed')
56
+ lines.append("")
57
+ for result in results:
58
+ if include_comments:
59
+ lines.append(
60
+ f"# {result.transformer}: {result.rows_affected} rows affected"
61
+ )
62
+ for code_line in result.code.split("\n"):
63
+ if code_line.strip():
64
+ if code_line.startswith("import "):
65
+ continue
66
+ lines.append(code_line)
67
+ lines.append("")
68
+ if include_comments:
69
+ lines.append("# Save cleaned data")
70
+ lines.append('df.to_csv("output.csv", index=False)')
71
+
72
+ return "\n".join(lines) + "\n"
@@ -0,0 +1 @@
1
+ """Core functionality for datawash."""
datawash/core/cache.py ADDED
@@ -0,0 +1,64 @@
1
+ """Lazy-evaluated computation cache shared across detectors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import pandas as pd
8
+
9
+
10
+ class ComputationCache:
11
+ """Cache expensive column computations.
12
+
13
+ Computes on first access, returns cached value on subsequent access.
14
+ All detectors share the same cache instance to avoid redundant work.
15
+ """
16
+
17
+ def __init__(self, df: pd.DataFrame) -> None:
18
+ self._df = df
19
+ self._null_masks: dict[str, pd.Series] = {}
20
+ self._value_sets: dict[str, set[str]] = {}
21
+ self._unique_counts: dict[str, int] = {}
22
+ self._statistics: dict[str, dict[str, Any]] = {}
23
+
24
+ def get_null_mask(self, column: str) -> pd.Series:
25
+ """Return boolean mask of null values. Cached."""
26
+ if column not in self._null_masks:
27
+ self._null_masks[column] = self._df[column].isna()
28
+ return self._null_masks[column]
29
+
30
+ def get_value_set(self, column: str, max_values: int = 10000) -> set[str]:
31
+ """Return set of unique non-null string values. Cached."""
32
+ if column not in self._value_sets:
33
+ values = self._df[column].dropna()
34
+ if len(values) > max_values:
35
+ values = values.sample(max_values, random_state=42)
36
+ self._value_sets[column] = set(values.astype(str))
37
+ return self._value_sets[column]
38
+
39
+ def get_unique_count(self, column: str) -> int:
40
+ """Return count of unique values. Cached."""
41
+ if column not in self._unique_counts:
42
+ self._unique_counts[column] = int(self._df[column].nunique())
43
+ return self._unique_counts[column]
44
+
45
+ def get_statistics(self, column: str) -> dict[str, Any]:
46
+ """Return numeric statistics. Cached."""
47
+ if column not in self._statistics:
48
+ col = self._df[column]
49
+ if pd.api.types.is_numeric_dtype(col):
50
+ clean = col.dropna()
51
+ if clean.empty:
52
+ self._statistics[column] = {}
53
+ else:
54
+ self._statistics[column] = {
55
+ "mean": float(clean.mean()),
56
+ "std": float(clean.std()) if len(clean) > 1 else 0.0,
57
+ "min": float(clean.min()),
58
+ "max": float(clean.max()),
59
+ "q1": float(clean.quantile(0.25)),
60
+ "q3": float(clean.quantile(0.75)),
61
+ }
62
+ else:
63
+ self._statistics[column] = {}
64
+ return self._statistics[column]
@@ -0,0 +1,56 @@
1
+ """Configuration management for datawash."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Literal
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ class DetectorConfig(BaseModel):
11
+ enabled: list[str] = Field(
12
+ default_factory=lambda: [
13
+ "missing",
14
+ "duplicates",
15
+ "types",
16
+ "formats",
17
+ "outliers",
18
+ "similarity",
19
+ ]
20
+ )
21
+ similarity_name_threshold: float = 0.8
22
+ similarity_value_threshold: float = 0.7
23
+ outlier_method: Literal["iqr", "zscore"] = "iqr"
24
+ outlier_threshold: float = 1.5
25
+ fuzzy_duplicates_enabled: bool = False
26
+ fuzzy_duplicates_threshold: float = 0.85
27
+
28
+
29
+ class MLConfig(BaseModel):
30
+ embedding_model: str = "all-MiniLM-L6-v2"
31
+ device: str = "cpu"
32
+
33
+
34
+ class SuggestionConfig(BaseModel):
35
+ max_suggestions: int = 50
36
+ min_confidence: float = 0.7
37
+
38
+
39
+ class CodegenConfig(BaseModel):
40
+ style: Literal["function", "script"] = "function"
41
+ include_comments: bool = True
42
+
43
+
44
+ class Config(BaseModel):
45
+ sample_size: int = 10000
46
+ max_unique_ratio: float = 0.95
47
+ null_threshold: float = 0.5
48
+ detectors: DetectorConfig = Field(default_factory=DetectorConfig)
49
+ ml: MLConfig = Field(default_factory=MLConfig)
50
+ suggestions: SuggestionConfig = Field(default_factory=SuggestionConfig)
51
+ codegen: CodegenConfig = Field(default_factory=CodegenConfig)
52
+ use_case: Literal["general", "ml", "analytics", "export"] = "general"
53
+
54
+ @classmethod
55
+ def from_dict(cls, data: dict[str, Any]) -> Config:
56
+ return cls.model_validate(data)