datawash 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datawash/__init__.py +9 -0
- datawash/adapters/__init__.py +12 -0
- datawash/adapters/base.py +66 -0
- datawash/adapters/csv_adapter.py +23 -0
- datawash/adapters/excel_adapter.py +36 -0
- datawash/adapters/json_adapter.py +21 -0
- datawash/adapters/parquet_adapter.py +34 -0
- datawash/cli/__init__.py +0 -0
- datawash/cli/formatters.py +110 -0
- datawash/cli/main.py +168 -0
- datawash/codegen/__init__.py +1 -0
- datawash/codegen/generator.py +72 -0
- datawash/core/__init__.py +1 -0
- datawash/core/cache.py +64 -0
- datawash/core/config.py +56 -0
- datawash/core/dtypes.py +24 -0
- datawash/core/exceptions.py +21 -0
- datawash/core/models.py +78 -0
- datawash/core/report.py +430 -0
- datawash/core/sampling.py +84 -0
- datawash/detectors/__init__.py +13 -0
- datawash/detectors/base.py +27 -0
- datawash/detectors/duplicate_detector.py +56 -0
- datawash/detectors/format_detector.py +130 -0
- datawash/detectors/missing_detector.py +78 -0
- datawash/detectors/outlier_detector.py +93 -0
- datawash/detectors/registry.py +64 -0
- datawash/detectors/similarity_detector.py +294 -0
- datawash/detectors/type_detector.py +100 -0
- datawash/profiler/__init__.py +1 -0
- datawash/profiler/engine.py +88 -0
- datawash/profiler/parallel.py +122 -0
- datawash/profiler/patterns.py +80 -0
- datawash/profiler/statistics.py +41 -0
- datawash/suggestors/__init__.py +1 -0
- datawash/suggestors/base.py +15 -0
- datawash/suggestors/engine.py +327 -0
- datawash/suggestors/prioritizer.py +23 -0
- datawash/transformers/__init__.py +13 -0
- datawash/transformers/base.py +27 -0
- datawash/transformers/categories.py +64 -0
- datawash/transformers/columns.py +72 -0
- datawash/transformers/duplicates.py +43 -0
- datawash/transformers/formats.py +95 -0
- datawash/transformers/missing.py +201 -0
- datawash/transformers/registry.py +30 -0
- datawash/transformers/types.py +95 -0
- datawash-0.2.0.dist-info/METADATA +353 -0
- datawash-0.2.0.dist-info/RECORD +53 -0
- datawash-0.2.0.dist-info/WHEEL +5 -0
- datawash-0.2.0.dist-info/entry_points.txt +2 -0
- datawash-0.2.0.dist-info/licenses/LICENSE +21 -0
- datawash-0.2.0.dist-info/top_level.txt +1 -0
datawash/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Data adapters for loading and saving DataFrames."""
|
|
2
|
+
|
|
3
|
+
# Import adapters to trigger registration
|
|
4
|
+
from datawash.adapters import (
|
|
5
|
+
csv_adapter, # noqa: F401
|
|
6
|
+
excel_adapter, # noqa: F401
|
|
7
|
+
json_adapter, # noqa: F401
|
|
8
|
+
parquet_adapter, # noqa: F401
|
|
9
|
+
)
|
|
10
|
+
from datawash.adapters.base import load_dataframe
|
|
11
|
+
|
|
12
|
+
__all__ = ["load_dataframe"]
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Base adapter interface and loader dispatch."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Optional, Protocol
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from datawash.core.exceptions import AdapterError
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DataAdapter(Protocol):
|
|
17
|
+
"""Protocol for data adapters."""
|
|
18
|
+
|
|
19
|
+
def read(self, path: Path, **kwargs: Any) -> pd.DataFrame: ...
|
|
20
|
+
def write(self, df: pd.DataFrame, path: Path, **kwargs: Any) -> None: ...
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
_ADAPTERS: dict[str, DataAdapter] = {}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def register_adapter(extension: str, adapter: DataAdapter) -> None:
|
|
27
|
+
_ADAPTERS[extension] = adapter
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def load_dataframe(
|
|
31
|
+
source: str | Path, format: Optional[str] = None, **kwargs: Any
|
|
32
|
+
) -> pd.DataFrame:
|
|
33
|
+
"""Load a DataFrame from a file path.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
source: Path to the data file.
|
|
37
|
+
format: Force file format. Auto-detected from extension if None.
|
|
38
|
+
**kwargs: Passed to the adapter's read method.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Loaded DataFrame.
|
|
42
|
+
|
|
43
|
+
Raises:
|
|
44
|
+
AdapterError: If the file cannot be loaded.
|
|
45
|
+
"""
|
|
46
|
+
path = Path(source)
|
|
47
|
+
if not path.exists():
|
|
48
|
+
raise AdapterError(f"File not found: {path}")
|
|
49
|
+
|
|
50
|
+
ext = format or path.suffix.lstrip(".")
|
|
51
|
+
adapter = _ADAPTERS.get(ext)
|
|
52
|
+
if adapter is None:
|
|
53
|
+
raise AdapterError(
|
|
54
|
+
f"Unsupported format: '{ext}'. "
|
|
55
|
+
f"Supported formats: {', '.join(sorted(_ADAPTERS.keys()))}"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
logger.info("Loading %s with %s adapter", path, ext)
|
|
60
|
+
df = adapter.read(path, **kwargs)
|
|
61
|
+
logger.info("Loaded %d rows, %d columns", len(df), len(df.columns))
|
|
62
|
+
return df
|
|
63
|
+
except AdapterError:
|
|
64
|
+
raise
|
|
65
|
+
except Exception as e:
|
|
66
|
+
raise AdapterError(f"Failed to read {path}: {e}") from e
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""CSV file adapter."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from datawash.adapters.base import register_adapter
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CsvAdapter:
|
|
14
|
+
def read(self, path: Path, **kwargs: Any) -> pd.DataFrame:
|
|
15
|
+
return pd.read_csv(path, **kwargs)
|
|
16
|
+
|
|
17
|
+
def write(self, df: pd.DataFrame, path: Path, **kwargs: Any) -> None:
|
|
18
|
+
df.to_csv(path, index=False, **kwargs)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
_adapter = CsvAdapter()
|
|
22
|
+
register_adapter("csv", _adapter)
|
|
23
|
+
register_adapter("tsv", _adapter) # TSV uses same adapter with sep='\t'
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Excel file adapter (requires openpyxl)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from datawash.adapters.base import register_adapter
|
|
11
|
+
from datawash.core.exceptions import AdapterError
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ExcelAdapter:
|
|
15
|
+
def read(self, path: Path, **kwargs: Any) -> pd.DataFrame:
|
|
16
|
+
try:
|
|
17
|
+
return pd.read_excel(path, **kwargs)
|
|
18
|
+
except ImportError:
|
|
19
|
+
raise AdapterError(
|
|
20
|
+
"Excel support requires openpyxl. "
|
|
21
|
+
"Install with: pip install datawash[formats]"
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
def write(self, df: pd.DataFrame, path: Path, **kwargs: Any) -> None:
|
|
25
|
+
try:
|
|
26
|
+
df.to_excel(path, index=False, **kwargs)
|
|
27
|
+
except ImportError:
|
|
28
|
+
raise AdapterError(
|
|
29
|
+
"Excel support requires openpyxl. "
|
|
30
|
+
"Install with: pip install datawash[formats]"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
_adapter = ExcelAdapter()
|
|
35
|
+
register_adapter("xlsx", _adapter)
|
|
36
|
+
register_adapter("xls", _adapter)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""JSON file adapter."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from datawash.adapters.base import register_adapter
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class JsonAdapter:
|
|
14
|
+
def read(self, path: Path, **kwargs: Any) -> pd.DataFrame:
|
|
15
|
+
return pd.read_json(path, **kwargs)
|
|
16
|
+
|
|
17
|
+
def write(self, df: pd.DataFrame, path: Path, **kwargs: Any) -> None:
|
|
18
|
+
df.to_json(path, orient="records", indent=2, **kwargs)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
register_adapter("json", JsonAdapter())
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Parquet file adapter (requires pyarrow)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from datawash.adapters.base import register_adapter
|
|
11
|
+
from datawash.core.exceptions import AdapterError
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ParquetAdapter:
|
|
15
|
+
def read(self, path: Path, **kwargs: Any) -> pd.DataFrame:
|
|
16
|
+
try:
|
|
17
|
+
return pd.read_parquet(path, **kwargs)
|
|
18
|
+
except ImportError:
|
|
19
|
+
raise AdapterError(
|
|
20
|
+
"Parquet support requires pyarrow. "
|
|
21
|
+
"Install with: pip install datawash[formats]"
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
def write(self, df: pd.DataFrame, path: Path, **kwargs: Any) -> None:
|
|
25
|
+
try:
|
|
26
|
+
df.to_parquet(path, index=False, **kwargs)
|
|
27
|
+
except ImportError:
|
|
28
|
+
raise AdapterError(
|
|
29
|
+
"Parquet support requires pyarrow. "
|
|
30
|
+
"Install with: pip install datawash[formats]"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
register_adapter("parquet", ParquetAdapter())
|
datawash/cli/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Rich output formatting for CLI."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from rich.console import Console
|
|
6
|
+
from rich.panel import Panel
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
from rich.text import Text
|
|
9
|
+
|
|
10
|
+
from datawash.core.models import DatasetProfile, Finding, Suggestion
|
|
11
|
+
|
|
12
|
+
console = Console()
|
|
13
|
+
|
|
14
|
+
SEVERITY_COLORS = {"high": "red", "medium": "yellow", "low": "green"}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def format_profile(profile: DatasetProfile) -> None:
|
|
18
|
+
"""Print dataset profile."""
|
|
19
|
+
console.print(
|
|
20
|
+
Panel(
|
|
21
|
+
f"[bold]{profile.row_count}[/] rows x "
|
|
22
|
+
f"[bold]{profile.column_count}[/] columns | "
|
|
23
|
+
f"Memory: [bold]"
|
|
24
|
+
f"{profile.memory_bytes / 1024 / 1024:.1f} MB[/] | "
|
|
25
|
+
f"Dupes: [bold]{profile.duplicate_row_count}[/]",
|
|
26
|
+
title="Dataset Overview",
|
|
27
|
+
)
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
table = Table(title="Column Profiles")
|
|
31
|
+
table.add_column("Column", style="cyan")
|
|
32
|
+
table.add_column("Type")
|
|
33
|
+
table.add_column("Nulls", justify="right")
|
|
34
|
+
table.add_column("Unique", justify="right")
|
|
35
|
+
table.add_column("Sample Values")
|
|
36
|
+
|
|
37
|
+
for name, col in profile.columns.items():
|
|
38
|
+
null_str = (
|
|
39
|
+
f"{col.null_count} ({col.null_ratio:.0%})" if col.null_count > 0 else "0"
|
|
40
|
+
)
|
|
41
|
+
samples = ", ".join(str(v) for v in col.sample_values[:3])
|
|
42
|
+
semantic = f" [{col.semantic_type}]" if col.semantic_type else ""
|
|
43
|
+
table.add_row(
|
|
44
|
+
name, f"{col.dtype}{semantic}", null_str, str(col.unique_count), samples
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
console.print(table)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def format_issues(findings: list[Finding]) -> None:
|
|
51
|
+
"""Print detected issues."""
|
|
52
|
+
if not findings:
|
|
53
|
+
console.print("[green]No issues detected![/]")
|
|
54
|
+
return
|
|
55
|
+
|
|
56
|
+
table = Table(title=f"Issues Found ({len(findings)})")
|
|
57
|
+
table.add_column("Severity", justify="center")
|
|
58
|
+
table.add_column("Detector")
|
|
59
|
+
table.add_column("Message")
|
|
60
|
+
table.add_column("Columns")
|
|
61
|
+
|
|
62
|
+
for f in findings:
|
|
63
|
+
color = SEVERITY_COLORS.get(f.severity.value, "white")
|
|
64
|
+
table.add_row(
|
|
65
|
+
Text(f.severity.value.upper(), style=color),
|
|
66
|
+
f.detector,
|
|
67
|
+
f.message,
|
|
68
|
+
", ".join(f.columns),
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
console.print(table)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def format_suggestions(suggestions: list[Suggestion]) -> None:
|
|
75
|
+
"""Print suggestions."""
|
|
76
|
+
if not suggestions:
|
|
77
|
+
console.print("[green]No suggestions.[/]")
|
|
78
|
+
return
|
|
79
|
+
|
|
80
|
+
table = Table(title=f"Suggestions ({len(suggestions)})")
|
|
81
|
+
table.add_column("#", justify="right", style="bold")
|
|
82
|
+
table.add_column("Priority", justify="center")
|
|
83
|
+
table.add_column("Action")
|
|
84
|
+
table.add_column("Impact")
|
|
85
|
+
table.add_column("Rationale")
|
|
86
|
+
|
|
87
|
+
for s in suggestions:
|
|
88
|
+
color = SEVERITY_COLORS.get(s.priority.value, "white")
|
|
89
|
+
table.add_row(
|
|
90
|
+
str(s.id),
|
|
91
|
+
Text(s.priority.value.upper(), style=color),
|
|
92
|
+
s.action,
|
|
93
|
+
s.impact,
|
|
94
|
+
s.rationale,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
console.print(table)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def format_transformation_summary(
|
|
101
|
+
before_rows: int, after_rows: int, n_applied: int
|
|
102
|
+
) -> None:
|
|
103
|
+
"""Print transformation summary."""
|
|
104
|
+
console.print(
|
|
105
|
+
Panel(
|
|
106
|
+
f"Applied [bold]{n_applied}[/] transformation(s)\n"
|
|
107
|
+
f"Rows: {before_rows} → {after_rows} ({before_rows - after_rows} removed)",
|
|
108
|
+
title="Cleaning Summary",
|
|
109
|
+
)
|
|
110
|
+
)
|
datawash/cli/main.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""CLI entry point for datawash."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
|
|
11
|
+
app = typer.Typer(
|
|
12
|
+
name="datawash",
|
|
13
|
+
help="Intelligent data cleaning and quality analysis.",
|
|
14
|
+
no_args_is_help=True,
|
|
15
|
+
)
|
|
16
|
+
console = Console()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@app.command()
|
|
20
|
+
def analyze(
|
|
21
|
+
file: Path = typer.Argument(..., help="Path to data file"),
|
|
22
|
+
sample: Optional[int] = typer.Option(
|
|
23
|
+
None, "--sample", "-s", help="Number of rows to sample"
|
|
24
|
+
),
|
|
25
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed output"),
|
|
26
|
+
) -> None:
|
|
27
|
+
"""Analyze a dataset and show quality report."""
|
|
28
|
+
from datawash.cli.formatters import (
|
|
29
|
+
format_issues,
|
|
30
|
+
format_profile,
|
|
31
|
+
format_suggestions,
|
|
32
|
+
)
|
|
33
|
+
from datawash.core.report import Report
|
|
34
|
+
|
|
35
|
+
with console.status("Analyzing..."):
|
|
36
|
+
report = Report(str(file))
|
|
37
|
+
|
|
38
|
+
format_profile(report.profile)
|
|
39
|
+
console.print()
|
|
40
|
+
format_issues(report.issues)
|
|
41
|
+
console.print()
|
|
42
|
+
format_suggestions(report.suggestions)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@app.command()
|
|
46
|
+
def suggest(
|
|
47
|
+
file: Path = typer.Argument(..., help="Path to data file"),
|
|
48
|
+
use_case: str = typer.Option(
|
|
49
|
+
"general", "--use-case", "-u", help="Context: general, ml, analytics, export"
|
|
50
|
+
),
|
|
51
|
+
priority: str = typer.Option(
|
|
52
|
+
"all", "--priority", "-p", help="Filter: high, medium, low, all"
|
|
53
|
+
),
|
|
54
|
+
limit: int = typer.Option(20, "--limit", "-l", help="Max suggestions"),
|
|
55
|
+
) -> None:
|
|
56
|
+
"""Show cleaning suggestions for a dataset."""
|
|
57
|
+
from datawash.cli.formatters import format_suggestions
|
|
58
|
+
from datawash.core.report import Report
|
|
59
|
+
|
|
60
|
+
with console.status("Analyzing..."):
|
|
61
|
+
report = Report(str(file), use_case=use_case)
|
|
62
|
+
|
|
63
|
+
suggestions = report.suggestions
|
|
64
|
+
if priority != "all":
|
|
65
|
+
suggestions = [s for s in suggestions if s.priority.value == priority]
|
|
66
|
+
suggestions = suggestions[:limit]
|
|
67
|
+
|
|
68
|
+
format_suggestions(suggestions)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@app.command()
|
|
72
|
+
def clean(
|
|
73
|
+
file: Path = typer.Argument(..., help="Path to data file"),
|
|
74
|
+
output: Path = typer.Option(..., "--output", "-o", help="Output file path"),
|
|
75
|
+
apply: Optional[str] = typer.Option(
|
|
76
|
+
None, "--apply", "-a", help="Suggestion IDs (comma-separated)"
|
|
77
|
+
),
|
|
78
|
+
apply_all: bool = typer.Option(False, "--apply-all", help="Apply all suggestions"),
|
|
79
|
+
use_case: str = typer.Option("general", "--use-case", "-u", help="Context"),
|
|
80
|
+
codegen: Optional[Path] = typer.Option(
|
|
81
|
+
None, "--codegen", help="Also save generated Python code"
|
|
82
|
+
),
|
|
83
|
+
) -> None:
|
|
84
|
+
"""Clean a dataset by applying suggestions."""
|
|
85
|
+
from datawash.adapters.base import _ADAPTERS
|
|
86
|
+
from datawash.cli.formatters import format_transformation_summary
|
|
87
|
+
from datawash.core.report import Report
|
|
88
|
+
|
|
89
|
+
with console.status("Analyzing..."):
|
|
90
|
+
report = Report(str(file), use_case=use_case)
|
|
91
|
+
|
|
92
|
+
before_rows = len(report.df)
|
|
93
|
+
|
|
94
|
+
if apply_all:
|
|
95
|
+
clean_df = report.apply_all()
|
|
96
|
+
elif apply:
|
|
97
|
+
ids = [int(x.strip()) for x in apply.split(",")]
|
|
98
|
+
clean_df = report.apply(ids)
|
|
99
|
+
else:
|
|
100
|
+
console.print("[red]Specify --apply or --apply-all[/]")
|
|
101
|
+
raise typer.Exit(1)
|
|
102
|
+
|
|
103
|
+
# Save output
|
|
104
|
+
ext = output.suffix.lstrip(".")
|
|
105
|
+
adapter = _ADAPTERS.get(ext)
|
|
106
|
+
if adapter is None:
|
|
107
|
+
console.print(f"[red]Unsupported output format: {ext}[/]")
|
|
108
|
+
raise typer.Exit(1)
|
|
109
|
+
adapter.write(clean_df, output)
|
|
110
|
+
console.print(f"Saved cleaned data to [bold]{output}[/]")
|
|
111
|
+
|
|
112
|
+
format_transformation_summary(before_rows, len(clean_df), len(report._applied))
|
|
113
|
+
|
|
114
|
+
# Show before/after quality score
|
|
115
|
+
score_before = getattr(report, "_last_score_before", None)
|
|
116
|
+
score_after = getattr(report, "_last_score_after", None)
|
|
117
|
+
if score_before is not None and score_after is not None:
|
|
118
|
+
diff = score_after - score_before
|
|
119
|
+
sign = "+" if diff >= 0 else ""
|
|
120
|
+
console.print(f"Quality score: {score_before} → {score_after} ({sign}{diff})")
|
|
121
|
+
|
|
122
|
+
if codegen:
|
|
123
|
+
code = report.generate_code()
|
|
124
|
+
codegen.write_text(code)
|
|
125
|
+
console.print(f"Saved code to [bold]{codegen}[/]")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@app.command()
|
|
129
|
+
def codegen(
|
|
130
|
+
file: Path = typer.Argument(..., help="Path to data file"),
|
|
131
|
+
apply: Optional[str] = typer.Option(
|
|
132
|
+
None, "--apply", "-a", help="Suggestion IDs (comma-separated)"
|
|
133
|
+
),
|
|
134
|
+
apply_all: bool = typer.Option(
|
|
135
|
+
False, "--apply-all", help="Generate code for all suggestions"
|
|
136
|
+
),
|
|
137
|
+
style: str = typer.Option(
|
|
138
|
+
"function", "--style", "-s", help="Code style: function or script"
|
|
139
|
+
),
|
|
140
|
+
output: Optional[Path] = typer.Option(
|
|
141
|
+
None, "--output", "-o", help="Output Python file"
|
|
142
|
+
),
|
|
143
|
+
) -> None:
|
|
144
|
+
"""Generate Python code for data cleaning transformations."""
|
|
145
|
+
from datawash.core.report import Report
|
|
146
|
+
|
|
147
|
+
with console.status("Analyzing..."):
|
|
148
|
+
report = Report(str(file))
|
|
149
|
+
|
|
150
|
+
if apply_all:
|
|
151
|
+
report.apply_all()
|
|
152
|
+
elif apply:
|
|
153
|
+
ids = [int(x.strip()) for x in apply.split(",")]
|
|
154
|
+
report.apply(ids)
|
|
155
|
+
else:
|
|
156
|
+
report.apply_all()
|
|
157
|
+
|
|
158
|
+
code = report.generate_code(style=style)
|
|
159
|
+
|
|
160
|
+
if output:
|
|
161
|
+
output.write_text(code)
|
|
162
|
+
console.print(f"Saved code to [bold]{output}[/]")
|
|
163
|
+
else:
|
|
164
|
+
console.print(code)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
if __name__ == "__main__":
|
|
168
|
+
app()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .generator import generate_code as generate_code
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Python code generation from transformation log."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datawash.core.models import TransformationResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def generate_code(
|
|
9
|
+
results: list[TransformationResult],
|
|
10
|
+
style: str = "function",
|
|
11
|
+
include_comments: bool = True,
|
|
12
|
+
) -> str:
|
|
13
|
+
"""Generate Python code from a list of transformation results.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
results: List of TransformationResult from applied transformations.
|
|
17
|
+
style: "function" wraps in a function, "script" generates standalone.
|
|
18
|
+
include_comments: Whether to add explanatory comments.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Python source code as a string.
|
|
22
|
+
"""
|
|
23
|
+
if not results:
|
|
24
|
+
return "# No transformations to apply"
|
|
25
|
+
|
|
26
|
+
lines: list[str] = []
|
|
27
|
+
|
|
28
|
+
# Header
|
|
29
|
+
lines.append("import pandas as pd")
|
|
30
|
+
lines.append("import numpy as np")
|
|
31
|
+
lines.append("")
|
|
32
|
+
|
|
33
|
+
if style == "function":
|
|
34
|
+
lines.append("")
|
|
35
|
+
lines.append("def clean_data(df: pd.DataFrame) -> pd.DataFrame:")
|
|
36
|
+
lines.append(' """Apply data cleaning transformations."""')
|
|
37
|
+
lines.append(" df = df.copy()")
|
|
38
|
+
lines.append("")
|
|
39
|
+
for result in results:
|
|
40
|
+
if include_comments:
|
|
41
|
+
lines.append(
|
|
42
|
+
f" # {result.transformer}: {result.rows_affected} rows affected"
|
|
43
|
+
)
|
|
44
|
+
for code_line in result.code.split("\n"):
|
|
45
|
+
if code_line.strip():
|
|
46
|
+
# Skip redundant imports inside function
|
|
47
|
+
if code_line.startswith("import "):
|
|
48
|
+
continue
|
|
49
|
+
lines.append(f" {code_line}")
|
|
50
|
+
lines.append("")
|
|
51
|
+
lines.append(" return df")
|
|
52
|
+
else:
|
|
53
|
+
if include_comments:
|
|
54
|
+
lines.append("# Load data")
|
|
55
|
+
lines.append('df = pd.read_csv("input.csv") # Update path as needed')
|
|
56
|
+
lines.append("")
|
|
57
|
+
for result in results:
|
|
58
|
+
if include_comments:
|
|
59
|
+
lines.append(
|
|
60
|
+
f"# {result.transformer}: {result.rows_affected} rows affected"
|
|
61
|
+
)
|
|
62
|
+
for code_line in result.code.split("\n"):
|
|
63
|
+
if code_line.strip():
|
|
64
|
+
if code_line.startswith("import "):
|
|
65
|
+
continue
|
|
66
|
+
lines.append(code_line)
|
|
67
|
+
lines.append("")
|
|
68
|
+
if include_comments:
|
|
69
|
+
lines.append("# Save cleaned data")
|
|
70
|
+
lines.append('df.to_csv("output.csv", index=False)')
|
|
71
|
+
|
|
72
|
+
return "\n".join(lines) + "\n"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Core functionality for datawash."""
|
datawash/core/cache.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Lazy-evaluated computation cache shared across detectors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ComputationCache:
|
|
11
|
+
"""Cache expensive column computations.
|
|
12
|
+
|
|
13
|
+
Computes on first access, returns cached value on subsequent access.
|
|
14
|
+
All detectors share the same cache instance to avoid redundant work.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, df: pd.DataFrame) -> None:
|
|
18
|
+
self._df = df
|
|
19
|
+
self._null_masks: dict[str, pd.Series] = {}
|
|
20
|
+
self._value_sets: dict[str, set[str]] = {}
|
|
21
|
+
self._unique_counts: dict[str, int] = {}
|
|
22
|
+
self._statistics: dict[str, dict[str, Any]] = {}
|
|
23
|
+
|
|
24
|
+
def get_null_mask(self, column: str) -> pd.Series:
|
|
25
|
+
"""Return boolean mask of null values. Cached."""
|
|
26
|
+
if column not in self._null_masks:
|
|
27
|
+
self._null_masks[column] = self._df[column].isna()
|
|
28
|
+
return self._null_masks[column]
|
|
29
|
+
|
|
30
|
+
def get_value_set(self, column: str, max_values: int = 10000) -> set[str]:
|
|
31
|
+
"""Return set of unique non-null string values. Cached."""
|
|
32
|
+
if column not in self._value_sets:
|
|
33
|
+
values = self._df[column].dropna()
|
|
34
|
+
if len(values) > max_values:
|
|
35
|
+
values = values.sample(max_values, random_state=42)
|
|
36
|
+
self._value_sets[column] = set(values.astype(str))
|
|
37
|
+
return self._value_sets[column]
|
|
38
|
+
|
|
39
|
+
def get_unique_count(self, column: str) -> int:
|
|
40
|
+
"""Return count of unique values. Cached."""
|
|
41
|
+
if column not in self._unique_counts:
|
|
42
|
+
self._unique_counts[column] = int(self._df[column].nunique())
|
|
43
|
+
return self._unique_counts[column]
|
|
44
|
+
|
|
45
|
+
def get_statistics(self, column: str) -> dict[str, Any]:
|
|
46
|
+
"""Return numeric statistics. Cached."""
|
|
47
|
+
if column not in self._statistics:
|
|
48
|
+
col = self._df[column]
|
|
49
|
+
if pd.api.types.is_numeric_dtype(col):
|
|
50
|
+
clean = col.dropna()
|
|
51
|
+
if clean.empty:
|
|
52
|
+
self._statistics[column] = {}
|
|
53
|
+
else:
|
|
54
|
+
self._statistics[column] = {
|
|
55
|
+
"mean": float(clean.mean()),
|
|
56
|
+
"std": float(clean.std()) if len(clean) > 1 else 0.0,
|
|
57
|
+
"min": float(clean.min()),
|
|
58
|
+
"max": float(clean.max()),
|
|
59
|
+
"q1": float(clean.quantile(0.25)),
|
|
60
|
+
"q3": float(clean.quantile(0.75)),
|
|
61
|
+
}
|
|
62
|
+
else:
|
|
63
|
+
self._statistics[column] = {}
|
|
64
|
+
return self._statistics[column]
|
datawash/core/config.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Configuration management for datawash."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Literal
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DetectorConfig(BaseModel):
|
|
11
|
+
enabled: list[str] = Field(
|
|
12
|
+
default_factory=lambda: [
|
|
13
|
+
"missing",
|
|
14
|
+
"duplicates",
|
|
15
|
+
"types",
|
|
16
|
+
"formats",
|
|
17
|
+
"outliers",
|
|
18
|
+
"similarity",
|
|
19
|
+
]
|
|
20
|
+
)
|
|
21
|
+
similarity_name_threshold: float = 0.8
|
|
22
|
+
similarity_value_threshold: float = 0.7
|
|
23
|
+
outlier_method: Literal["iqr", "zscore"] = "iqr"
|
|
24
|
+
outlier_threshold: float = 1.5
|
|
25
|
+
fuzzy_duplicates_enabled: bool = False
|
|
26
|
+
fuzzy_duplicates_threshold: float = 0.85
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class MLConfig(BaseModel):
|
|
30
|
+
embedding_model: str = "all-MiniLM-L6-v2"
|
|
31
|
+
device: str = "cpu"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class SuggestionConfig(BaseModel):
|
|
35
|
+
max_suggestions: int = 50
|
|
36
|
+
min_confidence: float = 0.7
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class CodegenConfig(BaseModel):
|
|
40
|
+
style: Literal["function", "script"] = "function"
|
|
41
|
+
include_comments: bool = True
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class Config(BaseModel):
|
|
45
|
+
sample_size: int = 10000
|
|
46
|
+
max_unique_ratio: float = 0.95
|
|
47
|
+
null_threshold: float = 0.5
|
|
48
|
+
detectors: DetectorConfig = Field(default_factory=DetectorConfig)
|
|
49
|
+
ml: MLConfig = Field(default_factory=MLConfig)
|
|
50
|
+
suggestions: SuggestionConfig = Field(default_factory=SuggestionConfig)
|
|
51
|
+
codegen: CodegenConfig = Field(default_factory=CodegenConfig)
|
|
52
|
+
use_case: Literal["general", "ml", "analytics", "export"] = "general"
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def from_dict(cls, data: dict[str, Any]) -> Config:
|
|
56
|
+
return cls.model_validate(data)
|