datawash 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. datawash/__init__.py +9 -0
  2. datawash/adapters/__init__.py +12 -0
  3. datawash/adapters/base.py +66 -0
  4. datawash/adapters/csv_adapter.py +23 -0
  5. datawash/adapters/excel_adapter.py +36 -0
  6. datawash/adapters/json_adapter.py +21 -0
  7. datawash/adapters/parquet_adapter.py +34 -0
  8. datawash/cli/__init__.py +0 -0
  9. datawash/cli/formatters.py +110 -0
  10. datawash/cli/main.py +168 -0
  11. datawash/codegen/__init__.py +1 -0
  12. datawash/codegen/generator.py +72 -0
  13. datawash/core/__init__.py +1 -0
  14. datawash/core/cache.py +64 -0
  15. datawash/core/config.py +56 -0
  16. datawash/core/dtypes.py +24 -0
  17. datawash/core/exceptions.py +21 -0
  18. datawash/core/models.py +78 -0
  19. datawash/core/report.py +430 -0
  20. datawash/core/sampling.py +84 -0
  21. datawash/detectors/__init__.py +13 -0
  22. datawash/detectors/base.py +27 -0
  23. datawash/detectors/duplicate_detector.py +56 -0
  24. datawash/detectors/format_detector.py +130 -0
  25. datawash/detectors/missing_detector.py +78 -0
  26. datawash/detectors/outlier_detector.py +93 -0
  27. datawash/detectors/registry.py +64 -0
  28. datawash/detectors/similarity_detector.py +294 -0
  29. datawash/detectors/type_detector.py +100 -0
  30. datawash/profiler/__init__.py +1 -0
  31. datawash/profiler/engine.py +88 -0
  32. datawash/profiler/parallel.py +122 -0
  33. datawash/profiler/patterns.py +80 -0
  34. datawash/profiler/statistics.py +41 -0
  35. datawash/suggestors/__init__.py +1 -0
  36. datawash/suggestors/base.py +15 -0
  37. datawash/suggestors/engine.py +327 -0
  38. datawash/suggestors/prioritizer.py +23 -0
  39. datawash/transformers/__init__.py +13 -0
  40. datawash/transformers/base.py +27 -0
  41. datawash/transformers/categories.py +64 -0
  42. datawash/transformers/columns.py +72 -0
  43. datawash/transformers/duplicates.py +43 -0
  44. datawash/transformers/formats.py +95 -0
  45. datawash/transformers/missing.py +201 -0
  46. datawash/transformers/registry.py +30 -0
  47. datawash/transformers/types.py +95 -0
  48. datawash-0.2.0.dist-info/METADATA +353 -0
  49. datawash-0.2.0.dist-info/RECORD +53 -0
  50. datawash-0.2.0.dist-info/WHEEL +5 -0
  51. datawash-0.2.0.dist-info/entry_points.txt +2 -0
  52. datawash-0.2.0.dist-info/licenses/LICENSE +21 -0
  53. datawash-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,41 @@
1
+ """Statistical computations for column profiling."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import pandas as pd
8
+
9
+
10
+ def compute_numeric_stats(series: pd.Series) -> dict[str, Any]:
11
+ """Compute statistics for a numeric column."""
12
+ clean = series.dropna()
13
+ if clean.empty:
14
+ return {}
15
+ return {
16
+ "mean": float(clean.mean()),
17
+ "median": float(clean.median()),
18
+ "std": float(clean.std()) if len(clean) > 1 else 0.0,
19
+ "min": float(clean.min()),
20
+ "max": float(clean.max()),
21
+ "q25": float(clean.quantile(0.25)),
22
+ "q75": float(clean.quantile(0.75)),
23
+ "skewness": float(clean.skew()) if len(clean) > 2 else 0.0,
24
+ "kurtosis": float(clean.kurtosis()) if len(clean) > 3 else 0.0,
25
+ }
26
+
27
+
28
+ def compute_categorical_stats(series: pd.Series) -> dict[str, Any]:
29
+ """Compute statistics for a categorical/string column."""
30
+ clean = series.dropna()
31
+ if clean.empty:
32
+ return {}
33
+ value_counts = clean.value_counts()
34
+ top_n = value_counts.head(10)
35
+ return {
36
+ "top_values": {str(k): int(v) for k, v in top_n.items()},
37
+ "mode": str(value_counts.index[0]) if len(value_counts) > 0 else None,
38
+ "avg_length": float(clean.astype(str).str.len().mean()),
39
+ "min_length": int(clean.astype(str).str.len().min()),
40
+ "max_length": int(clean.astype(str).str.len().max()),
41
+ }
@@ -0,0 +1 @@
1
+ from .engine import generate_suggestions as generate_suggestions
@@ -0,0 +1,15 @@
1
+ """Base suggestor interface."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+
7
+ from datawash.core.models import Finding, Suggestion
8
+
9
+
10
+ class BaseSuggestor(ABC):
11
+ """Maps findings to actionable suggestions."""
12
+
13
+ @abstractmethod
14
+ def suggest(self, finding: Finding) -> Suggestion | None:
15
+ """Generate a suggestion for a finding, or None if not applicable."""
@@ -0,0 +1,327 @@
1
+ """Suggestion generation engine."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from collections import defaultdict
7
+
8
+ from datawash.core.models import Finding, Severity, Suggestion
9
+ from datawash.suggestors.prioritizer import sort_suggestions
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ # Transformation execution order - later phases should not undo earlier phases
15
+ # The tuple is (transformer, operation/strategy) for precise matching
16
+ TRANSFORMATION_ORDER: list[tuple[str, str]] = [
17
+ # Phase 1: Structural cleaning (affects row count)
18
+ ("duplicates", "drop_duplicates"),
19
+ ("missing", "drop_rows"),
20
+ # Phase 2: Value normalization (changes string values)
21
+ ("formats", "strip_whitespace"),
22
+ ("formats", "lowercase"),
23
+ ("formats", "uppercase"),
24
+ ("formats", "titlecase"),
25
+ ("missing", "clean_empty_strings"), # combined: empty→NaN→fill
26
+ # Phase 3: Missing value handling (fills NaN)
27
+ ("missing", "fill_mode"),
28
+ ("missing", "fill_median"),
29
+ ("missing", "fill_value"),
30
+ ("missing", "empty_to_nan"), # legacy, prefer clean_empty_strings
31
+ # Phase 4: Type conversion (after all string cleaning done)
32
+ ("types", "boolean"),
33
+ ("types", "numeric"),
34
+ ("formats", "standardize_dates"),
35
+ # Phase 5: Outlier handling (after types are correct)
36
+ ("missing", "clip_outliers"),
37
+ # Phase 6: Column operations (last)
38
+ ("columns", "drop"),
39
+ ("columns", "rename"),
40
+ ("columns", "review_merge"),
41
+ ]
42
+
43
+
44
+ def _get_transform_order(transformer: str, params: dict) -> int:
45
+ """Get execution order for a transformation."""
46
+ # Determine the operation/strategy key
47
+ if transformer == "missing":
48
+ key = params.get("strategy", "")
49
+ elif transformer == "formats":
50
+ key = params.get("operation", "")
51
+ elif transformer == "types":
52
+ key = params.get("target_type", "")
53
+ elif transformer == "duplicates":
54
+ key = "drop_duplicates"
55
+ elif transformer == "columns":
56
+ key = params.get("operation", "")
57
+ else:
58
+ key = ""
59
+
60
+ for i, (t, op) in enumerate(TRANSFORMATION_ORDER):
61
+ if t == transformer and op == key:
62
+ return i
63
+ return 999 # Unknown transformations go last
64
+
65
+
66
+ # Exclusion rules: if a column has suggestion A, exclude suggestion B for same column
67
+ # Key: (transformer, operation/strategy), Value: list of (transformer, operation) to exclude
68
+ EXCLUSION_RULES: dict[tuple[str, str], list[tuple[str, str]]] = {
69
+ # If column will be converted to boolean, don't suggest case changes
70
+ ("types", "boolean"): [
71
+ ("formats", "lowercase"),
72
+ ("formats", "uppercase"),
73
+ ("formats", "titlecase"),
74
+ ],
75
+ # If column will be converted to datetime, don't suggest case changes
76
+ ("formats", "standardize_dates"): [
77
+ ("formats", "lowercase"),
78
+ ("formats", "uppercase"),
79
+ ("formats", "titlecase"),
80
+ ],
81
+ # If column will be converted to numeric, don't suggest case changes
82
+ ("types", "numeric"): [
83
+ ("formats", "lowercase"),
84
+ ("formats", "uppercase"),
85
+ ("formats", "titlecase"),
86
+ ],
87
+ }
88
+
89
+
90
+ def _get_transform_key(transformer: str, params: dict) -> tuple[str, str]:
91
+ """Get the (transformer, operation) key for exclusion matching."""
92
+ if transformer == "missing":
93
+ return (transformer, params.get("strategy", ""))
94
+ elif transformer == "formats":
95
+ return (transformer, params.get("operation", ""))
96
+ elif transformer == "types":
97
+ return (transformer, params.get("target_type", ""))
98
+ elif transformer == "duplicates":
99
+ return (transformer, "drop_duplicates")
100
+ elif transformer == "columns":
101
+ return (transformer, params.get("operation", ""))
102
+ return (transformer, "")
103
+
104
+
105
+ def _missing_strategy(finding: Finding) -> str:
106
+ """Choose fill strategy based on column dtype and null ratio."""
107
+ if finding.details.get("null_ratio", 0) > 0.5:
108
+ return "drop_rows"
109
+ dtype = finding.details.get("dtype", "")
110
+ # Check for numeric types
111
+ if any(kw in dtype for kw in ("int", "float", "Int", "Float", "number")):
112
+ return "fill_median"
113
+ # Check for boolean types
114
+ if "bool" in dtype.lower():
115
+ return "fill_mode"
116
+ # String/object/categorical → fill_mode
117
+ return "fill_mode"
118
+
119
+
120
+ # Maps (issue_type) -> (action, transformer, param_builder, impact, rationale)
121
+ _SUGGESTION_MAP: dict[str, dict] = {
122
+ "missing_values": {
123
+ "action": "Handle missing values",
124
+ "transformer": "missing",
125
+ "params_fn": lambda f: {
126
+ "columns": f.columns,
127
+ "strategy": _missing_strategy(f),
128
+ },
129
+ "impact": "Removes or fills null values to prevent errors",
130
+ "rationale": "Missing values cause errors in ML and analysis",
131
+ },
132
+ "empty_strings": {
133
+ "action": "Clean empty strings",
134
+ "transformer": "missing",
135
+ # Use combined strategy that converts empty→NaN and fills in one step
136
+ "params_fn": lambda f: {
137
+ "columns": f.columns,
138
+ "strategy": "clean_empty_strings",
139
+ },
140
+ "impact": "Converts empty strings to proper values",
141
+ "rationale": "Empty strings are often unintentional missing values",
142
+ },
143
+ "duplicate_rows": {
144
+ "action": "Remove duplicate rows",
145
+ "transformer": "duplicates",
146
+ "params_fn": lambda f: {"keep": "first"},
147
+ "impact": "Removes redundant data that skews analysis",
148
+ "rationale": "Exact duplicates inflate counts and bias statistics",
149
+ },
150
+ "inconsistent_case": {
151
+ "action": "Standardize text casing",
152
+ "transformer": "formats",
153
+ "params_fn": lambda f: {"columns": f.columns, "operation": "lowercase"},
154
+ "impact": "Ensures consistent text representation",
155
+ "rationale": "Mixed casing causes mismatches in grouping and joins",
156
+ },
157
+ "inconsistent_date_format": {
158
+ "action": "Standardize date format",
159
+ "transformer": "formats",
160
+ "params_fn": lambda f: {
161
+ "columns": f.columns,
162
+ "operation": "standardize_dates",
163
+ "target_format": "%Y-%m-%d",
164
+ },
165
+ "impact": "Ensures consistent date parsing",
166
+ "rationale": "Mixed date formats cause parsing errors",
167
+ },
168
+ "whitespace_padding": {
169
+ "action": "Strip whitespace from values",
170
+ "transformer": "formats",
171
+ "params_fn": lambda f: {"columns": f.columns, "operation": "strip_whitespace"},
172
+ "impact": "Removes accidental padding that causes mismatches",
173
+ "rationale": "Leading/trailing whitespace causes silent matching failures",
174
+ },
175
+ "outliers": {
176
+ "action": "Review and handle outliers",
177
+ "transformer": "missing",
178
+ "params_fn": lambda f: {
179
+ "columns": f.columns,
180
+ "strategy": "clip_outliers",
181
+ "method": f.details.get("method", "iqr"),
182
+ "threshold": f.details.get("threshold", 1.5),
183
+ },
184
+ "impact": "Reduces influence of extreme values on analysis",
185
+ "rationale": "Outliers can heavily skew means and model training",
186
+ },
187
+ "numeric_as_string": {
188
+ "action": "Convert to numeric type",
189
+ "transformer": "types",
190
+ "params_fn": lambda f: {"columns": f.columns, "target_type": "numeric"},
191
+ "impact": "Enables numeric operations and reduces memory",
192
+ "rationale": "Numeric data stored as strings prevents mathematical operations",
193
+ },
194
+ "boolean_as_string": {
195
+ "action": "Convert to boolean type",
196
+ "transformer": "types",
197
+ "params_fn": lambda f: {"columns": f.columns, "target_type": "boolean"},
198
+ "impact": "Correct type enables boolean operations",
199
+ "rationale": "Boolean data as strings wastes memory and prevents logic ops",
200
+ },
201
+ "similar_columns": {
202
+ "action": "Review potentially duplicate columns",
203
+ "transformer": "columns",
204
+ "params_fn": lambda f: {"columns": f.columns, "operation": "review_merge"},
205
+ "impact": "May reduce redundant data",
206
+ "rationale": "Similar columns may be duplicated data or candidates for merging",
207
+ },
208
+ }
209
+
210
+
211
+ _USE_CASE_BOOSTS: dict[str, dict[str, float]] = {
212
+ "ml": {
213
+ "duplicate_rows": 1.5,
214
+ "missing_values": 1.3,
215
+ "numeric_as_string": 1.3,
216
+ "boolean_as_string": 1.2,
217
+ "outliers": 1.2,
218
+ "similar_columns": 1.4,
219
+ },
220
+ "analytics": {
221
+ "missing_values": 1.5,
222
+ "outliers": 1.3,
223
+ "inconsistent_date_format": 1.4,
224
+ "inconsistent_case": 1.2,
225
+ },
226
+ "export": {
227
+ "inconsistent_date_format": 1.5,
228
+ "whitespace_padding": 1.4,
229
+ "inconsistent_case": 1.3,
230
+ "numeric_as_string": 1.3,
231
+ },
232
+ "general": {},
233
+ }
234
+
235
+
236
+ def _apply_exclusion_rules(suggestions: list[Suggestion]) -> list[Suggestion]:
237
+ """Remove suggestions that conflict with higher-priority transformations."""
238
+ # Build a map of column → list of (transform_key, suggestion)
239
+ col_transforms: dict[str, list[tuple[tuple[str, str], Suggestion]]] = defaultdict(
240
+ list
241
+ )
242
+
243
+ for s in suggestions:
244
+ key = _get_transform_key(s.transformer, s.params)
245
+ for col in s.params.get("columns", []):
246
+ col_transforms[col].append((key, s))
247
+
248
+ # Find suggestions to exclude
249
+ excluded_ids: set[int] = set()
250
+
251
+ for col, transforms in col_transforms.items():
252
+ # Check each transform against exclusion rules
253
+ for key, _s in transforms:
254
+ if key in EXCLUSION_RULES:
255
+ # This transform excludes certain others for the same column
256
+ to_exclude = EXCLUSION_RULES[key]
257
+ for other_key, other_s in transforms:
258
+ if other_key in to_exclude:
259
+ excluded_ids.add(id(other_s))
260
+ logger.debug(
261
+ "Excluding %s for column '%s' due to %s",
262
+ other_key,
263
+ col,
264
+ key,
265
+ )
266
+
267
+ return [s for s in suggestions if id(s) not in excluded_ids]
268
+
269
+
270
+ def _sort_by_execution_order(suggestions: list[Suggestion]) -> list[Suggestion]:
271
+ """Sort suggestions by transformation execution order."""
272
+ return sorted(
273
+ suggestions, key=lambda s: _get_transform_order(s.transformer, s.params)
274
+ )
275
+
276
+
277
+ def generate_suggestions(
278
+ findings: list[Finding],
279
+ max_suggestions: int = 50,
280
+ use_case: str = "general",
281
+ ) -> list[Suggestion]:
282
+ """Generate prioritized suggestions from findings."""
283
+ boosts = _USE_CASE_BOOSTS.get(use_case, {})
284
+ suggestions: list[Suggestion] = []
285
+ for finding in findings:
286
+ mapping = _SUGGESTION_MAP.get(finding.issue_type)
287
+ if mapping is None:
288
+ logger.debug(
289
+ "No suggestion mapping for: %s",
290
+ finding.issue_type,
291
+ )
292
+ continue
293
+
294
+ # Apply use-case priority boost
295
+ priority = finding.severity
296
+ boost = boosts.get(finding.issue_type, 1.0)
297
+ if boost >= 1.4 and priority == Severity.LOW:
298
+ priority = Severity.MEDIUM
299
+ elif boost >= 1.3 and priority == Severity.MEDIUM:
300
+ priority = Severity.HIGH
301
+
302
+ action = mapping["action"]
303
+ # Include column names in action text for column-specific suggestions
304
+ if finding.columns and len(finding.columns) <= 3:
305
+ col_str = ", ".join(f"'{c}'" for c in finding.columns)
306
+ action = f"{action} in {col_str}"
307
+
308
+ suggestion = Suggestion(
309
+ id=0,
310
+ finding=finding,
311
+ action=action,
312
+ transformer=mapping["transformer"],
313
+ params=mapping["params_fn"](finding),
314
+ priority=priority,
315
+ impact=mapping["impact"],
316
+ rationale=mapping["rationale"],
317
+ )
318
+ suggestions.append(suggestion)
319
+
320
+ # Step 1: Apply exclusion rules (remove conflicting suggestions)
321
+ suggestions = _apply_exclusion_rules(suggestions)
322
+
323
+ # Step 2: Sort by priority (for display)
324
+ suggestions = sort_suggestions(suggestions)
325
+
326
+ # Step 3: Assign IDs and limit
327
+ return suggestions[:max_suggestions]
@@ -0,0 +1,23 @@
1
+ """Suggestion prioritization."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datawash.core.models import Severity, Suggestion
6
+
7
+ SEVERITY_WEIGHTS = {Severity.HIGH: 3, Severity.MEDIUM: 2, Severity.LOW: 1}
8
+
9
+
10
+ def priority_score(suggestion: Suggestion) -> float:
11
+ """Compute a numeric priority score for sorting."""
12
+ severity_val = SEVERITY_WEIGHTS.get(suggestion.priority, 1)
13
+ confidence = suggestion.finding.confidence
14
+ # Impact approximated from severity
15
+ return severity_val * 0.5 + confidence * 0.5
16
+
17
+
18
+ def sort_suggestions(suggestions: list[Suggestion]) -> list[Suggestion]:
19
+ """Sort suggestions by priority score descending, reassign IDs."""
20
+ ranked = sorted(suggestions, key=priority_score, reverse=True)
21
+ for i, s in enumerate(ranked, 1):
22
+ s.id = i
23
+ return ranked
@@ -0,0 +1,13 @@
1
+ """Data transformers."""
2
+
3
+ # Import to trigger registration
4
+ from . import ( # noqa: F401
5
+ categories,
6
+ columns,
7
+ duplicates,
8
+ formats,
9
+ missing,
10
+ types,
11
+ )
12
+ from .registry import get_transformer as get_transformer
13
+ from .registry import run_transformer as run_transformer
@@ -0,0 +1,27 @@
1
+ """Base transformer interface."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import Any
7
+
8
+ import pandas as pd
9
+
10
+ from datawash.core.models import TransformationResult
11
+
12
+
13
+ class BaseTransformer(ABC):
14
+ @property
15
+ @abstractmethod
16
+ def name(self) -> str:
17
+ """Unique transformer name."""
18
+
19
+ @abstractmethod
20
+ def transform(
21
+ self, df: pd.DataFrame, **params: Any
22
+ ) -> tuple[pd.DataFrame, TransformationResult]:
23
+ """Apply transformation. Returns (new_df, result). Must NOT mutate input."""
24
+
25
+ @abstractmethod
26
+ def generate_code(self, **params: Any) -> str:
27
+ """Return equivalent pandas code string."""
@@ -0,0 +1,64 @@
1
+ """Category normalization."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import pandas as pd
8
+
9
+ from datawash.core.models import TransformationResult
10
+ from datawash.transformers.base import BaseTransformer
11
+ from datawash.transformers.registry import register_transformer
12
+
13
+
14
+ class CategoryTransformer(BaseTransformer):
15
+ @property
16
+ def name(self) -> str:
17
+ return "categories"
18
+
19
+ def transform(
20
+ self, df: pd.DataFrame, **params: Any
21
+ ) -> tuple[pd.DataFrame, TransformationResult]:
22
+ columns = params.get("columns", [])
23
+ mapping = params.get("mapping", {})
24
+ result_df = df.copy()
25
+ affected = 0
26
+
27
+ for col in columns:
28
+ if col not in result_df.columns:
29
+ continue
30
+ if mapping:
31
+ mask = result_df[col].isin(mapping.keys())
32
+ affected += int(mask.sum())
33
+ result_df[col] = result_df[col].replace(mapping)
34
+ else:
35
+ # Auto-normalize: strip + lowercase
36
+ before = result_df[col].copy()
37
+ result_df[col] = result_df[col].astype(str).str.strip().str.lower()
38
+ affected += int((before != result_df[col]).sum())
39
+
40
+ return result_df, TransformationResult(
41
+ transformer=self.name,
42
+ params=params,
43
+ rows_affected=affected,
44
+ columns_affected=columns,
45
+ code=self.generate_code(**params),
46
+ )
47
+
48
+ def generate_code(self, **params: Any) -> str:
49
+ columns = params.get("columns", [])
50
+ mapping = params.get("mapping", {})
51
+ lines = []
52
+ for col in columns:
53
+ if mapping:
54
+ lines.append(
55
+ f"df[{repr(col)}] = df[{repr(col)}].replace({repr(mapping)})"
56
+ )
57
+ else:
58
+ lines.append(
59
+ f"df[{repr(col)}] = df[{repr(col)}].astype(str).str.strip().str.lower()"
60
+ )
61
+ return "\n".join(lines)
62
+
63
+
64
+ register_transformer(CategoryTransformer())
@@ -0,0 +1,72 @@
1
+ """Column operations (merge, rename, drop)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import pandas as pd
8
+
9
+ from datawash.core.models import TransformationResult
10
+ from datawash.transformers.base import BaseTransformer
11
+ from datawash.transformers.registry import register_transformer
12
+
13
+
14
+ class ColumnTransformer(BaseTransformer):
15
+ @property
16
+ def name(self) -> str:
17
+ return "columns"
18
+
19
+ def transform(
20
+ self, df: pd.DataFrame, **params: Any
21
+ ) -> tuple[pd.DataFrame, TransformationResult]:
22
+ operation = params.get("operation", "drop")
23
+ columns = params.get("columns", [])
24
+ result_df = df.copy()
25
+ affected = 0
26
+
27
+ if operation == "drop":
28
+ existing = [c for c in columns if c in result_df.columns]
29
+ result_df = result_df.drop(columns=existing)
30
+ affected = len(result_df) * len(existing)
31
+ elif operation == "rename":
32
+ mapping = params.get("mapping", {})
33
+ result_df = result_df.rename(columns=mapping)
34
+ affected = len(result_df) * len(mapping)
35
+ elif operation == "merge":
36
+ if len(columns) >= 2:
37
+ new_name = params.get("new_name", "_".join(columns))
38
+ separator = params.get("separator", " ")
39
+ result_df[new_name] = (
40
+ result_df[columns].astype(str).agg(separator.join, axis=1)
41
+ )
42
+ result_df = result_df.drop(columns=columns)
43
+ affected = len(result_df)
44
+
45
+ return result_df, TransformationResult(
46
+ transformer=self.name,
47
+ params=params,
48
+ rows_affected=affected,
49
+ columns_affected=columns,
50
+ code=self.generate_code(**params),
51
+ )
52
+
53
+ def generate_code(self, **params: Any) -> str:
54
+ operation = params.get("operation", "drop")
55
+ columns = params.get("columns", [])
56
+ if operation == "drop":
57
+ return f"df = df.drop(columns={repr(columns)})"
58
+ elif operation == "rename":
59
+ mapping = params.get("mapping", {})
60
+ return f"df = df.rename(columns={repr(mapping)})"
61
+ elif operation == "merge":
62
+ new_name = params.get("new_name", "_".join(columns))
63
+ sep = params.get("separator", " ")
64
+ return (
65
+ f"df[{repr(new_name)}] = df[{repr(columns)}]"
66
+ f".astype(str).agg({repr(sep)}.join, axis=1)\n"
67
+ f"df = df.drop(columns={repr(columns)})"
68
+ )
69
+ return ""
70
+
71
+
72
+ register_transformer(ColumnTransformer())
@@ -0,0 +1,43 @@
1
+ """Remove duplicate rows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import pandas as pd
8
+
9
+ from datawash.core.models import TransformationResult
10
+ from datawash.transformers.base import BaseTransformer
11
+ from datawash.transformers.registry import register_transformer
12
+
13
+
14
+ class DuplicateTransformer(BaseTransformer):
15
+ @property
16
+ def name(self) -> str:
17
+ return "duplicates"
18
+
19
+ def transform(
20
+ self, df: pd.DataFrame, **params: Any
21
+ ) -> tuple[pd.DataFrame, TransformationResult]:
22
+ keep = params.get("keep", "first")
23
+ subset = params.get("subset", None)
24
+ before = len(df)
25
+ result_df = df.drop_duplicates(keep=keep, subset=subset)
26
+ after = len(result_df)
27
+ return result_df, TransformationResult(
28
+ transformer=self.name,
29
+ params=params,
30
+ rows_affected=before - after,
31
+ columns_affected=list(df.columns),
32
+ code=self.generate_code(**params),
33
+ )
34
+
35
+ def generate_code(self, **params: Any) -> str:
36
+ keep = params.get("keep", "first")
37
+ subset = params.get("subset", None)
38
+ if subset:
39
+ return f'df = df.drop_duplicates(keep="{keep}", subset={subset})'
40
+ return f'df = df.drop_duplicates(keep="{keep}")'
41
+
42
+
43
+ register_transformer(DuplicateTransformer())