datawash 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. datawash/__init__.py +9 -0
  2. datawash/adapters/__init__.py +12 -0
  3. datawash/adapters/base.py +66 -0
  4. datawash/adapters/csv_adapter.py +23 -0
  5. datawash/adapters/excel_adapter.py +36 -0
  6. datawash/adapters/json_adapter.py +21 -0
  7. datawash/adapters/parquet_adapter.py +34 -0
  8. datawash/cli/__init__.py +0 -0
  9. datawash/cli/formatters.py +110 -0
  10. datawash/cli/main.py +168 -0
  11. datawash/codegen/__init__.py +1 -0
  12. datawash/codegen/generator.py +72 -0
  13. datawash/core/__init__.py +1 -0
  14. datawash/core/cache.py +64 -0
  15. datawash/core/config.py +56 -0
  16. datawash/core/dtypes.py +24 -0
  17. datawash/core/exceptions.py +21 -0
  18. datawash/core/models.py +78 -0
  19. datawash/core/report.py +430 -0
  20. datawash/core/sampling.py +84 -0
  21. datawash/detectors/__init__.py +13 -0
  22. datawash/detectors/base.py +27 -0
  23. datawash/detectors/duplicate_detector.py +56 -0
  24. datawash/detectors/format_detector.py +130 -0
  25. datawash/detectors/missing_detector.py +78 -0
  26. datawash/detectors/outlier_detector.py +93 -0
  27. datawash/detectors/registry.py +64 -0
  28. datawash/detectors/similarity_detector.py +294 -0
  29. datawash/detectors/type_detector.py +100 -0
  30. datawash/profiler/__init__.py +1 -0
  31. datawash/profiler/engine.py +88 -0
  32. datawash/profiler/parallel.py +122 -0
  33. datawash/profiler/patterns.py +80 -0
  34. datawash/profiler/statistics.py +41 -0
  35. datawash/suggestors/__init__.py +1 -0
  36. datawash/suggestors/base.py +15 -0
  37. datawash/suggestors/engine.py +327 -0
  38. datawash/suggestors/prioritizer.py +23 -0
  39. datawash/transformers/__init__.py +13 -0
  40. datawash/transformers/base.py +27 -0
  41. datawash/transformers/categories.py +64 -0
  42. datawash/transformers/columns.py +72 -0
  43. datawash/transformers/duplicates.py +43 -0
  44. datawash/transformers/formats.py +95 -0
  45. datawash/transformers/missing.py +201 -0
  46. datawash/transformers/registry.py +30 -0
  47. datawash/transformers/types.py +95 -0
  48. datawash-0.2.0.dist-info/METADATA +353 -0
  49. datawash-0.2.0.dist-info/RECORD +53 -0
  50. datawash-0.2.0.dist-info/WHEEL +5 -0
  51. datawash-0.2.0.dist-info/entry_points.txt +2 -0
  52. datawash-0.2.0.dist-info/licenses/LICENSE +21 -0
  53. datawash-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,95 @@
1
+ """Format standardization transformers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import pandas as pd
8
+
9
+ from datawash.core.models import TransformationResult
10
+ from datawash.transformers.base import BaseTransformer
11
+ from datawash.transformers.registry import register_transformer
12
+
13
+
14
+ class FormatTransformer(BaseTransformer):
15
+ @property
16
+ def name(self) -> str:
17
+ return "formats"
18
+
19
+ def transform(
20
+ self, df: pd.DataFrame, **params: Any
21
+ ) -> tuple[pd.DataFrame, TransformationResult]:
22
+ columns = params.get("columns", [])
23
+ operation = params.get("operation", "strip_whitespace")
24
+ result_df = df.copy()
25
+ affected = 0
26
+
27
+ for col in columns:
28
+ if col not in result_df.columns:
29
+ continue
30
+ if operation == "strip_whitespace":
31
+ before = result_df[col].copy()
32
+ result_df[col] = result_df[col].astype(str).str.strip()
33
+ affected += int((before != result_df[col]).sum())
34
+ elif operation == "lowercase":
35
+ before = result_df[col].copy()
36
+ result_df[col] = result_df[col].astype(str).str.lower()
37
+ affected += int((before != result_df[col]).sum())
38
+ elif operation == "uppercase":
39
+ before = result_df[col].copy()
40
+ result_df[col] = result_df[col].astype(str).str.upper()
41
+ affected += int((before != result_df[col]).sum())
42
+ elif operation == "titlecase":
43
+ before = result_df[col].copy()
44
+ result_df[col] = result_df[col].astype(str).str.title()
45
+ affected += int((before != result_df[col]).sum())
46
+ elif operation == "standardize_dates":
47
+ target_format = params.get("target_format", "%Y-%m-%d")
48
+ parsed = pd.to_datetime(result_df[col], errors="coerce")
49
+ affected += int(parsed.notna().sum())
50
+ result_df[col] = parsed.dt.strftime(target_format).where(
51
+ parsed.notna(), result_df[col]
52
+ )
53
+
54
+ return result_df, TransformationResult(
55
+ transformer=self.name,
56
+ params=params,
57
+ rows_affected=affected,
58
+ columns_affected=columns,
59
+ code=self.generate_code(**params),
60
+ )
61
+
62
+ def generate_code(self, **params: Any) -> str:
63
+ columns = params.get("columns", [])
64
+ operation = params.get("operation", "strip_whitespace")
65
+ lines = []
66
+ for col in columns:
67
+ if operation == "strip_whitespace":
68
+ lines.append(
69
+ f"df[{repr(col)}] = df[{repr(col)}].astype(str).str.strip()"
70
+ )
71
+ elif operation == "lowercase":
72
+ lines.append(
73
+ f"df[{repr(col)}] = df[{repr(col)}].astype(str).str.lower()"
74
+ )
75
+ elif operation == "uppercase":
76
+ lines.append(
77
+ f"df[{repr(col)}] = df[{repr(col)}].astype(str).str.upper()"
78
+ )
79
+ elif operation == "titlecase":
80
+ lines.append(
81
+ f"df[{repr(col)}] = df[{repr(col)}].astype(str).str.title()"
82
+ )
83
+ elif operation == "standardize_dates":
84
+ fmt = params.get("target_format", "%Y-%m-%d")
85
+ lines.append(
86
+ f"_parsed = pd.to_datetime(df[{repr(col)}], errors='coerce')"
87
+ )
88
+ lines.append(
89
+ f"df[{repr(col)}] = _parsed.dt.strftime({repr(fmt)})"
90
+ f".where(_parsed.notna(), df[{repr(col)}])"
91
+ )
92
+ return "\n".join(lines)
93
+
94
+
95
+ register_transformer(FormatTransformer())
@@ -0,0 +1,201 @@
1
+ """Handle missing values."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+ from datawash.core.models import TransformationResult
12
+ from datawash.transformers.base import BaseTransformer
13
+ from datawash.transformers.registry import register_transformer
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class MissingTransformer(BaseTransformer):
19
+ @property
20
+ def name(self) -> str:
21
+ return "missing"
22
+
23
+ def transform(
24
+ self, df: pd.DataFrame, **params: Any
25
+ ) -> tuple[pd.DataFrame, TransformationResult]:
26
+ strategy = params.get("strategy", "drop_rows")
27
+ columns = params.get("columns", list(df.columns))
28
+ result_df = df.copy()
29
+ rows_before = len(result_df)
30
+ affected = 0
31
+
32
+ if strategy == "drop_rows":
33
+ result_df = result_df.dropna(subset=columns)
34
+ affected = rows_before - len(result_df)
35
+ elif strategy == "fill_median":
36
+ for col in columns:
37
+ if pd.api.types.is_numeric_dtype(result_df[col]):
38
+ median = result_df[col].median()
39
+ affected += int(result_df[col].isna().sum())
40
+ result_df[col] = result_df[col].fillna(median)
41
+ elif strategy == "fill_mode":
42
+ for col in columns:
43
+ mode = result_df[col].mode()
44
+ if not mode.empty:
45
+ affected += int(result_df[col].isna().sum())
46
+ result_df[col] = result_df[col].fillna(mode.iloc[0])
47
+ else:
48
+ logger.warning(
49
+ "Column '%s': fill_mode requested but no mode found "
50
+ "(all values null). Column left unchanged.",
51
+ col,
52
+ )
53
+ elif strategy == "fill_value":
54
+ fill_value = params.get("fill_value", "")
55
+ for col in columns:
56
+ affected += int(result_df[col].isna().sum())
57
+ result_df[col] = result_df[col].fillna(fill_value)
58
+ elif strategy == "empty_to_nan":
59
+ for col in columns:
60
+ mask = result_df[col] == ""
61
+ affected += int(mask.sum())
62
+ result_df.loc[mask, col] = np.nan
63
+ elif strategy == "clean_empty_strings":
64
+ # Combined strategy: convert empty/whitespace strings to NaN and fill in one step
65
+ fill_strategy = params.get("fill_strategy", "mode")
66
+ for col in columns:
67
+ # Convert empty and whitespace-only strings to NaN
68
+ # Handle both 'object' and string dtypes
69
+ col_dtype = result_df[col].dtype
70
+ is_string_like = col_dtype == object or pd.api.types.is_string_dtype(
71
+ col_dtype
72
+ )
73
+ if is_string_like:
74
+ mask = result_df[col].apply(
75
+ lambda x: isinstance(x, str) and x.strip() == ""
76
+ )
77
+ empty_count = int(mask.sum())
78
+ result_df.loc[mask, col] = np.nan
79
+ else:
80
+ empty_count = 0
81
+
82
+ # Now fill NaN values
83
+ null_count = int(result_df[col].isna().sum())
84
+ if null_count > 0:
85
+ if fill_strategy == "mode":
86
+ mode = result_df[col].mode()
87
+ if not mode.empty:
88
+ result_df[col] = result_df[col].fillna(mode.iloc[0])
89
+ elif fill_strategy == "median":
90
+ if pd.api.types.is_numeric_dtype(result_df[col]):
91
+ result_df[col] = result_df[col].fillna(
92
+ result_df[col].median()
93
+ )
94
+ elif fill_strategy == "value":
95
+ fill_value = params.get("fill_value", "")
96
+ result_df[col] = result_df[col].fillna(fill_value)
97
+
98
+ affected += max(empty_count, null_count)
99
+ elif strategy == "clip_outliers":
100
+ method = params.get("method", "iqr")
101
+ threshold = params.get("threshold", 1.5)
102
+ for col in columns:
103
+ if not pd.api.types.is_numeric_dtype(result_df[col]):
104
+ continue
105
+ series = result_df[col].dropna()
106
+ if method == "iqr":
107
+ q1, q3 = series.quantile(0.25), series.quantile(0.75)
108
+ iqr = q3 - q1
109
+ lower, upper = q1 - threshold * iqr, q3 + threshold * iqr
110
+ else:
111
+ mean, std = series.mean(), series.std()
112
+ lower, upper = mean - threshold * std, mean + threshold * std
113
+ mask = (result_df[col] < lower) | (result_df[col] > upper)
114
+ affected += int(mask.sum())
115
+ result_df[col] = result_df[col].clip(lower=lower, upper=upper)
116
+
117
+ return result_df, TransformationResult(
118
+ transformer=self.name,
119
+ params=params,
120
+ rows_affected=affected,
121
+ columns_affected=columns,
122
+ code=self.generate_code(**params),
123
+ )
124
+
125
+ def generate_code(self, **params: Any) -> str:
126
+ strategy = params.get("strategy", "drop_rows")
127
+ columns = params.get("columns", [])
128
+ col_repr = repr(columns)
129
+ if strategy == "drop_rows":
130
+ return f"df = df.dropna(subset={col_repr})"
131
+ elif strategy == "fill_median":
132
+ lines = [
133
+ f"df[{repr(c)}] = df[{repr(c)}].fillna(df[{repr(c)}].median())"
134
+ for c in columns
135
+ ]
136
+ return "\n".join(lines)
137
+ elif strategy == "fill_mode":
138
+ lines = [
139
+ f"df[{repr(c)}] = df[{repr(c)}].fillna(df[{repr(c)}].mode().iloc[0])"
140
+ for c in columns
141
+ ]
142
+ return "\n".join(lines)
143
+ elif strategy == "fill_value":
144
+ val = repr(params.get("fill_value", ""))
145
+ lines = [f"df[{repr(c)}] = df[{repr(c)}].fillna({val})" for c in columns]
146
+ return "\n".join(lines)
147
+ elif strategy == "empty_to_nan":
148
+ lines = [
149
+ f"df[{repr(c)}] = df[{repr(c)}].replace('', np.nan)" for c in columns
150
+ ]
151
+ return "import numpy as np\n" + "\n".join(lines)
152
+ elif strategy == "clean_empty_strings":
153
+ fill_strategy = params.get("fill_strategy", "mode")
154
+ lines = ["import numpy as np"]
155
+ for c in columns:
156
+ # Convert empty/whitespace to NaN
157
+ lines.append(
158
+ f"df[{repr(c)}] = df[{repr(c)}].replace(r'^\\s*$', np.nan, regex=True)"
159
+ )
160
+ # Fill based on strategy
161
+ if fill_strategy == "mode":
162
+ lines.append(
163
+ f"df[{repr(c)}] = df[{repr(c)}].fillna(df[{repr(c)}].mode().iloc[0])"
164
+ )
165
+ elif fill_strategy == "median":
166
+ lines.append(
167
+ f"df[{repr(c)}] = df[{repr(c)}].fillna(df[{repr(c)}].median())"
168
+ )
169
+ elif fill_strategy == "value":
170
+ val = repr(params.get("fill_value", ""))
171
+ lines.append(f"df[{repr(c)}] = df[{repr(c)}].fillna({val})")
172
+ return "\n".join(lines)
173
+ elif strategy == "clip_outliers":
174
+ method = params.get("method", "iqr")
175
+ threshold = params.get("threshold", 1.5)
176
+ lines = []
177
+ for c in columns:
178
+ if method == "iqr":
179
+ lines.append(
180
+ f"q1, q3 = df[{repr(c)}].quantile(0.25), df[{repr(c)}].quantile(0.75)"
181
+ )
182
+ lines.append("iqr = q3 - q1")
183
+ lines.append(
184
+ f"df[{repr(c)}] = df[{repr(c)}].clip("
185
+ f"lower=q1 - {threshold} * iqr, "
186
+ f"upper=q3 + {threshold} * iqr)"
187
+ )
188
+ else:
189
+ lines.append(
190
+ f"mean, std = df[{repr(c)}].mean(), " f"df[{repr(c)}].std()"
191
+ )
192
+ lines.append(
193
+ f"df[{repr(c)}] = df[{repr(c)}].clip("
194
+ f"lower=mean - {threshold} * std, "
195
+ f"upper=mean + {threshold} * std)"
196
+ )
197
+ return "\n".join(lines)
198
+ return "# No code generated"
199
+
200
+
201
+ register_transformer(MissingTransformer())
@@ -0,0 +1,30 @@
1
+ """Transformer registration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import pandas as pd
8
+
9
+ from datawash.core.models import TransformationResult
10
+ from datawash.transformers.base import BaseTransformer
11
+
12
+ _TRANSFORMERS: dict[str, BaseTransformer] = {}
13
+
14
+
15
+ def register_transformer(t: BaseTransformer) -> None:
16
+ _TRANSFORMERS[t.name] = t
17
+
18
+
19
+ def get_transformer(name: str) -> BaseTransformer:
20
+ if name not in _TRANSFORMERS:
21
+ raise KeyError(
22
+ f"Unknown transformer: {name}. Available: {list(_TRANSFORMERS.keys())}"
23
+ )
24
+ return _TRANSFORMERS[name]
25
+
26
+
27
+ def run_transformer(
28
+ name: str, df: pd.DataFrame, **params: Any
29
+ ) -> tuple[pd.DataFrame, TransformationResult]:
30
+ return get_transformer(name).transform(df, **params)
@@ -0,0 +1,95 @@
1
+ """Type conversion transformers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import pandas as pd
8
+
9
+ from datawash.core.models import TransformationResult
10
+ from datawash.transformers.base import BaseTransformer
11
+ from datawash.transformers.registry import register_transformer
12
+
13
+
14
+ class TypeTransformer(BaseTransformer):
15
+ @property
16
+ def name(self) -> str:
17
+ return "types"
18
+
19
+ def transform(
20
+ self, df: pd.DataFrame, **params: Any
21
+ ) -> tuple[pd.DataFrame, TransformationResult]:
22
+ columns = params.get("columns", [])
23
+ target_type = params.get("target_type", "numeric")
24
+ result_df = df.copy()
25
+ affected = 0
26
+
27
+ for col in columns:
28
+ if col not in result_df.columns:
29
+ continue
30
+ if target_type == "numeric":
31
+ converted = pd.to_numeric(result_df[col], errors="coerce")
32
+ affected += int((converted != result_df[col].astype(str)).sum())
33
+ result_df[col] = converted
34
+ elif target_type == "boolean":
35
+ bool_map = {
36
+ "true": True,
37
+ "false": False,
38
+ "yes": True,
39
+ "no": False,
40
+ "y": True,
41
+ "n": False,
42
+ "1": True,
43
+ "0": False,
44
+ "t": True,
45
+ "f": False,
46
+ "on": True,
47
+ "off": False,
48
+ }
49
+ result_df[col] = result_df[col].astype(str).str.lower().map(bool_map)
50
+ affected += len(result_df[col].dropna())
51
+ elif target_type == "datetime":
52
+ result_df[col] = pd.to_datetime(result_df[col], errors="coerce")
53
+ affected += int(result_df[col].notna().sum())
54
+ elif target_type == "string":
55
+ result_df[col] = result_df[col].astype(str)
56
+ affected += len(result_df[col])
57
+
58
+ return result_df, TransformationResult(
59
+ transformer=self.name,
60
+ params=params,
61
+ rows_affected=affected,
62
+ columns_affected=columns,
63
+ code=self.generate_code(**params),
64
+ )
65
+
66
+ def generate_code(self, **params: Any) -> str:
67
+ columns = params.get("columns", [])
68
+ target_type = params.get("target_type", "numeric")
69
+ lines = []
70
+ for col in columns:
71
+ if target_type == "numeric":
72
+ lines.append(
73
+ f"df[{repr(col)}] = pd.to_numeric(df[{repr(col)}], errors='coerce')"
74
+ )
75
+ elif target_type == "boolean":
76
+ bmap = (
77
+ "{'true': True, 'false': False, "
78
+ "'yes': True, 'no': False, "
79
+ "'y': True, 'n': False, "
80
+ "'1': True, '0': False}"
81
+ )
82
+ lines.append(
83
+ f"df[{repr(col)}] = df[{repr(col)}]"
84
+ f".astype(str).str.lower().map({bmap})"
85
+ )
86
+ elif target_type == "datetime":
87
+ lines.append(
88
+ f"df[{repr(col)}] = pd.to_datetime(df[{repr(col)}], errors='coerce')"
89
+ )
90
+ elif target_type == "string":
91
+ lines.append(f"df[{repr(col)}] = df[{repr(col)}].astype(str)")
92
+ return "\n".join(lines)
93
+
94
+
95
+ register_transformer(TypeTransformer())