clean-data-ml 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clean_data_ml-1.2.0/LICENSE +21 -0
- clean_data_ml-1.2.0/MANIFEST.in +10 -0
- clean_data_ml-1.2.0/PKG-INFO +494 -0
- clean_data_ml-1.2.0/README.md +436 -0
- clean_data_ml-1.2.0/clean_data_ml/__init__.py +7 -0
- clean_data_ml-1.2.0/clean_data_ml/__main__.py +18 -0
- clean_data_ml-1.2.0/clean_data_ml/auto_scaler.py +79 -0
- clean_data_ml-1.2.0/clean_data_ml/cleaner.py +1568 -0
- clean_data_ml-1.2.0/clean_data_ml/plotting.py +155 -0
- clean_data_ml-1.2.0/clean_data_ml/py.typed +0 -0
- clean_data_ml-1.2.0/clean_data_ml/stats.py +1085 -0
- clean_data_ml-1.2.0/clean_data_ml.egg-info/PKG-INFO +494 -0
- clean_data_ml-1.2.0/clean_data_ml.egg-info/SOURCES.txt +26 -0
- clean_data_ml-1.2.0/clean_data_ml.egg-info/dependency_links.txt +1 -0
- clean_data_ml-1.2.0/clean_data_ml.egg-info/entry_points.txt +2 -0
- clean_data_ml-1.2.0/clean_data_ml.egg-info/requires.txt +26 -0
- clean_data_ml-1.2.0/clean_data_ml.egg-info/top_level.txt +1 -0
- clean_data_ml-1.2.0/example_inference.py +43 -0
- clean_data_ml-1.2.0/example_train.py +72 -0
- clean_data_ml-1.2.0/pyproject.toml +74 -0
- clean_data_ml-1.2.0/setup.cfg +4 -0
- clean_data_ml-1.2.0/setup.py +52 -0
- clean_data_ml-1.2.0/tests/__init__.py +0 -0
- clean_data_ml-1.2.0/tests/conftest.py +36 -0
- clean_data_ml-1.2.0/tests/test_auto_scaler.py +53 -0
- clean_data_ml-1.2.0/tests/test_cleaner.py +376 -0
- clean_data_ml-1.2.0/tests/test_plotting.py +35 -0
- clean_data_ml-1.2.0/tests/test_stats.py +263 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Mohammad Hossein Habibpour
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
include README.md
|
|
2
|
+
include LICENSE
|
|
3
|
+
include pyproject.toml
|
|
4
|
+
recursive-include clean_data_ml *.py
|
|
5
|
+
recursive-include clean_data_ml py.typed
|
|
6
|
+
include example_train.py example_inference.py
|
|
7
|
+
graft tests
|
|
8
|
+
global-exclude __pycache__
|
|
9
|
+
global-exclude *.pyc
|
|
10
|
+
global-exclude *.pyo
|
|
@@ -0,0 +1,494 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: clean-data-ml
|
|
3
|
+
Version: 1.2.0
|
|
4
|
+
Summary: Automatic data cleaning and standardization for ML pipelines
|
|
5
|
+
Home-page: https://github.com/MohammadvHossein/clean-data-ml
|
|
6
|
+
Author: Mohammad Hossein Habibpour
|
|
7
|
+
Author-email: Mohammad Hossein Habibpour <habibpour.programming@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Project-URL: Homepage, https://github.com/MohammadvHossein/clean-data-ml
|
|
10
|
+
Project-URL: Repository, https://github.com/MohammadvHossein/clean-data-ml
|
|
11
|
+
Project-URL: Bug Tracker, https://github.com/MohammadvHossein/clean-data-ml/issues
|
|
12
|
+
Project-URL: Changelog, https://github.com/MohammadvHossein/clean-data-ml/releases
|
|
13
|
+
Keywords: data-cleaning,data-preprocessing,ml-pipeline,feature-engineering,scikit-learn
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
26
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
27
|
+
Classifier: Operating System :: OS Independent
|
|
28
|
+
Classifier: Typing :: Typed
|
|
29
|
+
Requires-Python: >=3.8
|
|
30
|
+
Description-Content-Type: text/markdown
|
|
31
|
+
License-File: LICENSE
|
|
32
|
+
Requires-Dist: pandas>=1.3.0
|
|
33
|
+
Requires-Dist: numpy>=1.21.0
|
|
34
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
35
|
+
Requires-Dist: scipy>=1.7.0
|
|
36
|
+
Requires-Dist: openpyxl>=3.0.0
|
|
37
|
+
Requires-Dist: joblib>=1.0.0
|
|
38
|
+
Provides-Extra: plot
|
|
39
|
+
Requires-Dist: matplotlib>=3.5.0; extra == "plot"
|
|
40
|
+
Requires-Dist: seaborn>=0.11.0; extra == "plot"
|
|
41
|
+
Provides-Extra: imbalance
|
|
42
|
+
Requires-Dist: imbalanced-learn>=0.10.0; extra == "imbalance"
|
|
43
|
+
Provides-Extra: all
|
|
44
|
+
Requires-Dist: matplotlib>=3.5.0; extra == "all"
|
|
45
|
+
Requires-Dist: seaborn>=0.11.0; extra == "all"
|
|
46
|
+
Requires-Dist: imbalanced-learn>=0.10.0; extra == "all"
|
|
47
|
+
Provides-Extra: dev
|
|
48
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
49
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
50
|
+
Requires-Dist: flake8>=6.0; extra == "dev"
|
|
51
|
+
Requires-Dist: black>=23.0; extra == "dev"
|
|
52
|
+
Requires-Dist: isort>=5.0; extra == "dev"
|
|
53
|
+
Requires-Dist: pre-commit>=3.0; extra == "dev"
|
|
54
|
+
Dynamic: author
|
|
55
|
+
Dynamic: home-page
|
|
56
|
+
Dynamic: license-file
|
|
57
|
+
Dynamic: requires-python
|
|
58
|
+
|
|
59
|
+
# DataCleaner
|
|
60
|
+
|
|
61
|
+
[](https://pypi.org/project/clean-data-ml/)
|
|
62
|
+
[](https://pypi.org/project/clean-data-ml/)
|
|
63
|
+
[](https://github.com/MohammadvHossein/clean-data-ml/blob/main/LICENSE)
|
|
64
|
+
[](https://github.com/MohammadvHossein/clean-data-ml/actions/workflows/ci.yml)
|
|
65
|
+
|
|
66
|
+
**Automated data cleaning & standardization pipeline for ML projects.**
|
|
67
|
+
|
|
68
|
+
DataCleaner takes raw CSV/Excel data and transforms it into production-ready ML features - handling nulls, encoding categories, selecting the best scaler per column, and packaging everything into a reusable inference pipeline.
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## Features
|
|
73
|
+
|
|
74
|
+
| Feature | Description |
|
|
75
|
+
|---------|-------------|
|
|
76
|
+
| **Input** | CSV & Excel files or in-memory DataFrames |
|
|
77
|
+
| **Column dropping** | Pass a list of unwanted columns (IDs, timestamps, etc.) |
|
|
78
|
+
| **Target column** | Designate any column as the prediction target |
|
|
79
|
+
| **Auto-detect problem type** | Automatically detects classification vs regression from target column |
|
|
80
|
+
| **Auto-drop useless columns** | Removes zero-variance, high-cardinality, and duplicated columns |
|
|
81
|
+
| **Null handling** | Dynamic threshold: drop rows if nulls are few, KNN-impute if nulls are abundant |
|
|
82
|
+
| **Outlier handling** | IQR-based detection with clip or remove options |
|
|
83
|
+
| **Encoding** | Auto-detects binary vs multi-category columns; LabelEncoder for binary, OneHotEncoder for categorical |
|
|
84
|
+
| **Auto-scaler** | Tests each numeric column for normality & outliers, then picks the optimal scaler (Standard, Robust, MinMax, MaxAbs) |
|
|
85
|
+
| **Feature engineering** | Generates polynomial features (interactions, squares) for numeric columns |
|
|
86
|
+
| **Imbalance handling** | SMOTE oversampling for imbalanced classification datasets |
|
|
87
|
+
| **Train/Val/Test split** | Configurable split ratios |
|
|
88
|
+
| **Pipeline export** | Save & reload the full transformation pipeline for inference on new data |
|
|
89
|
+
| **Summary** | Quick overview of shape, dtypes, null counts & percentages |
|
|
90
|
+
| **Date feature extraction** | Expands datetime columns into year/month/day/dayofweek/weekend |
|
|
91
|
+
| **Missing indicators** | Adds `{col}_missing` binary columns for imputed nulls |
|
|
92
|
+
| **Feature selection** | Removes weak features via mutual information |
|
|
93
|
+
| **Custom encoders/scalers** | Pass your own sklearn encoders and scalers to `prepare()` |
|
|
94
|
+
| **Statistical test suite** | Integrated A/B testing, t-tests, z-tests, chi-square, ANOVA |
|
|
95
|
+
| **Data profiling** | Self-contained HTML report with distributions, correlations, quality warnings |
|
|
96
|
+
| **Schema validation** | Validate column existence and expected dtypes |
|
|
97
|
+
| **Duplicate removal** | Drop duplicate rows |
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## Quick Start
|
|
102
|
+
|
|
103
|
+
### Install
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
pip install clean-data-ml
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
With optional extras:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
pip install clean-data-ml[plot] # visualization (matplotlib, seaborn)
|
|
113
|
+
pip install clean-data-ml[imbalance] # SMOTE oversampling support
|
|
114
|
+
pip install clean-data-ml[all] # all optional features
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
For a development (editable) install from source:
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
git clone https://github.com/MohammadvHossein/clean-data-ml.git
|
|
121
|
+
cd clean-data-ml
|
|
122
|
+
pip install -e .
|
|
123
|
+
pip install -e .[all] # including all extras
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Minimal example
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
from clean_data_ml import DataCleaner
|
|
130
|
+
from sklearn.svm import SVC
|
|
131
|
+
|
|
132
|
+
dc = DataCleaner()
|
|
133
|
+
dc.load("data.csv")
|
|
134
|
+
dc.set_target("purchased")
|
|
135
|
+
dc.drop_columns(["ID", "timestamp"])
|
|
136
|
+
|
|
137
|
+
X_train, X_test, y_train, y_test = dc.prepare(test_size=0.2)
|
|
138
|
+
|
|
139
|
+
model = SVC()
|
|
140
|
+
model.fit(X_train, y_train)
|
|
141
|
+
print(f"Accuracy: {model.score(X_test, y_test):.2f}")
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
---
|
|
145
|
+
|
|
146
|
+
## Full Example
|
|
147
|
+
|
|
148
|
+
### 1. Train & save pipeline
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
from clean_data_ml import DataCleaner
|
|
152
|
+
import pandas as pd
|
|
153
|
+
from sklearn.svm import SVC
|
|
154
|
+
import joblib
|
|
155
|
+
|
|
156
|
+
# Sample data
|
|
157
|
+
data = pd.DataFrame({
|
|
158
|
+
"ID": range(100),
|
|
159
|
+
"age": [25, 30, 35, None, 40, 45, 50, 55, 60, 65] * 10,
|
|
160
|
+
"salary": [50000, 60000, None, 80000, 90000, 100000, 110000, 120000, None, 140000] * 10,
|
|
161
|
+
"city": ["Tehran", "Shiraz", "Tehran", "Isfahan", None, "Tehran", "Shiraz", "Isfahan", "Tehran", "Shiraz"] * 10,
|
|
162
|
+
"gender": ["M", "F", "M", "F", "M", "F", "M", "F", "M", "F"] * 10,
|
|
163
|
+
"purchased": [1, 0, 1, 0, 1, 1, 0, 1, 0, 1] * 10,
|
|
164
|
+
})
|
|
165
|
+
|
|
166
|
+
dc = DataCleaner()
|
|
167
|
+
dc.load_df(data)
|
|
168
|
+
dc.set_target("purchased")
|
|
169
|
+
dc.drop_columns(["ID"])
|
|
170
|
+
|
|
171
|
+
X_train, X_test, y_train, y_test = dc.prepare(test_size=0.2)
|
|
172
|
+
|
|
173
|
+
model = SVC(probability=True)
|
|
174
|
+
model.fit(X_train, y_train)
|
|
175
|
+
print(f"Accuracy: {model.score(X_test, y_test):.2f}")
|
|
176
|
+
|
|
177
|
+
# Save model & pipeline for later inference
|
|
178
|
+
joblib.dump(model, "model.pkl")
|
|
179
|
+
dc.save_pipeline("my_pipeline.pkl")
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
### 2. Inference on new data
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
from clean_data_ml import DataCleaner
|
|
186
|
+
import pandas as pd
|
|
187
|
+
from sklearn.svm import SVC
|
|
188
|
+
import joblib
|
|
189
|
+
|
|
190
|
+
dc = DataCleaner.load_pipeline("my_pipeline.pkl")
|
|
191
|
+
model = joblib.load("model.pkl")
|
|
192
|
+
|
|
193
|
+
new_data = pd.DataFrame({
|
|
194
|
+
"age": [28, 42, 35],
|
|
195
|
+
"salary": [65000, 95000, 78000],
|
|
196
|
+
"city": ["Tehran", "Isfahan", "Shiraz"],
|
|
197
|
+
"gender": ["F", "M", "F"],
|
|
198
|
+
})
|
|
199
|
+
|
|
200
|
+
processed = dc.transform(new_data)
|
|
201
|
+
predictions = model.predict(processed)
|
|
202
|
+
probabilities = model.predict_proba(processed)
|
|
203
|
+
|
|
204
|
+
for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
|
|
205
|
+
status = "Purchased" if pred == 1 else "Not Purchased"
|
|
206
|
+
print(f"Customer {i+1}: {status} (confidence: {max(prob):.2%})")
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
## API Reference
|
|
212
|
+
|
|
213
|
+
### `DataCleaner(random_state=42)`
|
|
214
|
+
Main class. All methods return `self` for chaining.
|
|
215
|
+
|
|
216
|
+
| Method | Description |
|
|
217
|
+
|--------|-------------|
|
|
218
|
+
| `.load(filepath)` | Load CSV or Excel file |
|
|
219
|
+
| `.load_df(df)` | Load from an existing pandas DataFrame |
|
|
220
|
+
| `.set_target(col)` | Set the target column |
|
|
221
|
+
| `.drop_columns(cols)` | Drop unwanted columns (IDs, etc.) — append-only, safe to call multiple times |
|
|
222
|
+
| `.prepare(...)` | Execute the full pipeline (see parameters below) |
|
|
223
|
+
| `.get_pipeline()` | Returns the fitted `CleanPipeline` for transforming new data |
|
|
224
|
+
| `.save_pipeline(path)` | Save pipeline to disk |
|
|
225
|
+
| `.load_pipeline(path)` | Load a saved pipeline — returns a `DataCleaner` instance wrapping the pipeline |
|
|
226
|
+
| `.transform(df)` | Apply all cleaning steps to new raw data (same as `get_pipeline().transform(df)`) |
|
|
227
|
+
| `.export_cleaned(filepath, include_target=False)` | Export the fully cleaned dataset (features only, or with target if `True`) to CSV or Excel (.xlsx) |
|
|
228
|
+
| `.summary()` | Dict with shape, columns, dtypes, null counts |
|
|
229
|
+
| `.profile_report(filepath)` | Generate a self-contained HTML data profiling report with stats, distributions, and quality warnings |
|
|
230
|
+
| `.drop_duplicates(subset, keep)` | Remove duplicate rows |
|
|
231
|
+
| `.validate_schema(expected_schema, required_cols)` | Validate column existence and expected dtypes |
|
|
232
|
+
| `.auto_fix_dtypes()` | Auto-convert object columns to datetime or numeric where possible |
|
|
233
|
+
|
|
234
|
+
### `prepare()` Parameters
|
|
235
|
+
|
|
236
|
+
| Parameter | Default | Description |
|
|
237
|
+
|-----------|---------|-------------|
|
|
238
|
+
| `test_size` | `0.2` | Fraction of data for test set |
|
|
239
|
+
| `val_size` | `None` | If set, also creates a validation set |
|
|
240
|
+
| `handle_nulls` | `True` | Auto-detect and handle missing values |
|
|
241
|
+
| `auto_scale` | `True` | Auto-select and apply optimal scaler per column |
|
|
242
|
+
| `auto_encode` | `True` | Auto-encode binary (Label) and categorical (OneHot) columns |
|
|
243
|
+
| `null_drop_ratio` | `None` | Override dynamic null threshold |
|
|
244
|
+
| `auto_drop_useless` | `True` | Drop zero-variance and high-cardinality columns |
|
|
245
|
+
| `handle_outliers` | `None` | `"clip"` to cap outliers, `"remove"` to drop, `None` to skip |
|
|
246
|
+
| `feature_engineering` | `False` | Add polynomial features (interactions, squares) |
|
|
247
|
+
| `handle_imbalance` | `False` | Apply SMOTE oversampling on imbalanced classification data |
|
|
248
|
+
| `n_jobs` | `1` | Number of parallel jobs for scaler selection and outlier handling. `-1` uses all cores |
|
|
249
|
+
| `extract_date_features` | `False` | Expand datetime columns into year, month, day, dayofweek, weekend |
|
|
250
|
+
| `add_missing_indicators` | `False` | Add `{col}_missing` binary columns for imputed nulls |
|
|
251
|
+
| `feature_selection` | `None` | `"auto"` (median MI threshold) or a float threshold; removes features below threshold |
|
|
252
|
+
| `custom_encoders` | `None` | Dict of `{col: encoder_instance}` to override auto-encoding |
|
|
253
|
+
| `custom_scalers` | `None` | Dict of `{col: scaler_instance}` to override auto-scaling |
|
|
254
|
+
|
|
255
|
+
### `CleanPipeline` (internal)
|
|
256
|
+
Holds all fitted transformers. Can be used directly, but prefer `DataCleaner` for full functionality.
|
|
257
|
+
|
|
258
|
+
| Method | Description |
|
|
259
|
+
|--------|-------------|
|
|
260
|
+
| `.transform(df)` | Apply all cleaning steps to a raw DataFrame |
|
|
261
|
+
| `.save(path)` | Pickle to disk |
|
|
262
|
+
| `.load(path)` | Static method -- load from disk |
|
|
263
|
+
|
|
264
|
+
### Pipeline Attributes (accessible via `dc.pipeline.*`)
|
|
265
|
+
|
|
266
|
+
| Attribute | Description |
|
|
267
|
+
|-----------|-------------|
|
|
268
|
+
| `.problem_type` | `"classification"` or `"regression"` (auto-detected) |
|
|
269
|
+
| `.dropped_useless_cols` | Columns auto-dropped by `auto_drop_useless` |
|
|
270
|
+
| `.outlier_bounds` | IQR bounds used for outlier handling (applied in transform) |
|
|
271
|
+
| `.scalers` | Dict of column -> fitted scaler |
|
|
272
|
+
| `.onehot_cols` | One-hot encoded column names |
|
|
273
|
+
| `.label_encoders` | Binary column -> mapping dict |
|
|
274
|
+
| `.feature_cols` | Ordered list of all feature columns after transformation |
|
|
275
|
+
| `.poly_features` | Fitted `PolynomialFeatures` transformer (if feature_engineering was enabled) |
|
|
276
|
+
| `.custom_encoders` | Dict of user-provided encoders |
|
|
277
|
+
| `.custom_scalers` | Dict of user-provided scalers |
|
|
278
|
+
| `.cat_impute_values` | Dict of categorical column -> mode used for imputation |
|
|
279
|
+
| `.feature_importances_` | Dict of column -> mutual information score (if feature_selection was used) |
|
|
280
|
+
|
|
281
|
+
---
|
|
282
|
+
|
|
283
|
+
## How Nulls Are Handled
|
|
284
|
+
|
|
285
|
+
The threshold for "drop vs impute" is **dynamic** -- it adapts to dataset size:
|
|
286
|
+
|
|
287
|
+
| Dataset size | Drop threshold | Behavior |
|
|
288
|
+
|-------------|---------------|----------|
|
|
289
|
+
| 100 rows | 25% | Very conservative -- prefers KNN imputation |
|
|
290
|
+
| 1,000 rows | 5% | Balanced approach |
|
|
291
|
+
| 10,000+ rows | 1% | More aggressive dropping (plentiful data) |
|
|
292
|
+
|
|
293
|
+
- **Numeric columns with many nulls** -- `KNNImputer(n_neighbors=5)`
|
|
294
|
+
- **Categorical columns with many nulls** -- filled with mode
|
|
295
|
+
- **Any column with few nulls** -- those rows are dropped
|
|
296
|
+
|
|
297
|
+
You can override this with `prepare(null_drop_ratio=0.1)`.
|
|
298
|
+
|
|
299
|
+
---
|
|
300
|
+
|
|
301
|
+
## Statistical Test Suite
|
|
302
|
+
|
|
303
|
+
The `clean_data_ml.stats` module provides a comprehensive set of statistical tests for data analysis:
|
|
304
|
+
|
|
305
|
+
### Standalone Functions
|
|
306
|
+
|
|
307
|
+
| Function | Description |
|
|
308
|
+
|----------|-------------|
|
|
309
|
+
| `normality_test(series, method)` | Shapiro-Wilk, D'Agostino, or Anderson-Darling normality test |
|
|
310
|
+
| `correlation_test(x, y, method)` | Pearson, Spearman, or Kendall correlation |
|
|
311
|
+
| `ks_test(a, b)` | Kolmogorov-Smirnov (two-sample distribution test) |
|
|
312
|
+
| `chi_square_test(a, b)` | Chi-square test of independence |
|
|
313
|
+
| `variance_test(a, b, method)` | Levene, Bartlett, or Fligner test for equal variance |
|
|
314
|
+
| `anova_one_way(*groups)` | One-way ANOVA |
|
|
315
|
+
| `z_test_one_sample(series, pop_mean)` | One-sample z-test for mean |
|
|
316
|
+
| `z_test_two_sample(a, b)` | Two-sample z-test for mean |
|
|
317
|
+
| `z_test_proportion(successes, n, p)` | One-sample proportion z-test |
|
|
318
|
+
| `z_test_two_proportion(s1, n1, s2, n2)` | Two-sample proportion z-test |
|
|
319
|
+
| `t_test_one_sample(series, pop_mean)` | One-sample t-test |
|
|
320
|
+
| `t_test_independent(a, b)` | Independent two-sample t-test |
|
|
321
|
+
| `t_test_paired(a, b)` | Paired t-test |
|
|
322
|
+
| `ab_test_mean(control, treatment)` | A/B test on means (lift, CI, significance) |
|
|
323
|
+
| `ab_test_proportion(control, treatment)` | A/B test on proportions |
|
|
324
|
+
| `mutual_information(X, y)` | Mutual Information between features and target |
|
|
325
|
+
|
|
326
|
+
### StatisticalTestSuite (integration with DataCleaner)
|
|
327
|
+
|
|
328
|
+
```python
|
|
329
|
+
from clean_data_ml import DataCleaner, stats
|
|
330
|
+
|
|
331
|
+
dc = DataCleaner()
|
|
332
|
+
dc.load_df(data).set_target("purchased")
|
|
333
|
+
|
|
334
|
+
suite = stats.StatisticalTestSuite(dc)
|
|
335
|
+
suite.test_normality()
|
|
336
|
+
suite.test_correlations(target_col="purchased")
|
|
337
|
+
suite.test_chi_square("gender", "city")
|
|
338
|
+
suite.test_anova("age", "city")
|
|
339
|
+
suite.test_z_one_sample("age", pop_mean=35)
|
|
340
|
+
suite.test_t_independent("age", "score")
|
|
341
|
+
suite.test_ab_by_group("converted", "group", "A", "B", metric_type="proportion")
|
|
342
|
+
print(suite.summary())
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
## Visualization Module
|
|
346
|
+
|
|
347
|
+
The `clean_data_ml.plotting` module (requires `pip install -e .[plot]`):
|
|
348
|
+
|
|
349
|
+
| Function | Description |
|
|
350
|
+
|----------|-------------|
|
|
351
|
+
| `plot_null_report(dc)` | Bar charts of null counts and percentages |
|
|
352
|
+
| `plot_distributions(dc, cols)` | Histograms + boxplots for numeric columns |
|
|
353
|
+
| `plot_correlation(dc)` | Correlation heatmap |
|
|
354
|
+
| `plot_before_after(dc)` | Compare raw vs cleaned distributions |
|
|
355
|
+
|
|
356
|
+
## Project Structure
|
|
357
|
+
|
|
358
|
+
```
|
|
359
|
+
clean_data_ml/
|
|
360
|
+
__init__.py Package exports
|
|
361
|
+
cleaner.py DataCleaner + CleanPipeline classes
|
|
362
|
+
auto_scaler.py Automatic scaler selection logic
|
|
363
|
+
stats.py Statistical test suite (t-test, z-test, AB test, etc.)
|
|
364
|
+
plotting.py Optional visualization module
|
|
365
|
+
setup.py Package metadata
|
|
366
|
+
pyproject.toml Build configuration
|
|
367
|
+
MANIFEST.in sdist inclusion rules
|
|
368
|
+
LICENSE MIT license
|
|
369
|
+
example_train.py Training example
|
|
370
|
+
example_inference.py Inference example
|
|
371
|
+
.pre-commit-config.yaml Linting hooks (black, isort, flake8)
|
|
372
|
+
.gitignore Ignored files
|
|
373
|
+
README.md This file
|
|
374
|
+
tests/
|
|
375
|
+
conftest.py Shared test fixtures
|
|
376
|
+
test_cleaner.py DataCleaner / CleanPipeline tests
|
|
377
|
+
test_auto_scaler.py Scaler selection tests
|
|
378
|
+
test_stats.py Statistical test suite tests
|
|
379
|
+
test_plotting.py Visualization module tests
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
## Additional Features
|
|
383
|
+
|
|
384
|
+
### Auto-Drop Useless Columns
|
|
385
|
+
|
|
386
|
+
The library automatically detects and removes:
|
|
387
|
+
- **Zero-variance columns** -- columns with a single unique value
|
|
388
|
+
- **High-cardinality columns** -- non-numeric columns where unique values exceed 90% of rows (e.g., free-text fields)
|
|
389
|
+
|
|
390
|
+
Disabled with `prepare(auto_drop_useless=False)`.
|
|
391
|
+
|
|
392
|
+
### Outlier Handling (IQR)
|
|
393
|
+
|
|
394
|
+
After null handling, each numeric column is checked using the Interquartile Range method:
|
|
395
|
+
- **Lower bound**: Q1 - 1.5 x IQR
|
|
396
|
+
- **Upper bound**: Q3 + 1.5 x IQR
|
|
397
|
+
|
|
398
|
+
Two modes:
|
|
399
|
+
- `"clip"` -- caps values at the bounds (preserves row count)
|
|
400
|
+
- `"remove"` -- drops rows with outliers
|
|
401
|
+
|
|
402
|
+
Activated with `prepare(handle_outliers="clip")`.
|
|
403
|
+
|
|
404
|
+
### Feature Engineering
|
|
405
|
+
|
|
406
|
+
Generates polynomial features (degree 2) for numeric columns with more than 2 unique values. Creates interaction terms and squared features automatically.
|
|
407
|
+
|
|
408
|
+
Activated with `prepare(feature_engineering=True)`.
|
|
409
|
+
|
|
410
|
+
### Date Feature Extraction
|
|
411
|
+
|
|
412
|
+
When `prepare(extract_date_features=True)`, datetime columns are automatically expanded into numerical components:
|
|
413
|
+
- `{col}_year`, `{col}_month`, `{col}_day`, `{col}_dayofweek`, `{col}_weekend`
|
|
414
|
+
- The original datetime column is dropped afterward.
|
|
415
|
+
|
|
416
|
+
This happens early in the pipeline so the derived numeric columns benefit from all subsequent steps (encoding, scaling, feature engineering, etc.).
|
|
417
|
+
|
|
418
|
+
### Missing Indicators
|
|
419
|
+
|
|
420
|
+
When `prepare(add_missing_indicators=True)`, for every column that receives KNN imputation (null ratio above threshold), an additional binary column `{col}_missing` is added, flagging which rows originally contained nulls. This lets the model learn patterns from the missingness itself.
|
|
421
|
+
|
|
422
|
+
### Feature Selection
|
|
423
|
+
|
|
424
|
+
Controlled by `prepare(feature_selection="auto")` or `prepare(feature_selection=0.01)`.
|
|
425
|
+
|
|
426
|
+
After all transformations, Mutual Information is computed between each feature and the target. Features with MI below the threshold are dropped:
|
|
427
|
+
- `"auto"` -- drops features below the **median** MI score
|
|
428
|
+
- `float` (e.g., `0.01`) -- drops features below that absolute threshold
|
|
429
|
+
|
|
430
|
+
Set to `None` (default) to skip feature selection entirely.
|
|
431
|
+
|
|
432
|
+
### Custom Encoders & Custom Scalers
|
|
433
|
+
|
|
434
|
+
Pass fitted or unfitted sklearn-compatible transformers to override auto-detection:
|
|
435
|
+
|
|
436
|
+
```python
|
|
437
|
+
from sklearn.preprocessing import OrdinalEncoder, KBinsDiscretizer
|
|
438
|
+
|
|
439
|
+
dc.prepare(
|
|
440
|
+
custom_encoders={"city": OrdinalEncoder()},
|
|
441
|
+
custom_scalers={"salary": KBinsDiscretizer(n_bins=5, encode="ordinal")},
|
|
442
|
+
)
|
|
443
|
+
```
|
|
444
|
+
|
|
445
|
+
These are stored in `dc.pipeline.custom_encoders` / `dc.pipeline.custom_scalers` and applied during `transform()` as well.
|
|
446
|
+
|
|
447
|
+
### Imbalanced Data (SMOTE)
|
|
448
|
+
|
|
449
|
+
When `handle_imbalance=True` and the problem is classification, SMOTE oversampling is applied to the training set after the train/test split. Requires `imbalanced-learn`:
|
|
450
|
+
|
|
451
|
+
```bash
|
|
452
|
+
pip install imbalanced-learn
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
---
|
|
456
|
+
|
|
457
|
+
## How Scaler Selection Works
|
|
458
|
+
|
|
459
|
+
For each numeric column, the library tests:
|
|
460
|
+
|
|
461
|
+
1. **Normality** -- Shapiro-Wilk test (p > 0.05 => normal)
|
|
462
|
+
2. **Outliers** -- IQR method (1.5x IQR rule)
|
|
463
|
+
3. **Bounds** -- min >= 0 & max <= 1
|
|
464
|
+
4. **Sparsity** -- >40% zeros
|
|
465
|
+
|
|
466
|
+
**Note:** Tree-based models (Random Forest, XGBoost, LightGBM, etc.) do not require scaling or normalization -- they split on thresholds and are invariant to monotonic transformations. Scaling is only needed for distance-based or gradient-based models (SVM, KNN, Neural Networks, Logistic Regression, etc.). You can skip auto-scaling with `prepare(auto_scale=False)` if using a tree-based model.
|
|
467
|
+
|
|
468
|
+
Then assigns the optimal scaler:
|
|
469
|
+
|
|
470
|
+
| Condition | Scaler |
|
|
471
|
+
|-----------|--------|
|
|
472
|
+
| Normal + no outliers | `StandardScaler` |
|
|
473
|
+
| Has outliers | `RobustScaler` |
|
|
474
|
+
| Bounded [0, 1] | `MinMaxScaler` |
|
|
475
|
+
| Sparse | `MaxAbsScaler` |
|
|
476
|
+
| Default | `StandardScaler` |
|
|
477
|
+
|
|
478
|
+
---
|
|
479
|
+
|
|
480
|
+
## Requirements
|
|
481
|
+
|
|
482
|
+
- Python >= 3.8
|
|
483
|
+
- pandas >= 1.3
|
|
484
|
+
- numpy >= 1.21
|
|
485
|
+
- scikit-learn >= 1.0
|
|
486
|
+
- scipy >= 1.7
|
|
487
|
+
- joblib
|
|
488
|
+
- openpyxl (for Excel support)
|
|
489
|
+
|
|
490
|
+
---
|
|
491
|
+
|
|
492
|
+
## License
|
|
493
|
+
|
|
494
|
+
MIT
|