autopreprocess-lite 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autopreprocess_lite-0.1.1/LICENSE +21 -0
- autopreprocess_lite-0.1.1/MANIFEST.in +7 -0
- autopreprocess_lite-0.1.1/PKG-INFO +58 -0
- autopreprocess_lite-0.1.1/README.md +36 -0
- autopreprocess_lite-0.1.1/autopreprocess/__init__.py +5 -0
- autopreprocess_lite-0.1.1/autopreprocess/core/__init__.py +0 -0
- autopreprocess_lite-0.1.1/autopreprocess/core/cleaner.py +371 -0
- autopreprocess_lite-0.1.1/autopreprocess/core/encoder.py +383 -0
- autopreprocess_lite-0.1.1/autopreprocess/core/pipeline.py +415 -0
- autopreprocess_lite-0.1.1/autopreprocess/core/scaler.py +197 -0
- autopreprocess_lite-0.1.1/autopreprocess/core/selector.py +325 -0
- autopreprocess_lite-0.1.1/autopreprocess/core/splitter.py +211 -0
- autopreprocess_lite-0.1.1/autopreprocess/detectors/__init__.py +0 -0
- autopreprocess_lite-0.1.1/autopreprocess/detectors/quality_detector.py +327 -0
- autopreprocess_lite-0.1.1/autopreprocess/detectors/type_detector.py +157 -0
- autopreprocess_lite-0.1.1/autopreprocess/utils/__init__.py +0 -0
- autopreprocess_lite-0.1.1/autopreprocess/utils/cache.py +107 -0
- autopreprocess_lite-0.1.1/autopreprocess/utils/memory.py +79 -0
- autopreprocess_lite-0.1.1/autopreprocess_lite.egg-info/PKG-INFO +58 -0
- autopreprocess_lite-0.1.1/autopreprocess_lite.egg-info/SOURCES.txt +38 -0
- autopreprocess_lite-0.1.1/autopreprocess_lite.egg-info/dependency_links.txt +1 -0
- autopreprocess_lite-0.1.1/autopreprocess_lite.egg-info/requires.txt +4 -0
- autopreprocess_lite-0.1.1/autopreprocess_lite.egg-info/top_level.txt +5 -0
- autopreprocess_lite-0.1.1/core/__init__.py +0 -0
- autopreprocess_lite-0.1.1/detectors/__init__.py +0 -0
- autopreprocess_lite-0.1.1/report/__init__.py +1 -0
- autopreprocess_lite-0.1.1/setup.cfg +4 -0
- autopreprocess_lite-0.1.1/setup.py +26 -0
- autopreprocess_lite-0.1.1/tests/test_ames_housing.py +163 -0
- autopreprocess_lite-0.1.1/tests/test_cache.py +40 -0
- autopreprocess_lite-0.1.1/tests/test_current_pipeline.py +110 -0
- autopreprocess_lite-0.1.1/tests/test_encoder.py +49 -0
- autopreprocess_lite-0.1.1/tests/test_memory.py +19 -0
- autopreprocess_lite-0.1.1/tests/test_pipeline.py +56 -0
- autopreprocess_lite-0.1.1/tests/test_quality_detector.py +55 -0
- autopreprocess_lite-0.1.1/tests/test_scaler.py +74 -0
- autopreprocess_lite-0.1.1/tests/test_selector.py +64 -0
- autopreprocess_lite-0.1.1/tests/test_splitter.py +64 -0
- autopreprocess_lite-0.1.1/tests/test_type_detector.py +44 -0
- autopreprocess_lite-0.1.1/utils/__init__.py +0 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Ayush Gupta
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: autopreprocess-lite
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Automatic Data Preprocessing Library
|
|
5
|
+
Author: Ayush Gupta
|
|
6
|
+
Author-email: guptaaayush0908@gmail.com
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: pandas>=1.3.0
|
|
11
|
+
Requires-Dist: numpy>=1.21.0
|
|
12
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
13
|
+
Requires-Dist: joblib>=1.1.0
|
|
14
|
+
Dynamic: author
|
|
15
|
+
Dynamic: author-email
|
|
16
|
+
Dynamic: description
|
|
17
|
+
Dynamic: description-content-type
|
|
18
|
+
Dynamic: license-file
|
|
19
|
+
Dynamic: requires-dist
|
|
20
|
+
Dynamic: requires-python
|
|
21
|
+
Dynamic: summary
|
|
22
|
+
|
|
23
|
+
# AutoPreprocess
|
|
24
|
+
|
|
25
|
+
**Automatic Data Preprocessing Library for Machine Learning**
|
|
26
|
+
|
|
27
|
+
[](https://python.org)
|
|
28
|
+
[](LICENSE)
|
|
29
|
+
|
|
30
|
+
## โจ Features
|
|
31
|
+
|
|
32
|
+
- **Automatic column type detection** (numeric, categorical, datetime, useless)
|
|
33
|
+
- **Smart missing value handling** (based on missing percentage)
|
|
34
|
+
- **Outlier detection & capping** (IQR method)
|
|
35
|
+
- **Intelligent encoding** (One-hot, Frequency, Target encoding)
|
|
36
|
+
- **Feature scaling** (Standard, MinMax, Robust)
|
|
37
|
+
- **Feature selection** (Variance, Correlation, Importance, Mutual Info)
|
|
38
|
+
- **Train/Test split** (Random, Stratified, Time series)
|
|
39
|
+
- **Save & load pipeline** for production deployment
|
|
40
|
+
- **Zero data leakage** (fit only on training data)
|
|
41
|
+
|
|
42
|
+
## ๐ Quick Start
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from autopreprocess import AutoClean
|
|
46
|
+
|
|
47
|
+
# One line to preprocess everything
|
|
48
|
+
pipeline = AutoClean('data.csv', target='price')
|
|
49
|
+
X_train, X_test, y_train, y_test = pipeline.preprocess()
|
|
50
|
+
|
|
51
|
+
# For new predictions
|
|
52
|
+
X_new_clean = pipeline.predict_ready_data(X_new)
|
|
53
|
+
|
|
54
|
+
# Save for later
|
|
55
|
+
pipeline.save('my_pipeline.pkl')
|
|
56
|
+
|
|
57
|
+
# Load and use
|
|
58
|
+
loaded = AutoClean.load('my_pipeline.pkl')
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# AutoPreprocess
|
|
2
|
+
|
|
3
|
+
**Automatic Data Preprocessing Library for Machine Learning**
|
|
4
|
+
|
|
5
|
+
[](https://python.org)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
|
|
8
|
+
## โจ Features
|
|
9
|
+
|
|
10
|
+
- **Automatic column type detection** (numeric, categorical, datetime, useless)
|
|
11
|
+
- **Smart missing value handling** (based on missing percentage)
|
|
12
|
+
- **Outlier detection & capping** (IQR method)
|
|
13
|
+
- **Intelligent encoding** (One-hot, Frequency, Target encoding)
|
|
14
|
+
- **Feature scaling** (Standard, MinMax, Robust)
|
|
15
|
+
- **Feature selection** (Variance, Correlation, Importance, Mutual Info)
|
|
16
|
+
- **Train/Test split** (Random, Stratified, Time series)
|
|
17
|
+
- **Save & load pipeline** for production deployment
|
|
18
|
+
- **Zero data leakage** (fit only on training data)
|
|
19
|
+
|
|
20
|
+
## ๐ Quick Start
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
from autopreprocess import AutoClean
|
|
24
|
+
|
|
25
|
+
# One line to preprocess everything
|
|
26
|
+
pipeline = AutoClean('data.csv', target='price')
|
|
27
|
+
X_train, X_test, y_train, y_test = pipeline.preprocess()
|
|
28
|
+
|
|
29
|
+
# For new predictions
|
|
30
|
+
X_new_clean = pipeline.predict_ready_data(X_new)
|
|
31
|
+
|
|
32
|
+
# Save for later
|
|
33
|
+
pipeline.save('my_pipeline.pkl')
|
|
34
|
+
|
|
35
|
+
# Load and use
|
|
36
|
+
loaded = AutoClean.load('my_pipeline.pkl')
|
|
File without changes
|
|
@@ -0,0 +1,371 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cleaner - Handles Missing Values, Outliers, Duplicates, and Useless Columns
|
|
3
|
+
|
|
4
|
+
What it does:
|
|
5
|
+
1. Fills or removes missing values intelligently
|
|
6
|
+
2. Caps outliers at percentile boundaries
|
|
7
|
+
3. Removes duplicate rows
|
|
8
|
+
4. Drops columns that have no predictive value
|
|
9
|
+
|
|
10
|
+
Why it's important:
|
|
11
|
+
- Missing values break most ML algorithms
|
|
12
|
+
- Outliers can skew models like Linear Regression
|
|
13
|
+
- Duplicates waste memory and cause overfitting
|
|
14
|
+
- Useless columns add noise and slow down training
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import pandas as pd
|
|
18
|
+
import numpy as np
|
|
19
|
+
from typing import Dict, Any, Optional, List, Tuple
|
|
20
|
+
|
|
21
|
+
class DataCleaner:
|
|
22
|
+
"""
|
|
23
|
+
Cleans a DataFrame by handling common data quality issues
|
|
24
|
+
|
|
25
|
+
Usage:
|
|
26
|
+
cleaner = DataCleaner()
|
|
27
|
+
df_clean = cleaner.clean_dataframe(df)
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
@staticmethod
|
|
31
|
+
def handle_missing_values(
|
|
32
|
+
df: pd.DataFrame,
|
|
33
|
+
missing_threshold_drop: float = 40.0,
|
|
34
|
+
verbose: bool = True
|
|
35
|
+
) -> Tuple[pd.DataFrame, Dict[str, str]]:
|
|
36
|
+
"""
|
|
37
|
+
Handle missing values in DataFrame
|
|
38
|
+
|
|
39
|
+
Strategy based on missing percentage:
|
|
40
|
+
- < 5%: Fill with median (numeric) or mode (categorical)
|
|
41
|
+
- 5-20%: Fill with KNN imputation (simple version uses mean/mode)
|
|
42
|
+
- 20-40%: Create missing indicator column + fill
|
|
43
|
+
- > 40%: Drop column
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
df: Input DataFrame
|
|
47
|
+
missing_threshold_drop: Drop column if missing % above this (default 40)
|
|
48
|
+
verbose: Print actions taken
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Tuple of (cleaned_df, actions_taken)
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
df_clean = df.copy()
|
|
55
|
+
actions = {}
|
|
56
|
+
total_rows = len(df_clean)
|
|
57
|
+
|
|
58
|
+
if verbose:
|
|
59
|
+
print("\n๐ง HANDLING MISSING VALUES")
|
|
60
|
+
print("="*40)
|
|
61
|
+
|
|
62
|
+
for col in df_clean.columns:
|
|
63
|
+
missing_count = df_clean[col].isnull().sum()
|
|
64
|
+
missing_pct = (missing_count / total_rows) * 100
|
|
65
|
+
|
|
66
|
+
if missing_count == 0:
|
|
67
|
+
continue # No missing values, skip
|
|
68
|
+
|
|
69
|
+
# Determine strategy based on missing percentage
|
|
70
|
+
if missing_pct >= missing_threshold_drop:
|
|
71
|
+
# Drop column
|
|
72
|
+
df_clean = df_clean.drop(columns=[col])
|
|
73
|
+
actions[col] = f"dropped ({missing_pct:.1f}% missing)"
|
|
74
|
+
if verbose:
|
|
75
|
+
print(f" โ Dropped '{col}': {missing_pct:.1f}% missing")
|
|
76
|
+
|
|
77
|
+
elif missing_pct >= 20:
|
|
78
|
+
# Create missing indicator column + fill
|
|
79
|
+
indicator_col = f"{col}_was_missing"
|
|
80
|
+
df_clean[indicator_col] = df_clean[col].isnull().astype(int)
|
|
81
|
+
|
|
82
|
+
# Fill with median/mode
|
|
83
|
+
if pd.api.types.is_numeric_dtype(df_clean[col]):
|
|
84
|
+
fill_value = df_clean[col].median()
|
|
85
|
+
else:
|
|
86
|
+
fill_value = df_clean[col].mode()[0] if len(df_clean[col].mode()) > 0 else "Unknown"
|
|
87
|
+
|
|
88
|
+
df_clean[col] = df_clean[col].fillna(fill_value)
|
|
89
|
+
actions[col] = f"filled with {fill_value}, added '{indicator_col}' flag ({missing_pct:.1f}% missing)"
|
|
90
|
+
if verbose:
|
|
91
|
+
print(f" ๐ '{col}': {missing_pct:.1f}% missing โ filled + added indicator")
|
|
92
|
+
|
|
93
|
+
elif missing_pct >= 5:
|
|
94
|
+
# Advanced imputation (simplified - uses median/mode for now)
|
|
95
|
+
# In production, you'd use KNN or MICE here
|
|
96
|
+
if pd.api.types.is_numeric_dtype(df_clean[col]):
|
|
97
|
+
fill_value = df_clean[col].median()
|
|
98
|
+
else:
|
|
99
|
+
fill_value = df_clean[col].mode()[0] if len(df_clean[col].mode()) > 0 else "Unknown"
|
|
100
|
+
|
|
101
|
+
df_clean[col] = df_clean[col].fillna(fill_value)
|
|
102
|
+
actions[col] = f"imputed with {fill_value} ({missing_pct:.1f}% missing)"
|
|
103
|
+
if verbose:
|
|
104
|
+
print(f" ๐ง '{col}': {missing_pct:.1f}% missing โ imputed with {fill_value}")
|
|
105
|
+
|
|
106
|
+
else:
|
|
107
|
+
# Simple imputation (<5% missing)
|
|
108
|
+
if pd.api.types.is_numeric_dtype(df_clean[col]):
|
|
109
|
+
fill_value = df_clean[col].median()
|
|
110
|
+
else:
|
|
111
|
+
fill_value = df_clean[col].mode()[0] if len(df_clean[col].mode()) > 0 else "Unknown"
|
|
112
|
+
|
|
113
|
+
df_clean[col] = df_clean[col].fillna(fill_value)
|
|
114
|
+
actions[col] = f"filled with {fill_value} ({missing_pct:.1f}% missing)"
|
|
115
|
+
if verbose:
|
|
116
|
+
print(f" โ
'{col}': {missing_pct:.1f}% missing โ filled with {fill_value}")
|
|
117
|
+
|
|
118
|
+
if verbose and len(actions) == 0:
|
|
119
|
+
print(" โ
No missing values found!")
|
|
120
|
+
|
|
121
|
+
return df_clean, actions
|
|
122
|
+
|
|
123
|
+
@staticmethod
|
|
124
|
+
def handle_outliers(
|
|
125
|
+
df: pd.DataFrame,
|
|
126
|
+
lower_percentile: float = 1.0,
|
|
127
|
+
upper_percentile: float = 99.0,
|
|
128
|
+
verbose: bool = True
|
|
129
|
+
) -> Tuple[pd.DataFrame, Dict[str, Dict[str, float]]]:
|
|
130
|
+
"""
|
|
131
|
+
Cap outliers at specified percentiles
|
|
132
|
+
|
|
133
|
+
How it works:
|
|
134
|
+
1. Find P1 and P99 (or custom percentiles)
|
|
135
|
+
2. Any value < P1 becomes P1
|
|
136
|
+
3. Any value > P99 becomes P99
|
|
137
|
+
|
|
138
|
+
This is called "Winsorization" - it keeps outliers but limits their impact.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
df: Input DataFrame
|
|
142
|
+
lower_percentile: Lower bound percentile (default 1)
|
|
143
|
+
upper_percentile: Upper bound percentile (default 99)
|
|
144
|
+
verbose: Print actions taken
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Tuple of (cleaned_df, outlier_info)
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
df_clean = df.copy()
|
|
151
|
+
outlier_info = {}
|
|
152
|
+
|
|
153
|
+
# Only process numeric columns
|
|
154
|
+
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
|
|
155
|
+
|
|
156
|
+
if verbose:
|
|
157
|
+
print("\n๐ง HANDLING OUTLIERS")
|
|
158
|
+
print("="*40)
|
|
159
|
+
|
|
160
|
+
for col in numeric_cols:
|
|
161
|
+
# Skip if all values are same (constant)
|
|
162
|
+
if df_clean[col].nunique() == 1:
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
# Calculate percentiles
|
|
166
|
+
lower_bound = df_clean[col].quantile(lower_percentile / 100)
|
|
167
|
+
upper_bound = df_clean[col].quantile(upper_percentile / 100)
|
|
168
|
+
|
|
169
|
+
# Count outliers before capping
|
|
170
|
+
outliers_before = ((df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)).sum()
|
|
171
|
+
outlier_pct = (outliers_before / len(df_clean)) * 100
|
|
172
|
+
|
|
173
|
+
if outliers_before == 0:
|
|
174
|
+
continue
|
|
175
|
+
|
|
176
|
+
# Cap outliers
|
|
177
|
+
df_clean[col] = df_clean[col].clip(lower=lower_bound, upper=upper_bound)
|
|
178
|
+
|
|
179
|
+
outlier_info[col] = {
|
|
180
|
+
'lower_bound': round(lower_bound, 2),
|
|
181
|
+
'upper_bound': round(upper_bound, 2),
|
|
182
|
+
'outliers_capped': int(outliers_before),
|
|
183
|
+
'outlier_pct': round(outlier_pct, 2)
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
if verbose:
|
|
187
|
+
print(f" โ๏ธ '{col}': capped {outliers_before} outliers ({outlier_pct:.1f}%)")
|
|
188
|
+
print(f" Range limited to [{lower_bound:.2f}, {upper_bound:.2f}]")
|
|
189
|
+
|
|
190
|
+
if verbose and len(outlier_info) == 0:
|
|
191
|
+
print(" โ
No outliers found!")
|
|
192
|
+
|
|
193
|
+
return df_clean, outlier_info
|
|
194
|
+
|
|
195
|
+
@staticmethod
|
|
196
|
+
def remove_duplicates(
|
|
197
|
+
df: pd.DataFrame,
|
|
198
|
+
keep: str = 'first',
|
|
199
|
+
verbose: bool = True
|
|
200
|
+
) -> Tuple[pd.DataFrame, int]:
|
|
201
|
+
"""
|
|
202
|
+
Remove duplicate rows from DataFrame
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
df: Input DataFrame
|
|
206
|
+
keep: Which duplicate to keep ('first', 'last', or False for none)
|
|
207
|
+
verbose: Print actions taken
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Tuple of (cleaned_df, number_of_duplicates_removed)
|
|
211
|
+
"""
|
|
212
|
+
|
|
213
|
+
duplicates_before = df.duplicated().sum()
|
|
214
|
+
|
|
215
|
+
if duplicates_before == 0:
|
|
216
|
+
if verbose:
|
|
217
|
+
print("\n๐ง REMOVING DUPLICATES")
|
|
218
|
+
print("="*40)
|
|
219
|
+
print(" โ
No duplicate rows found!")
|
|
220
|
+
return df.copy(), 0
|
|
221
|
+
|
|
222
|
+
df_clean = df.drop_duplicates(keep=keep)
|
|
223
|
+
duplicates_removed = duplicates_before
|
|
224
|
+
|
|
225
|
+
if verbose:
|
|
226
|
+
print("\n๐ง REMOVING DUPLICATES")
|
|
227
|
+
print("="*40)
|
|
228
|
+
print(f" ๐๏ธ Removed {duplicates_removed} duplicate rows")
|
|
229
|
+
print(f" ๐ Shape: {df.shape} โ {df_clean.shape}")
|
|
230
|
+
|
|
231
|
+
return df_clean, duplicates_removed
|
|
232
|
+
|
|
233
|
+
@staticmethod
|
|
234
|
+
def drop_useless_columns(
|
|
235
|
+
df: pd.DataFrame,
|
|
236
|
+
verbose: bool = True
|
|
237
|
+
) -> tuple:
|
|
238
|
+
"""
|
|
239
|
+
Drop columns that are useless (constant or sequential identifiers)
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
df: Input DataFrame
|
|
243
|
+
verbose: Print actions taken
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
Tuple of (cleaned_df, list_of_dropped_columns)
|
|
247
|
+
"""
|
|
248
|
+
import pandas as pd
|
|
249
|
+
|
|
250
|
+
df_clean = df.copy()
|
|
251
|
+
dropped = []
|
|
252
|
+
|
|
253
|
+
if verbose:
|
|
254
|
+
print("\n๐ง DROPPING USELESS COLUMNS")
|
|
255
|
+
print("="*40)
|
|
256
|
+
|
|
257
|
+
for col in df_clean.columns:
|
|
258
|
+
# Check if constant (all same value)
|
|
259
|
+
if df_clean[col].nunique() == 1:
|
|
260
|
+
df_clean = df_clean.drop(columns=[col])
|
|
261
|
+
dropped.append(col)
|
|
262
|
+
if verbose:
|
|
263
|
+
constant_value = str(df[col].iloc[0]) if len(df) > 0 else "N/A"
|
|
264
|
+
if len(constant_value) > 30:
|
|
265
|
+
constant_value = constant_value[:27] + "..."
|
|
266
|
+
print(f" ๐๏ธ Dropped '{col}': constant value '{constant_value}'")
|
|
267
|
+
|
|
268
|
+
# Check if identifier (sequential numbers, all unique)
|
|
269
|
+
elif df_clean[col].nunique() == len(df_clean) and len(df_clean) > 10:
|
|
270
|
+
if pd.api.types.is_numeric_dtype(df_clean[col]):
|
|
271
|
+
sorted_vals = df_clean[col].dropna().sort_values().values
|
|
272
|
+
if len(sorted_vals) > 1:
|
|
273
|
+
# Check if values are sequential (difference of 1)
|
|
274
|
+
is_sequential = all(
|
|
275
|
+
sorted_vals[i+1] - sorted_vals[i] == 1
|
|
276
|
+
for i in range(len(sorted_vals)-1)
|
|
277
|
+
)
|
|
278
|
+
if is_sequential:
|
|
279
|
+
df_clean = df_clean.drop(columns=[col])
|
|
280
|
+
dropped.append(col)
|
|
281
|
+
if verbose:
|
|
282
|
+
print(f" ๐๏ธ Dropped '{col}': sequential ID")
|
|
283
|
+
|
|
284
|
+
if verbose and len(dropped) == 0:
|
|
285
|
+
print(" โ
No useless columns found")
|
|
286
|
+
elif verbose and len(dropped) > 0:
|
|
287
|
+
print(f" ๐ Total useless columns dropped: {len(dropped)}")
|
|
288
|
+
|
|
289
|
+
return df_clean, dropped
|
|
290
|
+
@staticmethod
|
|
291
|
+
def clean_dataframe(
|
|
292
|
+
df: pd.DataFrame,
|
|
293
|
+
handle_missing: bool = True,
|
|
294
|
+
handle_outliers: bool = True,
|
|
295
|
+
remove_duplicates: bool = True,
|
|
296
|
+
drop_useless: bool = True,
|
|
297
|
+
verbose: bool = True
|
|
298
|
+
) -> Dict[str, Any]:
|
|
299
|
+
"""
|
|
300
|
+
Main cleaning function - runs all cleaning steps
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
df: Input DataFrame
|
|
304
|
+
handle_missing: Whether to handle missing values
|
|
305
|
+
handle_outliers: Whether to handle outliers
|
|
306
|
+
remove_duplicates: Whether to remove duplicate rows
|
|
307
|
+
drop_useless: Whether to drop useless columns
|
|
308
|
+
verbose: Print progress
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
Dictionary with cleaned DataFrame and cleaning report
|
|
312
|
+
"""
|
|
313
|
+
|
|
314
|
+
df_clean = df.copy()
|
|
315
|
+
cleaning_report = {
|
|
316
|
+
'original_shape': df.shape,
|
|
317
|
+
'steps_performed': [],
|
|
318
|
+
'missing_handling': {},
|
|
319
|
+
'outlier_handling': {},
|
|
320
|
+
'duplicates_removed': 0,
|
|
321
|
+
'useless_columns_dropped': [],
|
|
322
|
+
'final_shape': None
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
if verbose:
|
|
326
|
+
print("\n" + "="*60)
|
|
327
|
+
print("๐งน DATA CLEANING PIPELINE")
|
|
328
|
+
print("="*60)
|
|
329
|
+
print(f"๐ Original shape: {df.shape[0]} rows, {df.shape[1]} columns")
|
|
330
|
+
|
|
331
|
+
# Step 1: Remove duplicates
|
|
332
|
+
if remove_duplicates:
|
|
333
|
+
df_clean, dup_removed = DataCleaner.remove_duplicates(df_clean, verbose=verbose)
|
|
334
|
+
cleaning_report['duplicates_removed'] = dup_removed
|
|
335
|
+
if dup_removed > 0:
|
|
336
|
+
cleaning_report['steps_performed'].append('removed_duplicates')
|
|
337
|
+
|
|
338
|
+
# Step 2: Drop useless columns
|
|
339
|
+
if drop_useless:
|
|
340
|
+
df_clean, dropped = DataCleaner.drop_useless_columns(df_clean, verbose=verbose)
|
|
341
|
+
cleaning_report['useless_columns_dropped'] = dropped
|
|
342
|
+
if dropped:
|
|
343
|
+
cleaning_report['steps_performed'].append('dropped_useless_columns')
|
|
344
|
+
|
|
345
|
+
# Step 3: Handle missing values
|
|
346
|
+
if handle_missing:
|
|
347
|
+
df_clean, missing_actions = DataCleaner.handle_missing_values(df_clean, verbose=verbose)
|
|
348
|
+
cleaning_report['missing_handling'] = missing_actions
|
|
349
|
+
if missing_actions:
|
|
350
|
+
cleaning_report['steps_performed'].append('handled_missing_values')
|
|
351
|
+
|
|
352
|
+
# Step 4: Handle outliers
|
|
353
|
+
if handle_outliers:
|
|
354
|
+
df_clean, outlier_actions = DataCleaner.handle_outliers(df_clean, verbose=verbose)
|
|
355
|
+
cleaning_report['outlier_handling'] = outlier_actions
|
|
356
|
+
if outlier_actions:
|
|
357
|
+
cleaning_report['steps_performed'].append('handled_outliers')
|
|
358
|
+
|
|
359
|
+
cleaning_report['final_shape'] = df_clean.shape
|
|
360
|
+
|
|
361
|
+
if verbose:
|
|
362
|
+
print("\n" + "="*40)
|
|
363
|
+
print("โ
CLEANING COMPLETE")
|
|
364
|
+
print("="*40)
|
|
365
|
+
print(f"๐ Final shape: {df_clean.shape[0]} rows, {df_clean.shape[1]} columns")
|
|
366
|
+
print(f"๐ Rows removed: {df.shape[0] - df_clean.shape[0]}")
|
|
367
|
+
print(f"๐ Columns removed: {df.shape[1] - df_clean.shape[1]}")
|
|
368
|
+
|
|
369
|
+
cleaning_report['cleaned_data'] = df_clean
|
|
370
|
+
|
|
371
|
+
return cleaning_report
|