autopreprocess-lite 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. autopreprocess_lite-0.1.1/LICENSE +21 -0
  2. autopreprocess_lite-0.1.1/MANIFEST.in +7 -0
  3. autopreprocess_lite-0.1.1/PKG-INFO +58 -0
  4. autopreprocess_lite-0.1.1/README.md +36 -0
  5. autopreprocess_lite-0.1.1/autopreprocess/__init__.py +5 -0
  6. autopreprocess_lite-0.1.1/autopreprocess/core/__init__.py +0 -0
  7. autopreprocess_lite-0.1.1/autopreprocess/core/cleaner.py +371 -0
  8. autopreprocess_lite-0.1.1/autopreprocess/core/encoder.py +383 -0
  9. autopreprocess_lite-0.1.1/autopreprocess/core/pipeline.py +415 -0
  10. autopreprocess_lite-0.1.1/autopreprocess/core/scaler.py +197 -0
  11. autopreprocess_lite-0.1.1/autopreprocess/core/selector.py +325 -0
  12. autopreprocess_lite-0.1.1/autopreprocess/core/splitter.py +211 -0
  13. autopreprocess_lite-0.1.1/autopreprocess/detectors/__init__.py +0 -0
  14. autopreprocess_lite-0.1.1/autopreprocess/detectors/quality_detector.py +327 -0
  15. autopreprocess_lite-0.1.1/autopreprocess/detectors/type_detector.py +157 -0
  16. autopreprocess_lite-0.1.1/autopreprocess/utils/__init__.py +0 -0
  17. autopreprocess_lite-0.1.1/autopreprocess/utils/cache.py +107 -0
  18. autopreprocess_lite-0.1.1/autopreprocess/utils/memory.py +79 -0
  19. autopreprocess_lite-0.1.1/autopreprocess_lite.egg-info/PKG-INFO +58 -0
  20. autopreprocess_lite-0.1.1/autopreprocess_lite.egg-info/SOURCES.txt +38 -0
  21. autopreprocess_lite-0.1.1/autopreprocess_lite.egg-info/dependency_links.txt +1 -0
  22. autopreprocess_lite-0.1.1/autopreprocess_lite.egg-info/requires.txt +4 -0
  23. autopreprocess_lite-0.1.1/autopreprocess_lite.egg-info/top_level.txt +5 -0
  24. autopreprocess_lite-0.1.1/core/__init__.py +0 -0
  25. autopreprocess_lite-0.1.1/detectors/__init__.py +0 -0
  26. autopreprocess_lite-0.1.1/report/__init__.py +1 -0
  27. autopreprocess_lite-0.1.1/setup.cfg +4 -0
  28. autopreprocess_lite-0.1.1/setup.py +26 -0
  29. autopreprocess_lite-0.1.1/tests/test_ames_housing.py +163 -0
  30. autopreprocess_lite-0.1.1/tests/test_cache.py +40 -0
  31. autopreprocess_lite-0.1.1/tests/test_current_pipeline.py +110 -0
  32. autopreprocess_lite-0.1.1/tests/test_encoder.py +49 -0
  33. autopreprocess_lite-0.1.1/tests/test_memory.py +19 -0
  34. autopreprocess_lite-0.1.1/tests/test_pipeline.py +56 -0
  35. autopreprocess_lite-0.1.1/tests/test_quality_detector.py +55 -0
  36. autopreprocess_lite-0.1.1/tests/test_scaler.py +74 -0
  37. autopreprocess_lite-0.1.1/tests/test_selector.py +64 -0
  38. autopreprocess_lite-0.1.1/tests/test_splitter.py +64 -0
  39. autopreprocess_lite-0.1.1/tests/test_type_detector.py +44 -0
  40. autopreprocess_lite-0.1.1/utils/__init__.py +0 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Ayush Gupta
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,7 @@
1
+ include autopreprocess/*.py
2
+ recursive-include autopreprocess *.py
3
+ recursive-include autopreprocess/core *.py
4
+ recursive-include autopreprocess/detectors *.py
5
+ recursive-include autopreprocess/utils *.py
6
+ include README.md
7
+ include LICENSE
@@ -0,0 +1,58 @@
1
+ Metadata-Version: 2.4
2
+ Name: autopreprocess-lite
3
+ Version: 0.1.1
4
+ Summary: Automatic Data Preprocessing Library
5
+ Author: Ayush Gupta
6
+ Author-email: guptaaayush0908@gmail.com
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: pandas>=1.3.0
11
+ Requires-Dist: numpy>=1.21.0
12
+ Requires-Dist: scikit-learn>=1.0.0
13
+ Requires-Dist: joblib>=1.1.0
14
+ Dynamic: author
15
+ Dynamic: author-email
16
+ Dynamic: description
17
+ Dynamic: description-content-type
18
+ Dynamic: license-file
19
+ Dynamic: requires-dist
20
+ Dynamic: requires-python
21
+ Dynamic: summary
22
+
23
+ # AutoPreprocess
24
+
25
+ **Automatic Data Preprocessing Library for Machine Learning**
26
+
27
+ [![Python Version](https://img.shields.io/badge/python-3.8+-blue.svg)](https://python.org)
28
+ [![License](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
29
+
30
+ ## โœจ Features
31
+
32
+ - **Automatic column type detection** (numeric, categorical, datetime, useless)
33
+ - **Smart missing value handling** (based on missing percentage)
34
+ - **Outlier detection & capping** (IQR method)
35
+ - **Intelligent encoding** (One-hot, Frequency, Target encoding)
36
+ - **Feature scaling** (Standard, MinMax, Robust)
37
+ - **Feature selection** (Variance, Correlation, Importance, Mutual Info)
38
+ - **Train/Test split** (Random, Stratified, Time series)
39
+ - **Save & load pipeline** for production deployment
40
+ - **Zero data leakage** (fit only on training data)
41
+
42
+ ## ๐Ÿš€ Quick Start
43
+
44
+ ```python
45
+ from autopreprocess import AutoClean
46
+
47
+ # One line to preprocess everything
48
+ pipeline = AutoClean('data.csv', target='price')
49
+ X_train, X_test, y_train, y_test = pipeline.preprocess()
50
+
51
+ # For new predictions
52
+ X_new_clean = pipeline.predict_ready_data(X_new)
53
+
54
+ # Save for later
55
+ pipeline.save('my_pipeline.pkl')
56
+
57
+ # Load and use
58
+ loaded = AutoClean.load('my_pipeline.pkl')
@@ -0,0 +1,36 @@
1
+ # AutoPreprocess
2
+
3
+ **Automatic Data Preprocessing Library for Machine Learning**
4
+
5
+ [![Python Version](https://img.shields.io/badge/python-3.8+-blue.svg)](https://python.org)
6
+ [![License](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
7
+
8
+ ## โœจ Features
9
+
10
+ - **Automatic column type detection** (numeric, categorical, datetime, useless)
11
+ - **Smart missing value handling** (based on missing percentage)
12
+ - **Outlier detection & capping** (IQR method)
13
+ - **Intelligent encoding** (One-hot, Frequency, Target encoding)
14
+ - **Feature scaling** (Standard, MinMax, Robust)
15
+ - **Feature selection** (Variance, Correlation, Importance, Mutual Info)
16
+ - **Train/Test split** (Random, Stratified, Time series)
17
+ - **Save & load pipeline** for production deployment
18
+ - **Zero data leakage** (fit only on training data)
19
+
20
+ ## ๐Ÿš€ Quick Start
21
+
22
+ ```python
23
+ from autopreprocess import AutoClean
24
+
25
+ # One line to preprocess everything
26
+ pipeline = AutoClean('data.csv', target='price')
27
+ X_train, X_test, y_train, y_test = pipeline.preprocess()
28
+
29
+ # For new predictions
30
+ X_new_clean = pipeline.predict_ready_data(X_new)
31
+
32
+ # Save for later
33
+ pipeline.save('my_pipeline.pkl')
34
+
35
+ # Load and use
36
+ loaded = AutoClean.load('my_pipeline.pkl')
@@ -0,0 +1,5 @@
1
+ ๏ปฟfrom .core.pipeline import AutoClean
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ __all__ = ['AutoClean']
@@ -0,0 +1,371 @@
1
+ """
2
+ Cleaner - Handles Missing Values, Outliers, Duplicates, and Useless Columns
3
+
4
+ What it does:
5
+ 1. Fills or removes missing values intelligently
6
+ 2. Caps outliers at percentile boundaries
7
+ 3. Removes duplicate rows
8
+ 4. Drops columns that have no predictive value
9
+
10
+ Why it's important:
11
+ - Missing values break most ML algorithms
12
+ - Outliers can skew models like Linear Regression
13
+ - Duplicates waste memory and cause overfitting
14
+ - Useless columns add noise and slow down training
15
+ """
16
+
17
+ import pandas as pd
18
+ import numpy as np
19
+ from typing import Dict, Any, Optional, List, Tuple
20
+
21
+ class DataCleaner:
22
+ """
23
+ Cleans a DataFrame by handling common data quality issues
24
+
25
+ Usage:
26
+ cleaner = DataCleaner()
27
+ df_clean = cleaner.clean_dataframe(df)
28
+ """
29
+
30
+ @staticmethod
31
+ def handle_missing_values(
32
+ df: pd.DataFrame,
33
+ missing_threshold_drop: float = 40.0,
34
+ verbose: bool = True
35
+ ) -> Tuple[pd.DataFrame, Dict[str, str]]:
36
+ """
37
+ Handle missing values in DataFrame
38
+
39
+ Strategy based on missing percentage:
40
+ - < 5%: Fill with median (numeric) or mode (categorical)
41
+ - 5-20%: Fill with KNN imputation (simple version uses mean/mode)
42
+ - 20-40%: Create missing indicator column + fill
43
+ - > 40%: Drop column
44
+
45
+ Args:
46
+ df: Input DataFrame
47
+ missing_threshold_drop: Drop column if missing % above this (default 40)
48
+ verbose: Print actions taken
49
+
50
+ Returns:
51
+ Tuple of (cleaned_df, actions_taken)
52
+ """
53
+
54
+ df_clean = df.copy()
55
+ actions = {}
56
+ total_rows = len(df_clean)
57
+
58
+ if verbose:
59
+ print("\n๐Ÿ”ง HANDLING MISSING VALUES")
60
+ print("="*40)
61
+
62
+ for col in df_clean.columns:
63
+ missing_count = df_clean[col].isnull().sum()
64
+ missing_pct = (missing_count / total_rows) * 100
65
+
66
+ if missing_count == 0:
67
+ continue # No missing values, skip
68
+
69
+ # Determine strategy based on missing percentage
70
+ if missing_pct >= missing_threshold_drop:
71
+ # Drop column
72
+ df_clean = df_clean.drop(columns=[col])
73
+ actions[col] = f"dropped ({missing_pct:.1f}% missing)"
74
+ if verbose:
75
+ print(f" โŒ Dropped '{col}': {missing_pct:.1f}% missing")
76
+
77
+ elif missing_pct >= 20:
78
+ # Create missing indicator column + fill
79
+ indicator_col = f"{col}_was_missing"
80
+ df_clean[indicator_col] = df_clean[col].isnull().astype(int)
81
+
82
+ # Fill with median/mode
83
+ if pd.api.types.is_numeric_dtype(df_clean[col]):
84
+ fill_value = df_clean[col].median()
85
+ else:
86
+ fill_value = df_clean[col].mode()[0] if len(df_clean[col].mode()) > 0 else "Unknown"
87
+
88
+ df_clean[col] = df_clean[col].fillna(fill_value)
89
+ actions[col] = f"filled with {fill_value}, added '{indicator_col}' flag ({missing_pct:.1f}% missing)"
90
+ if verbose:
91
+ print(f" ๐Ÿ“ '{col}': {missing_pct:.1f}% missing โ†’ filled + added indicator")
92
+
93
+ elif missing_pct >= 5:
94
+ # Advanced imputation (simplified - uses median/mode for now)
95
+ # In production, you'd use KNN or MICE here
96
+ if pd.api.types.is_numeric_dtype(df_clean[col]):
97
+ fill_value = df_clean[col].median()
98
+ else:
99
+ fill_value = df_clean[col].mode()[0] if len(df_clean[col].mode()) > 0 else "Unknown"
100
+
101
+ df_clean[col] = df_clean[col].fillna(fill_value)
102
+ actions[col] = f"imputed with {fill_value} ({missing_pct:.1f}% missing)"
103
+ if verbose:
104
+ print(f" ๐Ÿ”ง '{col}': {missing_pct:.1f}% missing โ†’ imputed with {fill_value}")
105
+
106
+ else:
107
+ # Simple imputation (<5% missing)
108
+ if pd.api.types.is_numeric_dtype(df_clean[col]):
109
+ fill_value = df_clean[col].median()
110
+ else:
111
+ fill_value = df_clean[col].mode()[0] if len(df_clean[col].mode()) > 0 else "Unknown"
112
+
113
+ df_clean[col] = df_clean[col].fillna(fill_value)
114
+ actions[col] = f"filled with {fill_value} ({missing_pct:.1f}% missing)"
115
+ if verbose:
116
+ print(f" โœ… '{col}': {missing_pct:.1f}% missing โ†’ filled with {fill_value}")
117
+
118
+ if verbose and len(actions) == 0:
119
+ print(" โœ… No missing values found!")
120
+
121
+ return df_clean, actions
122
+
123
+ @staticmethod
124
+ def handle_outliers(
125
+ df: pd.DataFrame,
126
+ lower_percentile: float = 1.0,
127
+ upper_percentile: float = 99.0,
128
+ verbose: bool = True
129
+ ) -> Tuple[pd.DataFrame, Dict[str, Dict[str, float]]]:
130
+ """
131
+ Cap outliers at specified percentiles
132
+
133
+ How it works:
134
+ 1. Find P1 and P99 (or custom percentiles)
135
+ 2. Any value < P1 becomes P1
136
+ 3. Any value > P99 becomes P99
137
+
138
+ This is called "Winsorization" - it keeps outliers but limits their impact.
139
+
140
+ Args:
141
+ df: Input DataFrame
142
+ lower_percentile: Lower bound percentile (default 1)
143
+ upper_percentile: Upper bound percentile (default 99)
144
+ verbose: Print actions taken
145
+
146
+ Returns:
147
+ Tuple of (cleaned_df, outlier_info)
148
+ """
149
+
150
+ df_clean = df.copy()
151
+ outlier_info = {}
152
+
153
+ # Only process numeric columns
154
+ numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
155
+
156
+ if verbose:
157
+ print("\n๐Ÿ”ง HANDLING OUTLIERS")
158
+ print("="*40)
159
+
160
+ for col in numeric_cols:
161
+ # Skip if all values are same (constant)
162
+ if df_clean[col].nunique() == 1:
163
+ continue
164
+
165
+ # Calculate percentiles
166
+ lower_bound = df_clean[col].quantile(lower_percentile / 100)
167
+ upper_bound = df_clean[col].quantile(upper_percentile / 100)
168
+
169
+ # Count outliers before capping
170
+ outliers_before = ((df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)).sum()
171
+ outlier_pct = (outliers_before / len(df_clean)) * 100
172
+
173
+ if outliers_before == 0:
174
+ continue
175
+
176
+ # Cap outliers
177
+ df_clean[col] = df_clean[col].clip(lower=lower_bound, upper=upper_bound)
178
+
179
+ outlier_info[col] = {
180
+ 'lower_bound': round(lower_bound, 2),
181
+ 'upper_bound': round(upper_bound, 2),
182
+ 'outliers_capped': int(outliers_before),
183
+ 'outlier_pct': round(outlier_pct, 2)
184
+ }
185
+
186
+ if verbose:
187
+ print(f" โœ‚๏ธ '{col}': capped {outliers_before} outliers ({outlier_pct:.1f}%)")
188
+ print(f" Range limited to [{lower_bound:.2f}, {upper_bound:.2f}]")
189
+
190
+ if verbose and len(outlier_info) == 0:
191
+ print(" โœ… No outliers found!")
192
+
193
+ return df_clean, outlier_info
194
+
195
+ @staticmethod
196
+ def remove_duplicates(
197
+ df: pd.DataFrame,
198
+ keep: str = 'first',
199
+ verbose: bool = True
200
+ ) -> Tuple[pd.DataFrame, int]:
201
+ """
202
+ Remove duplicate rows from DataFrame
203
+
204
+ Args:
205
+ df: Input DataFrame
206
+ keep: Which duplicate to keep ('first', 'last', or False for none)
207
+ verbose: Print actions taken
208
+
209
+ Returns:
210
+ Tuple of (cleaned_df, number_of_duplicates_removed)
211
+ """
212
+
213
+ duplicates_before = df.duplicated().sum()
214
+
215
+ if duplicates_before == 0:
216
+ if verbose:
217
+ print("\n๐Ÿ”ง REMOVING DUPLICATES")
218
+ print("="*40)
219
+ print(" โœ… No duplicate rows found!")
220
+ return df.copy(), 0
221
+
222
+ df_clean = df.drop_duplicates(keep=keep)
223
+ duplicates_removed = duplicates_before
224
+
225
+ if verbose:
226
+ print("\n๐Ÿ”ง REMOVING DUPLICATES")
227
+ print("="*40)
228
+ print(f" ๐Ÿ—‘๏ธ Removed {duplicates_removed} duplicate rows")
229
+ print(f" ๐Ÿ“Š Shape: {df.shape} โ†’ {df_clean.shape}")
230
+
231
+ return df_clean, duplicates_removed
232
+
233
+ @staticmethod
234
+ def drop_useless_columns(
235
+ df: pd.DataFrame,
236
+ verbose: bool = True
237
+ ) -> tuple:
238
+ """
239
+ Drop columns that are useless (constant or sequential identifiers)
240
+
241
+ Args:
242
+ df: Input DataFrame
243
+ verbose: Print actions taken
244
+
245
+ Returns:
246
+ Tuple of (cleaned_df, list_of_dropped_columns)
247
+ """
248
+ import pandas as pd
249
+
250
+ df_clean = df.copy()
251
+ dropped = []
252
+
253
+ if verbose:
254
+ print("\n๐Ÿ”ง DROPPING USELESS COLUMNS")
255
+ print("="*40)
256
+
257
+ for col in df_clean.columns:
258
+ # Check if constant (all same value)
259
+ if df_clean[col].nunique() == 1:
260
+ df_clean = df_clean.drop(columns=[col])
261
+ dropped.append(col)
262
+ if verbose:
263
+ constant_value = str(df[col].iloc[0]) if len(df) > 0 else "N/A"
264
+ if len(constant_value) > 30:
265
+ constant_value = constant_value[:27] + "..."
266
+ print(f" ๐Ÿ—‘๏ธ Dropped '{col}': constant value '{constant_value}'")
267
+
268
+ # Check if identifier (sequential numbers, all unique)
269
+ elif df_clean[col].nunique() == len(df_clean) and len(df_clean) > 10:
270
+ if pd.api.types.is_numeric_dtype(df_clean[col]):
271
+ sorted_vals = df_clean[col].dropna().sort_values().values
272
+ if len(sorted_vals) > 1:
273
+ # Check if values are sequential (difference of 1)
274
+ is_sequential = all(
275
+ sorted_vals[i+1] - sorted_vals[i] == 1
276
+ for i in range(len(sorted_vals)-1)
277
+ )
278
+ if is_sequential:
279
+ df_clean = df_clean.drop(columns=[col])
280
+ dropped.append(col)
281
+ if verbose:
282
+ print(f" ๐Ÿ—‘๏ธ Dropped '{col}': sequential ID")
283
+
284
+ if verbose and len(dropped) == 0:
285
+ print(" โœ… No useless columns found")
286
+ elif verbose and len(dropped) > 0:
287
+ print(f" ๐Ÿ“Š Total useless columns dropped: {len(dropped)}")
288
+
289
+ return df_clean, dropped
290
+ @staticmethod
291
+ def clean_dataframe(
292
+ df: pd.DataFrame,
293
+ handle_missing: bool = True,
294
+ handle_outliers: bool = True,
295
+ remove_duplicates: bool = True,
296
+ drop_useless: bool = True,
297
+ verbose: bool = True
298
+ ) -> Dict[str, Any]:
299
+ """
300
+ Main cleaning function - runs all cleaning steps
301
+
302
+ Args:
303
+ df: Input DataFrame
304
+ handle_missing: Whether to handle missing values
305
+ handle_outliers: Whether to handle outliers
306
+ remove_duplicates: Whether to remove duplicate rows
307
+ drop_useless: Whether to drop useless columns
308
+ verbose: Print progress
309
+
310
+ Returns:
311
+ Dictionary with cleaned DataFrame and cleaning report
312
+ """
313
+
314
+ df_clean = df.copy()
315
+ cleaning_report = {
316
+ 'original_shape': df.shape,
317
+ 'steps_performed': [],
318
+ 'missing_handling': {},
319
+ 'outlier_handling': {},
320
+ 'duplicates_removed': 0,
321
+ 'useless_columns_dropped': [],
322
+ 'final_shape': None
323
+ }
324
+
325
+ if verbose:
326
+ print("\n" + "="*60)
327
+ print("๐Ÿงน DATA CLEANING PIPELINE")
328
+ print("="*60)
329
+ print(f"๐Ÿ“Š Original shape: {df.shape[0]} rows, {df.shape[1]} columns")
330
+
331
+ # Step 1: Remove duplicates
332
+ if remove_duplicates:
333
+ df_clean, dup_removed = DataCleaner.remove_duplicates(df_clean, verbose=verbose)
334
+ cleaning_report['duplicates_removed'] = dup_removed
335
+ if dup_removed > 0:
336
+ cleaning_report['steps_performed'].append('removed_duplicates')
337
+
338
+ # Step 2: Drop useless columns
339
+ if drop_useless:
340
+ df_clean, dropped = DataCleaner.drop_useless_columns(df_clean, verbose=verbose)
341
+ cleaning_report['useless_columns_dropped'] = dropped
342
+ if dropped:
343
+ cleaning_report['steps_performed'].append('dropped_useless_columns')
344
+
345
+ # Step 3: Handle missing values
346
+ if handle_missing:
347
+ df_clean, missing_actions = DataCleaner.handle_missing_values(df_clean, verbose=verbose)
348
+ cleaning_report['missing_handling'] = missing_actions
349
+ if missing_actions:
350
+ cleaning_report['steps_performed'].append('handled_missing_values')
351
+
352
+ # Step 4: Handle outliers
353
+ if handle_outliers:
354
+ df_clean, outlier_actions = DataCleaner.handle_outliers(df_clean, verbose=verbose)
355
+ cleaning_report['outlier_handling'] = outlier_actions
356
+ if outlier_actions:
357
+ cleaning_report['steps_performed'].append('handled_outliers')
358
+
359
+ cleaning_report['final_shape'] = df_clean.shape
360
+
361
+ if verbose:
362
+ print("\n" + "="*40)
363
+ print("โœ… CLEANING COMPLETE")
364
+ print("="*40)
365
+ print(f"๐Ÿ“Š Final shape: {df_clean.shape[0]} rows, {df_clean.shape[1]} columns")
366
+ print(f"๐Ÿ“ˆ Rows removed: {df.shape[0] - df_clean.shape[0]}")
367
+ print(f"๐Ÿ“‰ Columns removed: {df.shape[1] - df_clean.shape[1]}")
368
+
369
+ cleaning_report['cleaned_data'] = df_clean
370
+
371
+ return cleaning_report