preplify 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. preplify-1.0.0/LICENSE +5 -0
  2. preplify-1.0.0/PKG-INFO +406 -0
  3. preplify-1.0.0/README.md +377 -0
  4. preplify-1.0.0/preplify/__init__.py +69 -0
  5. preplify-1.0.0/preplify/cleaning/__init__.py +0 -0
  6. preplify-1.0.0/preplify/cleaning/remove_duplicates.py +13 -0
  7. preplify-1.0.0/preplify/cleaning/remove_empty_rows.py +13 -0
  8. preplify-1.0.0/preplify/cleaning/standardize_columns.py +25 -0
  9. preplify-1.0.0/preplify/core/__init__.py +0 -0
  10. preplify-1.0.0/preplify/core/auto.py +26 -0
  11. preplify-1.0.0/preplify/core/automl_prep.py +85 -0
  12. preplify-1.0.0/preplify/core/pipeline.py +72 -0
  13. preplify-1.0.0/preplify/datasets/__init__.py +0 -0
  14. preplify-1.0.0/preplify/datasets/demo_datasets.py +74 -0
  15. preplify-1.0.0/preplify/encoding/__init__.py +0 -0
  16. preplify-1.0.0/preplify/encoding/auto_encoder.py +32 -0
  17. preplify-1.0.0/preplify/encoding/label_encoder.py +25 -0
  18. preplify-1.0.0/preplify/encoding/onehot_encoder.py +25 -0
  19. preplify-1.0.0/preplify/feature_engineering/__init__.py +0 -0
  20. preplify-1.0.0/preplify/feature_engineering/auto_feature_engineering.py +27 -0
  21. preplify-1.0.0/preplify/feature_engineering/date_features.py +43 -0
  22. preplify-1.0.0/preplify/feature_engineering/interaction_features.py +33 -0
  23. preplify-1.0.0/preplify/feature_engineering/ratio_features.py +34 -0
  24. preplify-1.0.0/preplify/feature_selection/__init__.py +0 -0
  25. preplify-1.0.0/preplify/feature_selection/correlation_filter.py +29 -0
  26. preplify-1.0.0/preplify/feature_selection/importance_selector.py +37 -0
  27. preplify-1.0.0/preplify/feature_selection/variance_filter.py +34 -0
  28. preplify-1.0.0/preplify/missing/__init__.py +0 -0
  29. preplify-1.0.0/preplify/missing/handle_missing.py +66 -0
  30. preplify-1.0.0/preplify/missing/strategies.py +14 -0
  31. preplify-1.0.0/preplify/missing/validators.py +14 -0
  32. preplify-1.0.0/preplify/outliers/__init__.py +0 -0
  33. preplify-1.0.0/preplify/outliers/iqr_detector.py +30 -0
  34. preplify-1.0.0/preplify/outliers/outlier_removal.py +25 -0
  35. preplify-1.0.0/preplify/outliers/zscore_detector.py +33 -0
  36. preplify-1.0.0/preplify/profiling/__init__.py +0 -0
  37. preplify-1.0.0/preplify/profiling/data_report.py +50 -0
  38. preplify-1.0.0/preplify/profiling/summary_stats.py +21 -0
  39. preplify-1.0.0/preplify/recommender/__init__.py +0 -0
  40. preplify-1.0.0/preplify/recommender/preprocessing_recommender.py +78 -0
  41. preplify-1.0.0/preplify/reduction/__init__.py +0 -0
  42. preplify-1.0.0/preplify/reduction/pca.py +42 -0
  43. preplify-1.0.0/preplify/scaling/__init__.py +0 -0
  44. preplify-1.0.0/preplify/scaling/minmax_scaler.py +7 -0
  45. preplify-1.0.0/preplify/scaling/robust_scaler.py +7 -0
  46. preplify-1.0.0/preplify/scaling/standard_scaler.py +38 -0
  47. preplify-1.0.0/preplify/split/__init__.py +0 -0
  48. preplify-1.0.0/preplify/split/train_test_split.py +22 -0
  49. preplify-1.0.0/preplify/transformation/__init__.py +0 -0
  50. preplify-1.0.0/preplify/transformation/log_transform.py +34 -0
  51. preplify-1.0.0/preplify/transformation/power_transform.py +29 -0
  52. preplify-1.0.0/preplify/utils/__init__.py +0 -0
  53. preplify-1.0.0/preplify/utils/column_detector.py +12 -0
  54. preplify-1.0.0/preplify/utils/dataframe_validator.py +22 -0
  55. preplify-1.0.0/preplify/utils/helpers.py +14 -0
  56. preplify-1.0.0/preplify/utils/logging.py +24 -0
  57. preplify-1.0.0/preplify.egg-info/PKG-INFO +406 -0
  58. preplify-1.0.0/preplify.egg-info/SOURCES.txt +66 -0
  59. preplify-1.0.0/preplify.egg-info/dependency_links.txt +1 -0
  60. preplify-1.0.0/preplify.egg-info/requires.txt +3 -0
  61. preplify-1.0.0/preplify.egg-info/top_level.txt +1 -0
  62. preplify-1.0.0/pyproject.toml +3 -0
  63. preplify-1.0.0/setup.cfg +4 -0
  64. preplify-1.0.0/setup.py +26 -0
  65. preplify-1.0.0/tests/test_cleaning.py +21 -0
  66. preplify-1.0.0/tests/test_feature_engineering.py +20 -0
  67. preplify-1.0.0/tests/test_missing.py +23 -0
  68. preplify-1.0.0/tests/test_pipeline.py +21 -0
preplify-1.0.0/LICENSE ADDED
@@ -0,0 +1,5 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Muhammad Hussnain
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy...
@@ -0,0 +1,406 @@
1
+ Metadata-Version: 2.4
2
+ Name: preplify
3
+ Version: 1.0.0
4
+ Summary: Modular, professional Python library for tabular data preprocessing with auto ML-ready pipelines.
5
+ Home-page: https://github.com/yourusername/preplify
6
+ Author: Muhammad Hussnain
7
+ Author-email: muhammadhussnain1227@gmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Topic :: Software Development :: Libraries
12
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
+ Requires-Python: >=3.8
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Requires-Dist: pandas>=1.3.0
17
+ Requires-Dist: numpy>=1.21.0
18
+ Requires-Dist: scikit-learn>=1.0.0
19
+ Dynamic: author
20
+ Dynamic: author-email
21
+ Dynamic: classifier
22
+ Dynamic: description
23
+ Dynamic: description-content-type
24
+ Dynamic: home-page
25
+ Dynamic: license-file
26
+ Dynamic: requires-dist
27
+ Dynamic: requires-python
28
+ Dynamic: summary
29
+
30
+ <div align="center">
31
+
32
+ # ๐Ÿš€ Preplify
33
+
34
+ ### The Last Preprocessing Library You'll Ever Need
35
+
36
+ **Preprocess any tabular dataset in one line โ€” clean, encode, scale, engineer, and ML-ready.**
37
+
38
+ [![Python](https://img.shields.io/badge/Python-3.8%2B-blue?style=for-the-badge&logo=python)](https://python.org)
39
+ [![License](https://img.shields.io/badge/License-MIT-green?style=for-the-badge)](LICENSE)
40
+ [![Version](https://img.shields.io/badge/Version-1.0.0-orange?style=for-the-badge)](https://github.com/yourusername/preplify)
41
+ [![sklearn](https://img.shields.io/badge/sklearn-compatible-red?style=for-the-badge&logo=scikit-learn)](https://scikit-learn.org)
42
+
43
+ ```python
44
+ # From messy raw data to ML-ready in ONE line
45
+ df_clean = auto_prep(df)
46
+ ```
47
+
48
+ </div>
49
+
50
+ ---
51
+
52
+ ## ๐Ÿ“Œ What is Preplify?
53
+
54
+ **Preplify** is a modular, professional Python library for **tabular data preprocessing**. Whether you're a beginner who wants everything handled automatically, or an advanced user who wants full control over every step โ€” Preplify has you covered.
55
+
56
+ > No more writing 100 lines of boilerplate preprocessing code. Preplify does it all.
57
+
58
+ ---
59
+
60
+ ## โœจ Features at a Glance
61
+
62
+ | Module | What it does |
63
+ |---|---|
64
+ | ๐Ÿงน **Cleaning** | Remove duplicates, empty rows, fix column names |
65
+ | โ“ **Missing Values** | Mean / Median / Mode / Drop / Constant strategies |
66
+ | ๐Ÿ“Š **Outlier Removal** | IQR and Z-score based detection |
67
+ | ๐Ÿ”  **Encoding** | One-Hot and Label encoding |
68
+ | ๐Ÿ“ **Scaling** | Standard, Min-Max, Robust scaling |
69
+ | ๐Ÿ”„ **Transformation** | Log and Power transforms |
70
+ | โš™๏ธ **Feature Engineering** | Interaction, Ratio, and DateTime features |
71
+ | ๐ŸŽฏ **Feature Selection** | Correlation filter, Variance filter, Importance selector |
72
+ | ๐Ÿ“‰ **Dimensionality Reduction** | PCA |
73
+ | ๐Ÿ” **Profiling** | Full data report + summary stats |
74
+ | ๐Ÿ’ก **Recommender** | Auto-suggests best preprocessing steps |
75
+ | ๐Ÿค– **AutoML** | Preprocess + train baseline model in one call |
76
+ | ๐Ÿ”ง **Pipeline** | Custom modular pipeline with full control |
77
+
78
+ ---
79
+
80
+ ## โšก Installation
81
+
82
+ ```bash
83
+ pip install -e .
84
+ ```
85
+
86
+ **Requirements:**
87
+ ```bash
88
+ pip install pandas numpy scikit-learn
89
+ ```
90
+
91
+ > Python 3.8+ required
92
+
93
+ ---
94
+
95
+ ## ๐Ÿš€ Quick Start
96
+
97
+ ### One-Line Preprocessing
98
+ ```python
99
+ import pandas as pd
100
+ from preplify import auto_prep
101
+
102
+ df = pd.read_csv("your_dataset.csv")
103
+ df_clean = auto_prep(df)
104
+ print(df_clean.shape)
105
+ ```
106
+
107
+ ### See What Your Data Needs First
108
+ ```python
109
+ from preplify import recommend_preprocessing, data_report
110
+
111
+ _ = recommend_preprocessing(df) # get smart suggestions
112
+ _ = data_report(df) # full data overview
113
+ ```
114
+
115
+ ### Train a Baseline Model Instantly
116
+ ```python
117
+ from preplify import automl_prep
118
+
119
+ X, model, score = automl_prep(df, target="Survived", task="classification")
120
+ print(f"Accuracy: {score:.4f}")
121
+ ```
122
+
123
+ ---
124
+
125
+ ## ๐Ÿ”ง Full API Reference
126
+
127
+ ### ๐Ÿ“ฆ Load Demo Data
128
+ ```python
129
+ from preplify.datasets.demo_datasets import (
130
+ load_sample_classification, # fake classification dataset
131
+ load_sample_regression, # fake regression dataset
132
+ load_titanic # real Titanic CSV (needs internet)
133
+ )
134
+
135
+ df = load_sample_classification()
136
+ df = load_sample_regression()
137
+ df = load_titanic()
138
+ ```
139
+
140
+ ---
141
+
142
+ ### ๐Ÿ” Profiling
143
+ ```python
144
+ from preplify import data_report, recommend_preprocessing
145
+
146
+ _ = data_report(df) # shape, missing, dtypes, duplicates
147
+ _ = recommend_preprocessing(df) # smart preprocessing suggestions
148
+ ```
149
+
150
+ ---
151
+
152
+ ### ๐Ÿงน Cleaning
153
+ ```python
154
+ from preplify import remove_duplicates, remove_empty_rows, standardize_columns
155
+
156
+ df = standardize_columns(df) # "First Name" โ†’ "first_name"
157
+ df = remove_duplicates(df) # drop repeated rows
158
+ df = remove_empty_rows(df) # drop fully empty rows
159
+ ```
160
+
161
+ ---
162
+
163
+ ### โ“ Missing Values
164
+ ```python
165
+ from preplify import handle_missing
166
+
167
+ df = handle_missing(df, strategy="mean") # fill with average
168
+ df = handle_missing(df, strategy="median") # fill with middle value
169
+ df = handle_missing(df, strategy="mode") # fill with most frequent
170
+ df = handle_missing(df, strategy="drop") # drop rows with missing
171
+ df = handle_missing(df, strategy="constant", fill_value=0) # fill with 0
172
+ ```
173
+
174
+ ---
175
+
176
+ ### ๐Ÿ“Š Outlier Removal
177
+ ```python
178
+ from preplify import remove_outliers, remove_outliers_iqr, remove_outliers_zscore
179
+
180
+ df = remove_outliers(df, method="iqr") # IQR method (recommended)
181
+ df = remove_outliers(df, method="zscore") # Z-score method
182
+ df = remove_outliers_iqr(df) # direct IQR
183
+ df = remove_outliers_zscore(df, threshold=3) # direct Z-score
184
+ ```
185
+
186
+ ---
187
+
188
+ ### ๐Ÿ”  Encoding
189
+ ```python
190
+ from preplify import encode_features, onehot_encode, label_encode
191
+
192
+ df = encode_features(df, method="onehot") # "Male/Female" โ†’ 2 columns
193
+ df = encode_features(df, method="label") # "Male/Female" โ†’ 0/1
194
+ df = onehot_encode(df) # direct one-hot
195
+ df = label_encode(df) # direct label encoding
196
+ ```
197
+
198
+ ---
199
+
200
+ ### ๐Ÿ“ Scaling
201
+ ```python
202
+ from preplify import scale_features, minmax_scale, robust_scale
203
+
204
+ df = scale_features(df, method="standard") # mean=0, std=1
205
+ df = scale_features(df, method="minmax") # values between 0 and 1
206
+ df = scale_features(df, method="robust") # outlier-resistant scaling
207
+ df = minmax_scale(df) # shortcut minmax
208
+ df = robust_scale(df) # shortcut robust
209
+ ```
210
+
211
+ ---
212
+
213
+ ### ๐Ÿ”„ Transformation
214
+ ```python
215
+ from preplify import log_transform, power_transform
216
+
217
+ df = log_transform(df) # log1p โ€” fix skewed data
218
+ df = power_transform(df, method="yeo-johnson") # works with negatives too
219
+ df = power_transform(df, method="box-cox") # positive values only
220
+ ```
221
+
222
+ ---
223
+
224
+ ### โš™๏ธ Feature Engineering
225
+ ```python
226
+ from preplify import (
227
+ auto_feature_engineering,
228
+ interaction_features,
229
+ ratio_features,
230
+ extract_date_features
231
+ )
232
+
233
+ df = auto_feature_engineering(df) # auto interaction + ratio
234
+ df = interaction_features(df) # age * income โ†’ age_x_income
235
+ df = ratio_features(df) # income / age โ†’ income_ratio_age
236
+ df = extract_date_features(df, date_columns=["date"]) # year, month, day, weekday, is_weekend
237
+ ```
238
+
239
+ ---
240
+
241
+ ### ๐ŸŽฏ Feature Selection
242
+ ```python
243
+ from preplify import correlation_filter, variance_filter, importance_selector
244
+
245
+ df = correlation_filter(df, threshold=0.9) # drop columns that are 90%+ similar
246
+ df = variance_filter(df, threshold=0.01) # drop near-constant columns
247
+ X = importance_selector(X, y, top_n=10) # keep top 10 by Random Forest importance
248
+ ```
249
+
250
+ ---
251
+
252
+ ### ๐Ÿ“‰ Dimensionality Reduction
253
+ ```python
254
+ from preplify import apply_pca
255
+
256
+ df = apply_pca(df, n_components=0.95) # keep 95% of variance
257
+ df = apply_pca(df, n_components=3) # reduce to exactly 3 components
258
+ ```
259
+
260
+ ---
261
+
262
+ ### โœ‚๏ธ Train-Test Split
263
+ ```python
264
+ from preplify import split_dataset
265
+
266
+ X_train, X_test, y_train, y_test = split_dataset(X, y, test_size=0.2)
267
+ ```
268
+
269
+ ---
270
+
271
+ ### ๐Ÿ”ง Custom Pipeline
272
+ Full control over every single step:
273
+ ```python
274
+ from preplify import PreplifyPipeline
275
+
276
+ pipe = PreplifyPipeline(
277
+ missing_strategy="median", # mean / median / mode / drop / constant
278
+ encoding="onehot", # onehot / label
279
+ scaling="robust", # standard / minmax / robust
280
+ outlier_method="iqr", # iqr / zscore / None
281
+ feature_engineering=True # True / False
282
+ )
283
+
284
+ df_clean = pipe.fit_transform(df)
285
+ ```
286
+
287
+ ---
288
+
289
+ ### ๐Ÿค– AutoML โ€” Preprocess + Train in One Call
290
+ ```python
291
+ from preplify import automl_prep
292
+
293
+ # Classification
294
+ X, model, score = automl_prep(df, target="Survived", task="classification")
295
+ print(f"Accuracy: {score:.4f}")
296
+
297
+ # Regression
298
+ X, model, score = automl_prep(df, target="Price", task="regression")
299
+ print(f"Rยฒ Score: {score:.4f}")
300
+ ```
301
+
302
+ | Task | Model Used | Score Metric |
303
+ |---|---|---|
304
+ | classification | Logistic Regression | Accuracy |
305
+ | regression | Ridge Regression | Rยฒ Score |
306
+
307
+ ---
308
+
309
+ ## ๐ŸŒ Real Dataset Example (Titanic)
310
+
311
+ ```python
312
+ import pandas as pd
313
+ from preplify import auto_prep, recommend_preprocessing
314
+ from sklearn.ensemble import RandomForestClassifier
315
+ from sklearn.model_selection import train_test_split
316
+ from sklearn.metrics import accuracy_score
317
+
318
+ # Load
319
+ df = pd.read_csv("titanic.csv")
320
+
321
+ # Explore
322
+ _ = recommend_preprocessing(df)
323
+
324
+ # Preprocess
325
+ target = df["Survived"]
326
+ df_clean = auto_prep(df.drop(columns=["Survived"]))
327
+
328
+ # Train Random Forest
329
+ X_train, X_test, y_train, y_test = train_test_split(df_clean, target, test_size=0.2, random_state=42)
330
+ model = RandomForestClassifier()
331
+ model.fit(X_train, y_train)
332
+ print(f"Accuracy: {accuracy_score(y_test, model.predict(X_test)):.4f}")
333
+ ```
334
+
335
+ ---
336
+
337
+ ## ๐Ÿ—‚๏ธ Project Structure
338
+
339
+ ```
340
+ preplify/
341
+ โ”‚
342
+ โ”œโ”€โ”€ preplify/
343
+ โ”‚ โ”œโ”€โ”€ core/ # Pipeline, auto_prep, automl_prep
344
+ โ”‚ โ”œโ”€โ”€ cleaning/ # Duplicates, empty rows, column names
345
+ โ”‚ โ”œโ”€โ”€ missing/ # Missing value strategies
346
+ โ”‚ โ”œโ”€โ”€ outliers/ # IQR and Z-score detection
347
+ โ”‚ โ”œโ”€โ”€ encoding/ # One-hot and label encoding
348
+ โ”‚ โ”œโ”€โ”€ scaling/ # Standard, MinMax, Robust
349
+ โ”‚ โ”œโ”€โ”€ transformation/ # Log and Power transforms
350
+ โ”‚ โ”œโ”€โ”€ feature_engineering/ # Interaction, ratio, date features
351
+ โ”‚ โ”œโ”€โ”€ feature_selection/ # Correlation, variance, importance
352
+ โ”‚ โ”œโ”€โ”€ reduction/ # PCA
353
+ โ”‚ โ”œโ”€โ”€ split/ # Train-test split
354
+ โ”‚ โ”œโ”€โ”€ profiling/ # Data report, summary stats
355
+ โ”‚ โ”œโ”€โ”€ recommender/ # Preprocessing recommendations
356
+ โ”‚ โ”œโ”€โ”€ datasets/ # Demo datasets
357
+ โ”‚ โ””โ”€โ”€ utils/ # Validators, logging, helpers
358
+ โ”‚
359
+ โ”œโ”€โ”€ examples/ # Ready-to-run example scripts
360
+ โ”œโ”€โ”€ tests/ # Unit tests
361
+ โ”œโ”€โ”€ docs/ # Documentation
362
+ โ”œโ”€โ”€ setup.py
363
+ โ””โ”€โ”€ README.md
364
+ ```
365
+
366
+ ---
367
+
368
+ ## ๐Ÿ”• Disable Logs
369
+
370
+ By default Preplify logs every step. To turn off:
371
+
372
+ ```python
373
+ import logging
374
+ logging.getLogger("preplify").setLevel(logging.WARNING)
375
+ ```
376
+
377
+ ---
378
+
379
+ ## ๐Ÿงช Run Tests
380
+
381
+ ```bash
382
+ pip install pytest
383
+ pytest tests/
384
+ ```
385
+
386
+ ---
387
+
388
+ ## ๐Ÿ“„ License
389
+
390
+ MIT License โ€” free to use, modify, and distribute.
391
+
392
+ ---
393
+
394
+ ## ๐Ÿ‘จโ€๐Ÿ’ป Author
395
+
396
+ **Muhammad Hussnain**
397
+ - ๐Ÿ“ง muhammadhussnain1227@gmail.com
398
+ - ๐Ÿ™ [GitHub](https://github.com/yourusername/preplify)
399
+
400
+ ---
401
+
402
+ <div align="center">
403
+
404
+ **If Preplify saved you time, give it a โญ on GitHub!**
405
+
406
+ </div>