masterclean 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- masterclean-1.0.0/LICENSE +11 -0
- masterclean-1.0.0/PKG-INFO +16 -0
- masterclean-1.0.0/README.md +355 -0
- masterclean-1.0.0/masterclean/__init__.py +8 -0
- masterclean-1.0.0/masterclean/cleaner.py +54 -0
- masterclean-1.0.0/masterclean/cli.py +133 -0
- masterclean-1.0.0/masterclean/datatypes.py +78 -0
- masterclean-1.0.0/masterclean/exporter.py +5 -0
- masterclean-1.0.0/masterclean/profiler.py +103 -0
- masterclean-1.0.0/masterclean/reader.py +66 -0
- masterclean-1.0.0/masterclean/report.py +332 -0
- masterclean-1.0.0/masterclean/validator.py +103 -0
- masterclean-1.0.0/masterclean/visualizer.py +102 -0
- masterclean-1.0.0/masterclean.egg-info/PKG-INFO +16 -0
- masterclean-1.0.0/masterclean.egg-info/SOURCES.txt +27 -0
- masterclean-1.0.0/masterclean.egg-info/dependency_links.txt +1 -0
- masterclean-1.0.0/masterclean.egg-info/entry_points.txt +2 -0
- masterclean-1.0.0/masterclean.egg-info/requires.txt +9 -0
- masterclean-1.0.0/masterclean.egg-info/top_level.txt +1 -0
- masterclean-1.0.0/pyproject.toml +28 -0
- masterclean-1.0.0/setup.cfg +4 -0
- masterclean-1.0.0/tests/test_cleaner.py +18 -0
- masterclean-1.0.0/tests/test_datatypes.py +16 -0
- masterclean-1.0.0/tests/test_missing_values.py +17 -0
- masterclean-1.0.0/tests/test_profiler.py +18 -0
- masterclean-1.0.0/tests/test_reader.py +10 -0
- masterclean-1.0.0/tests/test_report.py +16 -0
- masterclean-1.0.0/tests/test_validator.py +16 -0
- masterclean-1.0.0/tests/test_visualizer.py +16 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Mohamed Faisal Maraicar N
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software.
|
|
10
|
+
|
|
11
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: masterclean
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Automated CSV cleaning toolkit
|
|
5
|
+
Author: Mohamed Faisal
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Dist: pandas
|
|
8
|
+
Requires-Dist: numpy
|
|
9
|
+
Requires-Dist: typer
|
|
10
|
+
Requires-Dist: chardet
|
|
11
|
+
Requires-Dist: pytest
|
|
12
|
+
Requires-Dist: openpyxl
|
|
13
|
+
Requires-Dist: matplotlib
|
|
14
|
+
Requires-Dist: plotly
|
|
15
|
+
Requires-Dist: rich
|
|
16
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
# 🚀 MasterClean
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+
|
|
5
|
+

|
|
6
|
+
|
|
7
|
+

|
|
8
|
+
|
|
9
|
+
Automated Data Cleaning, Validation & Analytics Toolkit for Python.
|
|
10
|
+
|
|
11
|
+
MasterClean is a professional Python package that automates dataset cleaning, preprocessing, validation, profiling, visualization, and reporting using a single command.
|
|
12
|
+
|
|
13
|
+
It is designed for:
|
|
14
|
+
|
|
15
|
+
* Data Analysts
|
|
16
|
+
* Data Scientists
|
|
17
|
+
* ML Engineers
|
|
18
|
+
* Researchers
|
|
19
|
+
* Students
|
|
20
|
+
* Automation workflows
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
# ✨ Features
|
|
25
|
+
|
|
26
|
+
## Data Cleaning
|
|
27
|
+
|
|
28
|
+
* Automatic missing value handling
|
|
29
|
+
* Duplicate row removal
|
|
30
|
+
* Column standardization
|
|
31
|
+
* String cleanup
|
|
32
|
+
* Encoding-aware file loading
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Datatype Optimization
|
|
37
|
+
|
|
38
|
+
* Automatic numeric conversion
|
|
39
|
+
* Datetime detection
|
|
40
|
+
* Integer optimization
|
|
41
|
+
* Mixed datatype handling
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Validation Engine
|
|
46
|
+
|
|
47
|
+
* Negative value detection
|
|
48
|
+
* Outlier detection
|
|
49
|
+
* Invalid boolean detection
|
|
50
|
+
* Dataset quality warnings
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## Analytics & Profiling
|
|
55
|
+
|
|
56
|
+
* Automated dataset profiling
|
|
57
|
+
* Numeric statistics
|
|
58
|
+
* Categorical summaries
|
|
59
|
+
* Memory usage analysis
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Visualization Engine
|
|
64
|
+
|
|
65
|
+
* Interactive Plotly dashboards
|
|
66
|
+
* Histograms
|
|
67
|
+
* Pie charts
|
|
68
|
+
* Boxplots
|
|
69
|
+
* Distribution analysis
|
|
70
|
+
* Category analytics
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## Reporting
|
|
75
|
+
|
|
76
|
+
* Unified HTML analytics dashboard
|
|
77
|
+
* Validation summaries
|
|
78
|
+
* Interactive charts
|
|
79
|
+
* Automated report generation
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## Developer Features
|
|
84
|
+
|
|
85
|
+
* Command Line Interface (CLI)
|
|
86
|
+
* Automated testing with pytest
|
|
87
|
+
* GitHub Actions CI/CD pipeline
|
|
88
|
+
* Modular package architecture
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
# 📦 Installation
|
|
93
|
+
|
|
94
|
+
## Clone Repository
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
git clone https://github.com/MohamedFaisal-11/masterclean.git
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
cd masterclean
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## Create Virtual Environment
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
python -m venv venv
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## Activate Environment
|
|
115
|
+
|
|
116
|
+
### macOS / Linux
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
source venv/bin/activate
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Windows
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
venv\Scripts\activate
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
---
|
|
129
|
+
|
|
130
|
+
## Install Package
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
pip install -e .
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
# 🚀 CLI Usage
|
|
139
|
+
|
|
140
|
+
## Clean Dataset
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
masterclean clean sample.csv
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
MasterClean automatically:
|
|
147
|
+
|
|
148
|
+
* Reads datasets
|
|
149
|
+
* Cleans missing values
|
|
150
|
+
* Removes duplicates
|
|
151
|
+
* Optimizes datatypes
|
|
152
|
+
* Detects validation issues
|
|
153
|
+
* Generates dashboards
|
|
154
|
+
* Exports cleaned data
|
|
155
|
+
* Creates HTML reports
|
|
156
|
+
|
|
157
|
+
---
|
|
158
|
+
|
|
159
|
+
## Show Version
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
masterclean version
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
# 🐍 Python Usage
|
|
168
|
+
|
|
169
|
+
```python
|
|
170
|
+
from masterclean import (
|
|
171
|
+
read_file,
|
|
172
|
+
clean_data,
|
|
173
|
+
optimize_dtypes,
|
|
174
|
+
validate_data,
|
|
175
|
+
generate_profile,
|
|
176
|
+
generate_charts,
|
|
177
|
+
generate_report,
|
|
178
|
+
export_data
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
df = read_file("sample.csv")
|
|
182
|
+
|
|
183
|
+
df = clean_data(df)
|
|
184
|
+
|
|
185
|
+
df = optimize_dtypes(df)
|
|
186
|
+
|
|
187
|
+
warnings = validate_data(df)
|
|
188
|
+
|
|
189
|
+
profile = generate_profile(df)
|
|
190
|
+
|
|
191
|
+
charts = generate_charts(df)
|
|
192
|
+
|
|
193
|
+
generate_report(
|
|
194
|
+
df=df,
|
|
195
|
+
warnings=warnings,
|
|
196
|
+
profile=profile,
|
|
197
|
+
charts=charts
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
export_data(df)
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
# 📊 Example Validation Output
|
|
206
|
+
|
|
207
|
+
```text
|
|
208
|
+
VALIDATION WARNINGS
|
|
209
|
+
========================================
|
|
210
|
+
⚠ Negative values found in 'age' (1 rows)
|
|
211
|
+
|
|
212
|
+
⚠ Possible outliers detected in 'salary' (1 rows)
|
|
213
|
+
|
|
214
|
+
⚠ Invalid boolean-like values found in 'active': {'maybe'}
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
# 📈 Dashboard Features
|
|
220
|
+
|
|
221
|
+
MasterClean generates a unified interactive HTML dashboard containing:
|
|
222
|
+
|
|
223
|
+
* Dataset summaries
|
|
224
|
+
* Validation warnings
|
|
225
|
+
* Profiling statistics
|
|
226
|
+
* Pie charts
|
|
227
|
+
* Histograms
|
|
228
|
+
* Boxplots
|
|
229
|
+
* Category analytics
|
|
230
|
+
* Interactive Plotly visualizations
|
|
231
|
+
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
# 🏗 Architecture
|
|
235
|
+
|
|
236
|
+
```text
|
|
237
|
+
Read
|
|
238
|
+
↓
|
|
239
|
+
Clean
|
|
240
|
+
↓
|
|
241
|
+
Optimize
|
|
242
|
+
↓
|
|
243
|
+
Validate
|
|
244
|
+
↓
|
|
245
|
+
Profile
|
|
246
|
+
↓
|
|
247
|
+
Visualize
|
|
248
|
+
↓
|
|
249
|
+
Report
|
|
250
|
+
↓
|
|
251
|
+
Export
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
---
|
|
255
|
+
|
|
256
|
+
# 📂 Project Structure
|
|
257
|
+
|
|
258
|
+
```text
|
|
259
|
+
masterclean/
|
|
260
|
+
│
|
|
261
|
+
├── cleaner.py
|
|
262
|
+
├── validator.py
|
|
263
|
+
├── datatypes.py
|
|
264
|
+
├── profiler.py
|
|
265
|
+
├── visualizer.py
|
|
266
|
+
├── report.py
|
|
267
|
+
├── exporter.py
|
|
268
|
+
├── reader.py
|
|
269
|
+
├── cli.py
|
|
270
|
+
│
|
|
271
|
+
tests/
|
|
272
|
+
│
|
|
273
|
+
├── test_cleaner.py
|
|
274
|
+
├── test_validator.py
|
|
275
|
+
├── test_reader.py
|
|
276
|
+
├── test_report.py
|
|
277
|
+
├── test_visualizer.py
|
|
278
|
+
│
|
|
279
|
+
.github/workflows/
|
|
280
|
+
│
|
|
281
|
+
└── tests.yml
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
---
|
|
285
|
+
|
|
286
|
+
# 🧪 Testing
|
|
287
|
+
|
|
288
|
+
Run tests using:
|
|
289
|
+
|
|
290
|
+
```bash
|
|
291
|
+
python -m pytest
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
Current Status:
|
|
295
|
+
|
|
296
|
+
✅ Automated tests passing
|
|
297
|
+
✅ GitHub Actions CI/CD passing
|
|
298
|
+
|
|
299
|
+
---
|
|
300
|
+
|
|
301
|
+
# 🔄 CI/CD
|
|
302
|
+
|
|
303
|
+
MasterClean uses GitHub Actions for:
|
|
304
|
+
|
|
305
|
+
* automated testing
|
|
306
|
+
* dependency validation
|
|
307
|
+
* continuous integration
|
|
308
|
+
|
|
309
|
+
---
|
|
310
|
+
|
|
311
|
+
# 🛣 Roadmap
|
|
312
|
+
|
|
313
|
+
Future improvements planned:
|
|
314
|
+
|
|
315
|
+
* Advanced schema validation
|
|
316
|
+
* Large dataset optimization
|
|
317
|
+
* Plugin architecture
|
|
318
|
+
* AI-powered cleaning suggestions
|
|
319
|
+
* Cloud deployment support
|
|
320
|
+
* Streamlit dashboard integration
|
|
321
|
+
|
|
322
|
+
---
|
|
323
|
+
|
|
324
|
+
# Dashboard Preview
|
|
325
|
+
|
|
326
|
+

|
|
327
|
+
|
|
328
|
+
---
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
# 🤝 Contributing
|
|
332
|
+
|
|
333
|
+
Contributions are welcome.
|
|
334
|
+
|
|
335
|
+
You can:
|
|
336
|
+
|
|
337
|
+
* report bugs
|
|
338
|
+
* suggest features
|
|
339
|
+
* improve documentation
|
|
340
|
+
* submit pull requests
|
|
341
|
+
|
|
342
|
+
---
|
|
343
|
+
|
|
344
|
+
# 📄 License
|
|
345
|
+
|
|
346
|
+
MIT License
|
|
347
|
+
|
|
348
|
+
---
|
|
349
|
+
|
|
350
|
+
# 👨💻 Author
|
|
351
|
+
|
|
352
|
+
Mohamed Faisal Maraicar N
|
|
353
|
+
|
|
354
|
+
GitHub:
|
|
355
|
+
https://github.com/MohamedFaisal-11/masterclean
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
from .reader import read_file
|
|
2
|
+
from .cleaner import clean_data
|
|
3
|
+
from .datatypes import optimize_dtypes
|
|
4
|
+
from .validator import validate_data
|
|
5
|
+
from .report import generate_report
|
|
6
|
+
from .exporter import export_data
|
|
7
|
+
from .profiler import generate_profile
|
|
8
|
+
from .visualizer import generate_charts
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
def clean_data(df):
|
|
2
|
+
|
|
3
|
+
df = df.copy()
|
|
4
|
+
|
|
5
|
+
# Remove duplicates
|
|
6
|
+
df = df.drop_duplicates()
|
|
7
|
+
|
|
8
|
+
# Fill missing values
|
|
9
|
+
for col in df.columns:
|
|
10
|
+
|
|
11
|
+
# Object columns
|
|
12
|
+
if df[col].dtype == "object":
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
|
|
16
|
+
mode_value = df[col].mode()[0]
|
|
17
|
+
|
|
18
|
+
df[col] = df[col].fillna(mode_value)
|
|
19
|
+
|
|
20
|
+
except:
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
# Numeric columns
|
|
24
|
+
else:
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
|
|
28
|
+
median_value = df[col].median()
|
|
29
|
+
|
|
30
|
+
df[col] = df[col].fillna(median_value)
|
|
31
|
+
|
|
32
|
+
except:
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
# Standardize column names
|
|
36
|
+
df.columns = (
|
|
37
|
+
df.columns
|
|
38
|
+
.str.strip()
|
|
39
|
+
.str.lower()
|
|
40
|
+
.str.replace(" ", "_")
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Clean string columns
|
|
44
|
+
for col in df.select_dtypes(include="object"):
|
|
45
|
+
|
|
46
|
+
df[col] = (
|
|
47
|
+
df[col]
|
|
48
|
+
.astype(str)
|
|
49
|
+
.str.strip()
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
print("✅ Data cleaned successfully")
|
|
53
|
+
|
|
54
|
+
return df
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
|
|
3
|
+
from masterclean import (
|
|
4
|
+
read_file,
|
|
5
|
+
clean_data,
|
|
6
|
+
export_data,
|
|
7
|
+
optimize_dtypes,
|
|
8
|
+
generate_report,
|
|
9
|
+
validate_data,
|
|
10
|
+
generate_profile,
|
|
11
|
+
generate_charts
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
app = typer.Typer()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def process_file(
|
|
18
|
+
|
|
19
|
+
file_path: str,
|
|
20
|
+
|
|
21
|
+
output: str = "cleaned_data.csv",
|
|
22
|
+
|
|
23
|
+
report: bool = True,
|
|
24
|
+
|
|
25
|
+
skip_validation: bool = False
|
|
26
|
+
|
|
27
|
+
):
|
|
28
|
+
|
|
29
|
+
# -----------------------------------
|
|
30
|
+
# Read File
|
|
31
|
+
# -----------------------------------
|
|
32
|
+
|
|
33
|
+
df = read_file(file_path)
|
|
34
|
+
|
|
35
|
+
# -----------------------------------
|
|
36
|
+
# Clean Data
|
|
37
|
+
# -----------------------------------
|
|
38
|
+
|
|
39
|
+
df = clean_data(df)
|
|
40
|
+
|
|
41
|
+
# -----------------------------------
|
|
42
|
+
# Optimize Datatypes
|
|
43
|
+
# -----------------------------------
|
|
44
|
+
|
|
45
|
+
df = optimize_dtypes(df)
|
|
46
|
+
|
|
47
|
+
# -----------------------------------
|
|
48
|
+
# Generate Profile
|
|
49
|
+
# -----------------------------------
|
|
50
|
+
|
|
51
|
+
profile = generate_profile(df)
|
|
52
|
+
|
|
53
|
+
# -----------------------------------
|
|
54
|
+
# Generate Interactive Charts
|
|
55
|
+
# -----------------------------------
|
|
56
|
+
|
|
57
|
+
charts = generate_charts(df)
|
|
58
|
+
|
|
59
|
+
# -----------------------------------
|
|
60
|
+
# Validation
|
|
61
|
+
# -----------------------------------
|
|
62
|
+
|
|
63
|
+
warnings = []
|
|
64
|
+
|
|
65
|
+
if not skip_validation:
|
|
66
|
+
|
|
67
|
+
warnings = validate_data(df)
|
|
68
|
+
|
|
69
|
+
# -----------------------------------
|
|
70
|
+
# Generate HTML Report
|
|
71
|
+
# -----------------------------------
|
|
72
|
+
|
|
73
|
+
if report:
|
|
74
|
+
|
|
75
|
+
generate_report(
|
|
76
|
+
df=df,
|
|
77
|
+
warnings=warnings,
|
|
78
|
+
profile=profile,
|
|
79
|
+
charts=charts
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# -----------------------------------
|
|
83
|
+
# Export Cleaned Data
|
|
84
|
+
# -----------------------------------
|
|
85
|
+
|
|
86
|
+
export_data(df, output)
|
|
87
|
+
|
|
88
|
+
print("🎉 Cleaning completed successfully")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@app.command()
|
|
92
|
+
def clean(
|
|
93
|
+
|
|
94
|
+
file_path: str,
|
|
95
|
+
|
|
96
|
+
output: str = "cleaned_data.csv",
|
|
97
|
+
|
|
98
|
+
report: bool = True,
|
|
99
|
+
|
|
100
|
+
skip_validation: bool = False
|
|
101
|
+
|
|
102
|
+
):
|
|
103
|
+
|
|
104
|
+
"""
|
|
105
|
+
Clean and analyze CSV or Excel datasets.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
process_file(
|
|
109
|
+
file_path,
|
|
110
|
+
output,
|
|
111
|
+
report,
|
|
112
|
+
skip_validation
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@app.command()
|
|
117
|
+
def version():
|
|
118
|
+
|
|
119
|
+
"""
|
|
120
|
+
Show current MasterClean version.
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
print("MasterClean v0.9-beta")
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def main():
|
|
127
|
+
|
|
128
|
+
app()
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
if __name__ == "__main__":
|
|
132
|
+
|
|
133
|
+
main()
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def optimize_dtypes(df):
|
|
5
|
+
|
|
6
|
+
df = df.copy()
|
|
7
|
+
|
|
8
|
+
for col in df.columns:
|
|
9
|
+
|
|
10
|
+
# Skip already numeric columns
|
|
11
|
+
if pd.api.types.is_numeric_dtype(df[col]):
|
|
12
|
+
continue
|
|
13
|
+
|
|
14
|
+
# Process only object columns
|
|
15
|
+
if df[col].dtype == "object":
|
|
16
|
+
|
|
17
|
+
# -----------------------------------
|
|
18
|
+
# Intelligent Date Detection
|
|
19
|
+
# -----------------------------------
|
|
20
|
+
|
|
21
|
+
date_keywords = ["date", "time", "year"]
|
|
22
|
+
|
|
23
|
+
is_date_column = any(
|
|
24
|
+
keyword in col.lower()
|
|
25
|
+
for keyword in date_keywords
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
if is_date_column:
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
|
|
32
|
+
converted = pd.to_datetime(
|
|
33
|
+
df[col],
|
|
34
|
+
errors="coerce"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
if converted.notna().sum() > len(df) * 0.5:
|
|
38
|
+
|
|
39
|
+
df[col] = converted
|
|
40
|
+
|
|
41
|
+
continue
|
|
42
|
+
|
|
43
|
+
except:
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
# -----------------------------------
|
|
47
|
+
# Numeric Conversion
|
|
48
|
+
# -----------------------------------
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
|
|
52
|
+
converted = pd.to_numeric(
|
|
53
|
+
df[col],
|
|
54
|
+
errors="coerce"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
if converted.notna().sum() > len(df) * 0.7:
|
|
58
|
+
|
|
59
|
+
df[col] = converted
|
|
60
|
+
|
|
61
|
+
except:
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
# Convert float columns into Int64 if possible
|
|
65
|
+
for col in df.select_dtypes(include=['float']):
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
|
|
69
|
+
if (df[col].dropna() % 1 == 0).all():
|
|
70
|
+
|
|
71
|
+
df[col] = df[col].astype("Int64")
|
|
72
|
+
|
|
73
|
+
except:
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
print("✅ Datatypes optimized")
|
|
77
|
+
|
|
78
|
+
return df
|