masterclean 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Mohamed Faisal Maraicar N
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software.
10
+
11
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.4
2
+ Name: masterclean
3
+ Version: 1.0.0
4
+ Summary: Automated CSV cleaning toolkit
5
+ Author: Mohamed Faisal
6
+ License-File: LICENSE
7
+ Requires-Dist: pandas
8
+ Requires-Dist: numpy
9
+ Requires-Dist: typer
10
+ Requires-Dist: chardet
11
+ Requires-Dist: pytest
12
+ Requires-Dist: openpyxl
13
+ Requires-Dist: matplotlib
14
+ Requires-Dist: plotly
15
+ Requires-Dist: rich
16
+ Dynamic: license-file
@@ -0,0 +1,355 @@
1
+ # 🚀 MasterClean
2
+
3
+ ![Python](https://img.shields.io/badge/python-3.10-blue)
4
+
5
+ ![Tests](https://github.com/MohamedFaisal-11/masterclean/actions/workflows/tests.yml/badge.svg)
6
+
7
+ ![License](https://img.shields.io/badge/license-MIT-green)
8
+
9
+ Automated Data Cleaning, Validation & Analytics Toolkit for Python.
10
+
11
+ MasterClean is a professional Python package that automates dataset cleaning, preprocessing, validation, profiling, visualization, and reporting using a single command.
12
+
13
+ It is designed for:
14
+
15
+ * Data Analysts
16
+ * Data Scientists
17
+ * ML Engineers
18
+ * Researchers
19
+ * Students
20
+ * Automation workflows
21
+
22
+ ---
23
+
24
+ # ✨ Features
25
+
26
+ ## Data Cleaning
27
+
28
+ * Automatic missing value handling
29
+ * Duplicate row removal
30
+ * Column standardization
31
+ * String cleanup
32
+ * Encoding-aware file loading
33
+
34
+ ---
35
+
36
+ ## Datatype Optimization
37
+
38
+ * Automatic numeric conversion
39
+ * Datetime detection
40
+ * Integer optimization
41
+ * Mixed datatype handling
42
+
43
+ ---
44
+
45
+ ## Validation Engine
46
+
47
+ * Negative value detection
48
+ * Outlier detection
49
+ * Invalid boolean detection
50
+ * Dataset quality warnings
51
+
52
+ ---
53
+
54
+ ## Analytics & Profiling
55
+
56
+ * Automated dataset profiling
57
+ * Numeric statistics
58
+ * Categorical summaries
59
+ * Memory usage analysis
60
+
61
+ ---
62
+
63
+ ## Visualization Engine
64
+
65
+ * Interactive Plotly dashboards
66
+ * Histograms
67
+ * Pie charts
68
+ * Boxplots
69
+ * Distribution analysis
70
+ * Category analytics
71
+
72
+ ---
73
+
74
+ ## Reporting
75
+
76
+ * Unified HTML analytics dashboard
77
+ * Validation summaries
78
+ * Interactive charts
79
+ * Automated report generation
80
+
81
+ ---
82
+
83
+ ## Developer Features
84
+
85
+ * Command Line Interface (CLI)
86
+ * Automated testing with pytest
87
+ * GitHub Actions CI/CD pipeline
88
+ * Modular package architecture
89
+
90
+ ---
91
+
92
+ # 📦 Installation
93
+
94
+ ## Clone Repository
95
+
96
+ ```bash
97
+ git clone https://github.com/MohamedFaisal-11/masterclean.git
98
+ ```
99
+
100
+ ```bash
101
+ cd masterclean
102
+ ```
103
+
104
+ ---
105
+
106
+ ## Create Virtual Environment
107
+
108
+ ```bash
109
+ python -m venv venv
110
+ ```
111
+
112
+ ---
113
+
114
+ ## Activate Environment
115
+
116
+ ### macOS / Linux
117
+
118
+ ```bash
119
+ source venv/bin/activate
120
+ ```
121
+
122
+ ### Windows
123
+
124
+ ```bash
125
+ venv\Scripts\activate
126
+ ```
127
+
128
+ ---
129
+
130
+ ## Install Package
131
+
132
+ ```bash
133
+ pip install -e .
134
+ ```
135
+
136
+ ---
137
+
138
+ # 🚀 CLI Usage
139
+
140
+ ## Clean Dataset
141
+
142
+ ```bash
143
+ masterclean clean sample.csv
144
+ ```
145
+
146
+ MasterClean automatically:
147
+
148
+ * Reads datasets
149
+ * Cleans missing values
150
+ * Removes duplicates
151
+ * Optimizes datatypes
152
+ * Detects validation issues
153
+ * Generates dashboards
154
+ * Exports cleaned data
155
+ * Creates HTML reports
156
+
157
+ ---
158
+
159
+ ## Show Version
160
+
161
+ ```bash
162
+ masterclean version
163
+ ```
164
+
165
+ ---
166
+
167
+ # 🐍 Python Usage
168
+
169
+ ```python
170
+ from masterclean import (
171
+ read_file,
172
+ clean_data,
173
+ optimize_dtypes,
174
+ validate_data,
175
+ generate_profile,
176
+ generate_charts,
177
+ generate_report,
178
+ export_data
179
+ )
180
+
181
+ df = read_file("sample.csv")
182
+
183
+ df = clean_data(df)
184
+
185
+ df = optimize_dtypes(df)
186
+
187
+ warnings = validate_data(df)
188
+
189
+ profile = generate_profile(df)
190
+
191
+ charts = generate_charts(df)
192
+
193
+ generate_report(
194
+ df=df,
195
+ warnings=warnings,
196
+ profile=profile,
197
+ charts=charts
198
+ )
199
+
200
+ export_data(df)
201
+ ```
202
+
203
+ ---
204
+
205
+ # 📊 Example Validation Output
206
+
207
+ ```text
208
+ VALIDATION WARNINGS
209
+ ========================================
210
+ ⚠ Negative values found in 'age' (1 rows)
211
+
212
+ ⚠ Possible outliers detected in 'salary' (1 rows)
213
+
214
+ ⚠ Invalid boolean-like values found in 'active': {'maybe'}
215
+ ```
216
+
217
+ ---
218
+
219
+ # 📈 Dashboard Features
220
+
221
+ MasterClean generates a unified interactive HTML dashboard containing:
222
+
223
+ * Dataset summaries
224
+ * Validation warnings
225
+ * Profiling statistics
226
+ * Pie charts
227
+ * Histograms
228
+ * Boxplots
229
+ * Category analytics
230
+ * Interactive Plotly visualizations
231
+
232
+ ---
233
+
234
+ # 🏗 Architecture
235
+
236
+ ```text
237
+ Read
238
+
239
+ Clean
240
+
241
+ Optimize
242
+
243
+ Validate
244
+
245
+ Profile
246
+
247
+ Visualize
248
+
249
+ Report
250
+
251
+ Export
252
+ ```
253
+
254
+ ---
255
+
256
+ # 📂 Project Structure
257
+
258
+ ```text
259
+ masterclean/
260
+
261
+ ├── cleaner.py
262
+ ├── validator.py
263
+ ├── datatypes.py
264
+ ├── profiler.py
265
+ ├── visualizer.py
266
+ ├── report.py
267
+ ├── exporter.py
268
+ ├── reader.py
269
+ ├── cli.py
270
+
271
+ tests/
272
+
273
+ ├── test_cleaner.py
274
+ ├── test_validator.py
275
+ ├── test_reader.py
276
+ ├── test_report.py
277
+ ├── test_visualizer.py
278
+
279
+ .github/workflows/
280
+
281
+ └── tests.yml
282
+ ```
283
+
284
+ ---
285
+
286
+ # 🧪 Testing
287
+
288
+ Run tests using:
289
+
290
+ ```bash
291
+ python -m pytest
292
+ ```
293
+
294
+ Current Status:
295
+
296
+ ✅ Automated tests passing
297
+ ✅ GitHub Actions CI/CD passing
298
+
299
+ ---
300
+
301
+ # 🔄 CI/CD
302
+
303
+ MasterClean uses GitHub Actions for:
304
+
305
+ * automated testing
306
+ * dependency validation
307
+ * continuous integration
308
+
309
+ ---
310
+
311
+ # 🛣 Roadmap
312
+
313
+ Future improvements planned:
314
+
315
+ * Advanced schema validation
316
+ * Large dataset optimization
317
+ * Plugin architecture
318
+ * AI-powered cleaning suggestions
319
+ * Cloud deployment support
320
+ * Streamlit dashboard integration
321
+
322
+ ---
323
+
324
+ # Dashboard Preview
325
+
326
+ ![Dashboard](assets/dashboard.png)
327
+
328
+ ---
329
+
330
+
331
+ # 🤝 Contributing
332
+
333
+ Contributions are welcome.
334
+
335
+ You can:
336
+
337
+ * report bugs
338
+ * suggest features
339
+ * improve documentation
340
+ * submit pull requests
341
+
342
+ ---
343
+
344
+ # 📄 License
345
+
346
+ MIT License
347
+
348
+ ---
349
+
350
+ # 👨‍💻 Author
351
+
352
+ Mohamed Faisal Maraicar N
353
+
354
+ GitHub:
355
+ https://github.com/MohamedFaisal-11/masterclean
@@ -0,0 +1,8 @@
1
+ from .reader import read_file
2
+ from .cleaner import clean_data
3
+ from .datatypes import optimize_dtypes
4
+ from .validator import validate_data
5
+ from .report import generate_report
6
+ from .exporter import export_data
7
+ from .profiler import generate_profile
8
+ from .visualizer import generate_charts
@@ -0,0 +1,54 @@
1
+ def clean_data(df):
2
+
3
+ df = df.copy()
4
+
5
+ # Remove duplicates
6
+ df = df.drop_duplicates()
7
+
8
+ # Fill missing values
9
+ for col in df.columns:
10
+
11
+ # Object columns
12
+ if df[col].dtype == "object":
13
+
14
+ try:
15
+
16
+ mode_value = df[col].mode()[0]
17
+
18
+ df[col] = df[col].fillna(mode_value)
19
+
20
+ except:
21
+ pass
22
+
23
+ # Numeric columns
24
+ else:
25
+
26
+ try:
27
+
28
+ median_value = df[col].median()
29
+
30
+ df[col] = df[col].fillna(median_value)
31
+
32
+ except:
33
+ pass
34
+
35
+ # Standardize column names
36
+ df.columns = (
37
+ df.columns
38
+ .str.strip()
39
+ .str.lower()
40
+ .str.replace(" ", "_")
41
+ )
42
+
43
+ # Clean string columns
44
+ for col in df.select_dtypes(include="object"):
45
+
46
+ df[col] = (
47
+ df[col]
48
+ .astype(str)
49
+ .str.strip()
50
+ )
51
+
52
+ print("✅ Data cleaned successfully")
53
+
54
+ return df
@@ -0,0 +1,133 @@
1
+ import typer
2
+
3
+ from masterclean import (
4
+ read_file,
5
+ clean_data,
6
+ export_data,
7
+ optimize_dtypes,
8
+ generate_report,
9
+ validate_data,
10
+ generate_profile,
11
+ generate_charts
12
+ )
13
+
14
+ app = typer.Typer()
15
+
16
+
17
+ def process_file(
18
+
19
+ file_path: str,
20
+
21
+ output: str = "cleaned_data.csv",
22
+
23
+ report: bool = True,
24
+
25
+ skip_validation: bool = False
26
+
27
+ ):
28
+
29
+ # -----------------------------------
30
+ # Read File
31
+ # -----------------------------------
32
+
33
+ df = read_file(file_path)
34
+
35
+ # -----------------------------------
36
+ # Clean Data
37
+ # -----------------------------------
38
+
39
+ df = clean_data(df)
40
+
41
+ # -----------------------------------
42
+ # Optimize Datatypes
43
+ # -----------------------------------
44
+
45
+ df = optimize_dtypes(df)
46
+
47
+ # -----------------------------------
48
+ # Generate Profile
49
+ # -----------------------------------
50
+
51
+ profile = generate_profile(df)
52
+
53
+ # -----------------------------------
54
+ # Generate Interactive Charts
55
+ # -----------------------------------
56
+
57
+ charts = generate_charts(df)
58
+
59
+ # -----------------------------------
60
+ # Validation
61
+ # -----------------------------------
62
+
63
+ warnings = []
64
+
65
+ if not skip_validation:
66
+
67
+ warnings = validate_data(df)
68
+
69
+ # -----------------------------------
70
+ # Generate HTML Report
71
+ # -----------------------------------
72
+
73
+ if report:
74
+
75
+ generate_report(
76
+ df=df,
77
+ warnings=warnings,
78
+ profile=profile,
79
+ charts=charts
80
+ )
81
+
82
+ # -----------------------------------
83
+ # Export Cleaned Data
84
+ # -----------------------------------
85
+
86
+ export_data(df, output)
87
+
88
+ print("🎉 Cleaning completed successfully")
89
+
90
+
91
+ @app.command()
92
+ def clean(
93
+
94
+ file_path: str,
95
+
96
+ output: str = "cleaned_data.csv",
97
+
98
+ report: bool = True,
99
+
100
+ skip_validation: bool = False
101
+
102
+ ):
103
+
104
+ """
105
+ Clean and analyze CSV or Excel datasets.
106
+ """
107
+
108
+ process_file(
109
+ file_path,
110
+ output,
111
+ report,
112
+ skip_validation
113
+ )
114
+
115
+
116
+ @app.command()
117
+ def version():
118
+
119
+ """
120
+ Show current MasterClean version.
121
+ """
122
+
123
+ print("MasterClean v0.9-beta")
124
+
125
+
126
+ def main():
127
+
128
+ app()
129
+
130
+
131
+ if __name__ == "__main__":
132
+
133
+ main()
@@ -0,0 +1,78 @@
1
+ import pandas as pd
2
+
3
+
4
+ def optimize_dtypes(df):
5
+
6
+ df = df.copy()
7
+
8
+ for col in df.columns:
9
+
10
+ # Skip already numeric columns
11
+ if pd.api.types.is_numeric_dtype(df[col]):
12
+ continue
13
+
14
+ # Process only object columns
15
+ if df[col].dtype == "object":
16
+
17
+ # -----------------------------------
18
+ # Intelligent Date Detection
19
+ # -----------------------------------
20
+
21
+ date_keywords = ["date", "time", "year"]
22
+
23
+ is_date_column = any(
24
+ keyword in col.lower()
25
+ for keyword in date_keywords
26
+ )
27
+
28
+ if is_date_column:
29
+
30
+ try:
31
+
32
+ converted = pd.to_datetime(
33
+ df[col],
34
+ errors="coerce"
35
+ )
36
+
37
+ if converted.notna().sum() > len(df) * 0.5:
38
+
39
+ df[col] = converted
40
+
41
+ continue
42
+
43
+ except:
44
+ pass
45
+
46
+ # -----------------------------------
47
+ # Numeric Conversion
48
+ # -----------------------------------
49
+
50
+ try:
51
+
52
+ converted = pd.to_numeric(
53
+ df[col],
54
+ errors="coerce"
55
+ )
56
+
57
+ if converted.notna().sum() > len(df) * 0.7:
58
+
59
+ df[col] = converted
60
+
61
+ except:
62
+ pass
63
+
64
+ # Convert float columns into Int64 if possible
65
+ for col in df.select_dtypes(include=['float']):
66
+
67
+ try:
68
+
69
+ if (df[col].dropna() % 1 == 0).all():
70
+
71
+ df[col] = df[col].astype("Int64")
72
+
73
+ except:
74
+ pass
75
+
76
+ print("✅ Datatypes optimized")
77
+
78
+ return df
@@ -0,0 +1,5 @@
1
+ def export_data(df, output_file="cleaned_data.csv"):
2
+
3
+ df.to_csv(output_file, index=False)
4
+
5
+ print(f"✅ Cleaned CSV exported as {output_file}")