masterclean 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- masterclean/__init__.py +8 -0
- masterclean/cleaner.py +54 -0
- masterclean/cli.py +133 -0
- masterclean/datatypes.py +78 -0
- masterclean/exporter.py +5 -0
- masterclean/profiler.py +103 -0
- masterclean/reader.py +66 -0
- masterclean/report.py +332 -0
- masterclean/validator.py +103 -0
- masterclean/visualizer.py +102 -0
- masterclean-1.0.0.dist-info/METADATA +16 -0
- masterclean-1.0.0.dist-info/RECORD +16 -0
- masterclean-1.0.0.dist-info/WHEEL +5 -0
- masterclean-1.0.0.dist-info/entry_points.txt +2 -0
- masterclean-1.0.0.dist-info/licenses/LICENSE +11 -0
- masterclean-1.0.0.dist-info/top_level.txt +1 -0
masterclean/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
from .reader import read_file
|
|
2
|
+
from .cleaner import clean_data
|
|
3
|
+
from .datatypes import optimize_dtypes
|
|
4
|
+
from .validator import validate_data
|
|
5
|
+
from .report import generate_report
|
|
6
|
+
from .exporter import export_data
|
|
7
|
+
from .profiler import generate_profile
|
|
8
|
+
from .visualizer import generate_charts
|
masterclean/cleaner.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
def clean_data(df):
|
|
2
|
+
|
|
3
|
+
df = df.copy()
|
|
4
|
+
|
|
5
|
+
# Remove duplicates
|
|
6
|
+
df = df.drop_duplicates()
|
|
7
|
+
|
|
8
|
+
# Fill missing values
|
|
9
|
+
for col in df.columns:
|
|
10
|
+
|
|
11
|
+
# Object columns
|
|
12
|
+
if df[col].dtype == "object":
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
|
|
16
|
+
mode_value = df[col].mode()[0]
|
|
17
|
+
|
|
18
|
+
df[col] = df[col].fillna(mode_value)
|
|
19
|
+
|
|
20
|
+
except:
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
# Numeric columns
|
|
24
|
+
else:
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
|
|
28
|
+
median_value = df[col].median()
|
|
29
|
+
|
|
30
|
+
df[col] = df[col].fillna(median_value)
|
|
31
|
+
|
|
32
|
+
except:
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
# Standardize column names
|
|
36
|
+
df.columns = (
|
|
37
|
+
df.columns
|
|
38
|
+
.str.strip()
|
|
39
|
+
.str.lower()
|
|
40
|
+
.str.replace(" ", "_")
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Clean string columns
|
|
44
|
+
for col in df.select_dtypes(include="object"):
|
|
45
|
+
|
|
46
|
+
df[col] = (
|
|
47
|
+
df[col]
|
|
48
|
+
.astype(str)
|
|
49
|
+
.str.strip()
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
print("✅ Data cleaned successfully")
|
|
53
|
+
|
|
54
|
+
return df
|
masterclean/cli.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
|
|
3
|
+
from masterclean import (
|
|
4
|
+
read_file,
|
|
5
|
+
clean_data,
|
|
6
|
+
export_data,
|
|
7
|
+
optimize_dtypes,
|
|
8
|
+
generate_report,
|
|
9
|
+
validate_data,
|
|
10
|
+
generate_profile,
|
|
11
|
+
generate_charts
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
app = typer.Typer()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def process_file(
|
|
18
|
+
|
|
19
|
+
file_path: str,
|
|
20
|
+
|
|
21
|
+
output: str = "cleaned_data.csv",
|
|
22
|
+
|
|
23
|
+
report: bool = True,
|
|
24
|
+
|
|
25
|
+
skip_validation: bool = False
|
|
26
|
+
|
|
27
|
+
):
|
|
28
|
+
|
|
29
|
+
# -----------------------------------
|
|
30
|
+
# Read File
|
|
31
|
+
# -----------------------------------
|
|
32
|
+
|
|
33
|
+
df = read_file(file_path)
|
|
34
|
+
|
|
35
|
+
# -----------------------------------
|
|
36
|
+
# Clean Data
|
|
37
|
+
# -----------------------------------
|
|
38
|
+
|
|
39
|
+
df = clean_data(df)
|
|
40
|
+
|
|
41
|
+
# -----------------------------------
|
|
42
|
+
# Optimize Datatypes
|
|
43
|
+
# -----------------------------------
|
|
44
|
+
|
|
45
|
+
df = optimize_dtypes(df)
|
|
46
|
+
|
|
47
|
+
# -----------------------------------
|
|
48
|
+
# Generate Profile
|
|
49
|
+
# -----------------------------------
|
|
50
|
+
|
|
51
|
+
profile = generate_profile(df)
|
|
52
|
+
|
|
53
|
+
# -----------------------------------
|
|
54
|
+
# Generate Interactive Charts
|
|
55
|
+
# -----------------------------------
|
|
56
|
+
|
|
57
|
+
charts = generate_charts(df)
|
|
58
|
+
|
|
59
|
+
# -----------------------------------
|
|
60
|
+
# Validation
|
|
61
|
+
# -----------------------------------
|
|
62
|
+
|
|
63
|
+
warnings = []
|
|
64
|
+
|
|
65
|
+
if not skip_validation:
|
|
66
|
+
|
|
67
|
+
warnings = validate_data(df)
|
|
68
|
+
|
|
69
|
+
# -----------------------------------
|
|
70
|
+
# Generate HTML Report
|
|
71
|
+
# -----------------------------------
|
|
72
|
+
|
|
73
|
+
if report:
|
|
74
|
+
|
|
75
|
+
generate_report(
|
|
76
|
+
df=df,
|
|
77
|
+
warnings=warnings,
|
|
78
|
+
profile=profile,
|
|
79
|
+
charts=charts
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# -----------------------------------
|
|
83
|
+
# Export Cleaned Data
|
|
84
|
+
# -----------------------------------
|
|
85
|
+
|
|
86
|
+
export_data(df, output)
|
|
87
|
+
|
|
88
|
+
print("🎉 Cleaning completed successfully")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@app.command()
|
|
92
|
+
def clean(
|
|
93
|
+
|
|
94
|
+
file_path: str,
|
|
95
|
+
|
|
96
|
+
output: str = "cleaned_data.csv",
|
|
97
|
+
|
|
98
|
+
report: bool = True,
|
|
99
|
+
|
|
100
|
+
skip_validation: bool = False
|
|
101
|
+
|
|
102
|
+
):
|
|
103
|
+
|
|
104
|
+
"""
|
|
105
|
+
Clean and analyze CSV or Excel datasets.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
process_file(
|
|
109
|
+
file_path,
|
|
110
|
+
output,
|
|
111
|
+
report,
|
|
112
|
+
skip_validation
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@app.command()
|
|
117
|
+
def version():
|
|
118
|
+
|
|
119
|
+
"""
|
|
120
|
+
Show current MasterClean version.
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
print("MasterClean v0.9-beta")
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def main():
|
|
127
|
+
|
|
128
|
+
app()
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
if __name__ == "__main__":
|
|
132
|
+
|
|
133
|
+
main()
|
masterclean/datatypes.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def optimize_dtypes(df):
|
|
5
|
+
|
|
6
|
+
df = df.copy()
|
|
7
|
+
|
|
8
|
+
for col in df.columns:
|
|
9
|
+
|
|
10
|
+
# Skip already numeric columns
|
|
11
|
+
if pd.api.types.is_numeric_dtype(df[col]):
|
|
12
|
+
continue
|
|
13
|
+
|
|
14
|
+
# Process only object columns
|
|
15
|
+
if df[col].dtype == "object":
|
|
16
|
+
|
|
17
|
+
# -----------------------------------
|
|
18
|
+
# Intelligent Date Detection
|
|
19
|
+
# -----------------------------------
|
|
20
|
+
|
|
21
|
+
date_keywords = ["date", "time", "year"]
|
|
22
|
+
|
|
23
|
+
is_date_column = any(
|
|
24
|
+
keyword in col.lower()
|
|
25
|
+
for keyword in date_keywords
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
if is_date_column:
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
|
|
32
|
+
converted = pd.to_datetime(
|
|
33
|
+
df[col],
|
|
34
|
+
errors="coerce"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
if converted.notna().sum() > len(df) * 0.5:
|
|
38
|
+
|
|
39
|
+
df[col] = converted
|
|
40
|
+
|
|
41
|
+
continue
|
|
42
|
+
|
|
43
|
+
except:
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
# -----------------------------------
|
|
47
|
+
# Numeric Conversion
|
|
48
|
+
# -----------------------------------
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
|
|
52
|
+
converted = pd.to_numeric(
|
|
53
|
+
df[col],
|
|
54
|
+
errors="coerce"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
if converted.notna().sum() > len(df) * 0.7:
|
|
58
|
+
|
|
59
|
+
df[col] = converted
|
|
60
|
+
|
|
61
|
+
except:
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
# Convert float columns into Int64 if possible
|
|
65
|
+
for col in df.select_dtypes(include=['float']):
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
|
|
69
|
+
if (df[col].dropna() % 1 == 0).all():
|
|
70
|
+
|
|
71
|
+
df[col] = df[col].astype("Int64")
|
|
72
|
+
|
|
73
|
+
except:
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
print("✅ Datatypes optimized")
|
|
77
|
+
|
|
78
|
+
return df
|
masterclean/exporter.py
ADDED
masterclean/profiler.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def generate_profile(df):
|
|
5
|
+
|
|
6
|
+
profile = {}
|
|
7
|
+
|
|
8
|
+
# -----------------------------------
|
|
9
|
+
# Dataset Summary
|
|
10
|
+
# -----------------------------------
|
|
11
|
+
|
|
12
|
+
profile["rows"] = df.shape[0]
|
|
13
|
+
profile["columns"] = df.shape[1]
|
|
14
|
+
|
|
15
|
+
profile["duplicate_rows"] = df.duplicated().sum()
|
|
16
|
+
|
|
17
|
+
profile["memory_usage_mb"] = round(
|
|
18
|
+
df.memory_usage(deep=True).sum() / 1024 / 1024,
|
|
19
|
+
2
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# -----------------------------------
|
|
23
|
+
# Column Profiles
|
|
24
|
+
# -----------------------------------
|
|
25
|
+
|
|
26
|
+
column_profiles = {}
|
|
27
|
+
|
|
28
|
+
for col in df.columns:
|
|
29
|
+
|
|
30
|
+
column_data = {}
|
|
31
|
+
|
|
32
|
+
column_data["dtype"] = str(df[col].dtype)
|
|
33
|
+
|
|
34
|
+
column_data["missing_values"] = int(
|
|
35
|
+
df[col].isnull().sum()
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
column_data["unique_values"] = int(
|
|
39
|
+
df[col].nunique()
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# -----------------------------------
|
|
43
|
+
# Numeric Statistics
|
|
44
|
+
# -----------------------------------
|
|
45
|
+
|
|
46
|
+
if pd.api.types.is_numeric_dtype(df[col]):
|
|
47
|
+
|
|
48
|
+
column_data["mean"] = round(
|
|
49
|
+
df[col].mean(),
|
|
50
|
+
2
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
column_data["median"] = round(
|
|
54
|
+
df[col].median(),
|
|
55
|
+
2
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
column_data["min"] = round(
|
|
59
|
+
df[col].min(),
|
|
60
|
+
2
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
column_data["max"] = round(
|
|
64
|
+
df[col].max(),
|
|
65
|
+
2
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
column_data["std"] = round(
|
|
69
|
+
df[col].std(),
|
|
70
|
+
2
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# -----------------------------------
|
|
74
|
+
# Categorical Statistics
|
|
75
|
+
# -----------------------------------
|
|
76
|
+
|
|
77
|
+
else:
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
|
|
81
|
+
mode_value = df[col].mode()[0]
|
|
82
|
+
|
|
83
|
+
mode_frequency = int(
|
|
84
|
+
(df[col] == mode_value).sum()
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
column_data["top_value"] = str(mode_value)
|
|
88
|
+
|
|
89
|
+
column_data["top_frequency"] = mode_frequency
|
|
90
|
+
|
|
91
|
+
except:
|
|
92
|
+
|
|
93
|
+
column_data["top_value"] = "N/A"
|
|
94
|
+
|
|
95
|
+
column_data["top_frequency"] = 0
|
|
96
|
+
|
|
97
|
+
column_profiles[col] = column_data
|
|
98
|
+
|
|
99
|
+
profile["column_profiles"] = column_profiles
|
|
100
|
+
|
|
101
|
+
print("✅ Data profile generated")
|
|
102
|
+
|
|
103
|
+
return profile
|
masterclean/reader.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import chardet
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def detect_encoding(file_path):
|
|
7
|
+
|
|
8
|
+
with open(file_path, "rb") as file:
|
|
9
|
+
|
|
10
|
+
raw_data = file.read(100000)
|
|
11
|
+
|
|
12
|
+
result = chardet.detect(raw_data)
|
|
13
|
+
|
|
14
|
+
return result["encoding"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def read_file(file_path):
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
|
|
21
|
+
file_extension = Path(file_path).suffix.lower()
|
|
22
|
+
|
|
23
|
+
# -----------------------------------
|
|
24
|
+
# CSV Support
|
|
25
|
+
# -----------------------------------
|
|
26
|
+
|
|
27
|
+
if file_extension == ".csv":
|
|
28
|
+
|
|
29
|
+
encoding = detect_encoding(file_path)
|
|
30
|
+
|
|
31
|
+
print(f"📄 Detected Encoding: {encoding}")
|
|
32
|
+
|
|
33
|
+
df = pd.read_csv(
|
|
34
|
+
file_path,
|
|
35
|
+
encoding=encoding
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
print("✅ CSV loaded successfully")
|
|
39
|
+
|
|
40
|
+
return df
|
|
41
|
+
|
|
42
|
+
# -----------------------------------
|
|
43
|
+
# Excel Support
|
|
44
|
+
# -----------------------------------
|
|
45
|
+
|
|
46
|
+
elif file_extension in [".xlsx", ".xls"]:
|
|
47
|
+
|
|
48
|
+
df = pd.read_excel(file_path)
|
|
49
|
+
|
|
50
|
+
print("✅ Excel file loaded successfully")
|
|
51
|
+
|
|
52
|
+
return df
|
|
53
|
+
|
|
54
|
+
# -----------------------------------
|
|
55
|
+
# Unsupported Files
|
|
56
|
+
# -----------------------------------
|
|
57
|
+
|
|
58
|
+
else:
|
|
59
|
+
|
|
60
|
+
raise ValueError(
|
|
61
|
+
"Unsupported file format. Use CSV or Excel files."
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
except Exception as e:
|
|
65
|
+
|
|
66
|
+
print(f"❌ Error reading file: {e}")
|
masterclean/report.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def generate_report(
|
|
5
|
+
|
|
6
|
+
df,
|
|
7
|
+
warnings=None,
|
|
8
|
+
profile=None,
|
|
9
|
+
charts="",
|
|
10
|
+
output_file="report.html"
|
|
11
|
+
|
|
12
|
+
):
|
|
13
|
+
|
|
14
|
+
rows = df.shape[0]
|
|
15
|
+
columns = df.shape[1]
|
|
16
|
+
|
|
17
|
+
missing_values = df.isnull().sum()
|
|
18
|
+
|
|
19
|
+
dtypes = df.dtypes
|
|
20
|
+
|
|
21
|
+
html = f"""
|
|
22
|
+
<html>
|
|
23
|
+
|
|
24
|
+
<head>
|
|
25
|
+
|
|
26
|
+
<title>MasterClean Report</title>
|
|
27
|
+
|
|
28
|
+
<style>
|
|
29
|
+
|
|
30
|
+
body {{
|
|
31
|
+
font-family: Arial;
|
|
32
|
+
margin: 40px;
|
|
33
|
+
background-color: #111827;
|
|
34
|
+
color: white;
|
|
35
|
+
}}
|
|
36
|
+
|
|
37
|
+
h1 {{
|
|
38
|
+
color: #00E5FF;
|
|
39
|
+
}}
|
|
40
|
+
|
|
41
|
+
h2 {{
|
|
42
|
+
color: #A5F3FC;
|
|
43
|
+
margin-top: 40px;
|
|
44
|
+
}}
|
|
45
|
+
|
|
46
|
+
table {{
|
|
47
|
+
border-collapse: collapse;
|
|
48
|
+
width: 95%;
|
|
49
|
+
margin-bottom: 30px;
|
|
50
|
+
background-color: #1F2937;
|
|
51
|
+
}}
|
|
52
|
+
|
|
53
|
+
th, td {{
|
|
54
|
+
border: 1px solid #374151;
|
|
55
|
+
padding: 12px;
|
|
56
|
+
text-align: left;
|
|
57
|
+
}}
|
|
58
|
+
|
|
59
|
+
th {{
|
|
60
|
+
background-color: #06B6D4;
|
|
61
|
+
color: white;
|
|
62
|
+
}}
|
|
63
|
+
|
|
64
|
+
tr:nth-child(even) {{
|
|
65
|
+
background-color: #111827;
|
|
66
|
+
}}
|
|
67
|
+
|
|
68
|
+
.warning {{
|
|
69
|
+
color: #FCA5A5;
|
|
70
|
+
font-weight: bold;
|
|
71
|
+
}}
|
|
72
|
+
|
|
73
|
+
.section {{
|
|
74
|
+
margin-bottom: 50px;
|
|
75
|
+
}}
|
|
76
|
+
|
|
77
|
+
</style>
|
|
78
|
+
|
|
79
|
+
</head>
|
|
80
|
+
|
|
81
|
+
<body>
|
|
82
|
+
|
|
83
|
+
<h1>🚀 MasterClean Analytics Dashboard</h1>
|
|
84
|
+
|
|
85
|
+
<div class="section">
|
|
86
|
+
|
|
87
|
+
<h2>Dataset Summary</h2>
|
|
88
|
+
|
|
89
|
+
<table>
|
|
90
|
+
|
|
91
|
+
<tr>
|
|
92
|
+
<th>Metric</th>
|
|
93
|
+
<th>Value</th>
|
|
94
|
+
</tr>
|
|
95
|
+
|
|
96
|
+
<tr>
|
|
97
|
+
<td>Rows</td>
|
|
98
|
+
<td>{rows}</td>
|
|
99
|
+
</tr>
|
|
100
|
+
|
|
101
|
+
<tr>
|
|
102
|
+
<td>Columns</td>
|
|
103
|
+
<td>{columns}</td>
|
|
104
|
+
</tr>
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
# -----------------------------------
|
|
108
|
+
# Profile Summary
|
|
109
|
+
# -----------------------------------
|
|
110
|
+
|
|
111
|
+
if profile:
|
|
112
|
+
|
|
113
|
+
html += f"""
|
|
114
|
+
|
|
115
|
+
<tr>
|
|
116
|
+
<td>Duplicate Rows</td>
|
|
117
|
+
<td>{profile.get("duplicate_rows", 0)}</td>
|
|
118
|
+
</tr>
|
|
119
|
+
|
|
120
|
+
<tr>
|
|
121
|
+
<td>Memory Usage (MB)</td>
|
|
122
|
+
<td>{profile.get("memory_usage_mb", 0)}</td>
|
|
123
|
+
</tr>
|
|
124
|
+
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
html += """
|
|
128
|
+
</table>
|
|
129
|
+
|
|
130
|
+
</div>
|
|
131
|
+
|
|
132
|
+
<div class="section">
|
|
133
|
+
|
|
134
|
+
<h2>Column Types</h2>
|
|
135
|
+
|
|
136
|
+
<table>
|
|
137
|
+
|
|
138
|
+
<tr>
|
|
139
|
+
<th>Column</th>
|
|
140
|
+
<th>Datatype</th>
|
|
141
|
+
</tr>
|
|
142
|
+
"""
|
|
143
|
+
|
|
144
|
+
for col, dtype in dtypes.items():
|
|
145
|
+
|
|
146
|
+
html += f"""
|
|
147
|
+
<tr>
|
|
148
|
+
<td>{col}</td>
|
|
149
|
+
<td>{dtype}</td>
|
|
150
|
+
</tr>
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
html += """
|
|
154
|
+
</table>
|
|
155
|
+
|
|
156
|
+
</div>
|
|
157
|
+
|
|
158
|
+
<div class="section">
|
|
159
|
+
|
|
160
|
+
<h2>Missing Values</h2>
|
|
161
|
+
|
|
162
|
+
<table>
|
|
163
|
+
|
|
164
|
+
<tr>
|
|
165
|
+
<th>Column</th>
|
|
166
|
+
<th>Missing Values</th>
|
|
167
|
+
</tr>
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
for col, value in missing_values.items():
|
|
171
|
+
|
|
172
|
+
html += f"""
|
|
173
|
+
<tr>
|
|
174
|
+
<td>{col}</td>
|
|
175
|
+
<td>{value}</td>
|
|
176
|
+
</tr>
|
|
177
|
+
"""
|
|
178
|
+
|
|
179
|
+
html += """
|
|
180
|
+
</table>
|
|
181
|
+
|
|
182
|
+
</div>
|
|
183
|
+
"""
|
|
184
|
+
|
|
185
|
+
# -----------------------------------
|
|
186
|
+
# Numeric Statistics
|
|
187
|
+
# -----------------------------------
|
|
188
|
+
|
|
189
|
+
if profile:
|
|
190
|
+
|
|
191
|
+
html += """
|
|
192
|
+
<div class="section">
|
|
193
|
+
|
|
194
|
+
<h2>Numeric Statistics</h2>
|
|
195
|
+
|
|
196
|
+
<table>
|
|
197
|
+
|
|
198
|
+
<tr>
|
|
199
|
+
<th>Column</th>
|
|
200
|
+
<th>Mean</th>
|
|
201
|
+
<th>Median</th>
|
|
202
|
+
<th>Min</th>
|
|
203
|
+
<th>Max</th>
|
|
204
|
+
<th>Std</th>
|
|
205
|
+
</tr>
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
for col, stats in profile["column_profiles"].items():
|
|
209
|
+
|
|
210
|
+
if "mean" in stats:
|
|
211
|
+
|
|
212
|
+
html += f"""
|
|
213
|
+
<tr>
|
|
214
|
+
<td>{col}</td>
|
|
215
|
+
<td>{stats.get("mean", "N/A")}</td>
|
|
216
|
+
<td>{stats.get("median", "N/A")}</td>
|
|
217
|
+
<td>{stats.get("min", "N/A")}</td>
|
|
218
|
+
<td>{stats.get("max", "N/A")}</td>
|
|
219
|
+
<td>{stats.get("std", "N/A")}</td>
|
|
220
|
+
</tr>
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
html += """
|
|
224
|
+
</table>
|
|
225
|
+
|
|
226
|
+
</div>
|
|
227
|
+
"""
|
|
228
|
+
|
|
229
|
+
# -----------------------------------
|
|
230
|
+
# Categorical Statistics
|
|
231
|
+
# -----------------------------------
|
|
232
|
+
|
|
233
|
+
if profile:
|
|
234
|
+
|
|
235
|
+
html += """
|
|
236
|
+
<div class="section">
|
|
237
|
+
|
|
238
|
+
<h2>Categorical Statistics</h2>
|
|
239
|
+
|
|
240
|
+
<table>
|
|
241
|
+
|
|
242
|
+
<tr>
|
|
243
|
+
<th>Column</th>
|
|
244
|
+
<th>Top Value</th>
|
|
245
|
+
<th>Frequency</th>
|
|
246
|
+
<th>Unique Values</th>
|
|
247
|
+
</tr>
|
|
248
|
+
"""
|
|
249
|
+
|
|
250
|
+
for col, stats in profile["column_profiles"].items():
|
|
251
|
+
|
|
252
|
+
if "top_value" in stats:
|
|
253
|
+
|
|
254
|
+
html += f"""
|
|
255
|
+
<tr>
|
|
256
|
+
<td>{col}</td>
|
|
257
|
+
<td>{stats.get("top_value", "N/A")}</td>
|
|
258
|
+
<td>{stats.get("top_frequency", "N/A")}</td>
|
|
259
|
+
<td>{stats.get("unique_values", "N/A")}</td>
|
|
260
|
+
</tr>
|
|
261
|
+
"""
|
|
262
|
+
|
|
263
|
+
html += """
|
|
264
|
+
</table>
|
|
265
|
+
|
|
266
|
+
</div>
|
|
267
|
+
"""
|
|
268
|
+
|
|
269
|
+
# -----------------------------------
|
|
270
|
+
# Validation Warnings
|
|
271
|
+
# -----------------------------------
|
|
272
|
+
|
|
273
|
+
if warnings:
|
|
274
|
+
|
|
275
|
+
html += """
|
|
276
|
+
<div class="section">
|
|
277
|
+
|
|
278
|
+
<h2 class="warning">
|
|
279
|
+
⚠ Validation Warnings
|
|
280
|
+
</h2>
|
|
281
|
+
|
|
282
|
+
<table>
|
|
283
|
+
|
|
284
|
+
<tr>
|
|
285
|
+
<th>Warning</th>
|
|
286
|
+
</tr>
|
|
287
|
+
"""
|
|
288
|
+
|
|
289
|
+
for warning in warnings:
|
|
290
|
+
|
|
291
|
+
html += f"""
|
|
292
|
+
<tr>
|
|
293
|
+
<td>{warning}</td>
|
|
294
|
+
</tr>
|
|
295
|
+
"""
|
|
296
|
+
|
|
297
|
+
html += """
|
|
298
|
+
</table>
|
|
299
|
+
|
|
300
|
+
</div>
|
|
301
|
+
"""
|
|
302
|
+
|
|
303
|
+
# -----------------------------------
|
|
304
|
+
# Interactive Charts
|
|
305
|
+
# -----------------------------------
|
|
306
|
+
|
|
307
|
+
if charts:
|
|
308
|
+
|
|
309
|
+
html += """
|
|
310
|
+
<div class="section">
|
|
311
|
+
|
|
312
|
+
<h2>📊 Interactive Visual Analytics</h2>
|
|
313
|
+
"""
|
|
314
|
+
|
|
315
|
+
html += charts
|
|
316
|
+
|
|
317
|
+
html += """
|
|
318
|
+
</div>
|
|
319
|
+
"""
|
|
320
|
+
|
|
321
|
+
html += """
|
|
322
|
+
|
|
323
|
+
</body>
|
|
324
|
+
|
|
325
|
+
</html>
|
|
326
|
+
"""
|
|
327
|
+
|
|
328
|
+
with open(output_file, "w") as file:
|
|
329
|
+
|
|
330
|
+
file.write(html)
|
|
331
|
+
|
|
332
|
+
print(f"✅ Unified dashboard generated as {output_file}")
|
masterclean/validator.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
def validate_data(df):
|
|
2
|
+
|
|
3
|
+
warnings = []
|
|
4
|
+
|
|
5
|
+
# -----------------------------------
|
|
6
|
+
# Negative Values
|
|
7
|
+
# -----------------------------------
|
|
8
|
+
|
|
9
|
+
for col in df.select_dtypes(include=["int64", "float64", "Int64"]):
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
|
|
13
|
+
negative_count = (df[col] < 0).sum()
|
|
14
|
+
|
|
15
|
+
if negative_count > 0:
|
|
16
|
+
|
|
17
|
+
warnings.append(
|
|
18
|
+
f"⚠ Negative values found in '{col}' ({negative_count} rows)"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
except:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
# -----------------------------------
|
|
25
|
+
# Outlier Detection
|
|
26
|
+
# -----------------------------------
|
|
27
|
+
|
|
28
|
+
for col in df.select_dtypes(include=["int64", "float64", "Int64"]):
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
|
|
32
|
+
Q1 = df[col].quantile(0.25)
|
|
33
|
+
Q3 = df[col].quantile(0.75)
|
|
34
|
+
|
|
35
|
+
IQR = Q3 - Q1
|
|
36
|
+
|
|
37
|
+
lower_bound = Q1 - 1.5 * IQR
|
|
38
|
+
upper_bound = Q3 + 1.5 * IQR
|
|
39
|
+
|
|
40
|
+
outliers = df[
|
|
41
|
+
(df[col] < lower_bound) |
|
|
42
|
+
(df[col] > upper_bound)
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
if len(outliers) > 0:
|
|
46
|
+
|
|
47
|
+
warnings.append(
|
|
48
|
+
f"⚠ Possible outliers detected in '{col}' ({len(outliers)} rows)"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
except:
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
# -----------------------------------
|
|
55
|
+
# Boolean Validation
|
|
56
|
+
# -----------------------------------
|
|
57
|
+
|
|
58
|
+
valid_boolean_values = {
|
|
59
|
+
"true",
|
|
60
|
+
"false",
|
|
61
|
+
"yes",
|
|
62
|
+
"no"
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
for col in df.select_dtypes(include="object"):
|
|
66
|
+
|
|
67
|
+
unique_values = set(
|
|
68
|
+
df[col]
|
|
69
|
+
.dropna()
|
|
70
|
+
.astype(str)
|
|
71
|
+
.str.lower()
|
|
72
|
+
.unique()
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
if (
|
|
76
|
+
unique_values.intersection(valid_boolean_values)
|
|
77
|
+
and not unique_values.issubset(valid_boolean_values)
|
|
78
|
+
):
|
|
79
|
+
|
|
80
|
+
invalid_values = unique_values - valid_boolean_values
|
|
81
|
+
|
|
82
|
+
warnings.append(
|
|
83
|
+
f"⚠ Invalid boolean-like values found in '{col}': {invalid_values}"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# -----------------------------------
|
|
87
|
+
# Display Warnings
|
|
88
|
+
# -----------------------------------
|
|
89
|
+
|
|
90
|
+
if warnings:
|
|
91
|
+
|
|
92
|
+
print("\nVALIDATION WARNINGS")
|
|
93
|
+
print("=" * 40)
|
|
94
|
+
|
|
95
|
+
for warning in warnings:
|
|
96
|
+
|
|
97
|
+
print(warning)
|
|
98
|
+
|
|
99
|
+
else:
|
|
100
|
+
|
|
101
|
+
print("✅ No major validation issues found")
|
|
102
|
+
|
|
103
|
+
return warnings
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import plotly.express as px
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def generate_charts(df):
|
|
5
|
+
|
|
6
|
+
chart_html = ""
|
|
7
|
+
|
|
8
|
+
# -----------------------------------
|
|
9
|
+
# Numeric Charts
|
|
10
|
+
# -----------------------------------
|
|
11
|
+
|
|
12
|
+
numeric_columns = df.select_dtypes(
|
|
13
|
+
include=["int64", "float64", "Int64"]
|
|
14
|
+
).columns
|
|
15
|
+
|
|
16
|
+
for col in numeric_columns:
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
|
|
20
|
+
# Histogram
|
|
21
|
+
fig = px.histogram(
|
|
22
|
+
df,
|
|
23
|
+
x=col,
|
|
24
|
+
title=f"{col} Distribution",
|
|
25
|
+
template="plotly_dark"
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
chart_html += fig.to_html(
|
|
29
|
+
full_html=False,
|
|
30
|
+
include_plotlyjs="cdn"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Boxplot
|
|
34
|
+
fig = px.box(
|
|
35
|
+
df,
|
|
36
|
+
y=col,
|
|
37
|
+
title=f"{col} Boxplot",
|
|
38
|
+
template="plotly_dark"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
chart_html += fig.to_html(
|
|
42
|
+
full_html=False,
|
|
43
|
+
include_plotlyjs=False
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
except:
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
# -----------------------------------
|
|
50
|
+
# Categorical Charts
|
|
51
|
+
# -----------------------------------
|
|
52
|
+
|
|
53
|
+
categorical_columns = df.select_dtypes(
|
|
54
|
+
include="object"
|
|
55
|
+
).columns
|
|
56
|
+
|
|
57
|
+
for col in categorical_columns:
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
|
|
61
|
+
value_counts = (
|
|
62
|
+
df[col]
|
|
63
|
+
.value_counts()
|
|
64
|
+
.head(10)
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Bar Chart
|
|
68
|
+
fig = px.bar(
|
|
69
|
+
x=value_counts.index,
|
|
70
|
+
y=value_counts.values,
|
|
71
|
+
title=f"{col} Top Categories",
|
|
72
|
+
labels={
|
|
73
|
+
"x": col,
|
|
74
|
+
"y": "Count"
|
|
75
|
+
},
|
|
76
|
+
template="plotly_dark"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
chart_html += fig.to_html(
|
|
80
|
+
full_html=False,
|
|
81
|
+
include_plotlyjs=False
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Pie Chart
|
|
85
|
+
fig = px.pie(
|
|
86
|
+
values=value_counts.values,
|
|
87
|
+
names=value_counts.index,
|
|
88
|
+
title=f"{col} Distribution",
|
|
89
|
+
template="plotly_dark"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
chart_html += fig.to_html(
|
|
93
|
+
full_html=False,
|
|
94
|
+
include_plotlyjs=False
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
except:
|
|
98
|
+
pass
|
|
99
|
+
|
|
100
|
+
print("✅ Unified interactive charts generated")
|
|
101
|
+
|
|
102
|
+
return chart_html
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: masterclean
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Automated CSV cleaning toolkit
|
|
5
|
+
Author: Mohamed Faisal
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Dist: pandas
|
|
8
|
+
Requires-Dist: numpy
|
|
9
|
+
Requires-Dist: typer
|
|
10
|
+
Requires-Dist: chardet
|
|
11
|
+
Requires-Dist: pytest
|
|
12
|
+
Requires-Dist: openpyxl
|
|
13
|
+
Requires-Dist: matplotlib
|
|
14
|
+
Requires-Dist: plotly
|
|
15
|
+
Requires-Dist: rich
|
|
16
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
masterclean/__init__.py,sha256=DA2aftVukkKpam_I6jtFvTP52y8e6fkK_pTyj-yj0mM,286
|
|
2
|
+
masterclean/cleaner.py,sha256=bZHXg1gb2o-SdkOExOAjxJFSiGuF4io6XZB_0cV1L6s,968
|
|
3
|
+
masterclean/cli.py,sha256=oO-bRIaCpCRfBY1lg2gZbFKLkRxjzf-MeHWBRw78Z3Q,2206
|
|
4
|
+
masterclean/datatypes.py,sha256=GagqNIJDJSlZgieMbWz74EY5RxT9UNQBV_zetehrKww,1734
|
|
5
|
+
masterclean/exporter.py,sha256=frLLVxOYSMv1jBDPl5Gnwqn4jUEVvouM1PCsqcaFebA,150
|
|
6
|
+
masterclean/profiler.py,sha256=XV_S5uS7fKBGhvnyvfKR7P4aI6wAjcYjuBWdKZxiyP4,2241
|
|
7
|
+
masterclean/reader.py,sha256=dY1izmAnXHAiIlAh_TV4onKgyQdndDznoPwi2y7wKpU,1392
|
|
8
|
+
masterclean/report.py,sha256=rBrGPTg3RxQy6GwCSzY-pOKKbiNwnHg2P3UH6GF7l9Y,6241
|
|
9
|
+
masterclean/validator.py,sha256=JsRJB5fG3yUbK25CR04Dy1MPRyzT5i4346lc2YtvjKE,2311
|
|
10
|
+
masterclean/visualizer.py,sha256=gg3czfSrG8MEchnCrOSMuopMZEGKP_q9LOQ-fJAUiqg,2303
|
|
11
|
+
masterclean-1.0.0.dist-info/licenses/LICENSE,sha256=l92G-jngF3KP4hAIeGW5VMKLRJpwjI7rogm_786SVLs,453
|
|
12
|
+
masterclean-1.0.0.dist-info/METADATA,sha256=ojJDFvrlmMUdUC6EigZnHVlRipfiZeppXaZKiV44NxA,363
|
|
13
|
+
masterclean-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
14
|
+
masterclean-1.0.0.dist-info/entry_points.txt,sha256=VpZvrE-sBOw2weiy0E_0m8oO-2-vAeHqLNMV1jww7Is,53
|
|
15
|
+
masterclean-1.0.0.dist-info/top_level.txt,sha256=HGlOlIyGokt9u34mtY_noOUBqygDWx9149XfNoWnYDw,12
|
|
16
|
+
masterclean-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Mohamed Faisal Maraicar N
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software.
|
|
10
|
+
|
|
11
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
masterclean
|