data-autoeda 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_autoeda-0.1.1/PKG-INFO +60 -0
- data_autoeda-0.1.1/README.md +42 -0
- data_autoeda-0.1.1/autoeda/__init__.py +3 -0
- data_autoeda-0.1.1/autoeda/analyzer.py +288 -0
- data_autoeda-0.1.1/autoeda/cleaner.py +104 -0
- data_autoeda-0.1.1/autoeda/cli.py +94 -0
- data_autoeda-0.1.1/autoeda/insights.py +52 -0
- data_autoeda-0.1.1/autoeda/utils.py +17 -0
- data_autoeda-0.1.1/autoeda/validator.py +33 -0
- data_autoeda-0.1.1/autoeda/visualizer.py +125 -0
- data_autoeda-0.1.1/data_autoeda.egg-info/PKG-INFO +60 -0
- data_autoeda-0.1.1/data_autoeda.egg-info/SOURCES.txt +16 -0
- data_autoeda-0.1.1/data_autoeda.egg-info/dependency_links.txt +1 -0
- data_autoeda-0.1.1/data_autoeda.egg-info/entry_points.txt +2 -0
- data_autoeda-0.1.1/data_autoeda.egg-info/requires.txt +7 -0
- data_autoeda-0.1.1/data_autoeda.egg-info/top_level.txt +1 -0
- data_autoeda-0.1.1/pyproject.toml +29 -0
- data_autoeda-0.1.1/setup.cfg +4 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: data-autoeda
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Automatic Exploratory Data Analysis, Cleaning, Validation, Visualization, and Smart Insights on ANY CSV dataset.
|
|
5
|
+
Author: shubham kumar
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Shu40/AutoEDA/
|
|
8
|
+
Project-URL: Repository, https://github.com/Shu40/AutoEDA/
|
|
9
|
+
Requires-Python: >=3.8
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: pandas>=1.3.0
|
|
12
|
+
Requires-Dist: numpy>=1.20.0
|
|
13
|
+
Requires-Dist: matplotlib>=3.4.0
|
|
14
|
+
Requires-Dist: seaborn>=0.11.0
|
|
15
|
+
Requires-Dist: scipy>=1.7.0
|
|
16
|
+
Requires-Dist: click>=8.0.0
|
|
17
|
+
Requires-Dist: rich>=10.0.0
|
|
18
|
+
|
|
19
|
+
# AutoEDA
|
|
20
|
+
|
|
21
|
+
A production-ready Python package that performs automatic Exploratory Data Analysis (EDA), Data Cleaning, Data Validation, Visualization, and Smart Insights on ANY CSV dataset.
|
|
22
|
+
|
|
23
|
+
## Features
|
|
24
|
+
- **Dynamic Analysis**: Automatically detects numerical, categorical, boolean, and datetime columns.
|
|
25
|
+
- **Smart Cleaning**: Handles missing values, removes duplicates, and optimizes datatypes automatically.
|
|
26
|
+
- **Visualizations**: Automatically generates relevant charts using Matplotlib and Seaborn.
|
|
27
|
+
- **Smart Insights**: Generates English observations based on statistical findings.
|
|
28
|
+
- **Rich Terminal UI**: Beautiful, organized CLI reports.
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install .
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Usage
|
|
37
|
+
|
|
38
|
+
### Python API
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from autoeda import analyze
|
|
42
|
+
|
|
43
|
+
# Complete analysis
|
|
44
|
+
analyze("data.csv")
|
|
45
|
+
|
|
46
|
+
# Only clean data
|
|
47
|
+
# analyze("data.csv", mode="clean")
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### CLI
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
# Complete analysis
|
|
54
|
+
autoeda data.csv
|
|
55
|
+
|
|
56
|
+
# Specific modes
|
|
57
|
+
autoeda data.csv --clean
|
|
58
|
+
autoeda data.csv --visualize
|
|
59
|
+
autoeda data.csv --all
|
|
60
|
+
```
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# AutoEDA
|
|
2
|
+
|
|
3
|
+
A production-ready Python package that performs automatic Exploratory Data Analysis (EDA), Data Cleaning, Data Validation, Visualization, and Smart Insights on ANY CSV dataset.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
- **Dynamic Analysis**: Automatically detects numerical, categorical, boolean, and datetime columns.
|
|
7
|
+
- **Smart Cleaning**: Handles missing values, removes duplicates, and optimizes datatypes automatically.
|
|
8
|
+
- **Visualizations**: Automatically generates relevant charts using Matplotlib and Seaborn.
|
|
9
|
+
- **Smart Insights**: Generates English observations based on statistical findings.
|
|
10
|
+
- **Rich Terminal UI**: Beautiful, organized CLI reports.
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install .
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
### Python API
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
from autoeda import analyze
|
|
24
|
+
|
|
25
|
+
# Complete analysis
|
|
26
|
+
analyze("data.csv")
|
|
27
|
+
|
|
28
|
+
# Only clean data
|
|
29
|
+
# analyze("data.csv", mode="clean")
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### CLI
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
# Complete analysis
|
|
36
|
+
autoeda data.csv
|
|
37
|
+
|
|
38
|
+
# Specific modes
|
|
39
|
+
autoeda data.csv --clean
|
|
40
|
+
autoeda data.csv --visualize
|
|
41
|
+
autoeda data.csv --all
|
|
42
|
+
```
|
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from rich.table import Table
|
|
4
|
+
from rich.progress import track
|
|
5
|
+
from .utils import console, print_section, print_warning
|
|
6
|
+
|
|
7
|
+
def run_analysis(df: pd.DataFrame, smart_mode: bool = False):
|
|
8
|
+
steps = [
|
|
9
|
+
(_dataset_overview, "Dataset Overview"),
|
|
10
|
+
(_column_analysis, "Column Analysis"),
|
|
11
|
+
(_null_analysis, "Null Value Analysis"),
|
|
12
|
+
(_duplicate_analysis, "Duplicate Analysis"),
|
|
13
|
+
(_unique_value_analysis, "Unique Value Analysis"),
|
|
14
|
+
(_value_counts_analysis, "Value Counts Analysis"),
|
|
15
|
+
(_numerical_analysis, "Numerical Analysis"),
|
|
16
|
+
(_outlier_analysis, "Outlier Analysis"),
|
|
17
|
+
(_correlation_analysis, "Correlation Analysis"),
|
|
18
|
+
(_datetime_analysis, "Datetime Analysis"),
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
if smart_mode:
|
|
22
|
+
console.print("[yellow]Running analysis in Smart Mode... Some heavy computations may be optimized or skipped.[/yellow]")
|
|
23
|
+
|
|
24
|
+
for func, name in track(steps, description="Running Analysis Modules..."):
|
|
25
|
+
try:
|
|
26
|
+
if func in [_unique_value_analysis, _value_counts_analysis, _correlation_analysis]:
|
|
27
|
+
func(df, smart_mode)
|
|
28
|
+
else:
|
|
29
|
+
func(df)
|
|
30
|
+
except Exception as e:
|
|
31
|
+
console.print(f"[red]Error in {name}: {e}[/red]")
|
|
32
|
+
|
|
33
|
+
def _dataset_overview(df: pd.DataFrame):
|
|
34
|
+
print_section("Dataset Overview")
|
|
35
|
+
|
|
36
|
+
table = Table(show_header=True, header_style="bold magenta")
|
|
37
|
+
table.add_column("Metric", style="dim", width=20)
|
|
38
|
+
table.add_column("Value")
|
|
39
|
+
|
|
40
|
+
table.add_row("Dataset Shape", f"{df.shape[0]} Rows, {df.shape[1]} Columns")
|
|
41
|
+
table.add_row("Total Rows", str(df.shape[0]))
|
|
42
|
+
table.add_row("Total Columns", str(df.shape[1]))
|
|
43
|
+
table.add_row("Column Names", ", ".join(df.columns.astype(str).tolist()))
|
|
44
|
+
|
|
45
|
+
memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
|
|
46
|
+
table.add_row("Memory Usage", f"{memory_mb:.2f} MB")
|
|
47
|
+
|
|
48
|
+
console.print(table)
|
|
49
|
+
|
|
50
|
+
print_section("First 5 Rows")
|
|
51
|
+
console.print(df.head(5).to_string())
|
|
52
|
+
print_section("Last 5 Rows")
|
|
53
|
+
console.print(df.tail(5).to_string())
|
|
54
|
+
|
|
55
|
+
def _column_analysis(df: pd.DataFrame):
|
|
56
|
+
print_section("Column Analysis")
|
|
57
|
+
|
|
58
|
+
num_cols = df.select_dtypes(include=['number']).columns.tolist()
|
|
59
|
+
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
|
60
|
+
bool_cols = df.select_dtypes(include=['bool']).columns.tolist()
|
|
61
|
+
dt_cols = df.select_dtypes(include=['datetime']).columns.tolist()
|
|
62
|
+
|
|
63
|
+
console.print(f"[bold]Numerical Columns:[/bold] {len(num_cols)} -> {num_cols}")
|
|
64
|
+
console.print(f"[bold]Categorical Columns:[/bold] {len(cat_cols)} -> {cat_cols}")
|
|
65
|
+
console.print(f"[bold]Boolean Columns:[/bold] {len(bool_cols)} -> {bool_cols}")
|
|
66
|
+
console.print(f"[bold]Datetime Columns:[/bold] {len(dt_cols)} -> {dt_cols}")
|
|
67
|
+
|
|
68
|
+
table = Table(show_header=True, header_style="bold magenta")
|
|
69
|
+
table.add_column("Column Name")
|
|
70
|
+
table.add_column("Data Type")
|
|
71
|
+
table.add_column("Total Values")
|
|
72
|
+
table.add_column("Null Values")
|
|
73
|
+
table.add_column("Null %")
|
|
74
|
+
table.add_column("Unique Values")
|
|
75
|
+
|
|
76
|
+
total_rows = len(df)
|
|
77
|
+
for col in df.columns:
|
|
78
|
+
null_count = df[col].isnull().sum()
|
|
79
|
+
null_pct = (null_count / total_rows) * 100
|
|
80
|
+
table.add_row(
|
|
81
|
+
str(col),
|
|
82
|
+
str(df[col].dtype),
|
|
83
|
+
str(total_rows - null_count),
|
|
84
|
+
str(null_count),
|
|
85
|
+
f"{null_pct:.2f}%",
|
|
86
|
+
str(df[col].nunique())
|
|
87
|
+
)
|
|
88
|
+
console.print(table)
|
|
89
|
+
|
|
90
|
+
def _null_analysis(df: pd.DataFrame):
|
|
91
|
+
print_section("Null Value Analysis")
|
|
92
|
+
total_nulls = df.isnull().sum().sum()
|
|
93
|
+
console.print(f"[bold]Total Null Values:[/bold] {total_nulls}")
|
|
94
|
+
|
|
95
|
+
if total_nulls > 0:
|
|
96
|
+
table = Table(show_header=True, header_style="bold magenta")
|
|
97
|
+
table.add_column("Column Name")
|
|
98
|
+
table.add_column("Null Values")
|
|
99
|
+
table.add_column("Null Percentage")
|
|
100
|
+
|
|
101
|
+
for col in df.columns:
|
|
102
|
+
nc = df[col].isnull().sum()
|
|
103
|
+
if nc > 0:
|
|
104
|
+
table.add_row(str(col), str(nc), f"{(nc/len(df))*100:.2f}%")
|
|
105
|
+
console.print(table)
|
|
106
|
+
|
|
107
|
+
def _duplicate_analysis(df: pd.DataFrame):
|
|
108
|
+
print_section("Duplicate Analysis")
|
|
109
|
+
dup_count = df.duplicated().sum()
|
|
110
|
+
dup_pct = (dup_count / len(df)) * 100
|
|
111
|
+
console.print(f"[bold]Total Duplicate Rows:[/bold] {dup_count}")
|
|
112
|
+
console.print(f"[bold]Duplicate Percentage:[/bold] {dup_pct:.2f}%")
|
|
113
|
+
|
|
114
|
+
if dup_count > 0:
|
|
115
|
+
console.print("Sample Duplicates:")
|
|
116
|
+
console.print(df[df.duplicated(keep=False)].head(5).to_string())
|
|
117
|
+
|
|
118
|
+
def _unique_value_analysis(df: pd.DataFrame, smart_mode: bool = False):
|
|
119
|
+
print_section("Unique Value Analysis")
|
|
120
|
+
table = Table(show_header=True, header_style="bold magenta")
|
|
121
|
+
table.add_column("Column Name")
|
|
122
|
+
table.add_column("Unique Count")
|
|
123
|
+
table.add_column("Sample Unique Values")
|
|
124
|
+
|
|
125
|
+
for col in df.columns:
|
|
126
|
+
if smart_mode and str(df[col].dtype) == 'object':
|
|
127
|
+
table.add_row(str(col), "Skipped (Smart Mode)", "...")
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
uniques = df[col].dropna().unique()
|
|
131
|
+
sample = ", ".join([str(x) for x in uniques[:5]])
|
|
132
|
+
if len(uniques) > 5:
|
|
133
|
+
sample += "..."
|
|
134
|
+
table.add_row(str(col), str(len(uniques)), sample)
|
|
135
|
+
console.print(table)
|
|
136
|
+
|
|
137
|
+
def _value_counts_analysis(df: pd.DataFrame, smart_mode: bool = False):
|
|
138
|
+
print_section("Value Counts Analysis (Categorical)")
|
|
139
|
+
cat_cols = df.select_dtypes(include=['object', 'category']).columns
|
|
140
|
+
for col in cat_cols:
|
|
141
|
+
if smart_mode and df[col].nunique() > 100:
|
|
142
|
+
console.print(f"\n[bold underline]{col}[/bold underline] - [yellow]Skipped (Too many categories for Smart Mode)[/yellow]")
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
console.print(f"\n[bold underline]{col}[/bold underline]")
|
|
146
|
+
vc = df[col].value_counts()
|
|
147
|
+
vcp = df[col].value_counts(normalize=True) * 100
|
|
148
|
+
|
|
149
|
+
table = Table(show_header=True, header_style="bold magenta")
|
|
150
|
+
table.add_column("Value")
|
|
151
|
+
table.add_column("Count")
|
|
152
|
+
table.add_column("Frequency %")
|
|
153
|
+
|
|
154
|
+
for val, count in vc.head(10).items():
|
|
155
|
+
pct = vcp[val]
|
|
156
|
+
table.add_row(str(val), str(count), f"{pct:.2f}%")
|
|
157
|
+
|
|
158
|
+
if len(vc) > 10:
|
|
159
|
+
table.add_row("...", "...", "...")
|
|
160
|
+
console.print(table)
|
|
161
|
+
|
|
162
|
+
def _numerical_analysis(df: pd.DataFrame):
|
|
163
|
+
print_section("Numerical Analysis")
|
|
164
|
+
num_cols = df.select_dtypes(include=['number']).columns
|
|
165
|
+
if len(num_cols) == 0:
|
|
166
|
+
console.print("No numerical columns found.")
|
|
167
|
+
return
|
|
168
|
+
|
|
169
|
+
for col in num_cols:
|
|
170
|
+
console.print(f"\n[bold underline]{col}[/bold underline]")
|
|
171
|
+
s = df[col].dropna()
|
|
172
|
+
if s.empty:
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
table = Table(show_header=False)
|
|
176
|
+
table.add_column("Stat", style="dim")
|
|
177
|
+
table.add_column("Value")
|
|
178
|
+
|
|
179
|
+
table.add_row("Count", str(len(s)))
|
|
180
|
+
table.add_row("Mean", f"{s.mean():.4f}")
|
|
181
|
+
table.add_row("Median", f"{s.median():.4f}")
|
|
182
|
+
mode_val = s.mode()
|
|
183
|
+
table.add_row("Mode", str(mode_val.iloc[0]) if not mode_val.empty else "N/A")
|
|
184
|
+
table.add_row("Std Dev", f"{s.std():.4f}")
|
|
185
|
+
table.add_row("Variance", f"{s.var():.4f}")
|
|
186
|
+
table.add_row("Min", f"{s.min():.4f}")
|
|
187
|
+
table.add_row("Max", f"{s.max():.4f}")
|
|
188
|
+
table.add_row("Range", f"{(s.max() - s.min()):.4f}")
|
|
189
|
+
table.add_row("Q1 (25%)", f"{s.quantile(0.25):.4f}")
|
|
190
|
+
table.add_row("Q3 (75%)", f"{s.quantile(0.75):.4f}")
|
|
191
|
+
table.add_row("Skewness", f"{s.skew():.4f}")
|
|
192
|
+
table.add_row("Kurtosis", f"{s.kurtosis():.4f}")
|
|
193
|
+
|
|
194
|
+
console.print(table)
|
|
195
|
+
|
|
196
|
+
def _outlier_analysis(df: pd.DataFrame):
|
|
197
|
+
print_section("Outlier Analysis (IQR Method)")
|
|
198
|
+
num_cols = df.select_dtypes(include=['number']).columns
|
|
199
|
+
|
|
200
|
+
table = Table(show_header=True, header_style="bold magenta")
|
|
201
|
+
table.add_column("Column")
|
|
202
|
+
table.add_column("Outliers Count")
|
|
203
|
+
table.add_column("Outliers Percentage")
|
|
204
|
+
|
|
205
|
+
has_outliers = False
|
|
206
|
+
for col in num_cols:
|
|
207
|
+
s = df[col].dropna()
|
|
208
|
+
if s.empty: continue
|
|
209
|
+
q1 = s.quantile(0.25)
|
|
210
|
+
q3 = s.quantile(0.75)
|
|
211
|
+
iqr = q3 - q1
|
|
212
|
+
lower_bound = q1 - 1.5 * iqr
|
|
213
|
+
upper_bound = q3 + 1.5 * iqr
|
|
214
|
+
|
|
215
|
+
outliers = s[(s < lower_bound) | (s > upper_bound)]
|
|
216
|
+
ocount = len(outliers)
|
|
217
|
+
if ocount > 0:
|
|
218
|
+
has_outliers = True
|
|
219
|
+
table.add_row(str(col), str(ocount), f"{(ocount / len(s)) * 100:.2f}%")
|
|
220
|
+
|
|
221
|
+
if has_outliers:
|
|
222
|
+
console.print(table)
|
|
223
|
+
else:
|
|
224
|
+
console.print("No outliers detected based on IQR method.")
|
|
225
|
+
|
|
226
|
+
def _correlation_analysis(df: pd.DataFrame, smart_mode: bool = False):
|
|
227
|
+
print_section("Correlation Analysis")
|
|
228
|
+
num_df = df.select_dtypes(include=['number'])
|
|
229
|
+
|
|
230
|
+
if smart_mode and len(num_df.columns) > 50:
|
|
231
|
+
console.print("[yellow]Skipping full correlation matrix calculation for >50 columns in Smart Mode.[/yellow]")
|
|
232
|
+
return
|
|
233
|
+
|
|
234
|
+
if len(num_df.columns) < 2:
|
|
235
|
+
console.print("Not enough numerical columns for correlation.")
|
|
236
|
+
return
|
|
237
|
+
|
|
238
|
+
corr = num_df.corr()
|
|
239
|
+
console.print(corr.to_string())
|
|
240
|
+
|
|
241
|
+
# Strong correlations
|
|
242
|
+
strong_pos = []
|
|
243
|
+
strong_neg = []
|
|
244
|
+
|
|
245
|
+
for i in range(len(corr.columns)):
|
|
246
|
+
for j in range(i+1, len(corr.columns)):
|
|
247
|
+
val = corr.iloc[i, j]
|
|
248
|
+
if pd.notna(val):
|
|
249
|
+
if val >= 0.7:
|
|
250
|
+
strong_pos.append(f"{corr.columns[i]} & {corr.columns[j]}: {val:.2f}")
|
|
251
|
+
elif val <= -0.7:
|
|
252
|
+
strong_neg.append(f"{corr.columns[i]} & {corr.columns[j]}: {val:.2f}")
|
|
253
|
+
|
|
254
|
+
if strong_pos:
|
|
255
|
+
console.print("\n[bold green]Strong Positive Correlations (>0.7):[/bold green]")
|
|
256
|
+
for sp in strong_pos:
|
|
257
|
+
console.print(f"- {sp}")
|
|
258
|
+
|
|
259
|
+
if strong_neg:
|
|
260
|
+
console.print("\n[bold red]Strong Negative Correlations (<-0.7):[/bold red]")
|
|
261
|
+
for sn in strong_neg:
|
|
262
|
+
console.print(f"- {sn}")
|
|
263
|
+
|
|
264
|
+
def _datetime_analysis(df: pd.DataFrame):
|
|
265
|
+
print_section("Datetime Analysis")
|
|
266
|
+
# Try to infer datetime if none exist but some look like dates
|
|
267
|
+
for col in df.select_dtypes(include=['object']):
|
|
268
|
+
try:
|
|
269
|
+
sample = df[col].dropna().head(10)
|
|
270
|
+
if not sample.empty:
|
|
271
|
+
pd.to_datetime(sample)
|
|
272
|
+
df[col] = pd.to_datetime(df[col], errors='coerce')
|
|
273
|
+
except Exception:
|
|
274
|
+
pass
|
|
275
|
+
|
|
276
|
+
dt_cols = df.select_dtypes(include=['datetime']).columns
|
|
277
|
+
if len(dt_cols) == 0:
|
|
278
|
+
console.print("No datetime columns found.")
|
|
279
|
+
return
|
|
280
|
+
|
|
281
|
+
for col in dt_cols:
|
|
282
|
+
console.print(f"\n[bold underline]{col}[/bold underline]")
|
|
283
|
+
s = df[col].dropna()
|
|
284
|
+
if s.empty: continue
|
|
285
|
+
console.print(f"Minimum Date: {s.min()}")
|
|
286
|
+
console.print(f"Maximum Date: {s.max()}")
|
|
287
|
+
console.print(f"Date Range: {s.max() - s.min()}")
|
|
288
|
+
console.print(f"Missing Dates: {df[col].isnull().sum()}")
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from rich.prompt import IntPrompt, Prompt, Confirm
|
|
3
|
+
from .utils import console, print_section, print_info
|
|
4
|
+
|
|
5
|
+
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
|
|
6
|
+
cleaned_df = df.copy()
|
|
7
|
+
|
|
8
|
+
print_section("Data Cleaning")
|
|
9
|
+
console.print(f"Before Cleaning Shape: {cleaned_df.shape}")
|
|
10
|
+
console.print(f"Before Cleaning Missing Values: {cleaned_df.isnull().sum().sum()}")
|
|
11
|
+
console.print(f"Before Cleaning Duplicate Count: {cleaned_df.duplicated().sum()}")
|
|
12
|
+
|
|
13
|
+
cleaned_df.drop_duplicates(inplace=True)
|
|
14
|
+
|
|
15
|
+
obj_cols = cleaned_df.select_dtypes(include=['object']).columns
|
|
16
|
+
for col in obj_cols:
|
|
17
|
+
try:
|
|
18
|
+
cleaned_df[col] = cleaned_df[col].str.strip()
|
|
19
|
+
except Exception:
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
for col in cleaned_df.columns:
|
|
23
|
+
if cleaned_df[col].isnull().sum() > 0:
|
|
24
|
+
if pd.api.types.is_numeric_dtype(cleaned_df[col]):
|
|
25
|
+
median_val = cleaned_df[col].median()
|
|
26
|
+
cleaned_df[col].fillna(median_val, inplace=True)
|
|
27
|
+
elif pd.api.types.is_object_dtype(cleaned_df[col]) or pd.api.types.is_categorical_dtype(cleaned_df[col]):
|
|
28
|
+
mode_val = cleaned_df[col].mode()
|
|
29
|
+
if not mode_val.empty:
|
|
30
|
+
cleaned_df[col].fillna(mode_val[0], inplace=True)
|
|
31
|
+
|
|
32
|
+
num_cols = cleaned_df.select_dtypes(include=['float64', 'int64']).columns
|
|
33
|
+
for col in num_cols:
|
|
34
|
+
try:
|
|
35
|
+
if 'int' in str(cleaned_df[col].dtype):
|
|
36
|
+
cleaned_df[col] = pd.to_numeric(cleaned_df[col], downcast='integer')
|
|
37
|
+
elif 'float' in str(cleaned_df[col].dtype):
|
|
38
|
+
cleaned_df[col] = pd.to_numeric(cleaned_df[col], downcast='float')
|
|
39
|
+
except Exception:
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
for col in cleaned_df.select_dtypes(include=['object']).columns:
|
|
43
|
+
try:
|
|
44
|
+
sample = cleaned_df[col].dropna().sample(min(100, len(cleaned_df[col].dropna())))
|
|
45
|
+
if not sample.empty:
|
|
46
|
+
pd.to_datetime(sample)
|
|
47
|
+
cleaned_df[col] = pd.to_datetime(cleaned_df[col], errors='coerce')
|
|
48
|
+
except Exception:
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
console.print(f"\nAfter Cleaning Shape: {cleaned_df.shape}")
|
|
52
|
+
console.print(f"After Cleaning Missing Values: {cleaned_df.isnull().sum().sum()}")
|
|
53
|
+
console.print(f"After Cleaning Duplicate Count: {cleaned_df.duplicated().sum()}")
|
|
54
|
+
|
|
55
|
+
return cleaned_df
|
|
56
|
+
|
|
57
|
+
def interactive_clean(df: pd.DataFrame) -> pd.DataFrame:
|
|
58
|
+
df = df.copy()
|
|
59
|
+
while True:
|
|
60
|
+
print_section("Interactive Data Cleaning")
|
|
61
|
+
console.print("1. Remove Null Values")
|
|
62
|
+
console.print("2. Remove Duplicate Values")
|
|
63
|
+
console.print("3. Remove Categorical Columns")
|
|
64
|
+
console.print("4. Remove Specific Column")
|
|
65
|
+
console.print("5. Remove Specific Row")
|
|
66
|
+
console.print("6. Finish Cleaning")
|
|
67
|
+
|
|
68
|
+
choice = IntPrompt.ask("Select an option", choices=["1", "2", "3", "4", "5", "6"])
|
|
69
|
+
|
|
70
|
+
if choice == 1:
|
|
71
|
+
df.dropna(inplace=True)
|
|
72
|
+
console.print("[green]✔ Null values removed.[/green]")
|
|
73
|
+
elif choice == 2:
|
|
74
|
+
df.drop_duplicates(inplace=True)
|
|
75
|
+
console.print("[green]✔ Duplicates removed.[/green]")
|
|
76
|
+
elif choice == 3:
|
|
77
|
+
cat_cols = df.select_dtypes(include=['object', 'category']).columns
|
|
78
|
+
df.drop(columns=cat_cols, inplace=True)
|
|
79
|
+
console.print(f"[green]✔ Categorical columns removed: {list(cat_cols)}[/green]")
|
|
80
|
+
elif choice == 4:
|
|
81
|
+
col_name = Prompt.ask("Enter column name to remove")
|
|
82
|
+
if col_name in df.columns:
|
|
83
|
+
df.drop(columns=[col_name], inplace=True)
|
|
84
|
+
console.print(f"[green]✔ Column '{col_name}' removed.[/green]")
|
|
85
|
+
else:
|
|
86
|
+
console.print(f"[red]✖ Column '{col_name}' not found.[/red]")
|
|
87
|
+
elif choice == 5:
|
|
88
|
+
row_idx = IntPrompt.ask("Enter row index to remove")
|
|
89
|
+
try:
|
|
90
|
+
df.drop(index=row_idx, inplace=True)
|
|
91
|
+
console.print(f"[green]✔ Row {row_idx} removed.[/green]")
|
|
92
|
+
except Exception as e:
|
|
93
|
+
console.print(f"[red]✖ Failed to remove row: {e}[/red]")
|
|
94
|
+
elif choice == 6:
|
|
95
|
+
break
|
|
96
|
+
|
|
97
|
+
save_data = Confirm.ask("Do you want to save this new cleaned data to a file?")
|
|
98
|
+
if save_data:
|
|
99
|
+
save_path = Prompt.ask("Enter filename", default="cleaned_data.csv")
|
|
100
|
+
df.to_csv(save_path, index=False)
|
|
101
|
+
console.print(f"[green]✔ New data saved to {save_path}[/green]")
|
|
102
|
+
|
|
103
|
+
return df
|
|
104
|
+
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import click
|
|
2
|
+
import sys
|
|
3
|
+
import time
|
|
4
|
+
from .validator import load_and_validate
|
|
5
|
+
from .analyzer import run_analysis
|
|
6
|
+
from .cleaner import clean_data
|
|
7
|
+
from .visualizer import generate_visualizations
|
|
8
|
+
from .insights import generate_insights
|
|
9
|
+
from .utils import print_header, print_section, print_info
|
|
10
|
+
|
|
11
|
+
@click.command()
|
|
12
|
+
@click.argument('file_path', type=click.Path(exists=True))
|
|
13
|
+
@click.option('--clean', is_flag=True, help='Only perform data cleaning')
|
|
14
|
+
@click.option('--visualize', is_flag=True, help='Only perform data visualization')
|
|
15
|
+
@click.option('--all', 'run_all', is_flag=True, help='Run all modules (default behavior)')
|
|
16
|
+
def cli(file_path, clean, visualize, run_all):
|
|
17
|
+
"""AutoEDA: Automatic Exploratory Data Analysis & Cleaning"""
|
|
18
|
+
analyze(file_path, clean, visualize, run_all)
|
|
19
|
+
|
|
20
|
+
def analyze(file_path, clean=False, visualize=False, run_all=False):
|
|
21
|
+
# If no flags are provided, run all by default
|
|
22
|
+
if not clean and not visualize:
|
|
23
|
+
run_all = True
|
|
24
|
+
|
|
25
|
+
print_header("AutoEDA Complete Analysis Report")
|
|
26
|
+
start_time = time.time()
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
df, smart_mode = load_and_validate(file_path)
|
|
30
|
+
except Exception as e:
|
|
31
|
+
click.secho(f"Error loading file: {e}", fg='red')
|
|
32
|
+
sys.exit(1)
|
|
33
|
+
|
|
34
|
+
if smart_mode:
|
|
35
|
+
from .utils import console
|
|
36
|
+
console.print("\n[bold yellow]Large Dataset Detected[/bold yellow]")
|
|
37
|
+
console.print(f"Rows: {len(df):,}")
|
|
38
|
+
console.print(f"Columns: {len(df.columns)}")
|
|
39
|
+
console.print("[bold green]Optimization Mode: Enabled[/bold green]")
|
|
40
|
+
|
|
41
|
+
if clean and not run_all:
|
|
42
|
+
# Only cleaning
|
|
43
|
+
print_section("Data Cleaning")
|
|
44
|
+
df = clean_data(df)
|
|
45
|
+
print_info("Cleaning complete.")
|
|
46
|
+
return df
|
|
47
|
+
|
|
48
|
+
while True:
|
|
49
|
+
if run_all:
|
|
50
|
+
print_info(f"Analyzing dataset...")
|
|
51
|
+
run_analysis(df, smart_mode)
|
|
52
|
+
|
|
53
|
+
# Insights
|
|
54
|
+
print_section("Insights")
|
|
55
|
+
insights = generate_insights(df)
|
|
56
|
+
for idx, insight in enumerate(insights, 1):
|
|
57
|
+
print_info(f"{idx}. {insight}")
|
|
58
|
+
|
|
59
|
+
if run_all or visualize:
|
|
60
|
+
print_section("Visualization")
|
|
61
|
+
generate_visualizations(df, smart_mode)
|
|
62
|
+
print_info("Visualizations generated.")
|
|
63
|
+
|
|
64
|
+
if run_all:
|
|
65
|
+
from rich.prompt import Confirm
|
|
66
|
+
do_clean = Confirm.ask("\nDo you want to proceed to interactive data cleaning?")
|
|
67
|
+
if do_clean:
|
|
68
|
+
from .cleaner import interactive_clean
|
|
69
|
+
df = interactive_clean(df)
|
|
70
|
+
|
|
71
|
+
do_next = Confirm.ask("\nDo you want to run the complete analysis again on the new data?")
|
|
72
|
+
if do_next:
|
|
73
|
+
continue
|
|
74
|
+
else:
|
|
75
|
+
break
|
|
76
|
+
else:
|
|
77
|
+
break
|
|
78
|
+
else:
|
|
79
|
+
break
|
|
80
|
+
|
|
81
|
+
end_time = time.time()
|
|
82
|
+
from rich.console import Console
|
|
83
|
+
console = Console()
|
|
84
|
+
console.print(f"\n[bold cyan]Processing Time:[/bold cyan] {end_time - start_time:.2f} seconds")
|
|
85
|
+
|
|
86
|
+
# Track Memory Usage
|
|
87
|
+
mem_usage_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
|
|
88
|
+
console.print(f"[bold cyan]Final Memory Usage:[/bold cyan] {mem_usage_mb:.2f} MB")
|
|
89
|
+
|
|
90
|
+
console.print("\n[bold green]Thank you for using this tool![/bold green]")
|
|
91
|
+
return df
|
|
92
|
+
|
|
93
|
+
if __name__ == '__main__':
|
|
94
|
+
cli()
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
def generate_insights(df: pd.DataFrame) -> List[str]:
|
|
5
|
+
insights = []
|
|
6
|
+
|
|
7
|
+
# Missing values
|
|
8
|
+
total_cells = df.shape[0] * df.shape[1]
|
|
9
|
+
missing_cells = df.isnull().sum().sum()
|
|
10
|
+
missing_pct = (missing_cells / total_cells) * 100
|
|
11
|
+
if missing_pct > 0:
|
|
12
|
+
insights.append(f"Dataset contains {missing_pct:.2f}% missing values overall.")
|
|
13
|
+
else:
|
|
14
|
+
insights.append("Dataset is complete with 0% missing values.")
|
|
15
|
+
|
|
16
|
+
for col in df.columns:
|
|
17
|
+
col_missing = df[col].isnull().mean() * 100
|
|
18
|
+
if col_missing > 20:
|
|
19
|
+
insights.append(f"Column '{col}' has significant missing data ({col_missing:.2f}%).")
|
|
20
|
+
|
|
21
|
+
# Duplicates
|
|
22
|
+
dup_pct = (df.duplicated().sum() / df.shape[0]) * 100
|
|
23
|
+
if dup_pct > 0:
|
|
24
|
+
insights.append(f"Dataset contains {dup_pct:.2f}% duplicate rows.")
|
|
25
|
+
|
|
26
|
+
# Correlations (numerical)
|
|
27
|
+
num_df = df.select_dtypes(include=['number'])
|
|
28
|
+
if len(num_df.columns) > 1:
|
|
29
|
+
corr = num_df.corr()
|
|
30
|
+
for i in range(len(corr.columns)):
|
|
31
|
+
for j in range(i+1, len(corr.columns)):
|
|
32
|
+
c_val = corr.iloc[i, j]
|
|
33
|
+
if pd.notna(c_val):
|
|
34
|
+
if c_val > 0.8:
|
|
35
|
+
insights.append(f"Strong positive correlation ({c_val:.2f}) exists between '{corr.columns[i]}' and '{corr.columns[j]}'.")
|
|
36
|
+
elif c_val < -0.8:
|
|
37
|
+
insights.append(f"Strong negative correlation ({c_val:.2f}) exists between '{corr.columns[i]}' and '{corr.columns[j]}'.")
|
|
38
|
+
|
|
39
|
+
# Categorical dominance
|
|
40
|
+
cat_df = df.select_dtypes(include=['object', 'category'])
|
|
41
|
+
for col in cat_df.columns:
|
|
42
|
+
val_counts = df[col].value_counts(normalize=True)
|
|
43
|
+
if not val_counts.empty and val_counts.iloc[0] > 0.8:
|
|
44
|
+
insights.append(f"Category '{val_counts.index[0]}' dominates the '{col}' column ({val_counts.iloc[0]*100:.2f}%).")
|
|
45
|
+
|
|
46
|
+
# ML suitability
|
|
47
|
+
if missing_pct < 5 and len(num_df.columns) >= 2 and len(df) > 100:
|
|
48
|
+
insights.append("Dataset appears suitable for Machine Learning with standard preprocessing.")
|
|
49
|
+
else:
|
|
50
|
+
insights.append("Recommended preprocessing steps: Extensive data cleaning and imputation required before Machine Learning.")
|
|
51
|
+
|
|
52
|
+
return insights
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from rich.console import Console
|
|
2
|
+
from rich.panel import Panel
|
|
3
|
+
|
|
4
|
+
console = Console()
|
|
5
|
+
|
|
6
|
+
def print_header(title: str):
|
|
7
|
+
console.print()
|
|
8
|
+
console.print(Panel(title.upper(), style="bold cyan", expand=False))
|
|
9
|
+
|
|
10
|
+
def print_section(title: str):
|
|
11
|
+
console.print(f"\n[bold yellow]--- {title} ---[/bold yellow]")
|
|
12
|
+
|
|
13
|
+
def print_info(msg: str):
|
|
14
|
+
console.print(f"[green]✔[/green] {msg}")
|
|
15
|
+
|
|
16
|
+
def print_warning(msg: str):
|
|
17
|
+
console.print(f"[bold red]![/bold red] {msg}")
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from typing import Tuple
|
|
4
|
+
from .utils import print_warning
|
|
5
|
+
|
|
6
|
+
def load_and_validate(file_path: str) -> Tuple[pd.DataFrame, bool]:
|
|
7
|
+
if not os.path.exists(file_path):
|
|
8
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
9
|
+
|
|
10
|
+
if not file_path.lower().endswith(".csv"):
|
|
11
|
+
print_warning("File does not end with .csv, attempting to read as CSV anyway.")
|
|
12
|
+
|
|
13
|
+
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
|
|
14
|
+
smart_mode = False
|
|
15
|
+
|
|
16
|
+
if file_size_mb > 100:
|
|
17
|
+
smart_mode = True
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
df = pd.read_csv(file_path, low_memory=False)
|
|
21
|
+
except Exception as e:
|
|
22
|
+
raise ValueError(f"Failed to read CSV file. Error: {e}")
|
|
23
|
+
|
|
24
|
+
if df.empty:
|
|
25
|
+
raise ValueError("The dataset is empty.")
|
|
26
|
+
|
|
27
|
+
if len(df.columns) == 0:
|
|
28
|
+
raise ValueError("No columns found in the dataset.")
|
|
29
|
+
|
|
30
|
+
if len(df) > 100000:
|
|
31
|
+
smart_mode = True
|
|
32
|
+
|
|
33
|
+
return df, smart_mode
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import matplotlib.pyplot as plt
|
|
2
|
+
import seaborn as sns
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
from rich.prompt import Confirm
|
|
6
|
+
from .utils import console, print_section
|
|
7
|
+
|
|
8
|
+
def generate_visualizations(df: pd.DataFrame, smart_mode: bool = False):
|
|
9
|
+
sns.set_theme(style="whitegrid")
|
|
10
|
+
|
|
11
|
+
if smart_mode and len(df) > 50000:
|
|
12
|
+
console.print(f"[yellow]Smart Mode: Sampling dataset from {len(df):,} to 50,000 rows for faster visualizations.[/yellow]")
|
|
13
|
+
df = df.sample(50000, random_state=42)
|
|
14
|
+
|
|
15
|
+
print_section("Interactive Visualization Menu")
|
|
16
|
+
console.print("Select which graphs you want to generate:")
|
|
17
|
+
|
|
18
|
+
show_missing = Confirm.ask("Show Missing Value Graphs?")
|
|
19
|
+
show_categorical = Confirm.ask("Show Categorical Feature Graphs?")
|
|
20
|
+
show_numerical = Confirm.ask("Show Numerical Feature Distributions?")
|
|
21
|
+
show_correlation = Confirm.ask("Show Correlation Heatmap?")
|
|
22
|
+
show_datetime = Confirm.ask("Show Datetime Charts?")
|
|
23
|
+
|
|
24
|
+
plots = []
|
|
25
|
+
|
|
26
|
+
if show_missing and df.isnull().sum().sum() > 0:
|
|
27
|
+
plots.append({'type': 'missing_heatmap'})
|
|
28
|
+
missing_counts = df.isnull().sum()
|
|
29
|
+
if not missing_counts[missing_counts > 0].empty:
|
|
30
|
+
plots.append({'type': 'missing_bar'})
|
|
31
|
+
|
|
32
|
+
if show_categorical:
|
|
33
|
+
cat_cols = df.select_dtypes(include=['object', 'category']).columns
|
|
34
|
+
for col in cat_cols:
|
|
35
|
+
vc = df[col].value_counts().head(10)
|
|
36
|
+
if not vc.empty:
|
|
37
|
+
plots.append({'type': 'cat_count', 'col': col, 'vc': vc})
|
|
38
|
+
if len(vc) <= 10:
|
|
39
|
+
plots.append({'type': 'cat_pie', 'col': col, 'vc': vc})
|
|
40
|
+
|
|
41
|
+
if show_numerical:
|
|
42
|
+
num_cols = df.select_dtypes(include=['number']).columns
|
|
43
|
+
for col in num_cols:
|
|
44
|
+
plots.append({'type': 'num_hist', 'col': col})
|
|
45
|
+
plots.append({'type': 'num_kde', 'col': col})
|
|
46
|
+
plots.append({'type': 'num_box', 'col': col})
|
|
47
|
+
|
|
48
|
+
if show_correlation:
|
|
49
|
+
num_df = df.select_dtypes(include=['number'])
|
|
50
|
+
if len(num_df.columns) > 1:
|
|
51
|
+
plots.append({'type': 'correlation'})
|
|
52
|
+
|
|
53
|
+
if show_datetime:
|
|
54
|
+
dt_cols = df.select_dtypes(include=['datetime']).columns
|
|
55
|
+
for col in dt_cols:
|
|
56
|
+
plots.append({'type': 'datetime', 'col': col})
|
|
57
|
+
|
|
58
|
+
n_plots = len(plots)
|
|
59
|
+
if n_plots == 0:
|
|
60
|
+
console.print("No plots selected or no data available to plot.")
|
|
61
|
+
return
|
|
62
|
+
|
|
63
|
+
cols = 3
|
|
64
|
+
rows = (n_plots + cols - 1) // cols
|
|
65
|
+
|
|
66
|
+
fig, axes = plt.subplots(rows, cols, figsize=(18, 5 * rows))
|
|
67
|
+
|
|
68
|
+
# Ensure axes is always a 1D array even if there's only 1 plot
|
|
69
|
+
if n_plots == 1:
|
|
70
|
+
axes = np.array([axes])
|
|
71
|
+
else:
|
|
72
|
+
axes = axes.flatten()
|
|
73
|
+
|
|
74
|
+
for i, plot_info in enumerate(plots):
|
|
75
|
+
ax = axes[i]
|
|
76
|
+
ptype = plot_info['type']
|
|
77
|
+
|
|
78
|
+
if ptype == 'missing_heatmap':
|
|
79
|
+
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False, ax=ax)
|
|
80
|
+
ax.set_title('Missing Value Heatmap')
|
|
81
|
+
elif ptype == 'missing_bar':
|
|
82
|
+
missing_counts = df.isnull().sum()
|
|
83
|
+
missing_counts = missing_counts[missing_counts > 0]
|
|
84
|
+
missing_counts.plot(kind='bar', color='salmon', ax=ax)
|
|
85
|
+
ax.set_title('Missing Values per Column')
|
|
86
|
+
ax.tick_params(axis='x', rotation=45)
|
|
87
|
+
elif ptype == 'cat_count':
|
|
88
|
+
col = plot_info['col']
|
|
89
|
+
sns.countplot(y=col, data=df, order=plot_info['vc'].index, palette='Set2', ax=ax)
|
|
90
|
+
ax.set_title(f'Count Plot: {col}')
|
|
91
|
+
elif ptype == 'cat_pie':
|
|
92
|
+
col = plot_info['col']
|
|
93
|
+
vc = plot_info['vc']
|
|
94
|
+
ax.pie(vc.values, labels=vc.index, autopct='%1.1f%%', startangle=90, colors=sns.color_palette('Set2'))
|
|
95
|
+
ax.set_title(f'Pie Chart: {col}')
|
|
96
|
+
elif ptype == 'num_hist':
|
|
97
|
+
col = plot_info['col']
|
|
98
|
+
sns.histplot(df[col].dropna(), kde=False, ax=ax, color='skyblue')
|
|
99
|
+
ax.set_title(f'Histogram: {col}')
|
|
100
|
+
elif ptype == 'num_kde':
|
|
101
|
+
col = plot_info['col']
|
|
102
|
+
sns.kdeplot(df[col].dropna(), ax=ax, color='orange', fill=True)
|
|
103
|
+
ax.set_title(f'KDE Plot: {col}')
|
|
104
|
+
elif ptype == 'num_box':
|
|
105
|
+
col = plot_info['col']
|
|
106
|
+
sns.boxplot(x=df[col].dropna(), ax=ax, color='lightgreen')
|
|
107
|
+
ax.set_title(f'Boxplot: {col}')
|
|
108
|
+
elif ptype == 'correlation':
|
|
109
|
+
corr = df.select_dtypes(include=['number']).corr()
|
|
110
|
+
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', square=True, ax=ax)
|
|
111
|
+
ax.set_title('Correlation Heatmap')
|
|
112
|
+
elif ptype == 'datetime':
|
|
113
|
+
col = plot_info['col']
|
|
114
|
+
vc = df[col].dropna().dt.date.value_counts().sort_index()
|
|
115
|
+
vc.plot(kind='line', color='purple', ax=ax)
|
|
116
|
+
ax.set_title(f'Time Series: {col}')
|
|
117
|
+
ax.tick_params(axis='x', rotation=45)
|
|
118
|
+
|
|
119
|
+
# Hide any unused subplots
|
|
120
|
+
for j in range(n_plots, len(axes)):
|
|
121
|
+
fig.delaxes(axes[j])
|
|
122
|
+
|
|
123
|
+
plt.tight_layout()
|
|
124
|
+
console.print("[green]Opening all selected plots in a single window. Close the window to continue...[/green]")
|
|
125
|
+
plt.show(block=True)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: data-autoeda
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Automatic Exploratory Data Analysis, Cleaning, Validation, Visualization, and Smart Insights on ANY CSV dataset.
|
|
5
|
+
Author: shubham kumar
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Shu40/AutoEDA/
|
|
8
|
+
Project-URL: Repository, https://github.com/Shu40/AutoEDA/
|
|
9
|
+
Requires-Python: >=3.8
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: pandas>=1.3.0
|
|
12
|
+
Requires-Dist: numpy>=1.20.0
|
|
13
|
+
Requires-Dist: matplotlib>=3.4.0
|
|
14
|
+
Requires-Dist: seaborn>=0.11.0
|
|
15
|
+
Requires-Dist: scipy>=1.7.0
|
|
16
|
+
Requires-Dist: click>=8.0.0
|
|
17
|
+
Requires-Dist: rich>=10.0.0
|
|
18
|
+
|
|
19
|
+
# AutoEDA
|
|
20
|
+
|
|
21
|
+
A production-ready Python package that performs automatic Exploratory Data Analysis (EDA), Data Cleaning, Data Validation, Visualization, and Smart Insights on ANY CSV dataset.
|
|
22
|
+
|
|
23
|
+
## Features
|
|
24
|
+
- **Dynamic Analysis**: Automatically detects numerical, categorical, boolean, and datetime columns.
|
|
25
|
+
- **Smart Cleaning**: Handles missing values, removes duplicates, and optimizes datatypes automatically.
|
|
26
|
+
- **Visualizations**: Automatically generates relevant charts using Matplotlib and Seaborn.
|
|
27
|
+
- **Smart Insights**: Generates English observations based on statistical findings.
|
|
28
|
+
- **Rich Terminal UI**: Beautiful, organized CLI reports.
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install .
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Usage
|
|
37
|
+
|
|
38
|
+
### Python API
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from autoeda import analyze
|
|
42
|
+
|
|
43
|
+
# Complete analysis
|
|
44
|
+
analyze("data.csv")
|
|
45
|
+
|
|
46
|
+
# Only clean data
|
|
47
|
+
# analyze("data.csv", mode="clean")
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### CLI
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
# Complete analysis
|
|
54
|
+
autoeda data.csv
|
|
55
|
+
|
|
56
|
+
# Specific modes
|
|
57
|
+
autoeda data.csv --clean
|
|
58
|
+
autoeda data.csv --visualize
|
|
59
|
+
autoeda data.csv --all
|
|
60
|
+
```
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
autoeda/__init__.py
|
|
4
|
+
autoeda/analyzer.py
|
|
5
|
+
autoeda/cleaner.py
|
|
6
|
+
autoeda/cli.py
|
|
7
|
+
autoeda/insights.py
|
|
8
|
+
autoeda/utils.py
|
|
9
|
+
autoeda/validator.py
|
|
10
|
+
autoeda/visualizer.py
|
|
11
|
+
data_autoeda.egg-info/PKG-INFO
|
|
12
|
+
data_autoeda.egg-info/SOURCES.txt
|
|
13
|
+
data_autoeda.egg-info/dependency_links.txt
|
|
14
|
+
data_autoeda.egg-info/entry_points.txt
|
|
15
|
+
data_autoeda.egg-info/requires.txt
|
|
16
|
+
data_autoeda.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
autoeda
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "data-autoeda"
|
|
7
|
+
version = "0.1.1"
|
|
8
|
+
description = "Automatic Exploratory Data Analysis, Cleaning, Validation, Visualization, and Smart Insights on ANY CSV dataset."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "shubham kumar" }
|
|
14
|
+
]
|
|
15
|
+
dependencies = [
|
|
16
|
+
"pandas>=1.3.0",
|
|
17
|
+
"numpy>=1.20.0",
|
|
18
|
+
"matplotlib>=3.4.0",
|
|
19
|
+
"seaborn>=0.11.0",
|
|
20
|
+
"scipy>=1.7.0",
|
|
21
|
+
"click>=8.0.0",
|
|
22
|
+
"rich>=10.0.0"
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[project.scripts]
|
|
26
|
+
autoeda = "autoeda.cli:cli"
|
|
27
|
+
[project.urls]
|
|
28
|
+
Homepage = "https://github.com/Shu40/AutoEDA/"
|
|
29
|
+
Repository = "https://github.com/Shu40/AutoEDA/"
|