data-autoeda 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,60 @@
1
+ Metadata-Version: 2.4
2
+ Name: data-autoeda
3
+ Version: 0.1.1
4
+ Summary: Automatic Exploratory Data Analysis, Cleaning, Validation, Visualization, and Smart Insights on ANY CSV dataset.
5
+ Author: shubham kumar
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/Shu40/AutoEDA/
8
+ Project-URL: Repository, https://github.com/Shu40/AutoEDA/
9
+ Requires-Python: >=3.8
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: pandas>=1.3.0
12
+ Requires-Dist: numpy>=1.20.0
13
+ Requires-Dist: matplotlib>=3.4.0
14
+ Requires-Dist: seaborn>=0.11.0
15
+ Requires-Dist: scipy>=1.7.0
16
+ Requires-Dist: click>=8.0.0
17
+ Requires-Dist: rich>=10.0.0
18
+
19
+ # AutoEDA
20
+
21
+ A production-ready Python package that performs automatic Exploratory Data Analysis (EDA), Data Cleaning, Data Validation, Visualization, and Smart Insights on ANY CSV dataset.
22
+
23
+ ## Features
24
+ - **Dynamic Analysis**: Automatically detects numerical, categorical, boolean, and datetime columns.
25
+ - **Smart Cleaning**: Handles missing values, removes duplicates, and optimizes datatypes automatically.
26
+ - **Visualizations**: Automatically generates relevant charts using Matplotlib and Seaborn.
27
+ - **Smart Insights**: Generates English observations based on statistical findings.
28
+ - **Rich Terminal UI**: Beautiful, organized CLI reports.
29
+
30
+ ## Installation
31
+
32
+ ```bash
33
+ pip install .
34
+ ```
35
+
36
+ ## Usage
37
+
38
+ ### Python API
39
+
40
+ ```python
41
+ from autoeda import analyze
42
+
43
+ # Complete analysis
44
+ analyze("data.csv")
45
+
46
+ # Only clean data
47
+ # analyze("data.csv", mode="clean")
48
+ ```
49
+
50
+ ### CLI
51
+
52
+ ```bash
53
+ # Complete analysis
54
+ autoeda data.csv
55
+
56
+ # Specific modes
57
+ autoeda data.csv --clean
58
+ autoeda data.csv --visualize
59
+ autoeda data.csv --all
60
+ ```
@@ -0,0 +1,42 @@
1
+ # AutoEDA
2
+
3
+ A production-ready Python package that performs automatic Exploratory Data Analysis (EDA), Data Cleaning, Data Validation, Visualization, and Smart Insights on ANY CSV dataset.
4
+
5
+ ## Features
6
+ - **Dynamic Analysis**: Automatically detects numerical, categorical, boolean, and datetime columns.
7
+ - **Smart Cleaning**: Handles missing values, removes duplicates, and optimizes datatypes automatically.
8
+ - **Visualizations**: Automatically generates relevant charts using Matplotlib and Seaborn.
9
+ - **Smart Insights**: Generates English observations based on statistical findings.
10
+ - **Rich Terminal UI**: Beautiful, organized CLI reports.
11
+
12
+ ## Installation
13
+
14
+ ```bash
15
+ pip install .
16
+ ```
17
+
18
+ ## Usage
19
+
20
+ ### Python API
21
+
22
+ ```python
23
+ from autoeda import analyze
24
+
25
+ # Complete analysis
26
+ analyze("data.csv")
27
+
28
+ # Only clean data
29
+ # analyze("data.csv", mode="clean")
30
+ ```
31
+
32
+ ### CLI
33
+
34
+ ```bash
35
+ # Complete analysis
36
+ autoeda data.csv
37
+
38
+ # Specific modes
39
+ autoeda data.csv --clean
40
+ autoeda data.csv --visualize
41
+ autoeda data.csv --all
42
+ ```
@@ -0,0 +1,3 @@
1
+ from .cli import analyze
2
+
3
+ __all__ = ["analyze"]
@@ -0,0 +1,288 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from rich.table import Table
4
+ from rich.progress import track
5
+ from .utils import console, print_section, print_warning
6
+
7
+ def run_analysis(df: pd.DataFrame, smart_mode: bool = False):
8
+ steps = [
9
+ (_dataset_overview, "Dataset Overview"),
10
+ (_column_analysis, "Column Analysis"),
11
+ (_null_analysis, "Null Value Analysis"),
12
+ (_duplicate_analysis, "Duplicate Analysis"),
13
+ (_unique_value_analysis, "Unique Value Analysis"),
14
+ (_value_counts_analysis, "Value Counts Analysis"),
15
+ (_numerical_analysis, "Numerical Analysis"),
16
+ (_outlier_analysis, "Outlier Analysis"),
17
+ (_correlation_analysis, "Correlation Analysis"),
18
+ (_datetime_analysis, "Datetime Analysis"),
19
+ ]
20
+
21
+ if smart_mode:
22
+ console.print("[yellow]Running analysis in Smart Mode... Some heavy computations may be optimized or skipped.[/yellow]")
23
+
24
+ for func, name in track(steps, description="Running Analysis Modules..."):
25
+ try:
26
+ if func in [_unique_value_analysis, _value_counts_analysis, _correlation_analysis]:
27
+ func(df, smart_mode)
28
+ else:
29
+ func(df)
30
+ except Exception as e:
31
+ console.print(f"[red]Error in {name}: {e}[/red]")
32
+
33
+ def _dataset_overview(df: pd.DataFrame):
34
+ print_section("Dataset Overview")
35
+
36
+ table = Table(show_header=True, header_style="bold magenta")
37
+ table.add_column("Metric", style="dim", width=20)
38
+ table.add_column("Value")
39
+
40
+ table.add_row("Dataset Shape", f"{df.shape[0]} Rows, {df.shape[1]} Columns")
41
+ table.add_row("Total Rows", str(df.shape[0]))
42
+ table.add_row("Total Columns", str(df.shape[1]))
43
+ table.add_row("Column Names", ", ".join(df.columns.astype(str).tolist()))
44
+
45
+ memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
46
+ table.add_row("Memory Usage", f"{memory_mb:.2f} MB")
47
+
48
+ console.print(table)
49
+
50
+ print_section("First 5 Rows")
51
+ console.print(df.head(5).to_string())
52
+ print_section("Last 5 Rows")
53
+ console.print(df.tail(5).to_string())
54
+
55
+ def _column_analysis(df: pd.DataFrame):
56
+ print_section("Column Analysis")
57
+
58
+ num_cols = df.select_dtypes(include=['number']).columns.tolist()
59
+ cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
60
+ bool_cols = df.select_dtypes(include=['bool']).columns.tolist()
61
+ dt_cols = df.select_dtypes(include=['datetime']).columns.tolist()
62
+
63
+ console.print(f"[bold]Numerical Columns:[/bold] {len(num_cols)} -> {num_cols}")
64
+ console.print(f"[bold]Categorical Columns:[/bold] {len(cat_cols)} -> {cat_cols}")
65
+ console.print(f"[bold]Boolean Columns:[/bold] {len(bool_cols)} -> {bool_cols}")
66
+ console.print(f"[bold]Datetime Columns:[/bold] {len(dt_cols)} -> {dt_cols}")
67
+
68
+ table = Table(show_header=True, header_style="bold magenta")
69
+ table.add_column("Column Name")
70
+ table.add_column("Data Type")
71
+ table.add_column("Total Values")
72
+ table.add_column("Null Values")
73
+ table.add_column("Null %")
74
+ table.add_column("Unique Values")
75
+
76
+ total_rows = len(df)
77
+ for col in df.columns:
78
+ null_count = df[col].isnull().sum()
79
+ null_pct = (null_count / total_rows) * 100
80
+ table.add_row(
81
+ str(col),
82
+ str(df[col].dtype),
83
+ str(total_rows - null_count),
84
+ str(null_count),
85
+ f"{null_pct:.2f}%",
86
+ str(df[col].nunique())
87
+ )
88
+ console.print(table)
89
+
90
+ def _null_analysis(df: pd.DataFrame):
91
+ print_section("Null Value Analysis")
92
+ total_nulls = df.isnull().sum().sum()
93
+ console.print(f"[bold]Total Null Values:[/bold] {total_nulls}")
94
+
95
+ if total_nulls > 0:
96
+ table = Table(show_header=True, header_style="bold magenta")
97
+ table.add_column("Column Name")
98
+ table.add_column("Null Values")
99
+ table.add_column("Null Percentage")
100
+
101
+ for col in df.columns:
102
+ nc = df[col].isnull().sum()
103
+ if nc > 0:
104
+ table.add_row(str(col), str(nc), f"{(nc/len(df))*100:.2f}%")
105
+ console.print(table)
106
+
107
+ def _duplicate_analysis(df: pd.DataFrame):
108
+ print_section("Duplicate Analysis")
109
+ dup_count = df.duplicated().sum()
110
+ dup_pct = (dup_count / len(df)) * 100
111
+ console.print(f"[bold]Total Duplicate Rows:[/bold] {dup_count}")
112
+ console.print(f"[bold]Duplicate Percentage:[/bold] {dup_pct:.2f}%")
113
+
114
+ if dup_count > 0:
115
+ console.print("Sample Duplicates:")
116
+ console.print(df[df.duplicated(keep=False)].head(5).to_string())
117
+
118
+ def _unique_value_analysis(df: pd.DataFrame, smart_mode: bool = False):
119
+ print_section("Unique Value Analysis")
120
+ table = Table(show_header=True, header_style="bold magenta")
121
+ table.add_column("Column Name")
122
+ table.add_column("Unique Count")
123
+ table.add_column("Sample Unique Values")
124
+
125
+ for col in df.columns:
126
+ if smart_mode and str(df[col].dtype) == 'object':
127
+ table.add_row(str(col), "Skipped (Smart Mode)", "...")
128
+ continue
129
+
130
+ uniques = df[col].dropna().unique()
131
+ sample = ", ".join([str(x) for x in uniques[:5]])
132
+ if len(uniques) > 5:
133
+ sample += "..."
134
+ table.add_row(str(col), str(len(uniques)), sample)
135
+ console.print(table)
136
+
137
+ def _value_counts_analysis(df: pd.DataFrame, smart_mode: bool = False):
138
+ print_section("Value Counts Analysis (Categorical)")
139
+ cat_cols = df.select_dtypes(include=['object', 'category']).columns
140
+ for col in cat_cols:
141
+ if smart_mode and df[col].nunique() > 100:
142
+ console.print(f"\n[bold underline]{col}[/bold underline] - [yellow]Skipped (Too many categories for Smart Mode)[/yellow]")
143
+ continue
144
+
145
+ console.print(f"\n[bold underline]{col}[/bold underline]")
146
+ vc = df[col].value_counts()
147
+ vcp = df[col].value_counts(normalize=True) * 100
148
+
149
+ table = Table(show_header=True, header_style="bold magenta")
150
+ table.add_column("Value")
151
+ table.add_column("Count")
152
+ table.add_column("Frequency %")
153
+
154
+ for val, count in vc.head(10).items():
155
+ pct = vcp[val]
156
+ table.add_row(str(val), str(count), f"{pct:.2f}%")
157
+
158
+ if len(vc) > 10:
159
+ table.add_row("...", "...", "...")
160
+ console.print(table)
161
+
162
+ def _numerical_analysis(df: pd.DataFrame):
163
+ print_section("Numerical Analysis")
164
+ num_cols = df.select_dtypes(include=['number']).columns
165
+ if len(num_cols) == 0:
166
+ console.print("No numerical columns found.")
167
+ return
168
+
169
+ for col in num_cols:
170
+ console.print(f"\n[bold underline]{col}[/bold underline]")
171
+ s = df[col].dropna()
172
+ if s.empty:
173
+ continue
174
+
175
+ table = Table(show_header=False)
176
+ table.add_column("Stat", style="dim")
177
+ table.add_column("Value")
178
+
179
+ table.add_row("Count", str(len(s)))
180
+ table.add_row("Mean", f"{s.mean():.4f}")
181
+ table.add_row("Median", f"{s.median():.4f}")
182
+ mode_val = s.mode()
183
+ table.add_row("Mode", str(mode_val.iloc[0]) if not mode_val.empty else "N/A")
184
+ table.add_row("Std Dev", f"{s.std():.4f}")
185
+ table.add_row("Variance", f"{s.var():.4f}")
186
+ table.add_row("Min", f"{s.min():.4f}")
187
+ table.add_row("Max", f"{s.max():.4f}")
188
+ table.add_row("Range", f"{(s.max() - s.min()):.4f}")
189
+ table.add_row("Q1 (25%)", f"{s.quantile(0.25):.4f}")
190
+ table.add_row("Q3 (75%)", f"{s.quantile(0.75):.4f}")
191
+ table.add_row("Skewness", f"{s.skew():.4f}")
192
+ table.add_row("Kurtosis", f"{s.kurtosis():.4f}")
193
+
194
+ console.print(table)
195
+
196
+ def _outlier_analysis(df: pd.DataFrame):
197
+ print_section("Outlier Analysis (IQR Method)")
198
+ num_cols = df.select_dtypes(include=['number']).columns
199
+
200
+ table = Table(show_header=True, header_style="bold magenta")
201
+ table.add_column("Column")
202
+ table.add_column("Outliers Count")
203
+ table.add_column("Outliers Percentage")
204
+
205
+ has_outliers = False
206
+ for col in num_cols:
207
+ s = df[col].dropna()
208
+ if s.empty: continue
209
+ q1 = s.quantile(0.25)
210
+ q3 = s.quantile(0.75)
211
+ iqr = q3 - q1
212
+ lower_bound = q1 - 1.5 * iqr
213
+ upper_bound = q3 + 1.5 * iqr
214
+
215
+ outliers = s[(s < lower_bound) | (s > upper_bound)]
216
+ ocount = len(outliers)
217
+ if ocount > 0:
218
+ has_outliers = True
219
+ table.add_row(str(col), str(ocount), f"{(ocount / len(s)) * 100:.2f}%")
220
+
221
+ if has_outliers:
222
+ console.print(table)
223
+ else:
224
+ console.print("No outliers detected based on IQR method.")
225
+
226
+ def _correlation_analysis(df: pd.DataFrame, smart_mode: bool = False):
227
+ print_section("Correlation Analysis")
228
+ num_df = df.select_dtypes(include=['number'])
229
+
230
+ if smart_mode and len(num_df.columns) > 50:
231
+ console.print("[yellow]Skipping full correlation matrix calculation for >50 columns in Smart Mode.[/yellow]")
232
+ return
233
+
234
+ if len(num_df.columns) < 2:
235
+ console.print("Not enough numerical columns for correlation.")
236
+ return
237
+
238
+ corr = num_df.corr()
239
+ console.print(corr.to_string())
240
+
241
+ # Strong correlations
242
+ strong_pos = []
243
+ strong_neg = []
244
+
245
+ for i in range(len(corr.columns)):
246
+ for j in range(i+1, len(corr.columns)):
247
+ val = corr.iloc[i, j]
248
+ if pd.notna(val):
249
+ if val >= 0.7:
250
+ strong_pos.append(f"{corr.columns[i]} & {corr.columns[j]}: {val:.2f}")
251
+ elif val <= -0.7:
252
+ strong_neg.append(f"{corr.columns[i]} & {corr.columns[j]}: {val:.2f}")
253
+
254
+ if strong_pos:
255
+ console.print("\n[bold green]Strong Positive Correlations (>0.7):[/bold green]")
256
+ for sp in strong_pos:
257
+ console.print(f"- {sp}")
258
+
259
+ if strong_neg:
260
+ console.print("\n[bold red]Strong Negative Correlations (<-0.7):[/bold red]")
261
+ for sn in strong_neg:
262
+ console.print(f"- {sn}")
263
+
264
+ def _datetime_analysis(df: pd.DataFrame):
265
+ print_section("Datetime Analysis")
266
+ # Try to infer datetime if none exist but some look like dates
267
+ for col in df.select_dtypes(include=['object']):
268
+ try:
269
+ sample = df[col].dropna().head(10)
270
+ if not sample.empty:
271
+ pd.to_datetime(sample)
272
+ df[col] = pd.to_datetime(df[col], errors='coerce')
273
+ except Exception:
274
+ pass
275
+
276
+ dt_cols = df.select_dtypes(include=['datetime']).columns
277
+ if len(dt_cols) == 0:
278
+ console.print("No datetime columns found.")
279
+ return
280
+
281
+ for col in dt_cols:
282
+ console.print(f"\n[bold underline]{col}[/bold underline]")
283
+ s = df[col].dropna()
284
+ if s.empty: continue
285
+ console.print(f"Minimum Date: {s.min()}")
286
+ console.print(f"Maximum Date: {s.max()}")
287
+ console.print(f"Date Range: {s.max() - s.min()}")
288
+ console.print(f"Missing Dates: {df[col].isnull().sum()}")
@@ -0,0 +1,104 @@
1
+ import pandas as pd
2
+ from rich.prompt import IntPrompt, Prompt, Confirm
3
+ from .utils import console, print_section, print_info
4
+
5
+ def clean_data(df: pd.DataFrame) -> pd.DataFrame:
6
+ cleaned_df = df.copy()
7
+
8
+ print_section("Data Cleaning")
9
+ console.print(f"Before Cleaning Shape: {cleaned_df.shape}")
10
+ console.print(f"Before Cleaning Missing Values: {cleaned_df.isnull().sum().sum()}")
11
+ console.print(f"Before Cleaning Duplicate Count: {cleaned_df.duplicated().sum()}")
12
+
13
+ cleaned_df.drop_duplicates(inplace=True)
14
+
15
+ obj_cols = cleaned_df.select_dtypes(include=['object']).columns
16
+ for col in obj_cols:
17
+ try:
18
+ cleaned_df[col] = cleaned_df[col].str.strip()
19
+ except Exception:
20
+ pass
21
+
22
+ for col in cleaned_df.columns:
23
+ if cleaned_df[col].isnull().sum() > 0:
24
+ if pd.api.types.is_numeric_dtype(cleaned_df[col]):
25
+ median_val = cleaned_df[col].median()
26
+ cleaned_df[col].fillna(median_val, inplace=True)
27
+ elif pd.api.types.is_object_dtype(cleaned_df[col]) or pd.api.types.is_categorical_dtype(cleaned_df[col]):
28
+ mode_val = cleaned_df[col].mode()
29
+ if not mode_val.empty:
30
+ cleaned_df[col].fillna(mode_val[0], inplace=True)
31
+
32
+ num_cols = cleaned_df.select_dtypes(include=['float64', 'int64']).columns
33
+ for col in num_cols:
34
+ try:
35
+ if 'int' in str(cleaned_df[col].dtype):
36
+ cleaned_df[col] = pd.to_numeric(cleaned_df[col], downcast='integer')
37
+ elif 'float' in str(cleaned_df[col].dtype):
38
+ cleaned_df[col] = pd.to_numeric(cleaned_df[col], downcast='float')
39
+ except Exception:
40
+ pass
41
+
42
+ for col in cleaned_df.select_dtypes(include=['object']).columns:
43
+ try:
44
+ sample = cleaned_df[col].dropna().sample(min(100, len(cleaned_df[col].dropna())))
45
+ if not sample.empty:
46
+ pd.to_datetime(sample)
47
+ cleaned_df[col] = pd.to_datetime(cleaned_df[col], errors='coerce')
48
+ except Exception:
49
+ pass
50
+
51
+ console.print(f"\nAfter Cleaning Shape: {cleaned_df.shape}")
52
+ console.print(f"After Cleaning Missing Values: {cleaned_df.isnull().sum().sum()}")
53
+ console.print(f"After Cleaning Duplicate Count: {cleaned_df.duplicated().sum()}")
54
+
55
+ return cleaned_df
56
+
57
+ def interactive_clean(df: pd.DataFrame) -> pd.DataFrame:
58
+ df = df.copy()
59
+ while True:
60
+ print_section("Interactive Data Cleaning")
61
+ console.print("1. Remove Null Values")
62
+ console.print("2. Remove Duplicate Values")
63
+ console.print("3. Remove Categorical Columns")
64
+ console.print("4. Remove Specific Column")
65
+ console.print("5. Remove Specific Row")
66
+ console.print("6. Finish Cleaning")
67
+
68
+ choice = IntPrompt.ask("Select an option", choices=["1", "2", "3", "4", "5", "6"])
69
+
70
+ if choice == 1:
71
+ df.dropna(inplace=True)
72
+ console.print("[green]✔ Null values removed.[/green]")
73
+ elif choice == 2:
74
+ df.drop_duplicates(inplace=True)
75
+ console.print("[green]✔ Duplicates removed.[/green]")
76
+ elif choice == 3:
77
+ cat_cols = df.select_dtypes(include=['object', 'category']).columns
78
+ df.drop(columns=cat_cols, inplace=True)
79
+ console.print(f"[green]✔ Categorical columns removed: {list(cat_cols)}[/green]")
80
+ elif choice == 4:
81
+ col_name = Prompt.ask("Enter column name to remove")
82
+ if col_name in df.columns:
83
+ df.drop(columns=[col_name], inplace=True)
84
+ console.print(f"[green]✔ Column '{col_name}' removed.[/green]")
85
+ else:
86
+ console.print(f"[red]✖ Column '{col_name}' not found.[/red]")
87
+ elif choice == 5:
88
+ row_idx = IntPrompt.ask("Enter row index to remove")
89
+ try:
90
+ df.drop(index=row_idx, inplace=True)
91
+ console.print(f"[green]✔ Row {row_idx} removed.[/green]")
92
+ except Exception as e:
93
+ console.print(f"[red]✖ Failed to remove row: {e}[/red]")
94
+ elif choice == 6:
95
+ break
96
+
97
+ save_data = Confirm.ask("Do you want to save this new cleaned data to a file?")
98
+ if save_data:
99
+ save_path = Prompt.ask("Enter filename", default="cleaned_data.csv")
100
+ df.to_csv(save_path, index=False)
101
+ console.print(f"[green]✔ New data saved to {save_path}[/green]")
102
+
103
+ return df
104
+
@@ -0,0 +1,94 @@
1
+ import click
2
+ import sys
3
+ import time
4
+ from .validator import load_and_validate
5
+ from .analyzer import run_analysis
6
+ from .cleaner import clean_data
7
+ from .visualizer import generate_visualizations
8
+ from .insights import generate_insights
9
+ from .utils import print_header, print_section, print_info
10
+
11
+ @click.command()
12
+ @click.argument('file_path', type=click.Path(exists=True))
13
+ @click.option('--clean', is_flag=True, help='Only perform data cleaning')
14
+ @click.option('--visualize', is_flag=True, help='Only perform data visualization')
15
+ @click.option('--all', 'run_all', is_flag=True, help='Run all modules (default behavior)')
16
+ def cli(file_path, clean, visualize, run_all):
17
+ """AutoEDA: Automatic Exploratory Data Analysis & Cleaning"""
18
+ analyze(file_path, clean, visualize, run_all)
19
+
20
+ def analyze(file_path, clean=False, visualize=False, run_all=False):
21
+ # If no flags are provided, run all by default
22
+ if not clean and not visualize:
23
+ run_all = True
24
+
25
+ print_header("AutoEDA Complete Analysis Report")
26
+ start_time = time.time()
27
+
28
+ try:
29
+ df, smart_mode = load_and_validate(file_path)
30
+ except Exception as e:
31
+ click.secho(f"Error loading file: {e}", fg='red')
32
+ sys.exit(1)
33
+
34
+ if smart_mode:
35
+ from .utils import console
36
+ console.print("\n[bold yellow]Large Dataset Detected[/bold yellow]")
37
+ console.print(f"Rows: {len(df):,}")
38
+ console.print(f"Columns: {len(df.columns)}")
39
+ console.print("[bold green]Optimization Mode: Enabled[/bold green]")
40
+
41
+ if clean and not run_all:
42
+ # Only cleaning
43
+ print_section("Data Cleaning")
44
+ df = clean_data(df)
45
+ print_info("Cleaning complete.")
46
+ return df
47
+
48
+ while True:
49
+ if run_all:
50
+ print_info(f"Analyzing dataset...")
51
+ run_analysis(df, smart_mode)
52
+
53
+ # Insights
54
+ print_section("Insights")
55
+ insights = generate_insights(df)
56
+ for idx, insight in enumerate(insights, 1):
57
+ print_info(f"{idx}. {insight}")
58
+
59
+ if run_all or visualize:
60
+ print_section("Visualization")
61
+ generate_visualizations(df, smart_mode)
62
+ print_info("Visualizations generated.")
63
+
64
+ if run_all:
65
+ from rich.prompt import Confirm
66
+ do_clean = Confirm.ask("\nDo you want to proceed to interactive data cleaning?")
67
+ if do_clean:
68
+ from .cleaner import interactive_clean
69
+ df = interactive_clean(df)
70
+
71
+ do_next = Confirm.ask("\nDo you want to run the complete analysis again on the new data?")
72
+ if do_next:
73
+ continue
74
+ else:
75
+ break
76
+ else:
77
+ break
78
+ else:
79
+ break
80
+
81
+ end_time = time.time()
82
+ from rich.console import Console
83
+ console = Console()
84
+ console.print(f"\n[bold cyan]Processing Time:[/bold cyan] {end_time - start_time:.2f} seconds")
85
+
86
+ # Track Memory Usage
87
+ mem_usage_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
88
+ console.print(f"[bold cyan]Final Memory Usage:[/bold cyan] {mem_usage_mb:.2f} MB")
89
+
90
+ console.print("\n[bold green]Thank you for using this tool![/bold green]")
91
+ return df
92
+
93
+ if __name__ == '__main__':
94
+ cli()
@@ -0,0 +1,52 @@
1
+ import pandas as pd
2
+ from typing import List
3
+
4
+ def generate_insights(df: pd.DataFrame) -> List[str]:
5
+ insights = []
6
+
7
+ # Missing values
8
+ total_cells = df.shape[0] * df.shape[1]
9
+ missing_cells = df.isnull().sum().sum()
10
+ missing_pct = (missing_cells / total_cells) * 100
11
+ if missing_pct > 0:
12
+ insights.append(f"Dataset contains {missing_pct:.2f}% missing values overall.")
13
+ else:
14
+ insights.append("Dataset is complete with 0% missing values.")
15
+
16
+ for col in df.columns:
17
+ col_missing = df[col].isnull().mean() * 100
18
+ if col_missing > 20:
19
+ insights.append(f"Column '{col}' has significant missing data ({col_missing:.2f}%).")
20
+
21
+ # Duplicates
22
+ dup_pct = (df.duplicated().sum() / df.shape[0]) * 100
23
+ if dup_pct > 0:
24
+ insights.append(f"Dataset contains {dup_pct:.2f}% duplicate rows.")
25
+
26
+ # Correlations (numerical)
27
+ num_df = df.select_dtypes(include=['number'])
28
+ if len(num_df.columns) > 1:
29
+ corr = num_df.corr()
30
+ for i in range(len(corr.columns)):
31
+ for j in range(i+1, len(corr.columns)):
32
+ c_val = corr.iloc[i, j]
33
+ if pd.notna(c_val):
34
+ if c_val > 0.8:
35
+ insights.append(f"Strong positive correlation ({c_val:.2f}) exists between '{corr.columns[i]}' and '{corr.columns[j]}'.")
36
+ elif c_val < -0.8:
37
+ insights.append(f"Strong negative correlation ({c_val:.2f}) exists between '{corr.columns[i]}' and '{corr.columns[j]}'.")
38
+
39
+ # Categorical dominance
40
+ cat_df = df.select_dtypes(include=['object', 'category'])
41
+ for col in cat_df.columns:
42
+ val_counts = df[col].value_counts(normalize=True)
43
+ if not val_counts.empty and val_counts.iloc[0] > 0.8:
44
+ insights.append(f"Category '{val_counts.index[0]}' dominates the '{col}' column ({val_counts.iloc[0]*100:.2f}%).")
45
+
46
+ # ML suitability
47
+ if missing_pct < 5 and len(num_df.columns) >= 2 and len(df) > 100:
48
+ insights.append("Dataset appears suitable for Machine Learning with standard preprocessing.")
49
+ else:
50
+ insights.append("Recommended preprocessing steps: Extensive data cleaning and imputation required before Machine Learning.")
51
+
52
+ return insights
@@ -0,0 +1,17 @@
1
+ from rich.console import Console
2
+ from rich.panel import Panel
3
+
4
+ console = Console()
5
+
6
+ def print_header(title: str):
7
+ console.print()
8
+ console.print(Panel(title.upper(), style="bold cyan", expand=False))
9
+
10
+ def print_section(title: str):
11
+ console.print(f"\n[bold yellow]--- {title} ---[/bold yellow]")
12
+
13
+ def print_info(msg: str):
14
+ console.print(f"[green]✔[/green] {msg}")
15
+
16
+ def print_warning(msg: str):
17
+ console.print(f"[bold red]![/bold red] {msg}")
@@ -0,0 +1,33 @@
1
+ import os
2
+ import pandas as pd
3
+ from typing import Tuple
4
+ from .utils import print_warning
5
+
6
+ def load_and_validate(file_path: str) -> Tuple[pd.DataFrame, bool]:
7
+ if not os.path.exists(file_path):
8
+ raise FileNotFoundError(f"File not found: {file_path}")
9
+
10
+ if not file_path.lower().endswith(".csv"):
11
+ print_warning("File does not end with .csv, attempting to read as CSV anyway.")
12
+
13
+ file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
14
+ smart_mode = False
15
+
16
+ if file_size_mb > 100:
17
+ smart_mode = True
18
+
19
+ try:
20
+ df = pd.read_csv(file_path, low_memory=False)
21
+ except Exception as e:
22
+ raise ValueError(f"Failed to read CSV file. Error: {e}")
23
+
24
+ if df.empty:
25
+ raise ValueError("The dataset is empty.")
26
+
27
+ if len(df.columns) == 0:
28
+ raise ValueError("No columns found in the dataset.")
29
+
30
+ if len(df) > 100000:
31
+ smart_mode = True
32
+
33
+ return df, smart_mode
@@ -0,0 +1,125 @@
1
+ import matplotlib.pyplot as plt
2
+ import seaborn as sns
3
+ import pandas as pd
4
+ import numpy as np
5
+ from rich.prompt import Confirm
6
+ from .utils import console, print_section
7
+
8
+ def generate_visualizations(df: pd.DataFrame, smart_mode: bool = False):
9
+ sns.set_theme(style="whitegrid")
10
+
11
+ if smart_mode and len(df) > 50000:
12
+ console.print(f"[yellow]Smart Mode: Sampling dataset from {len(df):,} to 50,000 rows for faster visualizations.[/yellow]")
13
+ df = df.sample(50000, random_state=42)
14
+
15
+ print_section("Interactive Visualization Menu")
16
+ console.print("Select which graphs you want to generate:")
17
+
18
+ show_missing = Confirm.ask("Show Missing Value Graphs?")
19
+ show_categorical = Confirm.ask("Show Categorical Feature Graphs?")
20
+ show_numerical = Confirm.ask("Show Numerical Feature Distributions?")
21
+ show_correlation = Confirm.ask("Show Correlation Heatmap?")
22
+ show_datetime = Confirm.ask("Show Datetime Charts?")
23
+
24
+ plots = []
25
+
26
+ if show_missing and df.isnull().sum().sum() > 0:
27
+ plots.append({'type': 'missing_heatmap'})
28
+ missing_counts = df.isnull().sum()
29
+ if not missing_counts[missing_counts > 0].empty:
30
+ plots.append({'type': 'missing_bar'})
31
+
32
+ if show_categorical:
33
+ cat_cols = df.select_dtypes(include=['object', 'category']).columns
34
+ for col in cat_cols:
35
+ vc = df[col].value_counts().head(10)
36
+ if not vc.empty:
37
+ plots.append({'type': 'cat_count', 'col': col, 'vc': vc})
38
+ if len(vc) <= 10:
39
+ plots.append({'type': 'cat_pie', 'col': col, 'vc': vc})
40
+
41
+ if show_numerical:
42
+ num_cols = df.select_dtypes(include=['number']).columns
43
+ for col in num_cols:
44
+ plots.append({'type': 'num_hist', 'col': col})
45
+ plots.append({'type': 'num_kde', 'col': col})
46
+ plots.append({'type': 'num_box', 'col': col})
47
+
48
+ if show_correlation:
49
+ num_df = df.select_dtypes(include=['number'])
50
+ if len(num_df.columns) > 1:
51
+ plots.append({'type': 'correlation'})
52
+
53
+ if show_datetime:
54
+ dt_cols = df.select_dtypes(include=['datetime']).columns
55
+ for col in dt_cols:
56
+ plots.append({'type': 'datetime', 'col': col})
57
+
58
+ n_plots = len(plots)
59
+ if n_plots == 0:
60
+ console.print("No plots selected or no data available to plot.")
61
+ return
62
+
63
+ cols = 3
64
+ rows = (n_plots + cols - 1) // cols
65
+
66
+ fig, axes = plt.subplots(rows, cols, figsize=(18, 5 * rows))
67
+
68
+ # Ensure axes is always a 1D array even if there's only 1 plot
69
+ if n_plots == 1:
70
+ axes = np.array([axes])
71
+ else:
72
+ axes = axes.flatten()
73
+
74
+ for i, plot_info in enumerate(plots):
75
+ ax = axes[i]
76
+ ptype = plot_info['type']
77
+
78
+ if ptype == 'missing_heatmap':
79
+ sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False, ax=ax)
80
+ ax.set_title('Missing Value Heatmap')
81
+ elif ptype == 'missing_bar':
82
+ missing_counts = df.isnull().sum()
83
+ missing_counts = missing_counts[missing_counts > 0]
84
+ missing_counts.plot(kind='bar', color='salmon', ax=ax)
85
+ ax.set_title('Missing Values per Column')
86
+ ax.tick_params(axis='x', rotation=45)
87
+ elif ptype == 'cat_count':
88
+ col = plot_info['col']
89
+ sns.countplot(y=col, data=df, order=plot_info['vc'].index, palette='Set2', ax=ax)
90
+ ax.set_title(f'Count Plot: {col}')
91
+ elif ptype == 'cat_pie':
92
+ col = plot_info['col']
93
+ vc = plot_info['vc']
94
+ ax.pie(vc.values, labels=vc.index, autopct='%1.1f%%', startangle=90, colors=sns.color_palette('Set2'))
95
+ ax.set_title(f'Pie Chart: {col}')
96
+ elif ptype == 'num_hist':
97
+ col = plot_info['col']
98
+ sns.histplot(df[col].dropna(), kde=False, ax=ax, color='skyblue')
99
+ ax.set_title(f'Histogram: {col}')
100
+ elif ptype == 'num_kde':
101
+ col = plot_info['col']
102
+ sns.kdeplot(df[col].dropna(), ax=ax, color='orange', fill=True)
103
+ ax.set_title(f'KDE Plot: {col}')
104
+ elif ptype == 'num_box':
105
+ col = plot_info['col']
106
+ sns.boxplot(x=df[col].dropna(), ax=ax, color='lightgreen')
107
+ ax.set_title(f'Boxplot: {col}')
108
+ elif ptype == 'correlation':
109
+ corr = df.select_dtypes(include=['number']).corr()
110
+ sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', square=True, ax=ax)
111
+ ax.set_title('Correlation Heatmap')
112
+ elif ptype == 'datetime':
113
+ col = plot_info['col']
114
+ vc = df[col].dropna().dt.date.value_counts().sort_index()
115
+ vc.plot(kind='line', color='purple', ax=ax)
116
+ ax.set_title(f'Time Series: {col}')
117
+ ax.tick_params(axis='x', rotation=45)
118
+
119
+ # Hide any unused subplots
120
+ for j in range(n_plots, len(axes)):
121
+ fig.delaxes(axes[j])
122
+
123
+ plt.tight_layout()
124
+ console.print("[green]Opening all selected plots in a single window. Close the window to continue...[/green]")
125
+ plt.show(block=True)
@@ -0,0 +1,60 @@
1
+ Metadata-Version: 2.4
2
+ Name: data-autoeda
3
+ Version: 0.1.1
4
+ Summary: Automatic Exploratory Data Analysis, Cleaning, Validation, Visualization, and Smart Insights on ANY CSV dataset.
5
+ Author: shubham kumar
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/Shu40/AutoEDA/
8
+ Project-URL: Repository, https://github.com/Shu40/AutoEDA/
9
+ Requires-Python: >=3.8
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: pandas>=1.3.0
12
+ Requires-Dist: numpy>=1.20.0
13
+ Requires-Dist: matplotlib>=3.4.0
14
+ Requires-Dist: seaborn>=0.11.0
15
+ Requires-Dist: scipy>=1.7.0
16
+ Requires-Dist: click>=8.0.0
17
+ Requires-Dist: rich>=10.0.0
18
+
19
+ # AutoEDA
20
+
21
+ A production-ready Python package that performs automatic Exploratory Data Analysis (EDA), Data Cleaning, Data Validation, Visualization, and Smart Insights on ANY CSV dataset.
22
+
23
+ ## Features
24
+ - **Dynamic Analysis**: Automatically detects numerical, categorical, boolean, and datetime columns.
25
+ - **Smart Cleaning**: Handles missing values, removes duplicates, and optimizes datatypes automatically.
26
+ - **Visualizations**: Automatically generates relevant charts using Matplotlib and Seaborn.
27
+ - **Smart Insights**: Generates English observations based on statistical findings.
28
+ - **Rich Terminal UI**: Beautiful, organized CLI reports.
29
+
30
+ ## Installation
31
+
32
+ ```bash
33
+ pip install .
34
+ ```
35
+
36
+ ## Usage
37
+
38
+ ### Python API
39
+
40
+ ```python
41
+ from autoeda import analyze
42
+
43
+ # Complete analysis
44
+ analyze("data.csv")
45
+
46
+ # Only clean data
47
+ # analyze("data.csv", mode="clean")
48
+ ```
49
+
50
+ ### CLI
51
+
52
+ ```bash
53
+ # Complete analysis
54
+ autoeda data.csv
55
+
56
+ # Specific modes
57
+ autoeda data.csv --clean
58
+ autoeda data.csv --visualize
59
+ autoeda data.csv --all
60
+ ```
@@ -0,0 +1,16 @@
1
+ README.md
2
+ pyproject.toml
3
+ autoeda/__init__.py
4
+ autoeda/analyzer.py
5
+ autoeda/cleaner.py
6
+ autoeda/cli.py
7
+ autoeda/insights.py
8
+ autoeda/utils.py
9
+ autoeda/validator.py
10
+ autoeda/visualizer.py
11
+ data_autoeda.egg-info/PKG-INFO
12
+ data_autoeda.egg-info/SOURCES.txt
13
+ data_autoeda.egg-info/dependency_links.txt
14
+ data_autoeda.egg-info/entry_points.txt
15
+ data_autoeda.egg-info/requires.txt
16
+ data_autoeda.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ autoeda = autoeda.cli:cli
@@ -0,0 +1,7 @@
1
+ pandas>=1.3.0
2
+ numpy>=1.20.0
3
+ matplotlib>=3.4.0
4
+ seaborn>=0.11.0
5
+ scipy>=1.7.0
6
+ click>=8.0.0
7
+ rich>=10.0.0
@@ -0,0 +1 @@
1
+ autoeda
@@ -0,0 +1,29 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "data-autoeda"
7
+ version = "0.1.1"
8
+ description = "Automatic Exploratory Data Analysis, Cleaning, Validation, Visualization, and Smart Insights on ANY CSV dataset."
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = { text = "MIT" }
12
+ authors = [
13
+ { name = "shubham kumar" }
14
+ ]
15
+ dependencies = [
16
+ "pandas>=1.3.0",
17
+ "numpy>=1.20.0",
18
+ "matplotlib>=3.4.0",
19
+ "seaborn>=0.11.0",
20
+ "scipy>=1.7.0",
21
+ "click>=8.0.0",
22
+ "rich>=10.0.0"
23
+ ]
24
+
25
+ [project.scripts]
26
+ autoeda = "autoeda.cli:cli"
27
+ [project.urls]
28
+ Homepage = "https://github.com/Shu40/AutoEDA/"
29
+ Repository = "https://github.com/Shu40/AutoEDA/"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+