tom-analytics 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,46 @@
1
+ Metadata-Version: 2.4
2
+ Name: tom-analytics
3
+ Version: 0.1.0
4
+ Summary: A one-line autonomous exploratory data analysis (EDA) and reporting library.
5
+ License: MIT
6
+ Requires-Python: >=3.8
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: pandas
9
+ Requires-Dist: numpy
10
+ Requires-Dist: matplotlib
11
+ Requires-Dist: seaborn
12
+ Requires-Dist: scipy
13
+ Requires-Dist: scikit-learn
14
+ Requires-Dist: rich
15
+ Requires-Dist: plotly
16
+ Requires-Dist: kaleido
17
+ Requires-Dist: jinja2
18
+ Requires-Dist: tabulate
19
+
20
+ # 🚀 tom — One-Line Data Analytics Library
21
+
22
+ `tom` is a highly autonomous, zero-configuration Python library that delivers end-to-end exploratory data analysis (EDA) in a single line.
23
+
24
+ ## ✨ Features
25
+ - **Smart Loading (`om.file()`)**: Automatic format, encoding, and delimiter detection for `.csv`, `.xlsx`, `.xls`, `.json`, `.parquet`, `.tsv`, and `.txt`.
26
+ - **Auto Data Preprocessing (`om.clean_report()`)**: Type coercion (dates, numerical objects), median/mode missing value imputation, and duplicate handling.
27
+ - **Statistical Analysis Engine (`om.stats`)**: Normality tests, correlation matrices, Chi-Square associations, and ANOVA tests.
28
+ - **Dazzling Data Visualizations (`om.charts`)**: Static high-res Seaborn charts + portable, interactive Plotly dashboard.
29
+ - **NLP Insights (`om.insights`)**: Readable plain-English suggestions and red flags.
30
+ - **Premium Reporting (`om.describe()`)**: Rich terminal prints and self-contained glassmorphism HTML pages.
31
+
32
+ ## 📦 Installation
33
+ ```bash
34
+ pip install -e .
35
+ ```
36
+
37
+ ## 🛠️ Usage
38
+ ```python
39
+ import tom as om
40
+
41
+ # Load data interactively (terminal prompt if no path) or directly
42
+ om.file("dataset.csv")
43
+
44
+ # Generate beautiful, high-quality, comprehensive reports in one go!
45
+ om.describe()
46
+ ```
@@ -0,0 +1,27 @@
1
+ # 🚀 tom — One-Line Data Analytics Library
2
+
3
+ `tom` is a highly autonomous, zero-configuration Python library that delivers end-to-end exploratory data analysis (EDA) in a single line.
4
+
5
+ ## ✨ Features
6
+ - **Smart Loading (`om.file()`)**: Automatic format, encoding, and delimiter detection for `.csv`, `.xlsx`, `.xls`, `.json`, `.parquet`, `.tsv`, and `.txt`.
7
+ - **Auto Data Preprocessing (`om.clean_report()`)**: Type coercion (dates, numerical objects), median/mode missing value imputation, and duplicate handling.
8
+ - **Statistical Analysis Engine (`om.stats`)**: Normality tests, correlation matrices, Chi-Square associations, and ANOVA tests.
9
+ - **Dazzling Data Visualizations (`om.charts`)**: Static high-res Seaborn charts + portable, interactive Plotly dashboard.
10
+ - **NLP Insights (`om.insights`)**: Readable plain-English suggestions and red flags.
11
+ - **Premium Reporting (`om.describe()`)**: Rich terminal prints and self-contained glassmorphism HTML pages.
12
+
13
+ ## 📦 Installation
14
+ ```bash
15
+ pip install -e .
16
+ ```
17
+
18
+ ## 🛠️ Usage
19
+ ```python
20
+ import tom as om
21
+
22
+ # Load data interactively (terminal prompt if no path) or directly
23
+ om.file("dataset.csv")
24
+
25
+ # Generate beautiful, high-quality, comprehensive reports in one go!
26
+ om.describe()
27
+ ```
@@ -0,0 +1,28 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "tom-analytics"
7
+ version = "0.1.0"
8
+ description = "A one-line autonomous exploratory data analysis (EDA) and reporting library."
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = {text = "MIT"}
12
+ dependencies = [
13
+ "pandas",
14
+ "numpy",
15
+ "matplotlib",
16
+ "seaborn",
17
+ "scipy",
18
+ "scikit-learn",
19
+ "rich",
20
+ "plotly",
21
+ "kaleido",
22
+ "jinja2",
23
+ "tabulate"
24
+ ]
25
+
26
+ [tool.setuptools.packages.find]
27
+ where = ["."]
28
+ include = ["tom*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,277 @@
1
+ import os
2
+ import sys
3
+
4
+ # Reconfigure stdout/stderr to UTF-8 to prevent encoding issues on Windows
5
+ if hasattr(sys.stdout, 'reconfigure'):
6
+ try:
7
+ sys.stdout.reconfigure(encoding='utf-8')
8
+ except Exception:
9
+ pass
10
+ if hasattr(sys.stderr, 'reconfigure'):
11
+ try:
12
+ sys.stderr.reconfigure(encoding='utf-8')
13
+ except Exception:
14
+ pass
15
+
16
+ import pandas as pd
17
+ import numpy as np
18
+ from rich import print as rprint
19
+ from rich.console import Console
20
+ from rich.panel import Panel
21
+ from rich.table import Table
22
+
23
+ # Global package state
24
+ _active_df = None
25
+ _active_path = None
26
+
27
+ console = Console()
28
+
29
+ def set_active_df(df: pd.DataFrame, path: str = None):
30
+ global _active_df, _active_path
31
+ _active_df = df
32
+ if path:
33
+ _active_path = path
34
+
35
+ def get_active_df() -> pd.DataFrame:
36
+ global _active_df
37
+ return _active_df
38
+
39
+ def get_active_path() -> str:
40
+ global _active_path
41
+ return _active_path
42
+
43
+ # Public API Functions
44
+ from tom.loader import file
45
+ from tom.reporter import describe
46
+ from tom.charts import chart
47
+
48
+ def clean_report(df: pd.DataFrame = None):
49
+ """Shows only the data cleaning summary for the active or provided dataset."""
50
+ if df is None:
51
+ df = _active_df
52
+ if df is None:
53
+ rprint("[bold red]❌ Error: No dataset loaded. Run om.file() first.[/bold red]")
54
+ return
55
+
56
+ from tom.cleaner import clean_data, print_clean_report
57
+ df_cleaned, summary_info, warnings, dup_rows_removed = clean_data(df)
58
+ print_clean_report(summary_info, warnings, dup_rows_removed)
59
+
60
+ def correlation(df: pd.DataFrame = None):
61
+ """Computes correlation matrix and shows heatmap + Rich table."""
62
+ if df is None:
63
+ df = _active_df
64
+ if df is None:
65
+ rprint("[bold red]❌ Error: No dataset loaded. Run om.file() first.[/bold red]")
66
+ return
67
+
68
+ import tom.utils as utils
69
+ col_types = utils.detect_column_types(df)
70
+ num_cols = [c for c, t in col_types.items() if t == 'numerical']
71
+
72
+ if len(num_cols) < 2:
73
+ rprint("[bold yellow]⚠️ Not enough numerical columns to compute correlations.[/bold yellow]")
74
+ return
75
+
76
+ corr_matrix = df[num_cols].corr()
77
+
78
+ rprint("\n[bold cyan]🔗 Pearson Correlation Matrix[/bold cyan]")
79
+ table = Table(show_header=True, header_style="bold magenta", box=None)
80
+ table.add_column("Feature", style="bold white")
81
+ for col in num_cols:
82
+ table.add_column(col, justify="right")
83
+
84
+ for index, row in corr_matrix.iterrows():
85
+ row_vals = [index] + [f"{val:.2f}" if pd.notna(val) else "N/A" for val in row]
86
+ table.add_row(*row_vals)
87
+
88
+ console.print(table)
89
+
90
+ # Save correlation heatmap plot
91
+ import matplotlib.pyplot as plt
92
+ import seaborn as sns
93
+ fig, ax = plt.subplots(figsize=(6, 5))
94
+ sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1, square=True, ax=ax)
95
+ ax.set_title("Pearson Correlation Heatmap")
96
+
97
+ utils.ensure_dir("./tom_report/charts")
98
+ path = "./tom_report/charts/single_correlation_heatmap.png"
99
+ fig.savefig(path, dpi=300, bbox_inches='tight')
100
+ plt.close(fig)
101
+ rprint(f"[bold green]✅ Heatmap saved to: {os.path.abspath(path)}[/bold green]")
102
+
103
+ def outliers(df: pd.DataFrame = None):
104
+ """Generates a detailed outlier report for all numerical columns."""
105
+ if df is None:
106
+ df = _active_df
107
+ if df is None:
108
+ rprint("[bold red]❌ Error: No dataset loaded. Run om.file() first.[/bold red]")
109
+ return
110
+
111
+ import tom.utils as utils
112
+ col_types = utils.detect_column_types(df)
113
+ num_cols = [c for c, t in col_types.items() if t == 'numerical']
114
+
115
+ if not num_cols:
116
+ rprint("[bold yellow]⚠️ No numerical columns found in the dataset.[/bold yellow]")
117
+ return
118
+
119
+ rprint("\n[bold cyan]🚨 Detailed Outlier Analysis (IQR Method)[/bold cyan]")
120
+ table = Table(show_header=True, header_style="bold magenta", box=None)
121
+ table.add_column("Column", style="bold white")
122
+ table.add_column("Lower Limit", justify="right")
123
+ table.add_column("Upper Limit", justify="right")
124
+ table.add_column("Min Outlier", justify="right", style="red")
125
+ table.add_column("Max Outlier", justify="right", style="red")
126
+ table.add_column("Outlier Count", justify="right", style="magenta")
127
+ table.add_column("Outliers %", justify="right", style="bold red")
128
+
129
+ for col in num_cols:
130
+ series = df[col].dropna()
131
+ if len(series) == 0:
132
+ continue
133
+ q1 = series.quantile(0.25)
134
+ q3 = series.quantile(0.75)
135
+ iqr = q3 - q1
136
+
137
+ if iqr > 0:
138
+ lower = q1 - 1.5 * iqr
139
+ upper = q3 + 1.5 * iqr
140
+ outlier_series = series[(series < lower) | (series > upper)]
141
+ count = len(outlier_series)
142
+ pct = (count / len(series)) * 100
143
+
144
+ min_out = f"{outlier_series.min():.2f}" if count > 0 else "-"
145
+ max_out = f"{outlier_series.max():.2f}" if count > 0 else "-"
146
+
147
+ table.add_row(
148
+ col,
149
+ f"{lower:.2f}",
150
+ f"{upper:.2f}",
151
+ min_out,
152
+ max_out,
153
+ str(count),
154
+ f"{pct:.1f}%"
155
+ )
156
+
157
+ console.print(table)
158
+
159
+ def suggest(df: pd.DataFrame = None):
160
+ """Analyzes the loaded dataset characteristics and suggests suitable ML models."""
161
+ if df is None:
162
+ df = _active_df
163
+ if df is None:
164
+ rprint("[bold red]❌ Error: No dataset loaded. Run om.file() first.[/bold red]")
165
+ return
166
+
167
+ rprint("\n[bold cyan]🤖 Autonomous Machine Learning Model Suggestions[/bold cyan]")
168
+ n_rows, n_cols = df.shape
169
+
170
+ import tom.utils as utils
171
+ col_types = utils.detect_column_types(df)
172
+
173
+ num_c = sum(1 for t in col_types.values() if t == 'numerical')
174
+ cat_c = sum(1 for t in col_types.values() if t == 'categorical')
175
+
176
+ # Simple target column heuristic (prefer low-cardinality categorical, or last column)
177
+ target_col = df.columns[-1]
178
+ for col, ctype in col_types.items():
179
+ if ctype == 'categorical' and col != df.columns[0]:
180
+ target_col = col
181
+ break
182
+
183
+ target_type = col_types.get(target_col, 'numerical')
184
+
185
+ # Construct beautiful suggestion card
186
+ suggestion_text = f"[bold white]Target Column Detected:[/bold white] `{target_col}` (Type: [bold yellow]{target_type.upper()}[/bold yellow])\n\n"
187
+
188
+ if target_type in ['categorical', 'text_id'] and df[target_col].nunique() <= 20:
189
+ suggestion_text += "[bold green]🎯 Task Classification Suggested[/bold green]\n"
190
+ suggestion_text += "Since your target column is categorical with discrete classes.\n\n"
191
+ suggestion_text += "[bold white]Candidate Models to Try:[/bold white]\n"
192
+ suggestion_text += " 1. [bold cyan]Random Forest Classifier[/bold cyan] (Great baseline, robust to scaling & outliers)\n"
193
+ suggestion_text += " 2. [bold cyan]XGBoost / LightGBM[/bold cyan] (State-of-the-art for tabular data, handles missing values)\n"
194
+ suggestion_text += " 3. [bold cyan]Logistic Regression[/bold cyan] (Good for linear interpretability)\n\n"
195
+ suggestion_text += "[bold white]Pre-processing Advice:[/bold white]\n"
196
+ suggestion_text += f" - Apply Target Encoding or One-Hot Encoding to the {cat_c} categorical features.\n"
197
+ if num_c > 0:
198
+ suggestion_text += " - Standardize/Normalize numeric fields if utilizing Logistic Regression."
199
+ else:
200
+ suggestion_text += "[bold green]📈 Task Regression Suggested[/bold green]\n"
201
+ suggestion_text += "Since your target column is continuous / numerical.\n\n"
202
+ suggestion_text += "[bold white]Candidate Models to Try:[/bold white]\n"
203
+ suggestion_text += " 1. [bold cyan]Random Forest Regressor[/bold cyan] (Captures non-linear dependencies cleanly)\n"
204
+ suggestion_text += " 2. [bold cyan]Ridge Regression / Lasso[/bold cyan] (Excellent linear model with regularization)\n"
205
+ suggestion_text += " 3. [bold cyan]Gradient Boosting Regressor[/bold cyan] (High prediction accuracy)\n\n"
206
+ suggestion_text += "[bold white]Pre-processing Advice:[/bold white]\n"
207
+ suggestion_text += " - Address severe right/left-skewed features with log/Box-Cox transform.\n"
208
+ suggestion_text += " - Remove outliers or use robust scaling techniques."
209
+
210
+ console.print(Panel(suggestion_text, title="🧠 ML Model Suggestions", border_style="purple"))
211
+
212
+ def compare(df2: pd.DataFrame, df1: pd.DataFrame = None):
213
+ """Compares the active dataset side-by-side with another DataFrame."""
214
+ if df1 is None:
215
+ df1 = _active_df
216
+ if df1 is None:
217
+ rprint("[bold red]❌ Error: Active dataset is missing. Run om.file() first.[/bold red]")
218
+ return
219
+
220
+ rprint("\n[bold cyan]🔄 Side-by-Side Dataset Comparison[/bold cyan]")
221
+
222
+ table = Table(box=None, header_style="bold magenta")
223
+ table.add_column("Metric", style="bold white")
224
+ table.add_column("Dataset 1 (Active)", style="cyan")
225
+ table.add_column("Dataset 2", style="green")
226
+
227
+ table.add_row("Dimensions (Rows, Cols)", f"{df1.shape}", f"{df2.shape}")
228
+ table.add_row("Total Missing Values", f"{df1.isnull().sum().sum()}", f"{df2.isnull().sum().sum()}")
229
+ table.add_row("Duplicate Rows", f"{df1.duplicated().sum()}", f"{df2.duplicated().sum()}")
230
+
231
+ # Overlapping columns
232
+ cols1 = set(df1.columns)
233
+ cols2 = set(df2.columns)
234
+ overlapping = cols1.intersection(cols2)
235
+ unique_to_1 = cols1 - cols2
236
+ unique_to_2 = cols2 - cols1
237
+
238
+ table.add_row("Overlapping Columns", str(len(overlapping)), str(len(overlapping)))
239
+ table.add_row("Unique Columns", str(len(unique_to_1)), str(len(unique_to_2)))
240
+
241
+ console.print(table)
242
+
243
+ if overlapping:
244
+ rprint(f"[bold yellow]Common Columns:[/bold yellow] {', '.join(f'`{c}`' for c in list(overlapping)[:8])}...")
245
+ if unique_to_1:
246
+ rprint(f"[bold yellow]Unique to Dataset 1:[/bold yellow] {', '.join(f'`{c}`' for c in list(unique_to_1)[:8])}...")
247
+ if unique_to_2:
248
+ rprint(f"[bold yellow]Unique to Dataset 2:[/bold yellow] {', '.join(f'`{c}`' for c in list(unique_to_2)[:8])}...")
249
+
250
+ def export(format_type: str = "pdf"):
251
+ """
252
+ Exports the generated HTML report as PDF.
253
+ Attempts to use pdfkit if available; falls back gracefully.
254
+ """
255
+ format_type = format_type.lower()
256
+ html_path = "./tom_report/report.html"
257
+
258
+ if not os.path.exists(html_path):
259
+ rprint("[bold red]❌ Error: Generate a report first by running om.describe().[/bold red]")
260
+ return
261
+
262
+ if format_type == "pdf":
263
+ try:
264
+ import pdfkit
265
+ pdf_path = "./tom_report/report.pdf"
266
+ rprint("[bold cyan]Generating PDF Report using pdfkit...[/bold cyan]")
267
+ pdfkit.from_file(html_path, pdf_path)
268
+ rprint(f"[bold green]🎉 Success! Report exported to PDF: {os.path.abspath(pdf_path)}[/bold green]")
269
+ except ImportError:
270
+ rprint("[bold red]❌ Error: 'pdfkit' is not installed.[/bold red]")
271
+ rprint(" [yellow]Please run: pip install pdfkit[/yellow]")
272
+ except Exception as e:
273
+ rprint("[bold red]❌ Failed to export PDF.[/bold red]")
274
+ rprint(" [yellow]Note: pdfkit requires wkhtmltopdf binary installed on your system PATH.[/yellow]")
275
+ rprint(" [yellow]You can alternatively print the HTML report in your browser (Ctrl+P) and Save as PDF![/yellow]")
276
+ else:
277
+ rprint(f"[bold red]❌ Unsupported export format: '{format_type}'[/bold red]")