eda-k 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
eda_k/__init__.py ADDED
@@ -0,0 +1,218 @@
1
+ """
2
+ eda_k
3
+ =====
4
+ A local, no-API-key automated EDA toolkit.
5
+
6
+ Quick start
7
+ -----------
8
+ import eda_k
9
+
10
+ result = eda_k.analyze("data.csv")
11
+ result.summary() # quick text summary
12
+ result.to_html("report.html") # full self-contained HTML report
13
+ result.to_csv_zip("tables.zip") # all summary tables as a ZIP of CSVs
14
+ result.ask("which columns have missing values?")
15
+
16
+ Lower-level building blocks (`eda_engine`, `charts`, `chat_assistant`,
17
+ `report_builder`) remain available as submodules for anyone who wants
18
+ finer-grained control, e.g.:
19
+
20
+ from eda_k import eda_engine, charts
21
+ df = eda_engine.load_file(open("data.csv", "rb"), "data.csv")
22
+ results = eda_engine.run_full_eda(df)
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import io
28
+ import zipfile
29
+ from pathlib import Path
30
+ from typing import Optional, Union
31
+
32
+ import pandas as pd
33
+
34
+ from . import charts, chat_assistant, eda_engine, report_builder
35
+ from .chat_assistant import SUGGESTED_QUESTIONS, answer_question
36
+ from .eda_engine import load_file, run_full_eda
37
+
38
+ __all__ = [
39
+ "analyze",
40
+ "EDAResult",
41
+ "eda_engine",
42
+ "charts",
43
+ "chat_assistant",
44
+ "report_builder",
45
+ "load_file",
46
+ "run_full_eda",
47
+ "answer_question",
48
+ "SUGGESTED_QUESTIONS",
49
+ ]
50
+
51
+ __version__ = "0.1.0"
52
+
53
+
54
+ class EDAResult:
55
+ """
56
+ Convenience wrapper bundling a DataFrame with its EDA results.
57
+
58
+ Returned by `eda_k.analyze(...)`. Wraps the same `results` dict
59
+ produced by `eda_engine.run_full_eda`, plus helpers for reporting,
60
+ exporting, and Q&A — so you don't need Streamlit to get value out
61
+ of this library.
62
+ """
63
+
64
+ def __init__(self, df: pd.DataFrame, results: dict, name: str = "dataset"):
65
+ self.df = df
66
+ self.results = results
67
+ self.name = name
68
+
69
+ # -- introspection -----------------------------------------------------
70
+
71
+ def __repr__(self) -> str:
72
+ ov = self.results["overview"]
73
+ return (
74
+ f"<EDAResult '{self.name}' rows={ov['n_rows']:,} "
75
+ f"cols={ov['n_cols']:,} missing={ov['missing_pct']}% "
76
+ f"duplicates={ov['duplicate_rows']:,}>"
77
+ )
78
+
79
+ def summary(self) -> str:
80
+ """Return a short human-readable text summary."""
81
+ return answer_question("give me a summary of this dataset", self.df, self.results)
82
+
83
+ def ask(self, question: str) -> str:
84
+ """Ask a natural-language question about the dataset."""
85
+ return answer_question(question, self.df, self.results)
86
+
87
+ # -- figures -------------------------------------------------------------
88
+
89
+ def build_figures(self, top_n_cats: int = 10) -> dict:
90
+ """Build the full set of Plotly figures used in the HTML report."""
91
+ ov = self.results["overview"]
92
+ return {
93
+ "missing_bar": charts.missing_values_bar(self.results["missing_summary"]),
94
+ "histograms": {c: charts.histogram(self.df, c) for c in ov["numeric_cols"]},
95
+ "boxplots": {c: charts.boxplot(self.df, c) for c in ov["numeric_cols"]},
96
+ "categorical_bars": {
97
+ c: charts.bar_categorical(self.df, c, top_n=top_n_cats)
98
+ for c in ov["categorical_cols"]
99
+ },
100
+ "corr_heatmap": (
101
+ charts.correlation_heatmap(self.results["correlation"])
102
+ if not self.results["correlation"].empty
103
+ else None
104
+ ),
105
+ }
106
+
107
+ # -- export --------------------------------------------------------------
108
+
109
+ def to_html(
110
+ self,
111
+ path: Optional[Union[str, Path]] = None,
112
+ include_advanced_stats: bool = True,
113
+ top_n_cats: int = 10,
114
+ ) -> str:
115
+ """
116
+ Build the self-contained HTML report.
117
+
118
+ If `path` is given, also writes the report to disk.
119
+ Always returns the HTML string.
120
+ """
121
+ figures = self.build_figures(top_n_cats=top_n_cats)
122
+ html_str = report_builder.build_html_report(
123
+ self.df,
124
+ self.results,
125
+ figures,
126
+ filename=self.name,
127
+ include_advanced_stats=include_advanced_stats,
128
+ )
129
+ if path is not None:
130
+ Path(path).write_text(html_str, encoding="utf-8")
131
+ return html_str
132
+
133
+ def to_csv_zip(self, path: Optional[Union[str, Path]] = None) -> bytes:
134
+ """
135
+ Build a ZIP archive of every summary table as CSV.
136
+
137
+ If `path` is given, also writes the ZIP to disk.
138
+ Always returns the ZIP bytes.
139
+ """
140
+ results = self.results
141
+ buf = io.BytesIO()
142
+ with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
143
+ zf.writestr("dtype_summary.csv", results["dtype_table"].to_csv(index=False))
144
+ zf.writestr("missing_values.csv", results["missing_summary"].to_csv(index=False))
145
+ zf.writestr("numeric_summary.csv", results["numeric_summary"].to_csv(index=False))
146
+ zf.writestr("outliers.csv", results["outliers"].to_csv(index=False))
147
+ if not results["correlation"].empty:
148
+ zf.writestr("correlation_matrix.csv", results["correlation"].to_csv())
149
+ zf.writestr("top_correlations.csv", results["top_correlations"].to_csv(index=False))
150
+ for col, info in results["categorical_summary"].items():
151
+ safe = "".join(ch if ch.isalnum() else "_" for ch in col)
152
+ zf.writestr(f"categorical_{safe}.csv", info["top_values"].to_csv(index=False))
153
+ data = buf.getvalue()
154
+ if path is not None:
155
+ Path(path).write_bytes(data)
156
+ return data
157
+
158
+
159
+ def analyze(
160
+ source: Union[str, Path, "io.IOBase", pd.DataFrame],
161
+ filename: Optional[str] = None,
162
+ outlier_method: str = "Both",
163
+ correlation_method: str = "pearson",
164
+ max_sample_size: int = 5000,
165
+ ) -> EDAResult:
166
+ """
167
+ Run a full automated EDA pipeline on a file path, file-like object,
168
+ or an existing DataFrame, and return an `EDAResult`.
169
+
170
+ Parameters
171
+ ----------
172
+ source : str | Path | file-like | pd.DataFrame
173
+ A path to a CSV/TSV/TXT/XLSX/XLS/JSON/Parquet file, an open
174
+ file-like object, or an already-loaded DataFrame.
175
+ filename : str, optional
176
+ Used for display/report naming and (when `source` is a path-like
177
+ string/Path) to infer the file type. Required when `source` is a
178
+ file-like object without a `.name` attribute and not a DataFrame.
179
+ outlier_method : {"IQR", "Z-score", "Both"}
180
+ correlation_method : {"pearson", "spearman", "kendall"}
181
+ max_sample_size : int
182
+ Max rows sampled for the Shapiro-Wilk normality test.
183
+
184
+ Examples
185
+ --------
186
+ >>> result = eda_k.analyze("data.csv")
187
+ >>> result.summary()
188
+ >>> result.to_html("report.html")
189
+ """
190
+ if isinstance(source, pd.DataFrame):
191
+ df = source.copy()
192
+ name = filename or "dataset"
193
+ else:
194
+ if isinstance(source, (str, Path)):
195
+ path = Path(source)
196
+ name = filename or path.name
197
+ with open(path, "rb") as f:
198
+ df = load_file(f, name)
199
+ else:
200
+ # file-like object
201
+ name = filename or getattr(source, "name", None)
202
+ if not name:
203
+ raise ValueError(
204
+ "filename is required when passing a file-like object "
205
+ "without a '.name' attribute"
206
+ )
207
+ df = load_file(source, name)
208
+
209
+ if df.empty:
210
+ raise ValueError(f"'{name}' loaded but contains no rows.")
211
+
212
+ results = run_full_eda(
213
+ df,
214
+ outlier_method=outlier_method,
215
+ correlation_method=correlation_method,
216
+ max_sample_size=max_sample_size,
217
+ )
218
+ return EDAResult(df, results, name=name)
eda_k/charts.py ADDED
@@ -0,0 +1,428 @@
1
+ """
2
+ charts.py
3
+ All chart-building functions, using Plotly so charts are interactive
4
+ in Streamlit AND can be embedded as static images in the HTML report.
5
+ """
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import plotly.express as px
10
+ import plotly.graph_objects as go
11
+ from plotly.subplots import make_subplots
12
+
13
+ # Try to import scipy for statistical functions
14
+ try:
15
+ import scipy.stats as stats
16
+ SCIPY_AVAILABLE = True
17
+ except ImportError:
18
+ SCIPY_AVAILABLE = False
19
+
20
+
21
+ def missing_values_bar(missing_df: pd.DataFrame):
22
+ """Create a bar chart of missing values by column."""
23
+ d = missing_df[missing_df["Missing Count"] > 0]
24
+ if d.empty:
25
+ return None
26
+
27
+ fig = px.bar(
28
+ d,
29
+ x="Column",
30
+ y="Missing %",
31
+ text="Missing Count",
32
+ title="Missing Values by Column",
33
+ labels={"Missing %": "Missing (%)"},
34
+ color_discrete_sequence=["#6366f1"]
35
+ )
36
+ fig.update_traces(
37
+ textposition="outside",
38
+ marker_color="#6366f1",
39
+ hovertemplate="<b>%{x}</b><br>Missing: %{text:,}<br>Percentage: %{y:.1f}%<extra></extra>"
40
+ )
41
+ fig.update_layout(
42
+ height=400,
43
+ xaxis_tickangle=-45,
44
+ yaxis_title="Missing (%)",
45
+ xaxis_title="",
46
+ margin=dict(l=10, r=10, t=40, b=80),
47
+ showlegend=False
48
+ )
49
+ return fig
50
+
51
+
52
+ def missing_pattern_heatmap(missing_matrix: pd.DataFrame):
53
+ """Create a heatmap showing missing value patterns."""
54
+ if missing_matrix.empty or missing_matrix.shape[1] < 2:
55
+ return None
56
+
57
+ fig = px.imshow(
58
+ missing_matrix.T,
59
+ color_continuous_scale=["#e5e7eb", "#6366f1"],
60
+ title="Missing Value Pattern Heatmap",
61
+ labels={"x": "Row Index", "y": "Column", "color": "Missing"},
62
+ aspect="auto"
63
+ )
64
+ fig.update_layout(
65
+ height=max(300, 30 * missing_matrix.shape[1]),
66
+ margin=dict(l=10, r=10, t=40, b=60)
67
+ )
68
+ return fig
69
+
70
+
71
+ def correlation_heatmap(corr_df: pd.DataFrame):
72
+ """Create a correlation heatmap with improved styling."""
73
+ if corr_df.empty or len(corr_df) < 2:
74
+ return None
75
+
76
+ fig = px.imshow(
77
+ corr_df,
78
+ text_auto=".2f",
79
+ color_continuous_scale="RdBu_r",
80
+ zmin=-1,
81
+ zmax=1,
82
+ title="Correlation Heatmap",
83
+ aspect="auto"
84
+ )
85
+ fig.update_layout(
86
+ height=max(400, 40 * len(corr_df)),
87
+ margin=dict(l=10, r=10, t=40, b=80),
88
+ xaxis=dict(showgrid=False),
89
+ yaxis=dict(showgrid=False)
90
+ )
91
+ fig.update_traces(
92
+ hovertemplate="<b>%{x}</b> × <b>%{y}</b><br>Correlation: %{z:.3f}<extra></extra>"
93
+ )
94
+ return fig
95
+
96
+
97
+ def histogram(df: pd.DataFrame, col: str, bins: int = 40):
98
+ """Create an interactive histogram with marginal boxplot."""
99
+ s = df[col].dropna()
100
+ if s.empty:
101
+ return None
102
+
103
+ fig = px.histogram(
104
+ s,
105
+ x=col,
106
+ nbins=bins,
107
+ marginal="box",
108
+ title=f"Distribution of {col}",
109
+ color_discrete_sequence=["#6366f1"]
110
+ )
111
+ fig.update_traces(
112
+ marker_color="#6366f1",
113
+ marker_line_color="#4f46e5",
114
+ marker_line_width=0.5,
115
+ opacity=0.8,
116
+ hovertemplate="<b>%{x:.3f}</b><br>Count: %{y:,}<extra></extra>"
117
+ )
118
+ fig.update_layout(
119
+ height=400,
120
+ xaxis_title=col,
121
+ yaxis_title="Frequency",
122
+ margin=dict(l=10, r=10, t=40, b=40),
123
+ showlegend=False
124
+ )
125
+
126
+ # Add mean and median lines
127
+ mean_val = s.mean()
128
+ median_val = s.median()
129
+ fig.add_vline(x=mean_val, line_dash="dash", line_color="#ef4444",
130
+ annotation_text=f"Mean: {mean_val:.3f}", annotation_position="top")
131
+ fig.add_vline(x=median_val, line_dash="dot", line_color="#22c55e",
132
+ annotation_text=f"Median: {median_val:.3f}", annotation_position="bottom")
133
+
134
+ return fig
135
+
136
+
137
+ def boxplot(df: pd.DataFrame, col: str):
138
+ """Create an enhanced boxplot for outlier detection."""
139
+ s = df[col].dropna()
140
+ if s.empty:
141
+ return None
142
+
143
+ fig = px.box(
144
+ s,
145
+ y=col,
146
+ title=f"Boxplot of {col}",
147
+ points="outliers",
148
+ color_discrete_sequence=["#6366f1"]
149
+ )
150
+ fig.update_traces(
151
+ marker_color="#ef4444",
152
+ marker_size=6,
153
+ line_color="#4f46e5",
154
+ hovertemplate="<b>%{y:.3f}</b><extra></extra>"
155
+ )
156
+ fig.update_layout(
157
+ height=400,
158
+ yaxis_title=col,
159
+ xaxis_title="",
160
+ margin=dict(l=10, r=10, t=40, b=40),
161
+ showlegend=False
162
+ )
163
+
164
+ # Add statistical annotations
165
+ q1 = s.quantile(0.25)
166
+ q3 = s.quantile(0.75)
167
+ iqr = q3 - q1
168
+ lower_fence = q1 - 1.5 * iqr
169
+ upper_fence = q3 + 1.5 * iqr
170
+
171
+ fig.add_hline(y=lower_fence, line_dash="dash", line_color="#f59e0b",
172
+ annotation_text=f"Lower fence: {lower_fence:.3f}")
173
+ fig.add_hline(y=upper_fence, line_dash="dash", line_color="#f59e0b",
174
+ annotation_text=f"Upper fence: {upper_fence:.3f}")
175
+
176
+ return fig
177
+
178
+
179
+ def qq_plot(df: pd.DataFrame, col: str):
180
+ """Create a Q-Q plot for normality check."""
181
+ if not SCIPY_AVAILABLE:
182
+ return None
183
+
184
+ s = df[col].dropna()
185
+ if s.empty or len(s) < 5:
186
+ return None
187
+
188
+ # Calculate theoretical quantiles
189
+ n = len(s)
190
+ theoretical = stats.norm.ppf(np.arange(1, n + 1) / (n + 1))
191
+ sample = np.sort(s)
192
+
193
+ fig = go.Figure()
194
+
195
+ # Add scatter points
196
+ fig.add_trace(go.Scatter(
197
+ x=theoretical,
198
+ y=sample,
199
+ mode="markers",
200
+ name="Data",
201
+ marker=dict(color="#6366f1", size=4, opacity=0.7),
202
+ hovertemplate="<b>Theoretical: %{x:.3f}</b><br>Sample: %{y:.3f}<extra></extra>"
203
+ ))
204
+
205
+ # Add diagonal reference line
206
+ min_val = min(theoretical.min(), sample.min())
207
+ max_val = max(theoretical.max(), sample.max())
208
+ fig.add_trace(go.Scatter(
209
+ x=[min_val, max_val],
210
+ y=[min_val, max_val],
211
+ mode="lines",
212
+ name="Normal Reference",
213
+ line=dict(color="#ef4444", dash="dash", width=2)
214
+ ))
215
+
216
+ fig.update_layout(
217
+ title=f"Q-Q Plot for {col}",
218
+ xaxis_title="Theoretical Quantiles",
219
+ yaxis_title=f"Sample Quantiles ({col})",
220
+ height=400,
221
+ margin=dict(l=10, r=10, t=40, b=40),
222
+ legend=dict(x=0.02, y=0.98)
223
+ )
224
+
225
+ return fig
226
+
227
+
228
+ def bar_categorical(df: pd.DataFrame, col: str, top_n: int = 15):
229
+ """Create an improved categorical bar chart."""
230
+ vc = df[col].value_counts(dropna=True).head(top_n)
231
+ if vc.empty:
232
+ return None
233
+
234
+ fig = px.bar(
235
+ x=vc.values,
236
+ y=vc.index.astype(str),
237
+ orientation="h",
238
+ title=f"Top {min(top_n, len(vc))} values — {col}",
239
+ labels={"x": "Count", "y": col},
240
+ color=vc.values,
241
+ color_continuous_scale="Blues",
242
+ text=vc.values
243
+ )
244
+ fig.update_traces(
245
+ textposition="outside",
246
+ marker_color="#10b981",
247
+ hovertemplate="<b>%{y}</b><br>Count: %{x:,}<br>Percentage: %{customdata:.1f}%<extra></extra>",
248
+ customdata=100 * vc.values / len(df)
249
+ )
250
+ fig.update_layout(
251
+ height=max(350, 28 * len(vc)),
252
+ yaxis={"categoryorder": "total ascending"},
253
+ xaxis_title="Count",
254
+ margin=dict(l=10, r=40, t=40, b=20),
255
+ showlegend=False
256
+ )
257
+ return fig
258
+
259
+
260
+ def scatter_matrix(df: pd.DataFrame, numeric_cols: list, max_cols: int = 5):
261
+ """Create a scatter matrix for numeric columns."""
262
+ cols = numeric_cols[:max_cols]
263
+ if len(cols) < 2:
264
+ return None
265
+
266
+ fig = px.scatter_matrix(
267
+ df[cols],
268
+ dimensions=cols,
269
+ title="Scatter Matrix",
270
+ color_discrete_sequence=["#6366f1"]
271
+ )
272
+ fig.update_traces(
273
+ diagonal_visible=False,
274
+ marker=dict(size=3, opacity=0.6, color="#6366f1"),
275
+ hovertemplate="<b>%{x:.3f}</b> × <b>%{y:.3f}</b><extra></extra>"
276
+ )
277
+ fig.update_layout(height=700)
278
+ return fig
279
+
280
+
281
+ def pairwise_scatter(df: pd.DataFrame, col_x: str, col_y: str):
282
+ """Create a scatter plot for a pair of variables."""
283
+ # Remove trendline to avoid statsmodels dependency
284
+ fig = px.scatter(
285
+ df,
286
+ x=col_x,
287
+ y=col_y,
288
+ title=f"{col_y} vs {col_x}",
289
+ opacity=0.6,
290
+ color_discrete_sequence=["#6366f1"]
291
+ )
292
+ fig.update_traces(
293
+ marker=dict(color="#6366f1", size=5),
294
+ hovertemplate="<b>%{x:.3f}</b> × <b>%{y:.3f}</b><extra></extra>"
295
+ )
296
+
297
+ # Add correlation annotation
298
+ corr = df[col_x].corr(df[col_y])
299
+ if not np.isnan(corr):
300
+ fig.add_annotation(
301
+ x=0.95,
302
+ y=0.95,
303
+ xref="paper",
304
+ yref="paper",
305
+ text=f"Correlation: {corr:.3f}",
306
+ showarrow=False,
307
+ font=dict(size=12, color="#4f46e5"),
308
+ bgcolor="rgba(255,255,255,0.8)",
309
+ bordercolor="#e5e7eb",
310
+ borderwidth=1
311
+ )
312
+
313
+ fig.update_layout(height=450)
314
+ return fig
315
+
316
+
317
+ def pairwise_scatter_with_trendline(df: pd.DataFrame, col_x: str, col_y: str):
318
+ """Create a scatter plot with trendline for a pair of variables.
319
+ This requires statsmodels to be installed."""
320
+ try:
321
+ import statsmodels.api as sm
322
+ fig = px.scatter(
323
+ df,
324
+ x=col_x,
325
+ y=col_y,
326
+ trendline="ols",
327
+ title=f"{col_y} vs {col_x}",
328
+ opacity=0.6,
329
+ color_discrete_sequence=["#6366f1"]
330
+ )
331
+ fig.update_traces(
332
+ marker=dict(color="#6366f1", size=5),
333
+ hovertemplate="<b>%{x:.3f}</b> × <b>%{y:.3f}</b><extra></extra>"
334
+ )
335
+
336
+ # Add correlation annotation
337
+ corr = df[col_x].corr(df[col_y])
338
+ if not np.isnan(corr):
339
+ fig.add_annotation(
340
+ x=0.95,
341
+ y=0.95,
342
+ xref="paper",
343
+ yref="paper",
344
+ text=f"Correlation: {corr:.3f}",
345
+ showarrow=False,
346
+ font=dict(size=12, color="#4f46e5"),
347
+ bgcolor="rgba(255,255,255,0.8)",
348
+ bordercolor="#e5e7eb",
349
+ borderwidth=1
350
+ )
351
+
352
+ fig.update_layout(height=450)
353
+ return fig
354
+ except ImportError:
355
+ # Fallback to regular scatter without trendline
356
+ return pairwise_scatter(df, col_x, col_y)
357
+
358
+
359
+ def time_series_plot(df: pd.DataFrame, date_col: str, value_col: str):
360
+ """Create a time series plot for datetime data."""
361
+ # Ensure date column is datetime
362
+ df = df.copy()
363
+ df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
364
+ df = df.dropna(subset=[date_col])
365
+
366
+ if df.empty or value_col not in df.columns:
367
+ return None
368
+
369
+ fig = px.line(
370
+ df,
371
+ x=date_col,
372
+ y=value_col,
373
+ title=f"{value_col} over time",
374
+ color_discrete_sequence=["#6366f1"]
375
+ )
376
+ fig.update_traces(
377
+ line=dict(width=2),
378
+ hovertemplate="<b>%{x|%Y-%m-%d}</b><br>%{y:.3f}<extra></extra>"
379
+ )
380
+ fig.update_layout(
381
+ height=400,
382
+ xaxis_title="Date",
383
+ yaxis_title=value_col,
384
+ margin=dict(l=10, r=10, t=40, b=40)
385
+ )
386
+ return fig
387
+
388
+
389
+ def multi_histogram(df: pd.DataFrame, numeric_cols: list, max_cols: int = 4):
390
+ """Create a grid of histograms for multiple numeric columns."""
391
+ cols = numeric_cols[:max_cols]
392
+ if len(cols) < 1:
393
+ return None
394
+
395
+ n_cols = min(2, len(cols))
396
+ n_rows = (len(cols) + n_cols - 1) // n_cols
397
+
398
+ fig = make_subplots(
399
+ rows=n_rows,
400
+ cols=n_cols,
401
+ subplot_titles=cols,
402
+ shared_yaxes=False
403
+ )
404
+
405
+ for i, col in enumerate(cols):
406
+ row = i // n_cols + 1
407
+ col_idx = i % n_cols + 1
408
+ s = df[col].dropna()
409
+ if not s.empty:
410
+ fig.add_trace(
411
+ go.Histogram(
412
+ x=s,
413
+ nbinsx=40,
414
+ name=col,
415
+ marker_color="#6366f1",
416
+ opacity=0.8
417
+ ),
418
+ row=row,
419
+ col=col_idx
420
+ )
421
+
422
+ fig.update_layout(
423
+ height=300 * n_rows,
424
+ showlegend=False,
425
+ margin=dict(l=10, r=10, t=40, b=40)
426
+ )
427
+ fig.update_xaxes(title_text="")
428
+ return fig