PyPI - eda-k - Versions diffs - 0.1.0__py3-none-any.whl - Mend

eda-k 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

eda_k/__init__.py +218 -0
eda_k/charts.py +428 -0
eda_k/chat_assistant.py +252 -0
eda_k/eda_engine.py +364 -0
eda_k/report_builder.py +461 -0
eda_k-0.1.0.dist-info/METADATA +137 -0
eda_k-0.1.0.dist-info/RECORD +10 -0
eda_k-0.1.0.dist-info/WHEEL +5 -0
eda_k-0.1.0.dist-info/licenses/licence.txt +7 -0
eda_k-0.1.0.dist-info/top_level.txt +1 -0

eda_k/__init__.py ADDED Viewed

@@ -0,0 +1,218 @@
+"""
+eda_k
+=====
+A local, no-API-key automated EDA toolkit.
+Quick start
+-----------
+    import eda_k
+    result = eda_k.analyze("data.csv")
+    result.summary()                      # quick text summary
+    result.to_html("report.html")         # full self-contained HTML report
+    result.to_csv_zip("tables.zip")       # all summary tables as a ZIP of CSVs
+    result.ask("which columns have missing values?")
+Lower-level building blocks (`eda_engine`, `charts`, `chat_assistant`,
+`report_builder`) remain available as submodules for anyone who wants
+finer-grained control, e.g.:
+    from eda_k import eda_engine, charts
+    df = eda_engine.load_file(open("data.csv", "rb"), "data.csv")
+    results = eda_engine.run_full_eda(df)
+"""
+from __future__ import annotations
+import io
+import zipfile
+from pathlib import Path
+from typing import Optional, Union
+import pandas as pd
+from . import charts, chat_assistant, eda_engine, report_builder
+from .chat_assistant import SUGGESTED_QUESTIONS, answer_question
+from .eda_engine import load_file, run_full_eda
+__all__ = [
+    "analyze",
+    "EDAResult",
+    "eda_engine",
+    "charts",
+    "chat_assistant",
+    "report_builder",
+    "load_file",
+    "run_full_eda",
+    "answer_question",
+    "SUGGESTED_QUESTIONS",
+]
+__version__ = "0.1.0"
+class EDAResult:
+    """
+    Convenience wrapper bundling a DataFrame with its EDA results.
+    Returned by `eda_k.analyze(...)`. Wraps the same `results` dict
+    produced by `eda_engine.run_full_eda`, plus helpers for reporting,
+    exporting, and Q&A — so you don't need Streamlit to get value out
+    of this library.
+    """
+    def __init__(self, df: pd.DataFrame, results: dict, name: str = "dataset"):
+        self.df = df
+        self.results = results
+        self.name = name
+    # -- introspection -----------------------------------------------------
+    def __repr__(self) -> str:
+        ov = self.results["overview"]
+        return (
+            f"<EDAResult '{self.name}' rows={ov['n_rows']:,} "
+            f"cols={ov['n_cols']:,} missing={ov['missing_pct']}% "
+            f"duplicates={ov['duplicate_rows']:,}>"
+        )
+    def summary(self) -> str:
+        """Return a short human-readable text summary."""
+        return answer_question("give me a summary of this dataset", self.df, self.results)
+    def ask(self, question: str) -> str:
+        """Ask a natural-language question about the dataset."""
+        return answer_question(question, self.df, self.results)
+    # -- figures -------------------------------------------------------------
+    def build_figures(self, top_n_cats: int = 10) -> dict:
+        """Build the full set of Plotly figures used in the HTML report."""
+        ov = self.results["overview"]
+        return {
+            "missing_bar": charts.missing_values_bar(self.results["missing_summary"]),
+            "histograms": {c: charts.histogram(self.df, c) for c in ov["numeric_cols"]},
+            "boxplots": {c: charts.boxplot(self.df, c) for c in ov["numeric_cols"]},
+            "categorical_bars": {
+                c: charts.bar_categorical(self.df, c, top_n=top_n_cats)
+                for c in ov["categorical_cols"]
+            },
+            "corr_heatmap": (
+                charts.correlation_heatmap(self.results["correlation"])
+                if not self.results["correlation"].empty
+                else None
+            ),
+        }
+    # -- export --------------------------------------------------------------
+    def to_html(
+        self,
+        path: Optional[Union[str, Path]] = None,
+        include_advanced_stats: bool = True,
+        top_n_cats: int = 10,
+    ) -> str:
+        """
+        Build the self-contained HTML report.
+        If `path` is given, also writes the report to disk.
+        Always returns the HTML string.
+        """
+        figures = self.build_figures(top_n_cats=top_n_cats)
+        html_str = report_builder.build_html_report(
+            self.df,
+            self.results,
+            figures,
+            filename=self.name,
+            include_advanced_stats=include_advanced_stats,
+        )
+        if path is not None:
+            Path(path).write_text(html_str, encoding="utf-8")
+        return html_str
+    def to_csv_zip(self, path: Optional[Union[str, Path]] = None) -> bytes:
+        """
+        Build a ZIP archive of every summary table as CSV.
+        If `path` is given, also writes the ZIP to disk.
+        Always returns the ZIP bytes.
+        """
+        results = self.results
+        buf = io.BytesIO()
+        with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
+            zf.writestr("dtype_summary.csv", results["dtype_table"].to_csv(index=False))
+            zf.writestr("missing_values.csv", results["missing_summary"].to_csv(index=False))
+            zf.writestr("numeric_summary.csv", results["numeric_summary"].to_csv(index=False))
+            zf.writestr("outliers.csv", results["outliers"].to_csv(index=False))
+            if not results["correlation"].empty:
+                zf.writestr("correlation_matrix.csv", results["correlation"].to_csv())
+                zf.writestr("top_correlations.csv", results["top_correlations"].to_csv(index=False))
+            for col, info in results["categorical_summary"].items():
+                safe = "".join(ch if ch.isalnum() else "_" for ch in col)
+                zf.writestr(f"categorical_{safe}.csv", info["top_values"].to_csv(index=False))
+        data = buf.getvalue()
+        if path is not None:
+            Path(path).write_bytes(data)
+        return data
+def analyze(
+    source: Union[str, Path, "io.IOBase", pd.DataFrame],
+    filename: Optional[str] = None,
+    outlier_method: str = "Both",
+    correlation_method: str = "pearson",
+    max_sample_size: int = 5000,
+) -> EDAResult:
+    """
+    Run a full automated EDA pipeline on a file path, file-like object,
+    or an existing DataFrame, and return an `EDAResult`.
+    Parameters
+    ----------
+    source : str | Path | file-like | pd.DataFrame
+        A path to a CSV/TSV/TXT/XLSX/XLS/JSON/Parquet file, an open
+        file-like object, or an already-loaded DataFrame.
+    filename : str, optional
+        Used for display/report naming and (when `source` is a path-like
+        string/Path) to infer the file type. Required when `source` is a
+        file-like object without a `.name` attribute and not a DataFrame.
+    outlier_method : {"IQR", "Z-score", "Both"}
+    correlation_method : {"pearson", "spearman", "kendall"}
+    max_sample_size : int
+        Max rows sampled for the Shapiro-Wilk normality test.
+    Examples
+    --------
+    >>> result = eda_k.analyze("data.csv")
+    >>> result.summary()
+    >>> result.to_html("report.html")
+    """
+    if isinstance(source, pd.DataFrame):
+        df = source.copy()
+        name = filename or "dataset"
+    else:
+        if isinstance(source, (str, Path)):
+            path = Path(source)
+            name = filename or path.name
+            with open(path, "rb") as f:
+                df = load_file(f, name)
+        else:
+            # file-like object
+            name = filename or getattr(source, "name", None)
+            if not name:
+                raise ValueError(
+                    "filename is required when passing a file-like object "
+                    "without a '.name' attribute"
+                )
+            df = load_file(source, name)
+    if df.empty:
+        raise ValueError(f"'{name}' loaded but contains no rows.")
+    results = run_full_eda(
+        df,
+        outlier_method=outlier_method,
+        correlation_method=correlation_method,
+        max_sample_size=max_sample_size,
+    )
+    return EDAResult(df, results, name=name)

eda_k/charts.py ADDED Viewed

@@ -0,0 +1,428 @@
+"""
+charts.py
+All chart-building functions, using Plotly so charts are interactive
+in Streamlit AND can be embedded as static images in the HTML report.
+"""
+import numpy as np
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+# Try to import scipy for statistical functions
+try:
+    import scipy.stats as stats
+    SCIPY_AVAILABLE = True
+except ImportError:
+    SCIPY_AVAILABLE = False
+def missing_values_bar(missing_df: pd.DataFrame):
+    """Create a bar chart of missing values by column."""
+    d = missing_df[missing_df["Missing Count"] > 0]
+    if d.empty:
+        return None
+    fig = px.bar(
+        d,
+        x="Column",
+        y="Missing %",
+        text="Missing Count",
+        title="Missing Values by Column",
+        labels={"Missing %": "Missing (%)"},
+        color_discrete_sequence=["#6366f1"]
+    )
+    fig.update_traces(
+        textposition="outside",
+        marker_color="#6366f1",
+        hovertemplate="<b>%{x}</b><br>Missing: %{text:,}<br>Percentage: %{y:.1f}%<extra></extra>"
+    )
+    fig.update_layout(
+        height=400,
+        xaxis_tickangle=-45,
+        yaxis_title="Missing (%)",
+        xaxis_title="",
+        margin=dict(l=10, r=10, t=40, b=80),
+        showlegend=False
+    )
+    return fig
+def missing_pattern_heatmap(missing_matrix: pd.DataFrame):
+    """Create a heatmap showing missing value patterns."""
+    if missing_matrix.empty or missing_matrix.shape[1] < 2:
+        return None
+    fig = px.imshow(
+        missing_matrix.T,
+        color_continuous_scale=["#e5e7eb", "#6366f1"],
+        title="Missing Value Pattern Heatmap",
+        labels={"x": "Row Index", "y": "Column", "color": "Missing"},
+        aspect="auto"
+    )
+    fig.update_layout(
+        height=max(300, 30 * missing_matrix.shape[1]),
+        margin=dict(l=10, r=10, t=40, b=60)
+    )
+    return fig
+def correlation_heatmap(corr_df: pd.DataFrame):
+    """Create a correlation heatmap with improved styling."""
+    if corr_df.empty or len(corr_df) < 2:
+        return None
+    fig = px.imshow(
+        corr_df,
+        text_auto=".2f",
+        color_continuous_scale="RdBu_r",
+        zmin=-1,
+        zmax=1,
+        title="Correlation Heatmap",
+        aspect="auto"
+    )
+    fig.update_layout(
+        height=max(400, 40 * len(corr_df)),
+        margin=dict(l=10, r=10, t=40, b=80),
+        xaxis=dict(showgrid=False),
+        yaxis=dict(showgrid=False)
+    )
+    fig.update_traces(
+        hovertemplate="<b>%{x}</b> × <b>%{y}</b><br>Correlation: %{z:.3f}<extra></extra>"
+    )
+    return fig
+def histogram(df: pd.DataFrame, col: str, bins: int = 40):
+    """Create an interactive histogram with marginal boxplot."""
+    s = df[col].dropna()
+    if s.empty:
+        return None
+    fig = px.histogram(
+        s,
+        x=col,
+        nbins=bins,
+        marginal="box",
+        title=f"Distribution of {col}",
+        color_discrete_sequence=["#6366f1"]
+    )
+    fig.update_traces(
+        marker_color="#6366f1",
+        marker_line_color="#4f46e5",
+        marker_line_width=0.5,
+        opacity=0.8,
+        hovertemplate="<b>%{x:.3f}</b><br>Count: %{y:,}<extra></extra>"
+    )
+    fig.update_layout(
+        height=400,
+        xaxis_title=col,
+        yaxis_title="Frequency",
+        margin=dict(l=10, r=10, t=40, b=40),
+        showlegend=False
+    )
+    # Add mean and median lines
+    mean_val = s.mean()
+    median_val = s.median()
+    fig.add_vline(x=mean_val, line_dash="dash", line_color="#ef4444",
+                  annotation_text=f"Mean: {mean_val:.3f}", annotation_position="top")
+    fig.add_vline(x=median_val, line_dash="dot", line_color="#22c55e",
+                  annotation_text=f"Median: {median_val:.3f}", annotation_position="bottom")
+    return fig
+def boxplot(df: pd.DataFrame, col: str):
+    """Create an enhanced boxplot for outlier detection."""
+    s = df[col].dropna()
+    if s.empty:
+        return None
+    fig = px.box(
+        s,
+        y=col,
+        title=f"Boxplot of {col}",
+        points="outliers",
+        color_discrete_sequence=["#6366f1"]
+    )
+    fig.update_traces(
+        marker_color="#ef4444",
+        marker_size=6,
+        line_color="#4f46e5",
+        hovertemplate="<b>%{y:.3f}</b><extra></extra>"
+    )
+    fig.update_layout(
+        height=400,
+        yaxis_title=col,
+        xaxis_title="",
+        margin=dict(l=10, r=10, t=40, b=40),
+        showlegend=False
+    )
+    # Add statistical annotations
+    q1 = s.quantile(0.25)
+    q3 = s.quantile(0.75)
+    iqr = q3 - q1
+    lower_fence = q1 - 1.5 * iqr
+    upper_fence = q3 + 1.5 * iqr
+    fig.add_hline(y=lower_fence, line_dash="dash", line_color="#f59e0b",
+                  annotation_text=f"Lower fence: {lower_fence:.3f}")
+    fig.add_hline(y=upper_fence, line_dash="dash", line_color="#f59e0b",
+                  annotation_text=f"Upper fence: {upper_fence:.3f}")
+    return fig
+def qq_plot(df: pd.DataFrame, col: str):
+    """Create a Q-Q plot for normality check."""
+    if not SCIPY_AVAILABLE:
+        return None
+    s = df[col].dropna()
+    if s.empty or len(s) < 5:
+        return None
+    # Calculate theoretical quantiles
+    n = len(s)
+    theoretical = stats.norm.ppf(np.arange(1, n + 1) / (n + 1))
+    sample = np.sort(s)
+    fig = go.Figure()
+    # Add scatter points
+    fig.add_trace(go.Scatter(
+        x=theoretical,
+        y=sample,
+        mode="markers",
+        name="Data",
+        marker=dict(color="#6366f1", size=4, opacity=0.7),
+        hovertemplate="<b>Theoretical: %{x:.3f}</b><br>Sample: %{y:.3f}<extra></extra>"
+    ))
+    # Add diagonal reference line
+    min_val = min(theoretical.min(), sample.min())
+    max_val = max(theoretical.max(), sample.max())
+    fig.add_trace(go.Scatter(
+        x=[min_val, max_val],
+        y=[min_val, max_val],
+        mode="lines",
+        name="Normal Reference",
+        line=dict(color="#ef4444", dash="dash", width=2)
+    ))
+    fig.update_layout(
+        title=f"Q-Q Plot for {col}",
+        xaxis_title="Theoretical Quantiles",
+        yaxis_title=f"Sample Quantiles ({col})",
+        height=400,
+        margin=dict(l=10, r=10, t=40, b=40),
+        legend=dict(x=0.02, y=0.98)
+    )
+    return fig
+def bar_categorical(df: pd.DataFrame, col: str, top_n: int = 15):
+    """Create an improved categorical bar chart."""
+    vc = df[col].value_counts(dropna=True).head(top_n)
+    if vc.empty:
+        return None
+    fig = px.bar(
+        x=vc.values,
+        y=vc.index.astype(str),
+        orientation="h",
+        title=f"Top {min(top_n, len(vc))} values — {col}",
+        labels={"x": "Count", "y": col},
+        color=vc.values,
+        color_continuous_scale="Blues",
+        text=vc.values
+    )
+    fig.update_traces(
+        textposition="outside",
+        marker_color="#10b981",
+        hovertemplate="<b>%{y}</b><br>Count: %{x:,}<br>Percentage: %{customdata:.1f}%<extra></extra>",
+        customdata=100 * vc.values / len(df)
+    )
+    fig.update_layout(
+        height=max(350, 28 * len(vc)),
+        yaxis={"categoryorder": "total ascending"},
+        xaxis_title="Count",
+        margin=dict(l=10, r=40, t=40, b=20),
+        showlegend=False
+    )
+    return fig
+def scatter_matrix(df: pd.DataFrame, numeric_cols: list, max_cols: int = 5):
+    """Create a scatter matrix for numeric columns."""
+    cols = numeric_cols[:max_cols]
+    if len(cols) < 2:
+        return None
+    fig = px.scatter_matrix(
+        df[cols],
+        dimensions=cols,
+        title="Scatter Matrix",
+        color_discrete_sequence=["#6366f1"]
+    )
+    fig.update_traces(
+        diagonal_visible=False,
+        marker=dict(size=3, opacity=0.6, color="#6366f1"),
+        hovertemplate="<b>%{x:.3f}</b> × <b>%{y:.3f}</b><extra></extra>"
+    )
+    fig.update_layout(height=700)
+    return fig
+def pairwise_scatter(df: pd.DataFrame, col_x: str, col_y: str):
+    """Create a scatter plot for a pair of variables."""
+    # Remove trendline to avoid statsmodels dependency
+    fig = px.scatter(
+        df,
+        x=col_x,
+        y=col_y,
+        title=f"{col_y} vs {col_x}",
+        opacity=0.6,
+        color_discrete_sequence=["#6366f1"]
+    )
+    fig.update_traces(
+        marker=dict(color="#6366f1", size=5),
+        hovertemplate="<b>%{x:.3f}</b> × <b>%{y:.3f}</b><extra></extra>"
+    )
+    # Add correlation annotation
+    corr = df[col_x].corr(df[col_y])
+    if not np.isnan(corr):
+        fig.add_annotation(
+            x=0.95,
+            y=0.95,
+            xref="paper",
+            yref="paper",
+            text=f"Correlation: {corr:.3f}",
+            showarrow=False,
+            font=dict(size=12, color="#4f46e5"),
+            bgcolor="rgba(255,255,255,0.8)",
+            bordercolor="#e5e7eb",
+            borderwidth=1
+        )
+    fig.update_layout(height=450)
+    return fig
+def pairwise_scatter_with_trendline(df: pd.DataFrame, col_x: str, col_y: str):
+    """Create a scatter plot with trendline for a pair of variables.
+    This requires statsmodels to be installed."""
+    try:
+        import statsmodels.api as sm
+        fig = px.scatter(
+            df,
+            x=col_x,
+            y=col_y,
+            trendline="ols",
+            title=f"{col_y} vs {col_x}",
+            opacity=0.6,
+            color_discrete_sequence=["#6366f1"]
+        )
+        fig.update_traces(
+            marker=dict(color="#6366f1", size=5),
+            hovertemplate="<b>%{x:.3f}</b> × <b>%{y:.3f}</b><extra></extra>"
+        )
+        # Add correlation annotation
+        corr = df[col_x].corr(df[col_y])
+        if not np.isnan(corr):
+            fig.add_annotation(
+                x=0.95,
+                y=0.95,
+                xref="paper",
+                yref="paper",
+                text=f"Correlation: {corr:.3f}",
+                showarrow=False,
+                font=dict(size=12, color="#4f46e5"),
+                bgcolor="rgba(255,255,255,0.8)",
+                bordercolor="#e5e7eb",
+                borderwidth=1
+            )
+        fig.update_layout(height=450)
+        return fig
+    except ImportError:
+        # Fallback to regular scatter without trendline
+        return pairwise_scatter(df, col_x, col_y)
+def time_series_plot(df: pd.DataFrame, date_col: str, value_col: str):
+    """Create a time series plot for datetime data."""
+    # Ensure date column is datetime
+    df = df.copy()
+    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
+    df = df.dropna(subset=[date_col])
+    if df.empty or value_col not in df.columns:
+        return None
+    fig = px.line(
+        df,
+        x=date_col,
+        y=value_col,
+        title=f"{value_col} over time",
+        color_discrete_sequence=["#6366f1"]
+    )
+    fig.update_traces(
+        line=dict(width=2),
+        hovertemplate="<b>%{x|%Y-%m-%d}</b><br>%{y:.3f}<extra></extra>"
+    )
+    fig.update_layout(
+        height=400,
+        xaxis_title="Date",
+        yaxis_title=value_col,
+        margin=dict(l=10, r=10, t=40, b=40)
+    )
+    return fig
+def multi_histogram(df: pd.DataFrame, numeric_cols: list, max_cols: int = 4):
+    """Create a grid of histograms for multiple numeric columns."""
+    cols = numeric_cols[:max_cols]
+    if len(cols) < 1:
+        return None
+    n_cols = min(2, len(cols))
+    n_rows = (len(cols) + n_cols - 1) // n_cols
+    fig = make_subplots(
+        rows=n_rows,
+        cols=n_cols,
+        subplot_titles=cols,
+        shared_yaxes=False
+    )
+    for i, col in enumerate(cols):
+        row = i // n_cols + 1
+        col_idx = i % n_cols + 1
+        s = df[col].dropna()
+        if not s.empty:
+            fig.add_trace(
+                go.Histogram(
+                    x=s,
+                    nbinsx=40,
+                    name=col,
+                    marker_color="#6366f1",
+                    opacity=0.8
+                ),
+                row=row,
+                col=col_idx
+            )
+    fig.update_layout(
+        height=300 * n_rows,
+        showlegend=False,
+        margin=dict(l=10, r=10, t=40, b=40)
+    )
+    fig.update_xaxes(title_text="")
+    return fig