PyPI - syntaxmatrix - Versions diffs - 1.4.6__py3-none-any.whl → 2.5.5.4__py3-none-any.whl - Mend

syntaxmatrix 1.4.6py3-none-any.whl → 2.5.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

syntaxmatrix/__init__.py +13 -8
syntaxmatrix/agentic/__init__.py +0 -0
syntaxmatrix/agentic/agent_tools.py +24 -0
syntaxmatrix/agentic/agents.py +810 -0
syntaxmatrix/agentic/code_tools_registry.py +37 -0
syntaxmatrix/agentic/model_templates.py +1790 -0
syntaxmatrix/auth.py +308 -14
syntaxmatrix/commentary.py +328 -0
syntaxmatrix/core.py +993 -375
syntaxmatrix/dataset_preprocessing.py +218 -0
syntaxmatrix/db.py +92 -95
syntaxmatrix/display.py +95 -121
syntaxmatrix/generate_page.py +634 -0
syntaxmatrix/gpt_models_latest.py +46 -0
syntaxmatrix/history_store.py +26 -29
syntaxmatrix/kernel_manager.py +96 -17
syntaxmatrix/llm_store.py +1 -1
syntaxmatrix/plottings.py +6 -0
syntaxmatrix/profiles.py +64 -8
syntaxmatrix/project_root.py +55 -43
syntaxmatrix/routes.py +5072 -1398
syntaxmatrix/session.py +19 -0
syntaxmatrix/settings/logging.py +40 -0
syntaxmatrix/settings/model_map.py +300 -33
syntaxmatrix/settings/prompts.py +273 -62
syntaxmatrix/settings/string_navbar.py +3 -3
syntaxmatrix/static/docs.md +272 -0
syntaxmatrix/static/icons/favicon.png +0 -0
syntaxmatrix/static/icons/hero_bg.jpg +0 -0
syntaxmatrix/templates/dashboard.html +608 -147
syntaxmatrix/templates/docs.html +71 -0
syntaxmatrix/templates/error.html +2 -3
syntaxmatrix/templates/login.html +1 -0
syntaxmatrix/templates/register.html +1 -0
syntaxmatrix/ui_modes.py +14 -0
syntaxmatrix/utils.py +2482 -159
syntaxmatrix/vectorizer.py +16 -12
{syntaxmatrix-1.4.6.dist-info → syntaxmatrix-2.5.5.4.dist-info}/METADATA +20 -17
syntaxmatrix-2.5.5.4.dist-info/RECORD +68 -0
syntaxmatrix/model_templates.py +0 -30
syntaxmatrix/static/icons/favicon.ico +0 -0
syntaxmatrix-1.4.6.dist-info/RECORD +0 -54
{syntaxmatrix-1.4.6.dist-info → syntaxmatrix-2.5.5.4.dist-info}/WHEEL +0 -0
{syntaxmatrix-1.4.6.dist-info → syntaxmatrix-2.5.5.4.dist-info}/licenses/LICENSE.txt +0 -0
{syntaxmatrix-1.4.6.dist-info → syntaxmatrix-2.5.5.4.dist-info}/top_level.txt +0 -0

syntaxmatrix/utils.py CHANGED Viewed

@@ -1,50 +1,1188 @@
-import openai
-from openai import OpenAI
-import re, os, textwrap
+from __future__ import annotations
+import re, textwrap
 import pandas as pd
-import matplotlib.pyplot as plt
+import numpy as np
 import warnings
-from .model_templates import classification
-import syntaxmatrix as smx
+from difflib import get_close_matches
+from typing import Iterable, Tuple, Dict
+import inspect
+from sklearn.preprocessing import OneHotEncoder
+from syntaxmatrix.agentic.model_templates import (
+    classification, regression, multilabel_classification,
+    eda_overview, eda_correlation,
+    anomaly_detection, ts_anomaly_detection,
+    dimensionality_reduction, feature_selection,
+    time_series_forecasting, time_series_classification,
+    unknown_group_proxy_pack, viz_line,
+    clustering, recommendation, topic_modelling,
+    viz_pie, viz_count_bar, viz_box, viz_scatter,
+    viz_stacked_bar, viz_distribution, viz_area, viz_kde,
+)
+import ast
 warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")
+INJECTABLE_INTENTS = [
+    "classification",
+    "multilabel_classification",
+    "regression",
+    "anomaly_detection",
+    "time_series_forecasting",
+    "time_series_classification",
+    "ts_anomaly_detection",
+    "dimensionality_reduction",
+    "feature_selection",
+    "clustering",
+    "eda",
+    "correlation_analysis",
+    "visualisation",
+    "recommendation",
+    "topic_modelling",
+]
+def classify_ml_job(prompt: str) -> str:
+    """
+    Very-light intent classifier.
+    Returns one of:
+      'stat_test' | 'time_series' | 'clustering'
+      'classification' | 'regression' | 'eda'
+    """
+    p = prompt.lower()
+    greetings = {"hi", "hello", "hey", "good morning", "good afternoon", "good evening", "greetings"}
+    if any(p.startswith(g) or p == g for g in greetings):
+        return "greeting"
-# def ai_generate_code(question, df):
-#     provider = os.environ.get("provider", "openai")
-#     model = os.environ.get("model", "gpt-4o-mini")
-#     api_key = os.environ.get("OPENAI_API_KEY")
+    # Feature selection / importance intent
+    if any(k in p for k in (
+        "feature selection", "select k best", "selectkbest", "rfe",
+        "mutual information", "feature importance", "permutation importance",
+        "feature engineering suggestions"
+    )):
+        return "feature_selection"
+    # Dimensionality reduction intent
+    if any(k in p for k in (
+        "pca", "principal component", "dimensionality reduction",
+        "reduce dimension", "reduce dimensionality", "t-sne", "tsne", "umap"
+        )):
+        return "dimensionality_reduction"
+    # Anomaly / outlier intent
+    if any(k in p for k in (
+        "anomaly", "anomalies", "outlier", "outliers", "novelty",
+        "fraud", "deviation", "rare event", "rare events", "odd pattern",
+        "suspicious"
+        )):
+        return "anomaly_detection"
-#     llm = OpenAI(api_key=api_key)
+    if any(k in p for k in ("t-test", "anova", "p-value")):
+        return "stat_test"
+    if "forecast" in p or "prophet" in p:
+        return "time_series"
+    if "cluster" in p or "kmeans" in p:
+        return "clustering"
+    if any(k in p for k in ("accuracy", "precision", "roc")):
+        return "classification"
+    if any(k in p for k in ("rmse", "r2", "mae")):
+        return "regression"
-#     context = f"Columns: {list(df.columns)}\nDtypes: {df.dtypes.astype(str).to_dict()}\n"
-#     prompt = (
-#         f"You are an expert Python data analyst. Given the dataframe `df` with the following context:\n{context}\n"
-#         f"Write clean, working Python code that answers the question below. "
-#         f"DO NOT explain, just output the code only (NO comments or text):\n"
-#         f"Question: {question}\n"
-#         f"Output only the working code needed. Assume df is already defined."
-#         f"Produce at least one visible result"
-#         f"(syntaxmatrix.display.show(), display(), plt.show())."
-#     )
-#     if provider.lower() == "openai":
-#         response = llm.chat.completions.create(
-#             model=model,
-#             messages=[{"role": "user", "content": prompt}],
-#             temperature=0.0,
-#             max_tokens=1024,
-#         )
-#         code = response.choices[0].message.content
-#         if "```python" in code:
-#             code = code.split("```python")[1].split("```")[0].strip()
-#         elif "```" in code:
-#             code = code.split("```")[1].split("```")[0].strip()
-#         code = strip_describe_slice(code)
-#         code = drop_bad_classification_metrics(code, df)
-#         return code.strip()
+    return "eda"
+def harden_ai_code(code: str) -> str:
+    """
+    Make any AI-generated cell resilient:
+      - Safe seaborn wrappers + sentinel vars (boxplot/barplot/etc.)
+      - Remove 'numeric_only=' args
+      - Replace pd.concat(...) with _safe_concat(...)
+      - Relax 'required_cols' hard fails
+      - Make static numeric_vars dynamic
+      - Wrap the whole block in try/except so no exception bubbles up
+    """
+    # Remove any LLM-added try/except blocks (hardener adds its own)
+    import re
+    def strip_placeholders(code: str) -> str:
+        code = re.sub(r"\bshow\(\s*\.\.\.\s*\)",
+                    "show('⚠ Block skipped due to an error.')",
+                    code)
+        code = re.sub(r"\breturn\s+\.\.\.", "return None", code)
+        return code
+    def _indent(code: str, spaces: int = 4) -> str:
+        pad = " " * spaces
+        return "\n".join(pad + line for line in code.splitlines())
+    def _SMX_OHE(**k):
+        # normalise arg name across sklearn versions
+        if "sparse" in k and "sparse_output" not in k:
+            k["sparse_output"] = k.pop("sparse")
+        # default behaviour we want
+        k.setdefault("handle_unknown", "ignore")
+        k.setdefault("sparse_output", False)
+        try:
+            # if running on old sklearn without sparse_output, translate back
+            if "sparse_output" not in inspect.signature(OneHotEncoder).parameters:
+                if "sparse_output" in k:
+                    k["sparse"] = k.pop("sparse_output")
+            return OneHotEncoder(**k)
+        except TypeError:
+            # final fallback: try legacy name
+            if "sparse_output" in k:
+                k["sparse"] = k.pop("sparse_output")
+            return OneHotEncoder(**k)
+    def _strip_stray_backrefs(code: str) -> str:
+        code = re.sub(r'(?m)^\s*\\\d+\s*', '', code)
+        code = re.sub(r'(?m)[;]\s*\\\d+\s*', '; ', code)
+        return code
+    def _wrap_metric_calls(code: str) -> str:
+        names = [
+            "r2_score","accuracy_score","precision_score","recall_score","f1_score",
+            "roc_auc_score","classification_report","confusion_matrix",
+            "mean_absolute_error","mean_absolute_percentage_error",
+            "explained_variance_score","log_loss","average_precision_score",
+            "precision_recall_fscore_support"
+        ]
+        pat = re.compile(r"\b(?:(sklearn\.metrics\.|metrics\.)?(" + "|".join(names) + r"))\s*\(")
+        def repl(m):
+            prefix = m.group(1) or ""   # "", "metrics.", or "sklearn.metrics."
+            name = m.group(2)
+            return f"_SMX_call({prefix}{name}, "
+        return pat.sub(repl, code)
+    def _smx_patch_mean_squared_error_squared_kw():
+        """
+        sklearn<0.22 doesn't accept mean_squared_error(..., squared=False).
+        Patch the module attr so 'from sklearn.metrics import mean_squared_error'
+        receives a wrapper that drops 'squared' if the underlying call rejects it.
+        """
+        try:
+            import sklearn.metrics as _sm
+            _orig = getattr(_sm, "mean_squared_error", None)
+            if not callable(_orig):
+                return
+            def _mse_compat(y_true, y_pred, *a, **k):
+                if "squared" in k:
+                    try:
+                        return _orig(y_true, y_pred, *a, **k)
+                    except TypeError:
+                        k.pop("squared", None)
+                        return _orig(y_true, y_pred, *a, **k)
+                return _orig(y_true, y_pred, *a, **k)
+            _sm.mean_squared_error = _mse_compat
+        except Exception:
+            pass
+    def _smx_patch_kmeans_n_init_auto():
+        """
+        sklearn>=1.4 accepts n_init='auto'; older versions want an int.
+        Patch sklearn.cluster.KMeans so 'auto' is converted to 10 if TypeError occurs.
+        """
+        try:
+            import sklearn.cluster as _sc
+            _Orig = getattr(_sc, "KMeans", None)
+            if _Orig is None:
+                return
+            class KMeansCompat(_Orig):
+                def __init__(self, *a, **k):
+                    if isinstance(k.get("n_init", None), str):
+                        try:
+                            super().__init__(*a, **k)
+                            return
+                        except TypeError:
+                            k["n_init"] = 10
+                    super().__init__(*a, **k)
+            _sc.KMeans = KMeansCompat
+        except Exception:
+            pass
+    def _smx_patch_ohe_name_api():
+        """
+        Guard get_feature_names_out on older OneHotEncoder.
+        Your templates already use _SMX_OHE; this adds a soft fallback for feature names.
+        """
+        try:
+            from sklearn.preprocessing import OneHotEncoder as _OHE
+            _orig_get = getattr(_OHE, "get_feature_names_out", None)
+            if _orig_get is None:
+                # Monkey-patch instance method via mixin
+                def _fallback_get_feature_names_out(self, input_features=None):
+                    cats = getattr(self, "categories_", None) or []
+                    input_features = input_features or [f"x{i}" for i in range(len(cats))]
+                    names = []
+                    for base, cat_list in zip(input_features, cats):
+                        for j, _ in enumerate(cat_list):
+                            names.append(f"{base}__{j}")
+                    return names
+                _OHE.get_feature_names_out = _fallback_get_feature_names_out
+        except Exception:
+            pass
+    # Register and run patches once per execution
+    for _patch in (
+        _smx_patch_mean_squared_error_squared_kw,
+        _smx_patch_kmeans_n_init_auto,
+        _smx_patch_ohe_name_api,
+    ):
+        try:
+            _patch()
+        except Exception:
+            pass
+    PREFACE = (
+        "# === SMX Auto-Hardening Preface (do not edit) ===\n"
+        "import warnings, numpy as np, pandas as pd, matplotlib.pyplot as plt\n"
+        "warnings.filterwarnings('ignore')\n"
+        "try:\n"
+        "    import seaborn as sns\n"
+        "except Exception:\n"
+        "    class _Dummy:\n"
+        "        def __getattr__(self, name):\n"
+        "            def _f(*a, **k):\n"
+        "                from syntaxmatrix.display import show\n"
+        "                show('⚠ seaborn not available; plot skipped.')\n"
+        "            return _f\n"
+        "    sns = _Dummy()\n"
+        "\n"
+        "from syntaxmatrix.display import show as _SMX_base_show\n"
+        "def _SMX_caption_from_ctx():\n"
+        "    g = globals()\n"
+        "    t = g.get('refined_question') or g.get('askai_question') or 'Table'\n"
+        "    return str(t).strip().splitlines()[0][:120]\n"
+        "\n"
+        "def show(obj, title=None):\n"
+        "    try:\n"
+        "        import pandas as pd\n"
+        "        if isinstance(obj, pd.DataFrame):\n"
+        "            cap = (title or _SMX_caption_from_ctx())\n"
+        "            try:\n"
+        "                return _SMX_base_show(obj.style.set_caption(cap))\n"
+        "            except Exception:\n"
+        "                pass\n"
+        "    except Exception:\n"
+        "        pass\n"
+        "    return _SMX_base_show(obj)\n"
+        "\n"
+        "def _SMX_axes_have_titles(fig=None):\n"
+        "    import matplotlib.pyplot as _plt\n"
+        "    fig = fig or _plt.gcf()\n"
+        "    try:\n"
+        "        for _ax in fig.get_axes():\n"
+        "            if (_ax.get_title() or '').strip():\n"
+        "                return True\n"
+        "    except Exception:\n"
+        "        pass\n"
+        "    return False\n"
+        "\n"
+        "def _SMX_export_png():\n"
+        "    import io, base64\n"
+        "    fig = plt.gcf()\n"
+        "    try:\n"
+        "        if not _SMX_axes_have_titles(fig):\n"
+        "            fig.suptitle(_SMX_caption_from_ctx(), fontsize=10)\n"
+        "    except Exception:\n"
+        "        pass\n"
+        "    buf = io.BytesIO()\n"
+        "    plt.savefig(buf, format='png', bbox_inches='tight')\n"
+        "    buf.seek(0)\n"
+        "    from IPython.display import display, HTML\n"
+        "    _img = base64.b64encode(buf.read()).decode('ascii')\n"
+        "    display(HTML(f\"<img src='data:image/png;base64,{_img}' style='max-width:100%;height:auto;border:1px solid #ccc;border-radius:4px;'/>\"))\n"
+        "    plt.close()\n"
+        "\n"
+        "def _pick_df():\n"
+        "    return globals().get('df', None)\n"
+        "\n"
+        "def _pick_ax_slot():\n"
+        "    ax = None\n"
+        "    try:\n"
+        "        _axes = globals().get('axes', None)\n"
+        "        import numpy as _np\n"
+        "        if _axes is not None:\n"
+        "            arr = _np.ravel(_axes)\n"
+        "            for _a in arr:\n"
+        "                try:\n"
+        "                    if hasattr(_a,'has_data') and not _a.has_data():\n"
+        "                        ax = _a; break\n"
+        "                except Exception:\n"
+        "                    continue\n"
+        "    except Exception:\n"
+        "        ax = None\n"
+        "    return ax\n"
+        "\n"
+        "def _first_numeric(_d):\n"
+        "    import numpy as np, pandas as pd\n"
+        "    try:\n"
+        "        preferred = [\"median_house_value\", \"price\", \"value\", \"target\", \"label\", \"y\"]\n"
+        "        for c in preferred:\n"
+        "            if c in _d.columns and pd.api.types.is_numeric_dtype(_d[c]):\n"
+        "                return c\n"
+        "        cols = _d.select_dtypes(include=[np.number]).columns.tolist()\n"
+        "        return cols[0] if cols else None\n"
+        "    except Exception:\n"
+        "        return None\n"
+        "\n"
+        "def _first_categorical(_d):\n"
+        "    import pandas as pd, numpy as np\n"
+        "    try:\n"
+        "        num = set(_d.select_dtypes(include=[np.number]).columns.tolist())\n"
+        "        cand = [c for c in _d.columns if c not in num and _d[c].nunique(dropna=True) <= 50]\n"
+        "        return cand[0] if cand else None\n"
+        "    except Exception:\n"
+        "        return None\n"
+        "\n"
+        "boxplot = barplot = histplot = distplot = lineplot = countplot = heatmap = pairplot = None\n"
+        "\n"
+        "def _safe_plot(func, *args, **kwargs):\n"
+        "    try:\n"
+        "        ax = func(*args, **kwargs)\n"
+        "        if ax is None:\n"
+        "            ax = plt.gca()\n"
+        "        try:\n"
+        "            if hasattr(ax, 'has_data') and not ax.has_data():\n"
+        "                from syntaxmatrix.display import show as _show\n"
+        "                _show('⚠ Empty plot: no data drawn.')\n"
+        "        except Exception:\n"
+        "            pass\n"
+        "        try: plt.tight_layout()\n"
+        "        except Exception: pass\n"
+        "        return ax\n"
+        "    except Exception as e:\n"
+        "        from syntaxmatrix.display import show as _show\n"
+        "        _show(f'⚠ Plot skipped: {type(e).__name__}: {e}')\n"
+        "        return None\n"
+        "\n"
+        "def SB_histplot(*a, **k):\n"
+        "    _missing = (getattr(sns, '__class__', type(sns)).__name__ == '_Dummy')\n"
+        "    _sentinel = (len(a) >= 1 and a[0] is None)\n"
+        "    if (not a or _sentinel) and not k:\n"
+        "        d = _pick_df()\n"
+        "        if d is not None:\n"
+        "            x = _first_numeric(d)\n"
+        "            if x is not None:\n"
+        "                def _draw():\n"
+        "                    plt.hist(d[x].dropna())\n"
+        "                    ax = plt.gca()\n"
+        "                    if not (ax.get_title() or '').strip():\n"
+        "                        ax.set_title(f'Distribution of {x}')\n"
+        "                    return ax\n"
+        "                return _safe_plot(lambda **kw: _draw())\n"
+        "    if _missing:\n"
+        "        return _safe_plot(lambda **kw: plt.hist([]))\n"
+        "    if _sentinel:\n"
+        "        a = a[1:]\n"
+        "    return _safe_plot(getattr(sns,'histplot', plt.hist), *a, **k)\n"
+        "\n"
+        "def SB_barplot(*a, **k):\n"
+        "    _missing = (getattr(sns, '__class__', type(sns)).__name__ == '_Dummy')\n"
+        "    _sentinel = (len(a) >= 1 and a[0] is None)\n"
+        "    _ax = k.get('ax') or _pick_ax_slot()\n"
+        "    if _ax is not None:\n"
+        "        try: plt.sca(_ax)\n"
+        "        except Exception: pass\n"
+        "        k.setdefault('ax', _ax)\n"
+        "    if (not a or _sentinel) and not k:\n"
+        "        d = _pick_df()\n"
+        "        if d is not None:\n"
+        "            x = _first_categorical(d)\n"
+        "            y = _first_numeric(d)\n"
+        "            if x and y:\n"
+        "                import pandas as _pd\n"
+        "                g = d.groupby(x)[y].mean().reset_index()\n"
+        "                def _draw():\n"
+        "                    if _missing:\n"
+        "                        plt.bar(g[x], g[y])\n"
+        "                    else:\n"
+        "                        sns.barplot(data=g, x=x, y=y, ax=k.get('ax'))\n"
+        "                    ax = plt.gca()\n"
+        "                    if not (ax.get_title() or '').strip():\n"
+        "                        ax.set_title(f'Mean {y} by {x}')\n"
+        "                    return ax\n"
+        "                return _safe_plot(lambda **kw: _draw())\n"
+        "    if _missing:\n"
+        "        return _safe_plot(lambda **kw: plt.bar([], []))\n"
+        "    if _sentinel:\n"
+        "        a = a[1:]\n"
+        "    return _safe_plot(sns.barplot, *a, **k)\n"
+        "\n"
+        "def SB_boxplot(*a, **k):\n"
+        "    _missing = (getattr(sns, '__class__', type(sns)).__name__ == '_Dummy')\n"
+        "    _sentinel = (len(a) >= 1 and a[0] is None)\n"
+        "    _ax = k.get('ax') or _pick_ax_slot()\n"
+        "    if _ax is not None:\n"
+        "        try: plt.sca(_ax)\n"
+        "        except Exception: pass\n"
+        "        k.setdefault('ax', _ax)\n"
+        "    if (not a or _sentinel) and not k:\n"
+        "        d = _pick_df()\n"
+        "        if d is not None:\n"
+        "            x = _first_categorical(d)\n"
+        "            y = _first_numeric(d)\n"
+        "            if x and y:\n"
+        "                def _draw():\n"
+        "                    if _missing:\n"
+        "                        plt.boxplot(d[y].dropna())\n"
+        "                    else:\n"
+        "                        sns.boxplot(data=d, x=x, y=y, ax=k.get('ax'))\n"
+        "                    ax = plt.gca()\n"
+        "                    if not (ax.get_title() or '').strip():\n"
+        "                        ax.set_title(f'Distribution of {y} by {x}')\n"
+        "                    return ax\n"
+        "                return _safe_plot(lambda **kw: _draw())\n"
+        "    if _missing:\n"
+        "        return _safe_plot(lambda **kw: plt.boxplot([]))\n"
+        "    if _sentinel:\n"
+        "        a = a[1:]\n"
+        "    return _safe_plot(sns.boxplot, *a, **k)\n"
+        "\n"
+        "def SB_scatterplot(*a, **k):\n"
+        "    _missing = (getattr(sns, '__class__', type(sns)).__name__ == '_Dummy')\n"
+        "    fn = getattr(sns,'scatterplot', None)\n"
+        "    # If seaborn is unavailable OR the caller passed (data=..., x='col', y='col'),\n"
+        "    # use a robust matplotlib path that looks up data and coerces to numeric.\n"
+        "    if _missing or fn is None:\n"
+        "        data = k.get('data'); x = k.get('x'); y = k.get('y')\n"
+        "        if data is not None and isinstance(x, str) and isinstance(y, str) and x in data.columns and y in data.columns:\n"
+        "            xs = pd.to_numeric(data[x], errors='coerce')\n"
+        "            ys = pd.to_numeric(data[y], errors='coerce')\n"
+        "            m = xs.notna() & ys.notna()\n"
+        "            def _draw():\n"
+        "                plt.scatter(xs[m], ys[m])\n"
+        "                ax = plt.gca()\n"
+        "                if not (ax.get_title() or '').strip():\n"
+        "                    ax.set_title(f'{y} vs {x}')\n"
+        "                return ax\n"
+        "            return _safe_plot(lambda **kw: _draw())\n"
+        "        # else: fall back to auto-pick two numeric columns\n"
+        "        d = _pick_df()\n"
+        "        if d is not None:\n"
+        "            num = d.select_dtypes(include=[np.number]).columns.tolist()\n"
+        "            if len(num) >= 2:\n"
+        "                def _draw2():\n"
+        "                    plt.scatter(d[num[0]], d[num[1]])\n"
+        "                    ax = plt.gca()\n"
+        "                    if not (ax.get_title() or '').strip():\n"
+        "                        ax.set_title(f'{num[1]} vs {num[0]}')\n"
+        "                    return ax\n"
+        "                return _safe_plot(lambda **kw: _draw2())\n"
+        "        return _safe_plot(lambda **kw: plt.scatter([], []))\n"
+        "    # seaborn path\n"
+        "    return _safe_plot(fn, *a, **k)\n"
+        "\n"
+        "def SB_heatmap(*a, **k):\n"
+        "    _missing = (getattr(sns, '__class__', type(sns)).__name__ == '_Dummy')\n"
+        "    data = None\n"
+        "    if a:\n"
+        "        data = a[0]\n"
+        "    elif 'data' in k:\n"
+        "        data = k['data']\n"
+        "    if data is None:\n"
+        "        d = _pick_df()\n"
+        "        try:\n"
+        "            if d is not None:\n"
+        "                import numpy as _np\n"
+        "                data = d.select_dtypes(include=[_np.number]).corr()\n"
+        "        except Exception:\n"
+        "            data = None\n"
+        "    if data is None:\n"
+        "        from syntaxmatrix.display import show as _show\n"
+        "        _show('⚠ Heatmap skipped: no data.')\n"
+        "        return None\n"
+        "    if not _missing and hasattr(sns, 'heatmap'):\n"
+        "        _k = {kk: vv for kk, vv in k.items() if kk != 'data'}\n"
+        "        def _draw():\n"
+        "            ax = sns.heatmap(data, **_k)\n"
+        "            try:\n"
+        "                ax = ax or plt.gca()\n"
+        "                if not (ax.get_title() or '').strip():\n"
+        "                    ax.set_title('Correlation Heatmap')\n"
+        "            except Exception:\n"
+        "                pass\n"
+        "            return ax\n"
+        "        return _safe_plot(lambda **kw: _draw())\n"
+        "    def _mat_heat():\n"
+        "        im = plt.imshow(data, aspect='auto')\n"
+        "        try: plt.colorbar()\n"
+        "        except Exception: pass\n"
+        "        try:\n"
+        "            cols = list(getattr(data, 'columns', []))\n"
+        "            rows = list(getattr(data, 'index', []))\n"
+        "            if cols: plt.xticks(range(len(cols)), cols, rotation=90)\n"
+        "            if rows: plt.yticks(range(len(rows)), rows)\n"
+        "        except Exception:\n"
+        "            pass\n"
+        "        ax = plt.gca()\n"
+        "        try:\n"
+        "            if not (ax.get_title() or '').strip():\n"
+        "                ax.set_title('Correlation Heatmap')\n"
+        "        except Exception:\n"
+        "            pass\n"
+        "        return ax\n"
+        "    return _safe_plot(lambda **kw: _mat_heat())\n"
+        "\n"
+        "def _safe_concat(objs, **kwargs):\n"
+        "    import pandas as _pd\n"
+        "    if objs is None: return _pd.DataFrame()\n"
+        "    if isinstance(objs,(list,tuple)) and len(objs)==0: return _pd.DataFrame()\n"
+        "    try: return _pd.concat(objs, **kwargs)\n"
+        "    except Exception as e:\n"
+        "        show(f'⚠ concat skipped: {e}')\n"
+        "        return _pd.DataFrame()\n"
+        "\n"
+        "from sklearn.preprocessing import OneHotEncoder\n"
+        "import inspect\n"
+        "def _SMX_OHE(**k):\n"
+        "    # normalise arg name across sklearn versions\n"
+        "    if 'sparse' in k and 'sparse_output' not in k:\n"
+        "        k['sparse_output'] = k.pop('sparse')\n"
+        "    k.setdefault('handle_unknown','ignore')\n"
+        "    k.setdefault('sparse_output', False)\n"
+        "    try:\n"
+        "        sig = inspect.signature(OneHotEncoder)\n"
+        "        if 'sparse_output' not in sig.parameters and 'sparse_output' in k:\n"
+        "            k['sparse'] = k.pop('sparse_output')\n"
+        "    except Exception:\n"
+        "        if 'sparse_output' in k:\n"
+        "            k['sparse'] = k.pop('sparse_output')\n"
+        "    return OneHotEncoder(**k)\n"
+        "\n"
+        "import numpy as _np\n"
+        "def _SMX_mm(a, b):\n"
+        "    try:\n"
+        "        return a @ b  # normal path\n"
+        "    except Exception:\n"
+        "        try:\n"
+        "            A = _np.asarray(a); B = _np.asarray(b)\n"
+        "            # If same 2D shape (e.g. (n,k) & (n,k)), treat as row-wise dot\n"
+        "            if A.ndim==2 and B.ndim==2 and A.shape==B.shape:\n"
+        "                return (A * B).sum(axis=1)\n"
+        "            # Otherwise try element-wise product (broadcast if possible)\n"
+        "            return A * B\n"
+        "        except Exception as e:\n"
+        "            from syntaxmatrix.display import show\n"
+        "            show(f'⚠ Matmul relaxed: {type(e).__name__}: {e}'); return _np.nan\n"
+        "\n"
+        "def _SMX_call(fn, *a, **k):\n"
+        "    try:\n"
+        "        return fn(*a, **k)\n"
+        "    except TypeError as e:\n"
+        "        msg = str(e)\n"
+        "        if \"unexpected keyword argument 'squared'\" in msg:\n"
+        "            k.pop('squared', None)\n"
+        "            return fn(*a, **k)\n"
+        "        raise\n"
+        "\n"
+        "def _SMX_rmse(y_true, y_pred):\n"
+        "    try:\n"
+        "        from sklearn.metrics import mean_squared_error as _mse\n"
+        "        try:\n"
+        "            return _mse(y_true, y_pred, squared=False)\n"
+        "        except TypeError:\n"
+        "            return (_mse(y_true, y_pred)) ** 0.5\n"
+        "    except Exception:\n"
+        "        import numpy as _np\n"
+        "        yt = _np.asarray(y_true, dtype=float)\n"
+        "        yp = _np.asarray(y_pred, dtype=float)\n"
+        "        diff = yt - yp\n"
+        "        return float((_np.mean(diff * diff)) ** 0.5)\n"
+        "\n"
+        "import pandas as _pd\n"
+        "import numpy as _np\n"
+        "def _SMX_autocoerce_dates(_df):\n"
+        "    if _df is None or not hasattr(_df, 'columns'): return\n"
+        "    for c in list(_df.columns):\n"
+        "        s = _df[c]\n"
+        "        n = str(c).lower()\n"
+        "        if _pd.api.types.is_datetime64_any_dtype(s):\n"
+        "            continue\n"
+        "        if _pd.api.types.is_object_dtype(s) or ('date' in n or 'time' in n or 'timestamp' in n or n.endswith('_dt')):\n"
+        "            try:\n"
+        "                conv = _pd.to_datetime(s, errors='coerce', utc=True).dt.tz_localize(None)\n"
+        "                # accept only if at least 10% (min 3) parse as dates\n"
+        "                if getattr(conv, 'notna', lambda: _pd.Series([]))().sum() >= max(3, int(0.1*len(_df))):\n"
+        "                    _df[c] = conv\n"
+        "            except Exception:\n"
+        "                pass\n"
+        "\n"
+        "def _SMX_autocoerce_numeric(_df, cols):\n"
+        "    if _df is None: return\n"
+        "    for c in cols:\n"
+        "        if c in getattr(_df, 'columns', []):\n"
+        "            try:\n"
+        "                _df[c] = _pd.to_numeric(_df[c], errors='coerce')\n"
+        "            except Exception:\n"
+        "                pass\n"
+        "\n"
+        "def show(obj, title=None):\n"
+        "    try:\n"
+        "        import pandas as pd, numbers\n"
+        "        cap = (title or _SMX_caption_from_ctx())\n"
+        "        # 1) DataFrame → Styler with caption\n"
+        "        if isinstance(obj, pd.DataFrame):\n"
+        "            try: return _SMX_base_show(obj.style.set_caption(cap))\n"
+        "            except Exception: pass\n"
+        "        # 2) dict of scalars → DataFrame with caption\n"
+        "        if isinstance(obj, dict) and all(isinstance(v, numbers.Number) for v in obj.values()):\n"
+        "            df_ = pd.DataFrame({'metric': list(obj.keys()), 'value': list(obj.values())})\n"
+        "            try: return _SMX_base_show(df_.style.set_caption(cap))\n"
+        "            except Exception: return _SMX_base_show(df_)\n"
+        "    except Exception:\n"
+        "        pass\n"
+        "    return _SMX_base_show(obj)\n"
+    )
+    PREFACE_IMPORT = "from syntaxmatrix.smx_preface import *\n"
+    # if PREFACE not in code:
+    #     code = PREFACE_IMPORT + code
+    fixed = code
+    fixed = re.sub(
+        r"(?s)^\s*try:\s*(.*?)\s*except\s+Exception\s+as\s+\w+:\s*\n\s*show\([^\n]*\)\s*$",
+        r"\1",
+        fixed.strip()
+    )
+    # 1) Strip numeric_only=... (version-agnostic)
+    fixed = re.sub(r",\s*numeric_only\s*=\s*(True|False|None)", "", fixed, flags=re.I)
+    fixed = re.sub(r"\bnumeric_only\s*=\s*(True|False|None)\s*,\s*", "", fixed, flags=re.I)
+    fixed = re.sub(r"\bnumeric_only\s*=\s*(True|False|None)\b", "", fixed, flags=re.I)
+    # 2) Use safe seaborn wrappers
+    fixed = re.sub(r"\bsns\.boxplot\s*\(", "SB_boxplot(", fixed)
+    fixed = re.sub(r"\bsns\.barplot\s*\(", "SB_barplot(", fixed)
+    fixed = re.sub(r"\bsns\.histplot\s*\(", "SB_histplot(", fixed)
+    fixed = re.sub(r"\bsns\.scatterplot\s*\(", "SB_scatterplot(", fixed)
+    # 3) Guard concat calls
+    fixed = re.sub(r"\bpd\.concat\s*\(", "_safe_concat(", fixed)
+    fixed = re.sub(r"\bOneHotEncoder\s*\(", "_SMX_OHE(", fixed)
+    # Route np.dot to tolerant matmul
+    fixed = re.sub(r"\bnp\.dot\s*\(", "_SMX_mm(", fixed)
+    fixed = re.sub(r"(df\s*\[[^\]]+\])\s*\.dt", r"SMX_dt(\1).dt", fixed)
+    # 4) Relax any 'required_cols' hard failure blocks
+    fixed = re.sub(
+        r"required_cols\s*=\s*\[.*?\]\s*?\n\s*missing\s*=\s*\[.*?\]\s*?\n\s*if\s+missing:\s*raise[^\n]+",
+        "required_cols = [c for c in df.columns]\n# (relaxed by SMX hardener)",
+        fixed,
+        flags=re.S,
+    )
+    # 5) Make static numeric_vars lists dynamic
+    fixed = re.sub(
+        r"\bnumeric_vars\s*=\s*\[.*?\]",
+        "numeric_vars = df.select_dtypes(include=['number','bool']).columns.tolist()",
+        fixed,
+        flags=re.S,
+    )
+    # normalise all .dt usages on df[...] / df.attr / df.loc[...] to SMX_dt(...)
+    fixed = re.sub(
+        r"((?:df\s*(?:\.\s*(?:loc|iloc)\s*)?\[[^\]]+\]|df\s*\.\s*[A-Za-z_]\w*))\s*\.dt\b",
+        lambda m: f"SMX_dt({m.group(1)}).dt",
+        fixed
+    )
+    try:
+        class _SMXMatmulRewriter(ast.NodeTransformer):
+            def visit_BinOp(self, node):
+                self.generic_visit(node)
+                if isinstance(node.op, ast.MatMult):
+                    return ast.Call(func=ast.Name(id="_SMX_mm", ctx=ast.Load()),
+                                    args=[node.left, node.right], keywords=[])
+                return node
+        _tree = ast.parse(fixed)
+        _tree = _SMXMatmulRewriter().visit(_tree)
+        fixed = ast.unparse(_tree)
+    except Exception:
+        # If AST rewrite fails, keep original; _SMX_mm will still handle np.dot(...)
+        pass
+    # 6) Final safety wrapper
+    fixed = fixed.replace("\t", "    ")
+    fixed = textwrap.dedent(fixed).strip("\n")
+    fixed = _strip_stray_backrefs(fixed)
+    fixed = _wrap_metric_calls(fixed)
+    # If the transformed code is still not syntactically valid, fall back to a
+    # very defensive generic snippet that depends only on `df`. This guarantees
+    try:
+        ast.parse(fixed)
+    except (SyntaxError, IndentationError):
+        fixed = (
+            "import pandas as pd\n"
+            "df = df.copy()\n"
+            "_info = {\n"
+            "    'rows': len(df),\n"
+            "    'cols': len(df.columns),\n"
+            "    'numeric_cols': len(df.select_dtypes(include=['number','bool']).columns),\n"
+            "    'categorical_cols': len(df.select_dtypes(exclude=['number','bool']).columns),\n"
+            "}\n"
+            "show(df.head(), title='Sample of data')\n"
+            "show(_info, title='Dataset summary')\n"
+            "try:\n"
+            "    _num = df.select_dtypes(include=['number','bool']).columns.tolist()\n"
+            "    if _num:\n"
+            "        SB_histplot()\n"
+            "        _SMX_export_png()\n"
+            "except Exception as e:\n"
+            "    show(f\"⚠ Fallback visualisation failed: {type(e).__name__}: {e}\")\n"
+        )
+    # Fix placeholder Ellipsis handlers from LLM
+    fixed = re.sub(
+        r"except\s+Exception\s+as\s+e:\s*\n\s*show\(\.\.\.\)",
+        "except Exception as e:\n    show(f\"⚠ Block skipped due to: {type(e).__name__}: {e}\")",
+        fixed,
+    )
+    wrapped = PREFACE + "try:\n" + _indent(fixed) + "\nexcept Exception as e:\n    show(...)\n"
+    wrapped = wrapped.lstrip()
+    return wrapped
+def indent_code(code: str, spaces: int = 4) -> str:
+    pad = " " * spaces
+    return "\n".join(pad + line for line in code.splitlines())
+def wrap_llm_code_safe(code: str) -> str:
+    # Swallow any runtime error from the LLM block instead of crashing the run
+    return (
+        "# __SAFE_WRAPPED__\n"
+        "try:\n" + indent_code(code) + "\n"
+        "except Exception as e:\n"
+        "    from syntaxmatrix.display import show\n"
+        "    show(f\"⚠️ Skipped LLM block due to: {type(e).__name__}: {e}\")\n"
+    )
+def fix_boxplot_placeholder(code: str) -> str:
+    # Replace invalid 'sns.boxplot(boxplot)' with a safe call using df/group_label/m
+    return re.sub(
+        r"sns\.boxplot\(\s*boxplot\s*\)",
+        "sns.boxplot(x=group_label, y=m, data=df.loc[df[m].notnull()], showfliers=False)",
+        code
+    )
+def relax_required_columns(code: str) -> str:
+    # Remove hard failure on required_cols; keep a soft filter instead
+    return re.sub(
+        r"required_cols\s*=\s*\[.*?\]\s*?\n\s*missing\s*=\s*\[.*?\]\s*?\n\s*if\s+missing:\s*raise[^\n]+",
+        "required_cols = [c for c in df.columns]\n",
+        code,
+        flags=re.S
+    )
+def make_numeric_vars_dynamic(code: str) -> str:
+    # Replace any static numeric_vars list with a dynamic selection
+    return re.sub(
+        r"numeric_vars\s*=\s*\[.*?\]",
+        "numeric_vars = df.select_dtypes(include=['number','bool']).columns.tolist()",
+        code,
+        flags=re.S
+    )
+def auto_inject_template(code: str, intents, df) -> str:
+    """If the LLM forgot the core logic, prepend a skeleton block."""
+    has_fit = ".fit(" in code
+    has_plot = any(k in code for k in ("plt.", "sns.", ".plot(", ".hist("))
+    UNKNOWN_TOKENS = {
+        "unknown","not reported","not_reported","not known","n/a","na",
+        "none","nan","missing","unreported","unspecified","null","-",""
+    }
+    # --- Safe template caller: passes only supported kwargs, falls back cleanly ---
+    def _call_template(func, df, **hints):
+        import inspect
+        try:
+            params = inspect.signature(func).parameters
+            kw = {k: v for k, v in hints.items() if k in params}
+            try:
+                return func(df, **kw)
+            except TypeError:
+                # In case the template changed its signature at runtime
+                return func(df)
+        except Exception:
+            # Absolute safety net
+            try:
+                return func(df)
+            except Exception:
+                # As a last resort, return empty code so we don't 500
+                return ""
+    def _guess_classification_target(df: pd.DataFrame) -> str | None:
+        cols = list(df.columns)
+        # Helper: does this column look like a sensible label?
+        def _is_reasonable_class_col(s: pd.Series, col_name: str) -> bool:
+            try:
+                nunq = s.dropna().nunique()
+            except Exception:
+                return False
+            # need at least 2 classes, but not hundreds
+            if nunq < 2 or nunq > 20:
+                return False
+            bad_name_keys = ("id", "identifier", "index", "uuid", "key")
+            name = str(col_name).lower()
+            if any(k in name for k in bad_name_keys):
+                return False
+            return True
+        # 1) columns whose names look like labels
+        label_keys = ("target", "label", "outcome", "class", "y", "status")
+        name_candidates: list[str] = []
+        for key in label_keys:
+            for c in cols:
+                if key in str(c).lower():
+                    name_candidates.append(c)
+            if name_candidates:
+                break  # keep the earliest matching key-group
+        # prioritise name-based candidates that also look like proper label columns
+        for c in name_candidates:
+            if _is_reasonable_class_col(df[c], c):
+                return c
+        if name_candidates:
+            # fall back to the first name-based candidate if none passed the shape test
+            return name_candidates[0]
+        # 2) any column with a small number of distinct values (likely a class label)
+        for c in cols:
+            s = df[c]
+            if _is_reasonable_class_col(s, c):
+                return c
+        # Nothing suitable found
+        return None
+    def _guess_regression_target(df: pd.DataFrame) -> str | None:
+        num_cols = df.select_dtypes(include=[np.number, "bool"]).columns.tolist()
+        if not num_cols:
+            return None
+        # Avoid obvious ID-like columns
+        bad_keys = ("id", "identifier", "index")
+        candidates = [c for c in num_cols if not any(k in str(c).lower() for k in bad_keys)]
+        return (candidates or num_cols)[-1]
+    def _guess_time_col(df: pd.DataFrame) -> str | None:
+        # Prefer actual datetime dtype
+        dt_cols = [c for c in df.columns if np.issubdtype(df[c].dtype, np.datetime64)]
+        if dt_cols:
+            return dt_cols[0]
+        # Fallback: name-based hints
+        name_keys = ["date", "time", "timestamp", "datetime", "ds", "period"]
+        for c in df.columns:
+            name = str(c).lower()
+            if any(k in name for k in name_keys):
+                return c
+        return None
+    def _guess_entity_col(df: pd.DataFrame) -> str | None:
+        # Typical sequence IDs: id, patient, subject, device, series, entity
+        keys = ["id", "patient", "subject", "device", "series", "entity"]
+        candidates = []
+        for c in df.columns:
+            name = str(c).lower()
+            if any(k in name for k in keys):
+                candidates.append(c)
+        return candidates[0] if candidates else None
+    def _guess_ts_class_target(df: pd.DataFrame) -> str | None:
+        # Try label-like names first
+        keys = ["target", "label", "class", "outcome", "y"]
+        for key in keys:
+            for c in df.columns:
+                if key in str(c).lower():
+                    return c
+        # Fallback: any column with few distinct values (e.g. <= 10)
+        for c in df.columns:
+            s = df[c]
+            # avoid obvious IDs
+            if any(k in str(c).lower() for k in ["id", "index"]):
+                continue
+            try:
+                nunq = s.dropna().nunique()
+            except Exception:
+                continue
+            if 1 < nunq <= 10:
+                return c
+        return None
+    def _guess_multilabel_cols(df: pd.DataFrame) -> list[str]:
+        cols = list(df.columns)
+        lbl_like = [c for c in cols if str(c).startswith(("LBL_", "lbl_"))]
+        # also include boolean/binary columns with suitable names
+        for c in cols:
+            s = df[c]
+            try:
+                nunq = s.dropna().nunique()
+            except Exception:
+                continue
+            if nunq in (2,) and c not in lbl_like:
+                # avoid obvious IDs
+                if not any(k in str(c).lower() for k in ("id","index","uuid","identifier")):
+                    lbl_like.append(c)
+        # keep at most, say, 12 to avoid accidental flood
+        return lbl_like[:12]
+    def _find_unknownish_column(df: pd.DataFrame) -> str | None:
+        # Search categorical-like columns for any 'unknown-like' values or high missingness
+        candidates = []
+        for c in df.columns:
+            s = df[c]
+            # focus on object/category/boolean-ish or low-card columns
+            if not (pd.api.types.is_object_dtype(s) or pd.api.types.is_categorical_dtype(s) or s.dropna().nunique() <= 20):
+                continue
+            try:
+                vals = s.astype(str).str.strip().str.lower()
+            except Exception:
+                continue
+            # score: presence of unknown tokens + missing rate
+            token_hit = int(vals.isin(UNKNOWN_TOKENS).any())
+            miss_rate = s.isna().mean()
+            name_bonus = int(any(k in str(c).lower() for k in ("status","history","report","known","flag")))
+            score = 3*token_hit + 2*name_bonus + miss_rate
+            if token_hit or miss_rate > 0.05 or name_bonus:
+                candidates.append((score, c))
+        if not candidates:
+            return None
+        candidates.sort(reverse=True)
+        return candidates[0][1]
+    def _guess_numeric_cols(df: pd.DataFrame, max_n: int = 6) -> list[str]:
+        cols = [c for c in df.select_dtypes(include=[np.number, "bool"]).columns if not any(k in str(c).lower() for k in ("id","identifier","index","uuid"))]
+        # prefer non-constant columns
+        scored = []
+        for c in cols:
+            try:
+                v = df[c].dropna()
+                var = float(v.var()) if len(v) else 0.0
+                scored.append((var, c))
+            except Exception:
+                continue
+        scored.sort(reverse=True)
+        return [c for _, c in scored[:max_n]]
+    def _guess_categorical_cols(df: pd.DataFrame, exclude: set[str] | None = None, max_card: int = 12, max_n: int = 5) -> list[str]:
+        exclude = exclude or set()
+        picks = []
+        for c in df.columns:
+            if c in exclude:
+                continue
+            s = df[c]
+            if pd.api.types.is_object_dtype(s) or pd.api.types.is_categorical_dtype(s) or s.dropna().nunique() <= max_card:
+                nunq = s.dropna().nunique()
+                if 2 <= nunq <= max_card and not any(k in str(c).lower() for k in ("id","identifier","index","uuid")):
+                    picks.append((nunq, c))
+        picks.sort(reverse=True)
+        return [c for _, c in picks[:max_n]]
+    def _guess_outcome_col(df: pd.DataFrame, exclude: set[str] | None = None) -> str | None:
+        exclude = exclude or set()
+        # name hints first
+        name_keys = ("outcome","target","label","risk","score","result","prevalence","positivity")
+        for c in df.columns:
+            if c in exclude:
+                continue
+            name = str(c).lower()
+            if any(k in name for k in name_keys) and pd.api.types.is_numeric_dtype(df[c]):
+                return c
+        # fallback: any binary numeric
+        for c in df.select_dtypes(include=[np.number, "bool"]).columns:
+            if c in exclude:
+                continue
+            try:
+                if df[c].dropna().nunique() == 2:
+                    return c
+            except Exception:
+                continue
+        return None
+    def _pick_viz_template(signal: str):
+        s = signal.lower()
+        # explicit chart requests
+        if any(k in s for k in ("pie", "donut")):
+            return viz_pie
+        if any(k in s for k in ("stacked", "100% stacked", "composition", "proportion", "share by")):
+            return viz_stacked_bar
+        if any(k in s for k in ("distribution", "hist", "histogram", "bins")):
+            return viz_distribution
+        if any(k in s for k in ("kde", "density")):
+            return viz_kde
+        # these three you asked about
+        if any(k in s for k in ("box", "boxplot", "violin", "spread", "outlier")):
+            return viz_box
+        if any(k in s for k in ("scatter", "relationship", "vs ", "correlate")):
+            return viz_scatter
+        if any(k in s for k in ("count", "counts", "frequency", "bar chart", "barplot")):
+            return viz_count_bar
+        if any(k in s for k in ("area", "trend", "over time", "time series")):
+            return viz_area
+        # fallback
+        return viz_line
+    for intent in intents:
+        if intent not in INJECTABLE_INTENTS:
+            return code
+        # Correlation analysis
+        if intent == "correlation_analysis" and not has_fit:
+            return eda_correlation(df) + "\n\n" + code
+        # Generic visualisation (keyword-based)
+        if intent == "visualisation" and not has_fit and not has_plot:
+            rq = str(globals().get("refined_question", ""))
+            # aq = str(globals().get("askai_question", ""))
+            signal = rq + "\n" + str(intents) + "\n" + code
+            tpl = _pick_viz_template(signal)
+            return tpl(df) + "\n\n" + code
+        if intent == "clustering" and not has_fit:
+            return clustering(df) + "\n\n" + code
+        if intent == "recommendation" and not has_fit:
+            return recommendation(df) + "\\n\\n" + code
+        if intent == "topic_modelling" and not has_fit:
+            return topic_modelling(df) + "\\n\\n" + code
+        if intent == "eda" and not has_fit:
+            return code + "\n\nSB_heatmap(df.corr())"  # Inject heatmap if 'eda' intent
+        # --- Classification ------------------------------------------------
+        if intent == "classification" and not has_fit:
+            target = _guess_classification_target(df)
+            if target:
+                return classification(df) + "\n\n" + code
+                # return _call_template(classification, df, target) + "\n\n" + code
+        # --- Regression ----------------------------------------------------
+        if intent == "regression" and not has_fit:
+            target = _guess_regression_target(df)
+            if target:
+                return regression(df) + "\n\n" + code
+                # return _call_template(regression, df, target) + "\n\n" + code
+        # --- Anomaly detection --------------------------------------------
+        if intent == "anomaly_detection":
+            uses_anomaly = any(k in code for k in ("IsolationForest", "LocalOutlierFactor", "OneClassSVM"))
+            if not uses_anomaly:
+                return anomaly_detection(df) + "\n\n" + code
+        # --- Time-series anomaly detection --------------------------------
+        if intent == "ts_anomaly_detection":
+            uses_ts = "STL(" in code or "seasonal_decompose(" in code
+            if not uses_ts:
+                return ts_anomaly_detection(df) + "\n\n" + code
+        # --- Time-series classification --------------------------------
+        if intent == "time_series_classification" and not has_fit:
+            time_col = _guess_time_col(df)
+            entity_col = _guess_entity_col(df)
+            target_col = _guess_ts_class_target(df)
+            # If we can't confidently identify these, do NOT inject anything
+            if time_col and entity_col and target_col:
+                return time_series_classification(df, entity_col, time_col, target_col) + "\n\n" + code
+        # --- Dimensionality reduction --------------------------------------
+        if intent == "dimensionality_reduction":
+            uses_dr = any(k in code for k in ("PCA(", "TSNE("))
+            if not uses_dr:
+                return dimensionality_reduction(df) + "\n\n" + code
+        # --- Feature selection ---------------------------------------------
+        if intent == "feature_selection":
+            uses_fs = any(k in code for k in (
+                "mutual_info_", "permutation_importance(", "SelectKBest(", "RFE("
+            ))
+            if not uses_fs:
+                return feature_selection(df) + "\n\n" + code
+        # --- EDA / correlation / visualisation -----------------------------
+        if intent in ("eda", "correlation_analysis", "visualisation") and not has_plot:
+            if intent == "correlation_analysis":
+                return eda_correlation(df) + "\n\n" + code
+            else:
+                return eda_overview(df) + "\n\n" + code
+        # --- Time-series forecasting ---------------------------------------
+        if intent == "time_series_forecasting" and not has_fit:
+            uses_ts_forecast = any(k in code for k in (
+                "ARIMA", "ExponentialSmoothing", "forecast", "predict("
+            ))
+            if not uses_ts_forecast:
+                return time_series_forecasting(df) + "\n\n" + code
+        # --- Multi-label classification -----------------------------------
+        if intent in ("multilabel_classification",) and not has_fit:
+            label_cols = _guess_multilabel_cols(df)
+            if len(label_cols) >= 2:
+                return multilabel_classification(df, label_cols) + "\n\n" + code
+            group_col = _find_unknownish_column(df)
+            if group_col:
+                num_cols = _guess_numeric_cols(df)
+                cat_cols = _guess_categorical_cols(df, exclude={group_col})
+                outcome_col = None  # generic; let template skip if not present
+                tpl = unknown_group_proxy_pack(df, group_col, UNKNOWN_TOKENS, num_cols, cat_cols, outcome_col)
+                # Return template + guarded (repaired) LLM code, so it never crashes
+                repaired = make_numeric_vars_dynamic(relax_required_columns(fix_boxplot_placeholder(code)))
+                return tpl + "\n\n" + wrap_llm_code_safe(repaired)
+    return code
+def fix_values_sum_numeric_only_bug(code: str) -> str:
+    """
+    If a previous pass injected numeric_only=True into a NumPy-style sum,
+    e.g. .values.sum(numeric_only=True), strip it and canonicalize to .to_numpy().sum().
+    """
+    # .values.sum(numeric_only=True, ...)
+    code = re.sub(
+        r"\.values\s*\.sum\s*\(\s*[^)]*numeric_only\s*=\s*True[^)]*\)",
+        ".to_numpy().sum()",
+        code,
+        flags=re.IGNORECASE,
+    )
+    # .to_numpy().sum(numeric_only=True, ...)
+    code = re.sub(
+        r"\.to_numpy\(\)\s*\.sum\s*\(\s*[^)]*numeric_only\s*=\s*True[^)]*\)",
+        ".to_numpy().sum()",
+        code,
+        flags=re.IGNORECASE,
+    )
+    return code
 def strip_describe_slice(code: str) -> str:
     """
@@ -59,10 +1197,12 @@ def strip_describe_slice(code: str) -> str:
     )
     return pat.sub(r"\1)", code)
 def remove_plt_show(code: str) -> str:
     """Removes all plt.show() calls from the generated code string."""
     return "\n".join(line for line in code.splitlines() if "plt.show()" not in line)
 def patch_plot_with_table(code: str) -> str:
     """
     ▸ strips every `plt.show()` (avoids warnings)
@@ -149,7 +1289,7 @@ def patch_plot_with_table(code: str) -> str:
             ")\n"
         )
-    tbl_block += "from syntaxmatrix.display import show\nshow(summary_table)"
+    tbl_block += "show(summary_table, title='Summary Statistics')"
     # 5. inject image-export block, then table block, after the plot
     patched = (
@@ -289,10 +1429,10 @@ def refine_eda_question(raw_question, df=None, max_points=1000):
             "Use: tbl = df.describe().loc[['mean', '50%', 'std']].rename(index={'50%': 'median'}); display(tbl)"
         )
     # 9. Fallback: return the raw question
     return q
 def patch_plot_code(code, df, user_question=None):
      # ── Early guard: abort nicely if the generated code references columns that
@@ -313,10 +1453,13 @@ def patch_plot_code(code, df, user_question=None):
     if missing_cols:
         cols_list = ", ".join(missing_cols)
-        return (
-            f"print('⚠️ Column(s) \"{cols_list}\" not found in the dataset. "
-            f"Please check the column names and try again.')"
+        warning = (
+            f"show('⚠️ Warning: code references missing column(s): \"{cols_list}\". "
+            "These must either exist in df or be created earlier in the code; "
+            "otherwise you may see a KeyError.')\n"
         )
+        # Prepend the warning but keep the original code so it can still run
+        code = warning + code
     # 1. For line plots (auto-aggregate)
     m_l = re.search(r"plt\.plot\(\s*df\[['\"](\w+)['\"]\]\s*,\s*df\[['\"](\w+)['\"]\]", code)
@@ -425,6 +1568,16 @@ def patch_plot_code(code, df, user_question=None):
     # Fallback: Return original code
     return code
+def ensure_matplotlib_title(code, title_var="refined_question"):
+    import re
+    makes_plot = re.search(r"\b(plt\.(plot|scatter|bar|hist)|ax\.(plot|scatter|bar|hist))\b", code)
+    has_title = re.search(r"\b(plt\.title|ax\.set_title)\s*\(", code)
+    if makes_plot and not has_title:
+        code += f"\ntry:\n    plt.title(str({title_var})[:120])\nexcept Exception: pass\n"
+    return code
 def ensure_output(code: str) -> str:
     """
     Guarantees that AI-generated code actually surfaces results in the UI
@@ -441,7 +1594,6 @@ def ensure_output(code: str) -> str:
         # not a comment / print / assignment / pyplot call
         if (last and not last.startswith(("print(", "plt.", "#")) and "=" not in last):
             lines[-1] = f"_out = {last}"
-            lines.append("from syntaxmatrix.display import show")
             lines.append("show(_out)")
     # ── 3· auto-surface common stats tuples (stat, p) ───────────────────
@@ -449,14 +1601,12 @@ def ensure_output(code: str) -> str:
     if re.search(r"\bchi2\s*,\s*p\s*,", code) and "show((" in code:
         pass   # AI already shows the tuple
     elif re.search(r"\bchi2\s*,\s*p\s*,", code):
-        lines.append("from syntaxmatrix.display import show")
         lines.append("show((chi2, p))")
     # ── 4· classification report (string) ───────────────────────────────
     cr_match = re.search(r"^\s*(\w+)\s*=\s*classification_report\(", code, re.M)
     if cr_match and f"show({cr_match.group(1)})" not in "\n".join(lines):
         var = cr_match.group(1)
-        lines.append("from syntaxmatrix.display import show")
         lines.append(f"show({var})")
     # 5-bis · pivot tables  (DataFrame)
@@ -493,18 +1643,17 @@ def ensure_output(code: str) -> str:
     assign_scalar = re.match(r"\s*(\w+)\s*=\s*.+\.shape\[\s*0\s*\]\s*$", lines[-1])
     if assign_scalar:
         var = assign_scalar.group(1)
-        lines.append("from syntaxmatrix.display import show")
         lines.append(f"show({var})")
     # ── 8. utils.ensure_output()
     assign_df = re.match(r"\s*(\w+)\s*=\s*df\[", lines[-1])
     if assign_df:
         var = assign_df.group(1)
-        lines.append("from syntaxmatrix.display import show")
         lines.append(f"show({var})")
     return "\n".join(lines)
 def get_plotting_imports(code):
     imports = []
     if "plt." in code and "import matplotlib.pyplot as plt" not in code:
@@ -524,6 +1673,7 @@ def get_plotting_imports(code):
         code = "\n".join(imports) + "\n\n" + code
     return code
 def patch_pairplot(code, df):
     if "sns.pairplot" in code:
         # Always assign and print pairgrid
@@ -534,29 +1684,82 @@ def patch_pairplot(code, df):
             code += "\nprint(pairgrid)"
     return code
 def ensure_image_output(code: str) -> str:
     """
-    Injects a PNG exporter in front of every plt.show() so dashboards
-    get real <img> HTML instead of a blank cell.
+    Replace each plt.show() with an indented _SMX_export_png() call.
+    This keeps block indentation valid and still renders images in the dashboard.
     """
     if "plt.show()" not in code:
         return code
-    exporter = (
-        # -- NEW: use display(), not print() --------------------------
-        "import io, base64\n"
-        "buf = io.BytesIO()\n"
-        "plt.savefig(buf, format='png', bbox_inches='tight')\n"
-        "buf.seek(0)\n"
-        "img_b64 = base64.b64encode(buf.read()).decode('utf-8')\n"
-        "from IPython.display import display, HTML\n"
-        "display(HTML(f'<img src=\"data:image/png;base64,{img_b64}\" "
-        "style=\"max-width:100%;\">'))\n"
-        "plt.close()\n"
+    import re
+    out_lines = []
+    for ln in code.splitlines():
+        if "plt.show()" not in ln:
+            out_lines.append(ln)
+            continue
+        # works for:
+        #   plt.show()
+        #   plt.tight_layout(); plt.show()
+        #   ... ; plt.show(); ...  (multiple on one line)
+        indent = re.match(r"^(\s*)", ln).group(1)
+        parts = ln.split("plt.show()")
+        # keep whatever is before the first plt.show()
+        if parts[0].strip():
+            out_lines.append(parts[0].rstrip())
+        # for every plt.show() we removed, insert exporter at same indent
+        for _ in range(len(parts) - 1):
+            out_lines.append(indent + "_SMX_export_png()")
+        # keep whatever comes after the last plt.show()
+        if parts[-1].strip():
+            out_lines.append(indent + parts[-1].lstrip())
+    return "\n".join(out_lines)
+def clean_llm_code(code: str) -> str:
+    """
+    Make LLM output safe to exec:
+    - If fenced blocks exist, keep the largest one (usually the real code).
+    - Otherwise strip any stray ``` / ```python lines.
+    - Remove common markdown/preamble junk.
+    """
+    code = str(code or "")
+    # Extract fenced blocks (```python ... ``` or ``` ... ```)
+    blocks = re.findall(r"```(?:python)?\s*(.*?)```", code, flags=re.I | re.S)
+    if blocks:
+        # pick the largest block; small trailing blocks are usually garbage
+        largest = max(blocks, key=lambda b: len(b.strip()))
+        if len(largest.strip().splitlines()) >= 10:
+            code = largest
+        else:
+            # if no meaningful block, just remove fence markers
+            code = re.sub(r"^```.*?$", "", code, flags=re.M)
+    else:
+        # no complete blocks — still remove any stray fence lines
+        code = re.sub(r"^```.*?$", "", code, flags=re.M)
+    # Strip common markdown/preamble lines
+    drop_prefixes = (
+        "here is", "here's", "below is", "sure,", "certainly",
+        "explanation", "note:", "```"
     )
+    cleaned_lines = []
+    for ln in code.splitlines():
+        s = ln.strip().lower()
+        if any(s.startswith(p) for p in drop_prefixes):
+            continue
+        cleaned_lines.append(ln)
+    return "\n".join(cleaned_lines).strip()
-    # exporter BEFORE the original plt.show()
-    return code.replace("plt.show()", exporter + "plt.show()")
 def fix_groupby_describe_slice(code: str) -> str:
     """
@@ -579,6 +1782,7 @@ def fix_groupby_describe_slice(code: str) -> str:
         )
     return pat.sub(repl, code)
 def fix_importance_groupby(code: str) -> str:
     pattern = re.compile(r"df\.groupby\(['\"]Importance['\"]\)\['\"?Importance['\"]?\]")
     if "importance_df" in code:
@@ -625,10 +1829,12 @@ def inject_auto_preprocessing(code: str) -> str:
     # simply prepend; model code that follows can wrap estimator in a Pipeline
     return prep_snippet + code
 def fix_to_datetime_errors(code: str) -> str:
     """
     Force every pd.to_datetime(…) call to ignore bad dates so that
-    ‘year 16500 is out of range’ and similar issues don’t crash runs.
+    'year 16500 is out of range' and similar issues don’t crash runs.
     """
     import re
     # look for any pd.to_datetime( … )
@@ -641,25 +1847,67 @@ def fix_to_datetime_errors(code: str) -> str:
         return f"pd.to_datetime({inside}, errors='coerce')"
     return pat.sub(repl, code)
 def fix_numeric_sum(code: str) -> str:
     """
-    Rewrites every `.sum(` call so it becomes
-    `.sum(numeric_only=True, …)` unless that keyword is already present.
+    Make .sum(...) code safe across pandas versions by removing any
+    numeric_only=... argument (True/False/None) from function calls.
+    This avoids errors on pandas versions where numeric_only is not
+    supported for Series/grouped sums, and we rely instead on explicit
+    numeric column selection (e.g. select_dtypes) in the generated code.
     """
-    pattern = re.compile(r"\.sum\(\s*([^\)]*)\)")
+    # Case 1: ..., numeric_only=True/False/None
+    code = re.sub(
+        r",\s*numeric_only\s*=\s*(True|False|None)",
+        "",
+        code,
+        flags=re.IGNORECASE,
+    )
-    def _repl(match):
-        args = match.group(1)
-        if "numeric_only" in args:      # already safe
-            return match.group(0)
+    # Case 2: numeric_only=True/False/None, ...  (as first argument)
+    code = re.sub(
+        r"numeric_only\s*=\s*(True|False|None)\s*,\s*",
+        "",
+        code,
+        flags=re.IGNORECASE,
+    )
-        args = args.strip()
-        if args:                        # keep existing positional / kw args
-            args += ", "
-        return f".sum({args}numeric_only=True)"
+    # Case 3: numeric_only=True/False/None  (only argument)
+    code = re.sub(
+        r"numeric_only\s*=\s*(True|False|None)",
+        "",
+        code,
+        flags=re.IGNORECASE,
+    )
+    return code
+def fix_concat_empty_list(code: str) -> str:
+    """
+    Make pd.concat calls resilient to empty lists of objects.
+    Transforms patterns like:
+        pd.concat(frames, ignore_index=True)
+        pd.concat(frames)
+    into:
+        pd.concat(frames or [pd.DataFrame()], ignore_index=True)
+        pd.concat(frames or [pd.DataFrame()])
+    Only triggers when the first argument is a simple variable name.
+    """
+    pattern = re.compile(r"pd\.concat\(\s*([A-Za-z_][A-Za-z0-9_]*)\s*(,|\))")
+    def _repl(m):
+        name = m.group(1)
+        sep = m.group(2)  # ',' or ')'
+        return f"pd.concat({name} or [pd.DataFrame()]{sep}"
     return pattern.sub(_repl, code)
 def fix_numeric_aggs(code: str) -> str:
     _AGG_FUNCS = ("sum", "mean")
     pat = re.compile(rf"\.({'|'.join(_AGG_FUNCS)})\(\s*([^)]+)?\)")
@@ -673,69 +1921,66 @@ def fix_numeric_aggs(code: str) -> str:
         return f".{func}({args}numeric_only=True)"
     return pat.sub(_repl, code)
 def ensure_accuracy_block(code: str) -> str:
     """
-    If the code fits an estimator but never prints accuracy,
-    inject an evaluation block that re-uses *whatever variable name*
-    appears immediately before `.fit(`.
+    Inject a sensible evaluation block right after the last `<est>.fit(...)`
+    Classification → accuracy + weighted F1
+    Regression    → R², RMSE, MAE
+    Heuristic: infer task from estimator names present in the code.
     """
-    # Already prints accuracy? – bail out early
-    if re.search(r"accuracy_score\s*\(", code):
+    import re, textwrap
+    # If any proper metric already exists, do nothing
+    if re.search(r"\b(accuracy_score|f1_score|r2_score|mean_squared_error|mean_absolute_error)\b", code):
         return code
-    # Find the last `<var>.fit(` call
+    # Find the last "<var>.fit(" occurrence to reuse the estimator variable name
     m = list(re.finditer(r"(\w+)\.fit\s*\(", code))
     if not m:
-        return code                    # no model at all
+        return code  # no estimator
-    var = m[-1].group(1)               # estimator variable name
+    var = m[-1].group(1)
+    # indent with same leading whitespace used on that line
     indent = re.match(r"\s*", code[m[-1].start():]).group(0)
-    eval_block = textwrap.dedent(f"""
-        {indent}# ── automatic accuracy evaluation ─────────
-        {indent}from sklearn.metrics import accuracy_score
-        {indent}y_pred = {var}.predict(X_test)
-        {indent}acc = accuracy_score(y_test, y_pred)
-        {indent}print(f"Model accuracy on hold-out set: {{acc:.2%}}")
-    """)
+    # Detect regression by estimator names / hints in code
+    is_regression = bool(
+        re.search(
+            r"\b(LinearRegression|Ridge|Lasso|ElasticNet|ElasticNetCV|HuberRegressor|TheilSenRegressor|RANSACRegressor|"
+            r"RandomForestRegressor|GradientBoostingRegressor|DecisionTreeRegressor|KNeighborsRegressor|SVR|"
+            r"XGBRegressor|LGBMRegressor|CatBoostRegressor)\b", code
+        )
+        or re.search(r"\bOLS\s*\(", code)
+        or re.search(r"\bRegressor\b", code)
+    )
+    if is_regression:
+        # inject numpy import if needed for RMSE
+        if "import numpy as np" not in code and "np." not in code:
+            code = "import numpy as np\n" + code
+        eval_block = textwrap.dedent(f"""
+            {indent}# ── automatic regression evaluation ─────────
+            {indent}from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
+            {indent}y_pred = {var}.predict(X_test)
+            {indent}r2 = r2_score(y_test, y_pred)
+            {indent}rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))
+            {indent}mae = float(mean_absolute_error(y_test, y_pred))
+            {indent}print(f"R²: {{r2:.4f}} | RMSE: {{rmse:.4f}} | MAE: {{mae:.4f}}")
+        """)
+    else:
+        eval_block = textwrap.dedent(f"""
+            {indent}# ── automatic classification evaluation ─────────
+            {indent}from sklearn.metrics import accuracy_score, f1_score
+            {indent}y_pred = {var}.predict(X_test)
+            {indent}acc = accuracy_score(y_test, y_pred)
+            {indent}f1  = f1_score(y_test, y_pred, average='weighted')
+            {indent}print(f"Accuracy: {{acc:.2%}} | F1 (weighted): {{f1:.3f}}")
+        """)
     insert_at = code.find("\n", m[-1].end()) + 1
     return code[:insert_at] + eval_block + code[insert_at:]
-def classify(prompt: str) -> str:
-    """
-    Very-light intent classifier.
-    Returns one of:
-      'stat_test' | 'time_series' | 'clustering'
-      'classification' | 'regression' | 'eda'
-    """
-    p = prompt.lower().strip()
-    greetings = {"hi", "hello", "hey", "good morning", "good afternoon", "good evening", "greetings"}
-    if any(p.startswith(g) or p == g for g in greetings):
-        return "greeting"
-    if any(k in p for k in ("t-test", "anova", "p-value")):
-        return "stat_test"
-    if "forecast" in p or "prophet" in p:
-        return "time_series"
-    if "cluster" in p or "kmeans" in p:
-        return "clustering"
-    if any(k in p for k in ("accuracy", "precision", "roc")):
-        return "classification"
-    if any(k in p for k in ("rmse", "r2", "mae")):
-        return "regression"
-    return "eda"
-def auto_inject_template(code: str, intent: str, df) -> str:
-    """If the LLM forgot the core logic, prepend a skeleton block."""
-    has_fit = ".fit(" in code
-    if intent == "classification" and not has_fit:
-        # guess a y column that contains 'diabetes' as in your dataset
-        target = next((c for c in df.columns if "diabetes" in c.lower()), None)
-        if target:
-            return classification(df, target) + "\n\n" + code
-    return code
 def fix_scatter_and_summary(code: str) -> str:
     """
@@ -763,6 +2008,7 @@ def fix_scatter_and_summary(code: str) -> str:
     return code
 def auto_format_with_black(code: str) -> str:
     """
     Format the generated code with Black. Falls back silently if Black
@@ -777,6 +2023,7 @@ def auto_format_with_black(code: str) -> str:
     except Exception:
         return code
 def ensure_preproc_in_pipeline(code: str) -> str:
     """
     If code defines `preproc = ColumnTransformer(...)` but then builds
@@ -789,52 +2036,1128 @@ def ensure_preproc_in_pipeline(code: str) -> str:
         code
     )
 def fix_plain_prints(code: str) -> str:
     """
-    Rewrite  print(<var>)  → show(<var>)  when <var> looks like
-    a pandas / numpy / sklearn object (heuristic: not a string literal).
+    Rewrite bare `print(var)` where var looks like a dataframe/series/ndarray/etc
+    to go through SyntaxMatrix's smart display (so it renders in the dashboard).
+    Keeps string prints alone.
     """
-    import re
-    return re.sub(
-        r"print\((\w+)\)",
+    # Skip obvious string-literal prints
+    new = re.sub(
+        r"(?m)^\s*print\(\s*([A-Za-z_]\w*)\s*\)\s*$",
         r"from syntaxmatrix.display import show\nshow(\1)",
         code,
     )
+    return new
-# --------------------------------------------------------------------------
-#  ✂
-# --------------------------------------------------------------------------
-def drop_bad_classification_metrics(code: str, y) -> str:
+def fix_print_html(code: str) -> str:
     """
-    If the prediction target is continuous (i.e. a regression task) and the
-    generated code mistakenly calls classification metrics such as
-    `accuracy_score`, `classification_report`, or `confusion_matrix`,
-    comment those lines out so the cell can still run.
+    Ensure that HTML / DataFrame HTML are *displayed* (and captured by the kernel),
+    not printed as `<IPython.core.display.HTML object>` to the server console.
+    - Rewrites: print(HTML(...))  → display(HTML(...))
+                print(display(...)) → display(...)
+                print(df.to_html(...)) → display(HTML(df.to_html(...)))
+    Also prepends `from IPython.display import display, HTML` if required.
+    """
+    import re
+    new = code
+    # 1) print(HTML(...)) -> display(HTML(...))
+    new = re.sub(r"(?m)^\s*print\s*\(\s*HTML\s*\(", "display(HTML(", new)
+    # 2) print(display(...)) -> display(...)
+    new = re.sub(r"(?m)^\s*print\s*\(\s*display\s*\(", "display(", new)
+    # 3) print(<expr>.to_html(...)) -> display(HTML(<expr>.to_html(...)))
+    new = re.sub(
+        r"(?m)^\s*print\s*\(\s*([A-Za-z_]\w*(?:\.[A-Za-z_]\w*)*)\s*\.to_html\s*\(",
+        r"display(HTML(\1.to_html(", new
+    )
+    # If code references HTML() or display() make sure the import exists
+    if ("HTML(" in new or re.search(r"\bdisplay\s*\(", new)) and \
+        "from IPython.display import display, HTML" not in new:
+        new = "from IPython.display import display, HTML\n" + new
+    return new
-    Works whether `y` is:
-      • a pandas Series  ->  y.dtype.kind is available
-      • a pandas DataFrame (multi-column)  ->  we infer by looking at *all*
+def ensure_ipy_display(code: str) -> str:
     """
-    # ── decide whether y looks continuous ────────────────────────────────
-    try:
-        kind = y.dtype.kind                          # Series path
-    except AttributeError:
-        # DataFrame path: regression if *every* column’s dtype is numeric/datetime
-        numeric_kinds = set("fiuM")                  # float, int, unsigned, datetime
-        col_kinds = {dt.kind for dt in getattr(y, "dtypes", [])}
-        kind = "f" if col_kinds and col_kinds.issubset(numeric_kinds) else "O"
-    # ── if regression, strip classification lines ───────────────────────
-    if kind in "fM":                                 # float or datetime
-        patterns = [
-            r"\n.*accuracy_score[^\n]*",
-            r"\n.*classification_report[^\n]*",
-            r"\n.*confusion_matrix[^\n]*",
-        ]
-        for pat in patterns:
+    Guarantee that the cell has proper IPython display imports so that
+    display(HTML(...)) produces 'display_data' events the kernel captures.
+    """
+    if "display(" in code and "from IPython.display import display, HTML" not in code:
+        return "from IPython.display import display, HTML\n" + code
+    return code
+def drop_bad_classification_metrics(code: str, y_or_df) -> str:
+    """
+    Remove classification metrics (accuracy_score, classification_report, confusion_matrix)
+    if the generated cell is *regression*. We infer this from:
+      1) The estimator names in the code (LinearRegression, OLS, Regressor*, etc.), OR
+      2) The target dtype if we can parse y = df['...'] and have the DataFrame.
+    Safe across datasets and queries.
+    """
+    import re
+    import pandas as pd
+    # 1) Heuristic by estimator names in the *code* (fast path)
+    regression_by_model = bool(re.search(
+        r"\b(LinearRegression|Ridge|Lasso|ElasticNet|ElasticNetCV|HuberRegressor|TheilSenRegressor|RANSACRegressor|"
+        r"RandomForestRegressor|GradientBoostingRegressor|DecisionTreeRegressor|KNeighborsRegressor|SVR|"
+        r"XGBRegressor|LGBMRegressor|CatBoostRegressor)\b", code
+    ) or re.search(r"\bOLS\s*\(", code))
+    is_regression = regression_by_model
+    # 2) If not obvious from the model, try to infer from y dtype (if we can)
+    if not is_regression:
+        try:
+            # Try to parse: y = df['target']
+            m = re.search(r"y\s*=\s*df\[['\"]([^'\"]+)['\"]\]", code)
+            if m and hasattr(y_or_df, "columns") and m.group(1) in getattr(y_or_df, "columns", []):
+                y = y_or_df[m.group(1)]
+                if pd.api.types.is_numeric_dtype(y) and y.nunique(dropna=True) > 10:
+                    is_regression = True
+            else:
+                # If a Series was passed
+                y = y_or_df
+                if hasattr(y, "dtype") and pd.api.types.is_numeric_dtype(y) and y.nunique(dropna=True) > 10:
+                    is_regression = True
+        except Exception:
+            pass
+    if is_regression:
+        # Strip classification-only lines
+        for pat in (r"\n.*accuracy_score[^\n]*", r"\n.*classification_report[^\n]*", r"\n.*confusion_matrix[^\n]*"):
             code = re.sub(pat, "", code, flags=re.I)
     return code
-# from syntaxmatrix.core import SyntaxMUI
-# ai_generate_code = SyntaxMUI.ai_generate_code
+def force_capture_display(code: str) -> str:
+    """
+    Ensure our executor captures HTML output:
+    - Remove any import that would override our 'display' hook.
+    - Keep/allow importing HTML only.
+    - Handle alias cases like 'display as d'.
+    """
+    import re
+    new = code
+    # 'from IPython.display import display, HTML' -> keep HTML only
+    new = re.sub(
+        r"(?m)^\s*from\s+IPython\.display\s+import\s+display\s*,\s*HTML\s*(?:as\s+([A-Za-z_]\w*))?\s*$",
+        r"from IPython.display import HTML\1", new
+    )
+    # 'from IPython.display import display as d' -> 'd = display'
+    new = re.sub(
+        r"(?m)^\s*from\s+IPython\.display\s+import\s+display\s+as\s+([A-Za-z_]\w+)\s*$",
+        r"\1 = display", new
+    )
+    # 'from IPython.display import display' -> remove (use our injected display)
+    new = re.sub(
+        r"(?m)^\s*from\s+IPython\.display\s+import\s+display\s*$",
+        r"# display import removed (SMX capture active)", new
+    )
+    # If someone does 'import IPython.display as disp' and calls disp.display(...), rewrite to display(...)
+    new = re.sub(
+        r"(?m)\bIPython\.display\.display\s*\(",
+        "display(", new
+    )
+    new = re.sub(
+        r"(?m)\b([A-Za-z_]\w*)\.display\s*\("  # handles 'disp.display(' after 'import IPython.display as disp'
+        r"(?=.*import\s+IPython\.display\s+as\s+\1)",
+        "display(", new
+    )
+    return new
+def strip_matplotlib_show(code: str) -> str:
+    """Remove blocking plt.show() calls (we export base64 instead)."""
+    import re
+    return re.sub(r"(?m)^\s*plt\.show\(\)\s*$", "", code)
+def inject_display_shim(code: str) -> str:
+    """
+    Provide display()/HTML() if missing, forwarding to our executor hook.
+    Harmless if the names already exist.
+    """
+    shim = (
+        "try:\n"
+        "    display\n"
+        "except NameError:\n"
+        "    def display(obj=None, **kwargs):\n"
+        "        __builtins__.get('_smx_display', print)(obj)\n"
+        "try:\n"
+        "    HTML\n"
+        "except NameError:\n"
+        "    class HTML:\n"
+        "        def __init__(self, data): self.data = str(data)\n"
+        "        def _repr_html_(self): return self.data\n"
+        "\n"
+    )
+    return shim + code
+def strip_spurious_column_tokens(code: str) -> str:
+    """
+    Remove common stop-words ('the','whether', ...) when they appear
+    inside column lists, e.g.:
+        predictors = ['BMI','the','HbA1c']
+        df[['GGT','whether','BMI']]
+    Leaves other strings intact.
+    """
+    STOP = {
+        "the","whether","a","an","and","or","of","to","in","on","for","by",
+        "with","as","at","from","that","this","these","those","is","are","was","were",
+        "coef", "Coef", "coefficient", "Coefficient"
+    }
+    def _norm(s: str) -> str:
+        return re.sub(r"[^a-z0-9]+", "", s.lower())
+    def _clean_list(content: str) -> str:
+        # Rebuild a string list, keeping only non-stopword items
+        items = re.findall(r"(['\"])(.*?)\1", content)
+        if not items:
+            return "[" + content + "]"
+        keep = [f"{q}{s}{q}" for (q, s) in items if _norm(s) not in STOP]
+        return "[" + ", ".join(keep) + "]"
+    # Variable assignments: predictors/features/columns/cols = [...]
+    code = re.sub(
+        r"(?m)\b(predictors|features|columns|cols)\s*=\s*\[([^\]]+)\]",
+        lambda m: f"{m.group(1)} = " + _clean_list(m.group(2)),
+        code
+    )
+    # df[[ ... ]] selections
+    code = re.sub(
+        r"df\s*\[\s*\[([^\]]+)\]\s*\]", lambda m: "df[" + _clean_list(m.group(1)) + "]", code)
+    return code
+def patch_prefix_seaborn_calls(code: str) -> str:
+    """
+    Ensure bare seaborn calls are prefixed with `sns.`.
+    E.g., `barplot(...)` → `sns.barplot(...)`, `heatmap(...)` → `sns.heatmap(...)`, etc.
+    """
+    if "sns." in code:
+        # still fix any leftover bare calls alongside prefixed ones
+        pass
+    # functions commonly used from seaborn
+    funcs = [
+        "barplot","countplot","boxplot","violinplot","stripplot","swarmplot",
+        "histplot","kdeplot","jointplot","pairplot","heatmap","clustermap",
+        "scatterplot","lineplot","catplot","displot","lmplot"
+    ]
+    # Replace bare function calls not already qualified by a dot (e.g., obj.barplot)
+    # (?<![\w.]) ensures no preceding word char or dot; avoids touching obj.barplot or mybarplot
+    pattern = re.compile(r"(?<![\w\.])(" + "|".join(funcs) + r")\s*\(", flags=re.MULTILINE)
+    def _add_prefix(m):
+        fn = m.group(1)
+        return f"sns.{fn}("
+    return pattern.sub(_add_prefix, code)
+def patch_ensure_seaborn_import(code: str) -> str:
+    """
+    If seaborn is used (sns.) ensure `import seaborn as sns` exists once.
+    Also set a quiet theme for consistent visuals.
+    """
+    needs_sns = "sns." in code
+    has_import = bool(re.search(r"^\s*import\s+seaborn\s+as\s+sns\s*$", code, flags=re.MULTILINE))
+    if needs_sns and not has_import:
+        # Insert after the first block of imports if possible, else at top
+        import_block = re.search(r"^(?:\s*(?:from\s+\S+\s+import\s+.+|import\s+\S+)\s*\n)+", code, flags=re.MULTILINE)
+        inject = "import seaborn as sns\ntry:\n    sns.set_theme()\nexcept Exception:\n    pass\n"
+        if import_block:
+            start = import_block.end()
+            code = code[:start] + inject + code[start:]
+        else:
+            code = inject + code
+    return code
+def patch_pie_chart(code, df, user_question=None, top_n: int = 12):
+    """
+    Normalise pie-chart requests.
+    Supports three patterns:
+    A) Threshold split cohorts, e.g. "HbA1c ≥ 6.5 vs < 6.5" → two pies per categorical + grouped bar.
+    B) Facet-by categories, e.g. "Ethnicity across BMI categories" or "bin BMI into Normal/Overweight/Obese"
+       → one pie per facet level (grid) + counts bar of facet sizes.
+    C) Single pie when no split/facet is requested.
+    Notes:
+    - Pie variables must be categorical (or numeric binned).
+    - Facet variables can be categorical or numeric (we bin numeric; BMI gets WHO bins).
+    """
+    q = (user_question or "")
+    q_low = q.lower()
+    # Prefer explicit: df['col'].value_counts()
+    m = re.search(r"df\[['\"](\w+)['\"]\]\.value_counts\(", code)
+    col = m.group(1) if m else None
+    # ---------- helpers ----------
+    def _is_cat(col):
+        return (str(df[col].dtype).startswith("category")
+                or df[col].dtype == "object"
+                or (pd.api.types.is_numeric_dtype(df[col]) and df[col].nunique() <= 20))
+    def _cats_from_question(question: str):
+        found = []
+        for c in df.columns:
+            if c.lower() in question.lower() and _is_cat(c):
+                found.append(c)
+        # dedupe preserve order
+        seen, out = set(), []
+        for c in found:
+            if c not in seen:
+                out.append(c); seen.add(c)
+        return out
+    def _fallback_cat():
+        cats = [(c, df[c].nunique()) for c in df.columns if _is_cat(c) and df[c].nunique() > 1]
+        if not cats: return None
+        cats.sort(key=lambda t: t[1])
+        return cats[0][0]
+    def _infer_comp_pref(question: str) -> str:
+        ql = (question or "").lower()
+        if "heatmap" in ql or "matrix" in ql:
+            return "heatmap"
+        if "100%" in ql or "100 percent" in ql or "proportion" in ql or "share" in ql or "composition" in ql:
+            return "stacked_bar_pct"
+        if "stacked" in ql:
+            return "stacked_bar"
+        if "grouped" in ql or "clustered" in ql or "side-by-side" in ql:
+            return "grouped_bar"
+        return "counts_bar"
+    # parse threshold split like "HbA1c ≥ 6.5"
+    def _parse_split(question: str):
+        ops_map = {"≥": ">=", "≤": "<=", ">=": ">=", "<=": "<=", ">": ">", "<": "<", "==": "==", "=": "=="}
+        m = re.search(r"([A-Za-z_][A-Za-z0-9_ ]*)\s*(≥|<=|≤|>=|>|<|==|=)\s*([0-9]+(?:\.[0-9]+)?)", question)
+        if not m: return None
+        col_raw, op_raw, val_raw = m.group(1).strip(), m.group(2), m.group(3)
+        op = ops_map.get(op_raw);
+        if not op: return None
+        # case-insensitive column match
+        candidates = {c.lower(): c for c in df.columns}
+        col = candidates.get(col_raw.lower())
+        if not col: return None
+        try: val = float(val_raw)
+        except Exception: return None
+        return (col, op, val)
+    # facet extractor: "by/ across / within each / per <col>", or "bin <col>", or named category list
+    def _extract_facet(question: str):
+        # 1) explicit "by/ across / within / per <col>"
+        for kw in [" by ", " across ", " within ", " within each ", " per "]:
+            m = re.search(kw + r"([A-Za-z_][A-Za-z0-9_ ]*)", " " + question + " ", flags=re.IGNORECASE)
+            if m:
+                col_raw = m.group(1).strip()
+                candidates = {c.lower(): c for c in df.columns}
+                if col_raw.lower() in candidates:
+                    return (candidates[col_raw.lower()], "auto")
+        # 2) "bin <col>"
+        m2 = re.search(r"bin\s+([A-Za-z_][A-Za-z0-9_ ]*)", question, flags=re.IGNORECASE)
+        if m2:
+            col_raw = m2.group(1).strip()
+            candidates = {c.lower(): c for c in df.columns}
+            if col_raw.lower() in candidates:
+                return (candidates[col_raw.lower()], "bin")
+        # 3) BMI special: mentions of normal/overweight/obese imply BMI categories
+        if any(kw in question.lower() for kw in ["normal", "overweight", "obese", "obesity"]) and \
+           any(c.lower() == "bmi" for c in df.columns.str.lower()):
+            bmi_col = [c for c in df.columns if c.lower() == "bmi"][0]
+            return (bmi_col, "bmi")
+        return None
+    def _bmi_bins(series: pd.Series):
+        # WHO cutoffs
+        bins   = [-np.inf, 18.5, 25, 30, np.inf]
+        labels = ["Underweight (<18.5)", "Normal (18.5–24.9)", "Overweight (25–29.9)", "Obese (≥30)"]
+        return pd.cut(series.astype(float), bins=bins, labels=labels, right=False)
+    wants_pie = ("pie" in q_low) or ("plt.pie(" in code) or ("kind='pie'" in code) or ('kind="pie"' in code)
+    if not wants_pie:
+        return code
+    split = _parse_split(q)
+    facet = _extract_facet(q)
+    cats = _cats_from_question(q)
+    _comp_pref = _infer_comp_pref(q)
+    # Prefer explicitly referenced categorical like Ethnicity, Smoking_Status, Physical_Activity_Level
+    for hard in ["Ethnicity", "Smoking_Status", "Physical_Activity_Level"]:
+        if hard in df.columns and hard not in cats and hard.lower() in q_low:
+            cats.append(hard)
+    # --------------- CASE A: threshold split (cohorts) ---------------
+    if split:
+        if not (cats or any(_is_cat(c) for c in df.columns)):
+            return code
+        if not cats:
+            pool = [(c, df[c].nunique()) for c in df.columns if _is_cat(c) and df[c].nunique() > 1]
+            pool.sort(key=lambda t: t[1])
+            cats = [t[0] for t in pool[:3]] if pool else []
+        if not cats:
+            return code
+        split_col, op, val = split
+        cond_str = f"(df['{split_col}'] {op} {val})"
+        snippet = f"""
+        import numpy as np
+        import pandas as pd
+        import matplotlib.pyplot as plt
+        _mask_a = ({cond_str}) & df['{split_col}'].notna()
+        _mask_b = (~({cond_str})) & df['{split_col}'].notna()
+        _cohort_a_name = "{split_col} {op} {val}"
+        _cohort_b_name = "NOT ({split_col} {op} {val})"
+        _cat_cols = {cats!r}
+        n = len(_cat_cols)
+        fig, axes = plt.subplots(nrows=n, ncols=2, figsize=(12, 5*n))
+        if n == 1:
+            axes = np.array([axes])
+        for i, col in enumerate(_cat_cols):
+            s_a = df.loc[_mask_a, col].astype(str).value_counts().nlargest({top_n})
+            s_b = df.loc[_mask_b, col].astype(str).value_counts().nlargest({top_n})
+            ax_a = axes[i, 0]; ax_b = axes[i, 1]
+            if len(s_a) > 0:
+                ax_a.pie(s_a.values, labels=[str(x) for x in s_a.index],
+                        autopct='%1.1f%%', startangle=90, counterclock=False)
+            ax_a.set_title(f"{{col}} — {{_cohort_a_name}}"); ax_a.axis('equal')
+            if len(s_b) > 0:
+                ax_b.pie(s_b.values, labels=[str(x) for x in s_b.index],
+                        autopct='%1.1f%%', startangle=90, counterclock=False)
+            ax_b.set_title(f"{{col}} — {{_cohort_b_name}}"); ax_b.axis('equal')
+        plt.tight_layout(); plt.show()
+        # grouped bar complement
+        for col in _cat_cols:
+            _tmp = (df.loc[df['{split_col}'].notna(), [col, '{split_col}']]
+              .assign(__cohort=np.where({cond_str}, _cohort_a_name, _cohort_b_name)))
+            _tab = _tmp.groupby([col, "__cohort"]).size().unstack("__cohort").fillna(0)
+            _tab = _tab.loc[_tab.sum(axis=1).sort_values(ascending=False).index[:{top_n}]]
+            if _comp_pref == "grouped_bar":
+                ax = _tab.plot(kind='bar', rot=0, figsize=(10, 4))
+                ax.set_title(f"{col} by cohort (grouped)")
+                ax.set_xlabel(col); ax.set_ylabel("Count")
+                plt.tight_layout(); plt.show()
+            elif _comp_pref == "stacked_bar":
+                ax = _tab.plot(kind='bar', stacked=True, rot=0, figsize=(10, 4))
+                ax.set_title(f"{col} by cohort (stacked)")
+                ax.set_xlabel(col); ax.set_ylabel("Count")
+                plt.tight_layout(); plt.show()
+            elif _comp_pref == "stacked_bar_pct":
+                _perc = _tab.div(_tab.sum(axis=1), axis=0) * 100
+                ax = _perc.plot(kind='bar', stacked=True, rot=0, figsize=(10, 4))
+                ax.set_title(f"{col} by cohort (100% stacked)")
+                ax.set_xlabel(col); ax.set_ylabel("Percent")
+                plt.tight_layout(); plt.show()
+            elif _comp_pref == "heatmap":
+                _perc = _tab.div(_tab.sum(axis=1), axis=0) * 100
+                import numpy as np
+                fig, ax = plt.subplots(figsize=(8, max(3, 0.35*len(_perc))))
+                im = ax.imshow(_perc.values, aspect='auto')
+                ax.set_xticks(range(_perc.shape[1])); ax.set_xticklabels(_perc.columns, rotation=0)
+                ax.set_yticks(range(_perc.shape[0])); ax.set_yticklabels(_perc.index)
+                ax.set_title(f"{col} by cohort — % heatmap")
+                for i in range(_perc.shape[0]):
+                    for j in range(_perc.shape[1]):
+                        ax.text(j, i, f"{{_perc.values[i, j]:.1f}}%", ha="center", va="center")
+                fig.colorbar(im, ax=ax, label="%")
+                plt.tight_layout(); plt.show()
+            else:  # counts_bar (default)
+                ax = _tab.sum(axis=1).plot(kind='bar', rot=0, figsize=(10, 3))
+                ax.set_title(f"{col}: total counts (both cohorts)")
+                ax.set_xlabel(col); ax.set_ylabel("Count")
+                plt.tight_layout(); plt.show()
+        """.lstrip()
+        return snippet
+    # --------------- CASE B: facet-by (categories/bins) ---------------
+    if facet:
+        facet_col, how = facet
+        # Build facet series
+        if pd.api.types.is_numeric_dtype(df[facet_col]):
+            if how == "bmi":
+                facet_series = _bmi_bins(df[facet_col])
+            else:
+                # generic numeric bins: 3 equal-width bins by default
+                facet_series = pd.cut(df[facet_col].astype(float), bins=3)
+        else:
+            facet_series = df[facet_col].astype(str)
+        # Choose pie dimension (categorical to count inside each facet)
+        pie_dim = None
+        for c in cats:
+            if c in df.columns and _is_cat(c):
+                pie_dim = c; break
+        if pie_dim is None:
+            pie_dim = _fallback_cat()
+        if pie_dim is None:
+            return code
+        snippet = f"""
+        import math
+        import pandas as pd
+        import matplotlib.pyplot as plt
+        df = df.copy()
+        _preferred = "{facet_col}" if "{facet_col}" in df.columns else None
+        def _select_facet_col(df, preferred=None):
+            if preferred is not None:
+                return preferred
+            # Prefer low-cardinality categoricals (readable pies/grids)
+            cat_cols = [
+                c for c in df.columns
+                if (df[c].dtype == 'object' or str(df[c].dtype).startswith('category'))
+                and df[c].nunique() > 1 and df[c].nunique() <= 20
+            ]
+            if cat_cols:
+                cat_cols.sort(key=lambda c: df[c].nunique())
+                return cat_cols[0]
+            # Else fall back to first usable numeric
+            num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and df[c].nunique() > 1]
+            return num_cols[0] if num_cols else None
+        _facet_col = _select_facet_col(df, _preferred)
+        if _facet_col is None:
+            # Nothing suitable → single facet keeps pipeline alive
+            df["__facet__"] = "All"
+        else:
+            s = df[_facet_col]
+            if pd.api.types.is_numeric_dtype(s):
+                # Robust numeric binning: quantiles first, fallback to equal-width
+                uniq = pd.Series(s).dropna().nunique()
+                q = 3 if uniq < 10 else 4 if uniq < 30 else 5
+                try:
+                    df["__facet__"] = pd.qcut(s.astype(float), q=q, duplicates="drop")
+                except Exception:
+                    df["__facet__"] = pd.cut(s.astype(float), bins=q)
+            else:
+                # Cap long tails; keep top categories
+                vc = s.astype(str).value_counts()
+                keep = vc.index[:{top_n}]
+                df["__facet__"] = s.astype(str).where(s.astype(str).isin(keep), other="Other")
+        levels = [str(x) for x in df["__facet__"].dropna().unique().tolist()]
+        levels = [x for x in levels if x != "nan"]
+        levels.sort()
+        m = len(levels)
+        cols = 3 if m >= 3 else m or 1
+        rows = int(math.ceil(m / cols))
+        fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(4*cols, 4*rows))
+        if not isinstance(axes, (list, np.ndarray)):
+            axes = np.array([[axes]])
+        axes = axes.reshape(rows, cols)
+        for i, lvl in enumerate(levels):
+            r, c = divmod(i, cols)
+            ax = axes[r, c]
+            s = (df.loc[df["__facet"].astype(str) == str(lvl), "{pie_dim}"]
+                .astype(str).value_counts().nlargest({top_n}))
+            if len(s) > 0:
+                ax.pie(s.values, labels=[str(x) for x in s.index],
+                    autopct='%1.1f%%', startangle=90, counterclock=False)
+            ax.set_title(f"{pie_dim} — {{lvl}}"); ax.axis('equal')
+        # hide any empty subplots
+        for j in range(m, rows*cols):
+            r, c = divmod(j, cols)
+            axes[r, c].axis("off")
+        plt.tight_layout(); plt.show()
+        # --- companion visual (adaptive) ---
+        _comp_pref = "{_comp_pref}"
+        # build contingency table: pie_dim x facet
+        _tab = (df[["__facet__", "{pie_dim}"]]
+          .dropna()
+          .astype({{"__facet__": str, "{pie_dim}": str}})
+          .value_counts()
+          .unstack(level="__facet__")
+          .fillna(0))
+        # keep top categories by overall size
+        _tab = _tab.loc[_tab.sum(axis=1).sort_values(ascending=False).index[:{top_n}]]
+        if _comp_pref == "grouped_bar":
+            ax = _tab.T.plot(kind="bar", rot=0, figsize=(max(8, 1.2*len(_tab.columns)), 4))
+            ax.set_title("{pie_dim} by {facet_col} (grouped)")
+            ax.set_xlabel("{facet_col}"); ax.set_ylabel("Count")
+            plt.tight_layout(); plt.show()
+        elif _comp_pref == "stacked_bar":
+            ax = _tab.T.plot(kind="bar", stacked=True, rot=0, figsize=(max(8, 1.2*len(_tab.columns)), 4))
+            ax.set_title("{pie_dim} by {facet_col} (stacked)")
+            ax.set_xlabel("{facet_col}"); ax.set_ylabel("Count")
+            plt.tight_layout(); plt.show()
+        elif _comp_pref == "stacked_bar_pct":
+            _perc = _tab.div(_tab.sum(axis=0), axis=1) * 100  # column-normalised to 100%
+            ax = _perc.T.plot(kind="bar", stacked=True, rot=0, figsize=(max(8, 1.2*len(_perc.columns)), 4))
+            ax.set_title("{pie_dim} by {facet_col} (100% stacked)")
+            ax.set_xlabel("{facet_col}"); ax.set_ylabel("Percent")
+            plt.tight_layout(); plt.show()
+        elif _comp_pref == "heatmap":
+            _perc = _tab.div(_tab.sum(axis=0), axis=1) * 100
+            import numpy as np
+            fig, ax = plt.subplots(figsize=(max(6, 0.9*len(_perc.columns)), max(4, 0.35*len(_perc))))
+            im = ax.imshow(_perc.values, aspect='auto')
+            ax.set_xticks(range(_perc.shape[1])); ax.set_xticklabels(_perc.columns, rotation=0)
+            ax.set_yticks(range(_perc.shape[0])); ax.set_yticklabels(_perc.index)
+            ax.set_title("{pie_dim} by {facet_col} — % heatmap")
+            for i in range(_perc.shape[0]):
+                for j in range(_perc.shape[1]):
+                    ax.text(j, i, f"{{_perc.values[i, j]:.1f}}%", ha="center", va="center")
+            fig.colorbar(im, ax=ax, label="%")
+            plt.tight_layout(); plt.show()
+        else:  # counts_bar (default denominators)
+            _counts = df["__facet"].value_counts()
+            ax = _counts.plot(kind="bar", rot=0, figsize=(6, 3))
+            ax.set_title("Counts by {facet_col}")
+            ax.set_xlabel("{facet_col}"); ax.set_ylabel("Count")
+            plt.tight_layout(); plt.show()
+        """.lstrip()
+        return snippet
+    # --------------- CASE C: single pie ---------------
+    chosen = None
+    for c in cats:
+        if c in df.columns and _is_cat(c):
+            chosen = c; break
+    if chosen is None:
+        chosen = _fallback_cat()
+    if chosen:
+        snippet = f"""
+        import matplotlib.pyplot as plt
+        counts = df['{chosen}'].astype(str).value_counts().nlargest({top_n})
+        fig, ax = plt.subplots()
+        if len(counts) > 0:
+            ax.pie(counts.values, labels=[str(i) for i in counts.index],
+                autopct='%1.1f%%', startangle=90, counterclock=False)
+        ax.set_title('Distribution of {chosen} (top {top_n})')
+        ax.axis('equal')
+        plt.show()
+        """.lstrip()
+        return snippet
+    # numeric last resort
+    num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
+    if num_cols:
+        col = num_cols[0]
+        snippet = f"""
+        import pandas as pd
+        import matplotlib.pyplot as plt
+        bins = pd.qcut(df['{col}'], q=5, duplicates='drop')
+        counts = bins.value_counts().sort_index()
+        fig, ax = plt.subplots()
+        if len(counts) > 0:
+            ax.pie(counts.values, labels=[str(i) for i in counts.index],
+                autopct='%1.1f%%', startangle=90, counterclock=False)
+        ax.set_title('Distribution of {col} (binned)')
+        ax.axis('equal')
+        plt.show()
+        """.lstrip()
+        return snippet
+    return code
+def patch_fix_seaborn_palette_calls(code: str) -> str:
+    """
+    Removes seaborn `palette=` when no `hue=` is present in the same call.
+    Fixes FutureWarning: 'Passing `palette` without assigning `hue` ...'.
+    """
+    if "sns." not in code:
+        return code
+    # Targets common seaborn plotters
+    funcs = r"(boxplot|barplot|countplot|violinplot|stripplot|swarmplot|histplot|kdeplot)"
+    pattern = re.compile(rf"(sns\.{funcs}\s*\()([^)]*)\)", re.DOTALL)
+    def _fix_call(m):
+        head, inner = m.group(1), m.group(2)
+        # If there's already hue=, keep as is
+        if re.search(r"(?<!\w)hue\s*=", inner):
+            return f"{head}{inner})"
+        # Otherwise remove palette=... safely (and any adjacent comma spacing)
+        inner2 = re.sub(r",\s*palette\s*=\s*[^,)\n]+", "", inner)
+        inner2 = re.sub(r"\bpalette\s*=\s*[^,)\n]+\s*,\s*", "", inner2)
+        inner2 = re.sub(r"\s*,\s*\)", ")", f"{inner2})")[:-1]  # clean trailing comma before ')'
+        return f"{head}{inner2})"
+    return pattern.sub(_fix_call, code)
+def patch_quiet_specific_warnings(code: str) -> str:
+    """
+    Inserts targeted warning filters (not blanket ignores).
+    - seaborn palette/hue deprecation
+    - python-dotenv parse chatter
+    """
+    prelude = (
+        "import warnings\n"
+        "warnings.filterwarnings(\n"
+        "    'ignore', message=r'.*Passing `palette` without assigning `hue`.*', category=FutureWarning)\n"
+        "warnings.filterwarnings(\n"
+        "    'ignore', message=r'python-dotenv could not parse statement.*')\n"
+    )
+    # If warnings already imported once, just add filters; else insert full prelude.
+    if "import warnings" in code:
+        code = re.sub(
+            r"(import warnings[^\n]*\n)",
+            lambda m: m.group(1) + prelude.replace("import warnings\n", ""),
+            code,
+            count=1
+        )
+    else:
+        # place after first import block if possible
+        m = re.search(r"^(?:from\s+\S+\s+import\s+.+|import\s+\S+).*\n+", code, flags=re.MULTILINE)
+        if m:
+            idx = m.end()
+            code = code[:idx] + prelude + code[idx:]
+        else:
+            code = prelude + code
+    return code
+def _norm_col_name(s: str) -> str:
+    """normalise a column name: lowercase + strip non-alphanumerics."""
+    return re.sub(r"[^a-z0-9]+", "", str(s).lower())
+def _first_present(df: pd.DataFrame, candidates: list[str]) -> str | None:
+    """return the actual df column that matches any candidate (after normalisation)."""
+    norm_map = {_norm_col_name(c): c for c in df.columns}
+    for cand in candidates:
+        hit = norm_map.get(_norm_col_name(cand))
+        if hit is not None:
+            return hit
+    return None
+def _ensure_canonical_alias(df: pd.DataFrame, target: str, aliases: list[str]) -> tuple[pd.DataFrame, bool]:
+    """
+    If any alias exists, materialise a canonical copy at `target` (don’t drop the original).
+    Returns (df, found_bool).
+    """
+    if target in df.columns:
+        return df, True
+    col = _first_present(df, [target, *aliases])
+    if col is None:
+        return df, False
+    df[target] = df[col]
+    return df, True
+def strip_python_dotenv(code: str) -> str:
+    """
+    Remove any use of python-dotenv from generated code, including:
+      - single and multi-line 'from dotenv import ...'
+      - 'import dotenv' (with or without alias) and calls via any alias
+      - load_dotenv/find_dotenv/dotenv_values calls (bare or prefixed)
+      - IPython magics (%load_ext dotenv, %dotenv, %env …)
+      - shell installs like '!pip install python-dotenv'
+    """
+    original = code
+    # 0) Kill IPython magics & shell installs referencing dotenv
+    code = re.sub(r"^\s*%load_ext\s+dotenv\s*$", "", code, flags=re.MULTILINE)
+    code = re.sub(r"^\s*%dotenv\b.*$", "", code, flags=re.MULTILINE)
+    code = re.sub(r"^\s*%env\b.*$", "", code, flags=re.MULTILINE)
+    code = re.sub(r"^\s*!\s*pip\s+install\b.*dotenv.*$", "", code, flags=re.IGNORECASE | re.MULTILINE)
+    # 1) Remove single-line 'from dotenv import ...'
+    code = re.sub(r"^\s*from\s+dotenv\s+import\s+.*$", "", code, flags=re.MULTILINE)
+    # 2) Remove multi-line 'from dotenv import ( ... )' blocks
+    code = re.sub(
+        r"^\s*from\s+dotenv\s+import\s*\([\s\S]*?\)\s*$",
+        "",
+        code,
+        flags=re.MULTILINE,
+    )
+    # 3) Remove 'import dotenv' (with optional alias). Capture alias names.
+    aliases = re.findall(r"^\s*import\s+dotenv\s+as\s+([A-Za-z_][A-Za-z0-9_]*)\s*$",
+                         code, flags=re.MULTILINE)
+    code = re.sub(r"^\s*import\s+dotenv\s*(?:as\s+[A-Za-z_][A-Za-z0-9_]*)?\s*$",
+                  "", code, flags=re.MULTILINE)
+    # 4) Remove calls to load_dotenv / find_dotenv / dotenv_values with any prefix
+    #    e.g., load_dotenv(...), dotenv.load_dotenv(...), dtenv.load_dotenv(...)
+    fn_names = r"(?:load_dotenv|find_dotenv|dotenv_values)"
+    # bare calls
+    code = re.sub(rf"^\s*{fn_names}\s*\([^)]*\)\s*$", "", code, flags=re.MULTILINE)
+    # dotted calls with any identifier prefix (alias or module)
+    code = re.sub(rf"^\s*[A-Za-z_][A-Za-z0-9_]*\s*\.\s*{fn_names}\s*\([^)]*\)\s*$",
+                  "", code, flags=re.MULTILINE)
+    # 5) If any alias imported earlier slipped through (method chains etc.), remove lines using that alias.
+    for al in aliases:
+        code = re.sub(rf"^\s*{al}\s*\.\s*\w+\s*\([^)]*\)\s*$", "", code, flags=re.MULTILINE)
+    # 6) Tidy excess blank lines
+    code = re.sub(r"\n{3,}", "\n\n", code).strip("\n") + "\n"
+    return code
+def fix_predict_calls_records_arg(code: str) -> str:
+    """
+    If generated code calls predict_* with a list-of-dicts via .to_dict('records')
+    (or orient='records'), strip the .to_dict(...) so a DataFrame is passed instead.
+    Works line-by-line to avoid over-rewrites elsewhere.
+    Examples fixed:
+      predict_patient(X_test.iloc[:5].to_dict('records'))
+      predict_risk(df.head(3).to_dict(orient="records"))
+    → predict_patient(X_test.iloc[:5])
+    """
+    fixed_lines = []
+    for line in code.splitlines():
+        if "predict_" in line and "to_dict" in line and "records" in line:
+            line = re.sub(
+                r"\.to_dict\s*\(\s*(?:orient\s*=\s*)?['\"]records['\"]\s*\)",
+                "",
+                line
+            )
+        fixed_lines.append(line)
+    return "\n".join(fixed_lines)
+def fix_fstring_backslash_paths(code: str) -> str:
+    """
+    Fix bad f-strings like: f"...{out_dir\\plots\\img.png}..."
+    → f"...{os.path.join(out_dir, r'plots\\img.png')}"
+    Only touches f-strings that contain a backslash path inside {...}.
+    """
+    def _fix_line(line: str) -> str:
+        # quick check: only f-strings need scanning
+        if not (("f\"" in line) or ("f'" in line) or ("f\"\"\"" in line) or ("f'''" in line)):
+            return line
+        # {var\rest-of-path} where var can be dotted (e.g., cfg.out)
+        pattern = re.compile(r"\{([A-Za-z_][A-Za-z0-9_\.]*)\\([^}]+)\}")
+        def repl(m):
+            left = m.group(1)
+            right = m.group(2).strip().replace('"', '\\"')
+            return "{os.path.join(" + left + ', r"' + right + '")}'
+        return pattern.sub(repl, line)
+    return "\n".join(_fix_line(ln) for ln in code.splitlines())
+def ensure_os_import(code: str) -> str:
+    """
+    If os.path.join is used but 'import os' is missing, inject it at the top.
+    """
+    needs = "os.path.join(" in code
+    has_import_os = re.search(r"^\s*import\s+os\b", code, flags=re.MULTILINE) is not None
+    has_from_os = re.search(r"^\s*from\s+os\s+import\b", code, flags=re.MULTILINE) is not None
+    if needs and not (has_import_os or has_from_os):
+        return "import os\n" + code
+    return code
+def fix_seaborn_boxplot_nameerror(code: str) -> str:
+    """
+    Fix bad calls like: sns.boxplot(boxplot)
+    Heuristic:
+      - If plot_df + FH_status + var exist → sns.boxplot(data=plot_df, x='FH_status', y=var, ax=ax)
+      - Else if plot_df + var → sns.boxplot(data=plot_df, y=var, ax=ax)
+      - Else if plot_df only → sns.boxplot(data=plot_df, ax=ax)
+      - Else → sns.boxplot(ax=ax)
+    Ensures a matplotlib Axes 'ax' exists.
+    """
+    pattern = re.compile(r"^\s*sns\.boxplot\s*\(\s*boxplot\s*\)\s*$", re.MULTILINE)
+    if not pattern.search(code):
+        return code
+    has_plot_df = re.search(r"\bplot_df\b", code) is not None
+    has_var     = re.search(r"\bvar\b", code) is not None
+    has_fh      = bool(re.search(r"['\"]FH_status['\"]", code) or re.search(r"\bFH_status\b", code))
+    if has_plot_df and has_var and has_fh:
+        replacement = "sns.boxplot(data=plot_df, x='FH_status', y=var, ax=ax)"
+    elif has_plot_df and has_var:
+        replacement = "sns.boxplot(data=plot_df, y=var, ax=ax)"
+    elif has_plot_df:
+        replacement = "sns.boxplot(data=plot_df, ax=ax)"
+    else:
+        replacement = "sns.boxplot(ax=ax)"
+    fixed = pattern.sub(replacement, code)
+    # Ensure 'fig, ax = plt.subplots(...)' exists
+    if "ax=" in replacement and not re.search(r"\bfig\s*,\s*ax\s*=\s*plt\.subplots\s*\(", fixed):
+        # Insert right before the first seaborn call
+        m = re.search(r"^\s*sns\.", fixed, flags=re.MULTILINE)
+        insert_at = m.start() if m else 0
+        fixed = fixed[:insert_at] + "fig, ax = plt.subplots(figsize=(8,4))\n" + fixed[insert_at:]
+    return fixed
+def fix_seaborn_barplot_nameerror(code: str) -> str:
+    """
+    Fix bad calls like: sns.barplot(barplot)
+    Strategy mirrors boxplot fixer: prefer data=plot_df with x/y if available,
+    otherwise degrade safely to an empty call on an existing Axes.
+    """
+    import re
+    pattern = re.compile(r"^\s*sns\.barplot\s*\(\s*barplot\s*\)\s*$", re.MULTILINE)
+    if not pattern.search(code):
+        return code
+    has_plot_df = re.search(r"\bplot_df\b", code) is not None
+    has_var     = re.search(r"\bvar\b", code) is not None
+    has_fh      = bool(re.search(r"['\"]FH_status['\"]", code) or re.search(r"\bFH_status\b", code))
+    if has_plot_df and has_var and has_fh:
+        replacement = "sns.barplot(data=plot_df, x='FH_status', y=var, ax=ax)"
+    elif has_plot_df and has_var:
+        replacement = "sns.barplot(data=plot_df, y=var, ax=ax)"
+    elif has_plot_df:
+        replacement = "sns.barplot(data=plot_df, ax=ax)"
+    else:
+        replacement = "sns.barplot(ax=ax)"
+    # ensure an Axes 'ax' exists (no-op if already present)
+    if "ax =" not in code:
+        code = "import matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(6,4))\n" + code
+    return pattern.sub(replacement, code)
+def parse_and_format_ml_pipeline(raw_text: str) -> tuple[str, str, str]:
+    """
+    Parses the raw text to extract and format the 'refined question',
+    'intents (tasks)', and 'chronology of tasks' sections.
+    Args:
+        raw_text: The complete input string containing the ML pipeline structure.
+    Returns:
+        A tuple containing:
+        (formatted_question_str, formatted_intents_str, formatted_chronology_str)
+    """
+    # --- 1. Regex Pattern to Extract Sections ---
+    # The pattern uses capturing groups (?) to look for the section headers
+    # (e.g., 'refined question:') and captures all the content until the next
+    # section header or the end of the string. re.DOTALL is crucial for '.' to match newlines.
+    pattern = re.compile(
+        r"refined question:(?P<question>.*?)"
+        r"intents \(tasks\):(?P<intents>.*?)"
+        r"Chronology of tasks:(?P<chronology>.*)",
+        re.IGNORECASE | re.DOTALL
+    )
+    match = pattern.search(raw_text)
+    if not match:
+        raise ValueError("Input text structure does not match the expected pattern.")
+    # --- 2. Extract Content ---
+    question_content = match.group('question').strip()
+    intents_content = match.group('intents').strip()
+    chronology_content = match.group('chronology').strip()
+    # --- 3. Formatting Functions ---
+    def format_question(content):
+        """Formats the Refined Question section."""
+        # Clean up leading/trailing whitespace and ensure clean paragraphs
+        content = content.strip().replace('\n', ' ').replace('  ', ' ')
+        # Simple formatting using Markdown headers and bolding
+        formatted = (
+            # "## 1. Project Goal and Objectives\n\n"
+            "<b> Refined Question:</b>\n"
+            f"{content}\n"
+        )
+        return formatted
+    def format_intents(content):
+        """Formats the Intents (Tasks) section as a structured list."""
+        # Use regex to find and format each numbered task
+        # It finds 'N. **Text** - ...' and breaks it down.
+        tasks = []
+        # Pattern: N. **Text** - Content (including newlines, non-greedy)
+        # We need to explicitly handle the list items starting with '-' within the content
+        task_pattern = re.compile(r'(\d+\. \*\*.*?\*\*.*?)(?=\n\d+\. \*\*|\Z)', re.DOTALL)
+        # Split the content by lines and join tasks back into clean strings
+        raw_tasks = [m.group(1).strip() for m in task_pattern.finditer(content)]
+        for task in raw_tasks:
+            # Replace the initial task number and **Heading** with a Heading 3
+            task = re.sub(r'^\d+\. (\*\*.*?\*\*)', r'### \1', task, count=1, flags=re.MULTILINE)
+            # Replace list markers (' - ') with Markdown bullets ('* ') for clarity
+            task = task.replace('\n - ', '\n* ').replace('- ', '* ', 1)
+            tasks.append(task)
+        formatted_tasks = "\n\n".join(tasks)
+        return (
+            "\n---\n"
+            "## 2. Methodology and Tasks\n\n"
+            f"{formatted_tasks}\n"
+        )
+    def format_chronology(content):
+        """Formats the Chronology section."""
+        # Uses the given LaTeX format
+        content = content.strip().replace(' ', ' \rightarrow ')
+        formatted = (
+            "\n---\n"
+            "## 3. Chronology of Tasks\n"
+            f"$$\\text{{{content}}}$$"
+        )
+        return formatted
+    # --- 4. Format and Return ---
+    formatted_question = format_question(question_content)
+    formatted_intents = format_intents(intents_content)
+    formatted_chronology = format_chronology(chronology_content)
+    return formatted_question, formatted_intents, formatted_chronology
+def generate_full_report(formatted_question: str, formatted_intents: str, formatted_chronology: str) -> str:
+    """Combines all formatted parts into a final report string."""
+    return (
+        "# 🔬 Machine Learning Pipeline for Predicting Family History of Diabetes\n\n"
+        f"{formatted_question}\n"
+        f"{formatted_intents}\n"
+        f"{formatted_chronology}\n"
+    )
+def fix_confusion_matrix_for_multilabel(code: str) -> str:
+    """
+    Replace ConfusionMatrixDisplay.from_estimator(...) usages with
+    from_predictions(...) which works for multi-label loops without requiring
+    the estimator to expose _estimator_type.
+    """
+    return re.sub(
+        r"ConfusionMatrixDisplay\.from_estimator\(([^,]+),\s*([^,]+),\s*([^)]+)\)",
+        r"ConfusionMatrixDisplay.from_predictions(\3, \1.predict(\2))",
+        code
+    )
+def smx_auto_title_plots(ctx=None, fallback="Analysis"):
+    """
+    Ensure every Matplotlib/Seaborn Axes has a title.
+    Uses refined_question -> askai_question -> fallback.
+    Only sets a title if it's currently empty.
+    """
+    import matplotlib.pyplot as plt
+    def _all_figures():
+        try:
+            from matplotlib._pylab_helpers import Gcf
+            return [fm.canvas.figure for fm in Gcf.get_all_fig_managers()]
+        except Exception:
+            # Best effort fallback
+            nums = plt.get_fignums()
+            return [plt.figure(n) for n in nums] if nums else []
+    # Choose a concise title
+    title = None
+    if isinstance(ctx, dict):
+        title = ctx.get("refined_question") or ctx.get("askai_question")
+    title = (str(title).strip().splitlines()[0][:120]) if title else fallback
+    for fig in _all_figures():
+        for ax in getattr(fig, "axes", []):
+            try:
+                if not (ax.get_title() or "").strip():
+                    ax.set_title(title)
+            except Exception:
+                pass
+        try:
+            fig.tight_layout()
+        except Exception:
+            pass
+def patch_fix_sentinel_plot_calls(code: str) -> str:
+    """
+    Normalise 'sentinel first-arg' calls so wrappers can pick sane defaults.
+      SB_barplot(barplot)            -> SB_barplot()
+      SB_barplot(barplot, ...)       -> SB_barplot(...)
+      sns.barplot(barplot)           -> SB_barplot()
+      sns.barplot(barplot, ...)      -> SB_barplot(...)
+    Same for: histplot, boxplot, lineplot, countplot, heatmap, pairplot, scatterplot.
+    """
+    names = ['histplot','boxplot','barplot','lineplot','countplot','heatmap','pairplot','scatterplot']
+    for n in names:
+        # SB_* with sentinel as the first arg (with or without trailing args)
+        code = re.sub(rf"\bSB_{n}\s*\(\s*{n}\s*\)", f"SB_{n}()", code)
+        code = re.sub(rf"\bSB_{n}\s*\(\s*{n}\s*,", f"SB_{n}(", code)
+        # sns.* with sentinel as the first arg → route to SB_* (so our wrappers handle it)
+        code = re.sub(rf"\bsns\.{n}\s*\(\s*{n}\s*\)", f"SB_{n}()", code)
+        code = re.sub(rf"\bsns\.{n}\s*\(\s*{n}\s*,", f"SB_{n}(", code)
+    return code
+def patch_rmse_calls(code: str) -> str:
+    """
+    Make RMSE robust across sklearn versions.
+    - Replace mean_squared_error(..., squared=False) -> _SMX_rmse(...)
+    - Wrap any remaining mean_squared_error(...) calls with _SMX_call for safety.
+    """
+    import re
+    # (a) Specific RMSE pattern
+    code = re.sub(
+        r"\bmean_squared_error\s*\(\s*(.+?)\s*,\s*squared\s*=\s*False\s*\)",
+        r"_SMX_rmse(\1)",
+        code,
+        flags=re.DOTALL
+    )
+    # (b) Guard any other MSE calls
+    code = re.sub(r"\bmean_squared_error\s*\(", r"_SMX_call(mean_squared_error, ", code)
+    return code

syntaxmatrix 1.4.6__py3-none-any.whl → 2.5.5.4__py3-none-any.whl

syntaxmatrix 1.4.6py3-none-any.whl → 2.5.5.4py3-none-any.whl