PyPI - syntaxmatrix - Versions diffs - 2.3.5__py3-none-any.whl → 2.5.5.4__py3-none-any.whl - Mend

syntaxmatrix 2.3.5py3-none-any.whl → 2.5.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

syntaxmatrix/agentic/__init__.py +0 -0
syntaxmatrix/agentic/agent_tools.py +24 -0
syntaxmatrix/agentic/agents.py +810 -0
syntaxmatrix/agentic/code_tools_registry.py +37 -0
syntaxmatrix/agentic/model_templates.py +1790 -0
syntaxmatrix/commentary.py +134 -112
syntaxmatrix/core.py +385 -245
syntaxmatrix/dataset_preprocessing.py +218 -0
syntaxmatrix/display.py +89 -37
syntaxmatrix/gpt_models_latest.py +5 -4
syntaxmatrix/profiles.py +19 -4
syntaxmatrix/routes.py +947 -141
syntaxmatrix/settings/model_map.py +38 -30
syntaxmatrix/static/icons/hero_bg.jpg +0 -0
syntaxmatrix/templates/dashboard.html +248 -54
syntaxmatrix/utils.py +2254 -84
{syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/METADATA +16 -17
{syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/RECORD +21 -15
syntaxmatrix/model_templates.py +0 -29
{syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/WHEEL +0 -0
{syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/licenses/LICENSE.txt +0 -0
{syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/top_level.txt +0 -0

syntaxmatrix/routes.py CHANGED Viewed

@@ -1,18 +1,28 @@
-from prompt_toolkit import HTML
-from scipy import io
-import os, zipfile, time, uuid, werkzeug, queue, html, ast, re, threading, textwrap, json, pandas as pd
-from PyPDF2.errors import EmptyFileError
-import io
+import os, zipfile, time, uuid, werkzeug, queue, html, ast, re
+import threading, textwrap, json, pandas as pd
+import contextlib
+import io as _std_io
 from io import BytesIO
+from scipy import io
+from flask import Blueprint, Response, request, send_file, session
+from flask import render_template, render_template_string, url_for, redirect, g
+from flask import flash, jsonify, send_from_directory, get_flashed_messages, stream_with_context
+from flask_login import current_user
 from PyPDF2 import PdfReader
 from markupsafe  import Markup
-from urllib.parse import quote
+from urllib.parse import quote
+from datetime import datetime
+from prompt_toolkit import HTML
+from PyPDF2.errors import EmptyFileError
+import numpy as np
 from .auth import register_user, authenticate, login_required, admin_required, superadmin_required
-from flask import Blueprint, Response, request, send_file, session, render_template, render_template_string, redirect, url_for, flash, jsonify, send_from_directory, get_flashed_messages, stream_with_context
 from syntaxmatrix.themes import DEFAULT_THEMES
 from syntaxmatrix import db
-from syntaxmatrix.utils import *
 from syntaxmatrix.vector_db import add_pdf_chunk
 from syntaxmatrix.file_processor import *
 from syntaxmatrix.vectorizer import embed_text
@@ -22,16 +32,13 @@ from syntaxmatrix.history_store import SQLHistoryStore, PersistentHistoryStore
 from syntaxmatrix.kernel_manager import SyntaxMatrixKernelManager, execute_code_in_kernel
 from syntaxmatrix.vector_db import *
 from syntaxmatrix.settings.string_navbar import string_navbar_items
-from syntaxmatrix.settings.model_map import PROVIDERS_MODELS, MODEL_DESCRIPTIONS, PURPOSE_TAGS, EMBEDDING_MODELS
-from .project_root import detect_project_root
-from . import profiles as _prof
-from . import generate_page as _genpage
-from . import profiles as _prof
-from . import auth as _auth
+from syntaxmatrix.settings.model_map import GPT_MODELS_LATEST, PROVIDERS_MODELS, MODEL_DESCRIPTIONS, PURPOSE_TAGS, EMBEDDING_MODELS
+from syntaxmatrix.project_root import detect_project_root
+from syntaxmatrix import generate_page as _genpage
+from syntaxmatrix import auth as _auth
 from syntaxmatrix import profiles as _prof
 from syntaxmatrix.gpt_models_latest import set_args, extract_output_text as _out
-from datetime import datetime
-import contextlib
+from syntaxmatrix.agentic.agents import classify_ml_job_agent, refine_question_agent, text_formatter_agent
 try:
     from pygments import highlight as _hl
@@ -41,10 +48,16 @@ try:
 except Exception:
     _HAVE_PYGMENTS = False
-from flask_login import current_user
-from flask import g
+# from syntaxmatrix.utils import *
+from syntaxmatrix.utils import (
+    auto_inject_template, drop_bad_classification_metrics, ensure_accuracy_block,
+    ensure_image_output, ensure_output, fix_plain_prints, fix_print_html, patch_fix_sentinel_plot_calls,
+    patch_pairplot, fix_to_datetime_errors, harden_ai_code, patch_ensure_seaborn_import, get_plotting_imports,
+    patch_fix_seaborn_palette_calls, patch_quiet_specific_warnings, fix_seaborn_barplot_nameerror, fix_seaborn_boxplot_nameerror, ensure_matplotlib_title, patch_plot_code, patch_prefix_seaborn_calls, fix_scatter_and_summary, inject_auto_preprocessing, fix_importance_groupby, patch_pie_chart, patch_rmse_calls, clean_llm_code
+)
-# app = Flask(__name__)
+from syntaxmatrix.agentic.agent_tools import ToolRunner
+from syntaxmatrix.agentic.code_tools_registry import EARLY_SANITIZERS, SYNTAX_AND_REPAIR
 _CLIENT_DIR = detect_project_root()
 _stream_q = queue.Queue()
@@ -192,7 +205,6 @@ def setup_routes(smx):
             )
         return resp
     def head_html():
         # Determine a contrasting mobile text color based on the sidebar background.
         mobile_text_color = smx.theme["nav_text"]
@@ -625,8 +637,8 @@ def setup_routes(smx):
         desktop_nav = f"""
           <div class="nav-left">
-            <a class="logo" href="/" style="margin:0; padding:0;">{smx.site_logo}</a>
-            <a class="logo" href="/" style="text-decoration:none; vertical-align="middle; margin:0 24px 0 0; padding:0px;">{smx.site_title}</a>
+            <a class="logo" href="/">{smx.site_logo}</a>
+            <a class="logo" href="/" style="text-decoration:none; margin:0 24px 0 0; padding:0px; vertical-align:middle;">{smx.site_title}</a>
             <div class="nav-links" style="margin-left:24px;">
               {nav_links}
             </div>
@@ -3769,10 +3781,13 @@ def setup_routes(smx):
                     # if any live cached profile on smx matches this name, clear it
-                    for attr in ("_chat_profile", "_admin_profile", "_coding_profile", "_classification_profile", "_summarization_profile"):
+                    db_profiles = prof.get_profiles()
+                    # for attr in ("_chat_profile", "_admin_profile", "_coding_profile", "_classification_profile", "_summarization_profile", "_vision2text_profile"):
+                    for attr in ([db_profiles]):
                         prof = getattr(smx, attr, None)
                         if isinstance(prof, dict) and prof.get("name") == name:
                             setattr(smx, attr, {})
+                            prof.refresh_profiles_cache()
             elif action == "add_model":
                 prov = request.form.get("catalog_provider","").strip()
@@ -3944,7 +3959,7 @@ def setup_routes(smx):
               <label for="catalog_model">Model</label>
               <select id="catalog_model" name="catalog_model" required></select>
-              <label for="catalog_purpose">Purpose</label>
+              <label for="catalog_purpose">Agency</label>
               <select id="catalog_purpose" name="catalog_purpose" required></select>
               <label class="form-label mb-1" style="display:block; position:relative;">
@@ -4066,7 +4081,7 @@ def setup_routes(smx):
         models_catalog_list_card = f"""
           <div class="card span-4">
-            <h4>Models Catalog</h4>
+            <h4>Models Catalogue</h4>
             <ul class="catalog-list">
               {cat_items or "<li class='li-row'>No models yet.</li>"}
             </ul>
@@ -4080,15 +4095,15 @@ def setup_routes(smx):
           <div class='card span-4'>
             <h4>Setup Profiles</h4>
             <form method="post" style="margin-bottom:0.5rem;">
-              <label for="profile_name" class="form-label mb-1">
-                Confirm purpose
+              <label for="profile_name" class="form-label mb-1" style="margin-bottom:12px;">
+                Confirm Agency
                 <button id="name-help" type="button" class="info-btn btn-link p-0 text-muted"
                         style="font-size:0.8rem; line-height:1; padding:2px; display:inline-block;"
                         aria-haspopup="true" aria-expanded="false"
-                        title="Click to see naming suggestions">ⓘ</button>
+                        title="Click to see agencies">ⓘ</button>
               </label>
               <input id="profile_name" name="profile_name" type="text" class="form-control"
-                    placeholder="Purpose" required>
+                    placeholder="Agency" required>
               <div id="name-suggestions" role="tooltip"
                     class="suggestion-popover card shadow-sm p-2"
@@ -4175,9 +4190,9 @@ def setup_routes(smx):
         manage_sys_files_card = f"""
           <div class='card span-6'>
-            <h4>Manage System Files</h4>
+            <h4>Manage Company Files</h4>
             <ul class="catalog-list" style="list-style:none; padding-left:0; margin:0;">
-              {sys_files_html or "<li>No system file has been uploaded yet.</li>"}
+              {sys_files_html or "<li>No company file has been uploaded yet.</li>"}
             </ul>
           </div>
         """
@@ -5114,7 +5129,7 @@ def setup_routes(smx):
         rows = _auth.list_role_audit(limit=limit)
         import io, csv, datetime
-        buf = io.StringIO()
+        buf = _std_io.StringIO()
         writer = csv.writer(buf)
         writer.writerow(["timestamp", "actor", "target", "from_role", "to_role"])
         for r in rows:
@@ -5375,25 +5390,28 @@ def setup_routes(smx):
     # ────────────────────────────────────────────────────────────────────────────────────────
     # DASHBOARD
     # ────────────────────────────────────────────────────────────────────────────────────────
-    # ── DASHBOARD VIEW DETAILS -----------------------------
     @smx.app.route("/dashboard", methods=["GET", "POST"])
     # @login_required
     def dashboard():
         DATA_FOLDER = os.path.join(_CLIENT_DIR, "uploads", "data")
         os.makedirs(DATA_FOLDER, exist_ok=True)
-        ####################################################################
-        _CELL_REPAIR_RULES = """
-        Fix the Python cell to satisfy:
-        - Single valid cell; imports at the top.
-        - No top-level statements between if/elif/else branches.
-        - Regression must use either sklearn with train_test_split (then X_test exists) and R^2/MAE/RMSE, or statsmodels OLS. No accuracy_score in regression.
-        - Keep all plotting + savefig + BytesIO + display inside the branch that created the figure.
-        Return ONLY the corrected cell.
-        """
+        max_rows = 5000
+        max_cols = 80
         def _smx_repair_python_cell(py_code: str) -> str:
+            _CELL_REPAIR_RULES = """
+            You are an experienced Python code reviewer
+            Fix the Python cell to satisfy:
+            - Single valid cell; imports at the top.
+            - Do not import or invoke or use 'python-dotenv' or 'dotenv' because it's not needed.
+            - No top-level statements between if/elif/else branches.
+            - Regression must use either sklearn with train_test_split (then X_test exists) and R^2/MAE/RMSE,
+              or statsmodels OLS. No accuracy_score in regression.
+            - Keep all plotting + savefig + BytesIO + display inside the branch that created the figure.
+            - Return ONLY the corrected cell.
+            """
             code = textwrap.dedent(py_code or "").strip()
             needs_fix = False
             if re.search(r"\baccuracy_score\b", code) and re.search(r"\bLinearRegression\b|\bOLS\b", code):
@@ -5406,59 +5424,84 @@ def setup_routes(smx):
                 needs_fix = True
             if not needs_fix:
                 return code
-            prof = _prof.get_profile("coding") or _prof.get_profile("admin")
+            _prompt = f"```python\n{code}\n```"
+            prof = _prof.get_profile("classification") or _prof.get_profile("admin")
             if not prof:
                 return code
-            _prompt = f"```python\n{code}\n```"
-            _client = _prof.get_client(prof)
+            prof["client"] = _prof.get_client(prof)
+            _client = prof["client"]
             _model = prof["model"]
-            if prof['provider'] == "google":
-                fixed = _out(_client.models.generate_content(
-                              model=_model,
-                              contents=f"{_CELL_REPAIR_RULES}\n\n{_prompt}",
-                            )
-                        ).strip()
-            elif prof["provider"] == "openai" and _model in smx.gpt_models_latest():
-                args = set_args(model=prof.get("model"), instructions=_CELL_REPAIR_RULES,
-                          input=_prompt, previous_id=None, store=False,
-                          reasoning_effort="minimal", verbosity="low")
-                fixed = _out(_client.responses.create(**args)).strip()
-            elif prof["provider"] == "anthropic":
-                fixed = _out(_client.messages.create(
-                            model=_model,
-                            max_tokens=1024,
-                            system=_CELL_REPAIR_RULES,
-                            messages=[{"role": "user", "content":_prompt}]
-                        )).strip()
-            else:
-                fixed = _out(_client.chat.completions.create(
-                            model=_model,
-                            messages=[
-                                {"role": "system", "content":_CELL_REPAIR_RULES},
-                                {"role": "user", "content":_prompt},
-                            ]
-                          )
-                        ).strip()
+            _provider = prof["provider"].lower()
+            #1 Google
+            if _provider == "google":
+                from google.genai import types
+                fixed = _client.models.generate_content(
+                    model=_model,
+                    contents=_prompt,
+                    config=types.GenerateContentConfig(
+                        system_instruction=_CELL_REPAIR_RULES,
+                        temperature=0.8,
+                        max_output_tokens=1024,
+                    ),
+                )
+            #2 Openai
+            elif _provider == "openai" and _model in GPT_MODELS_LATEST:
+                args = set_args(
+                    model=_model,
+                    instructions=_CELL_REPAIR_RULES,
+                    input=[{"role": "user", "content": _prompt}],
+                    previous_id=None,
+                    store=False,
+                    reasoning_effort="medium",
+                    verbosity="medium",
+                )
+                fixed = _out(_client.responses.create(**args))
+            # Anthropic
+            elif _provider == "anthropic":
+                fixed = _client.messages.create(
+                    model=_model,
+                    max_tokens=1024,
+                    system=_CELL_REPAIR_RULES,
+                    messages=[{"role": "user", "content":_prompt}],
+                    stream=False,
+                )
+            # OpenAI SDK
+            else:
+                fixed = _client.chat.completions.create(
+                    model=_model,
+                    messages=[
+                        {"role": "system", "content":_CELL_REPAIR_RULES},
+                        {"role": "user", "content":_prompt},
+                    ],
+                    max_tokens=1024,
+                )
+            fixed_txt = clean_llm_code(fixed)
             try:
-                ast.parse(fixed); return fixed
-            except SyntaxError:
+                # Only accept the repaired cell if it's valid Python
+                ast.parse(fixed_txt)
+                return fixed_txt
+            except Exception:
+                # If the repaired version is still broken, fall back to the original code
                 return code
-        ################################################################
         section = request.args.get("section", "explore")
         datasets = [f for f in os.listdir(DATA_FOLDER) if f.lower().endswith(".csv")]
         selected_dataset = request.form.get("dataset") or request.args.get("dataset")
         if not selected_dataset and datasets:
             selected_dataset = datasets[0]
-        # selected_dataset = selected_dataset or ""
         # Handle file upload
         if request.method == "POST" and "dataset_file" in request.files:
             f = request.files["dataset_file"]
@@ -5470,7 +5513,7 @@ def setup_routes(smx):
         # Load dataframe if available
         df = pd.read_csv(os.path.join(DATA_FOLDER, selected_dataset)) if selected_dataset else None
         # --- Jupyter kernel management ---
         session_id = session.get('smx_kernel_id')
         if not session_id:
@@ -5481,38 +5524,84 @@ def setup_routes(smx):
         # --- Handle Ask AI ---
         ai_outputs = []
+        dl_html = ""
         askai_question = ""
-        refined_question = None
+        refined_question = ""
+        tags = []
         ai_code = None
+        eda_df = df
+        llm_usage = None
         if request.method == "POST" and "askai_question" in request.form:
             askai_question = request.form["askai_question"].strip()
-            if df is not None:
+            if df is not None:
+                CLEANED_FOLDER = str(selected_dataset).split(".")[0] + "_preprocessed"
+                cleaned_path = os.path.join(DATA_FOLDER, CLEANED_FOLDER, "cleaned_df.csv")
+                if os.path.exists(cleaned_path):
+                    df = pd.read_csv(cleaned_path, low_memory=False)
+                else:
+                    from syntaxmatrix.dataset_preprocessing import ensure_cleaned_df
+                    df = ensure_cleaned_df(DATA_FOLDER, CLEANED_FOLDER, df)  # writes cleaned_df.csv
+                # Build lightweight context
+                columns_summary = ", ".join(df.columns.tolist())
+                dataset_context = f"columns: {columns_summary}"
+                dataset_profile = f"modality: tabular; columns: {columns_summary}"
+                refined_question = refine_question_agent(askai_question, dataset_context)
+                tags = classify_ml_job_agent(refined_question, dataset_profile)
+                ai_code = smx.ai_generate_code(refined_question, tags, df)
+                llm_usage = smx.get_last_llm_usage()
+                ai_code = auto_inject_template(ai_code, tags, df)
+                # --- 1) Strip dotenv ASAP (kill imports, %magics, !pip) ---
+                ctx = {
+                    "question": refined_question,
+                    "df_columns": list(df.columns),
+                }
+                ai_code = ToolRunner(EARLY_SANITIZERS).run(ai_code, ctx)  # dotenv first
-                refined_question = refine_eda_question(askai_question, df)
-                intent = classify(refined_question)
-                ai_code = smx.ai_generate_code(refined_question, intent, df)
-                ai_code = auto_inject_template(ai_code, intent, df)
+                # --- 2) Domain/Plotting patches ---
                 ai_code = fix_scatter_and_summary(ai_code)
                 ai_code = fix_importance_groupby(ai_code)
                 ai_code = inject_auto_preprocessing(ai_code)
                 ai_code = patch_plot_code(ai_code, df, refined_question)
+                ai_code = ensure_matplotlib_title(ai_code)
+                ai_code = patch_pie_chart(ai_code, df, refined_question)
                 ai_code = patch_pairplot(ai_code, df)
+                ai_code = fix_seaborn_boxplot_nameerror(ai_code)
+                ai_code = fix_seaborn_barplot_nameerror(ai_code)
                 ai_code = get_plotting_imports(ai_code)
-                ai_code = ensure_image_output(ai_code)
-                ai_code = fix_numeric_sum(ai_code)
+                ai_code = patch_prefix_seaborn_calls(ai_code)
+                ai_code = patch_fix_sentinel_plot_calls(ai_code)
+                ai_code = patch_ensure_seaborn_import(ai_code)
+                ai_code = patch_rmse_calls(ai_code)
+                ai_code = patch_fix_seaborn_palette_calls(ai_code)
+                ai_code = patch_quiet_specific_warnings(ai_code)
+                ai_code = clean_llm_code(ai_code)
+                ai_code = ensure_image_output(ai_code)
                 ai_code = ensure_accuracy_block(ai_code)
                 ai_code = ensure_output(ai_code)
                 ai_code = fix_plain_prints(ai_code)
-                ai_code = fix_print_html(ai_code)
+                ai_code = fix_print_html(ai_code)
                 ai_code = fix_to_datetime_errors(ai_code)
+                # --- 3-4) Global syntax/data fixers (must run AFTER patches, BEFORE final repair) ---
+                ai_code = ToolRunner(SYNTAX_AND_REPAIR).run(ai_code, ctx)
+                # # --- 4) Final catch-all repair (run LAST) ---
                 ai_code = _smx_repair_python_cell(ai_code)
+                ai_code = harden_ai_code(ai_code)
+                ai_code = drop_bad_classification_metrics(ai_code, df)
+                ai_code = patch_fix_sentinel_plot_calls(ai_code)
-                 # Always make sure 'df' is in the kernel before running user code
+                # Always make sure 'df' is in the kernel before running user code
                 df_init_code = (
                     f"import pandas as pd\n"
-                    f"df = pd.read_csv(r'''{os.path.join(DATA_FOLDER, selected_dataset)}''')"
+                    f"df = pd.read_csv(r'''{os.path.join(cleaned_path)}''')"
                 )
                 execute_code_in_kernel(kc, df_init_code)
                 outputs, errors = execute_code_in_kernel(kc, ai_code)
@@ -5525,7 +5614,6 @@ def setup_routes(smx):
                     build_display_summary, phrase_commentary_vision, wrap_html
                 )
                 # Probe axes/labels/legend
                 probe1_out, probe1_err = execute_code_in_kernel(kc, MPL_PROBE_SNIPPET)
                 axes_info = parse_mpl_probe_output([str(x) for x in (probe1_out + probe1_err)])
@@ -5542,17 +5630,17 @@ def setup_routes(smx):
                 ################################################################
                 # ----- Build a single HTML with Result + Commentary + AI Code ----------
-                _buf_out, _buf_err = io.StringIO(), io.StringIO()
+                _buf_out, _buf_err = _std_io.StringIO(), _std_io.StringIO()
                 with contextlib.redirect_stdout(_buf_out), contextlib.redirect_stderr(_buf_err):
-                    # 1 Exact result blocks (already cleaned by kernel_manager)
+                    # Exact result blocks (already cleaned by kernel_manager)
                     result_html = rendered_html if rendered_html.strip() else "<pre>No output.</pre>"
-                    # 2 Commentary (we already have the raw HTML via wrap_html)
+                    # Commentary (we already have the raw HTML via wrap_html)
                     commentary_html = wrap_html(commentary_text)
                     code_html = _render_code_block("AI Generated Code", ai_code)
-                    full_body_html = "\n" + askai_question + "\n" + result_html + "\n" + commentary_html + "\n" + code_html
+                    full_body_html = "\n" + askai_question + "\n" + result_html + "\n" + code_html + "\n" + commentary_html
                     html_doc = (
                       "<!doctype html>"
@@ -5576,7 +5664,7 @@ def setup_routes(smx):
                     _last_result_html[session_id] = html_doc
-                    # 2.4 Append a single download button (explicit click → fetch → download)
+                    # Append a single download button (explicit click → fetch → download)
                     download_url = url_for("download_result_html", session_id=session_id)
                     dl_html = f"""
                       <a href="{download_url}">
@@ -5589,79 +5677,797 @@ def setup_routes(smx):
                     """
                     ai_outputs.append(Markup(dl_html))
-                ################################################################
         # --- EDA/static cells ---
+        # Display helper: coerce integer-like float columns to Int64 just for rendering
+        def _coerce_intlike_for_display(df_in: pd.DataFrame, per_cell: bool = False, eps: float = 1e-9) -> pd.DataFrame:
+            import numpy as np
+            out = df_in.copy()
+            if per_cell:
+                def _maybe(v):
+                    try:
+                        fv = float(v)
+                    except Exception:
+                        return v
+                    if pd.notnull(v) and np.isfinite(fv) and abs(fv - round(fv)) <= eps:
+                        return int(round(fv))
+                    return v
+                return out.applymap(_maybe)
+            # column-wise mode (original behaviour for previews)
+            for c in out.columns:
+                s = out[c]
+                if pd.api.types.is_float_dtype(s):
+                    vals = s.dropna().to_numpy()
+                    if vals.size and np.isfinite(vals).all() and np.allclose(vals, np.round(vals), rtol=0, atol=eps):
+                        out[c] = s.round().astype("Int64")
+            return out
         data_cells = []
+        max_rows = 5000
+        max_cols = 80
         if df is not None:
-            num_records = df.shape
-            ds = selected_dataset.replace("_"," ").replace(".csv","").capitalize()
+            df = eda_df
+            ds = (selected_dataset or "").replace("_", " ").replace(".csv", "").capitalize()
+            # 1) Dataset Overview (stat cards)
+            rows, cols = df.shape
+            mem_bytes = int(df.memory_usage(deep=True).sum())
+            mem_mb = round(mem_bytes / (1024 * 1024), 2)
+            dup_rows = int(df.duplicated().sum())
+            nunique_all = df.nunique(dropna=False)
+            n = max(rows, 1)
+            dtypes = df.dtypes.astype(str)
+            nonnull = df.notnull().sum()
+            miss_pct = (df.isnull().mean() * 100).round(1)
+            uniques = df.nunique(dropna=True)
+            uniq_ratio = (uniques / n).fillna(0.0)
+            id_like, hi_card, consts, flags_col = [], [], [], []
+            for c in df.columns:
+                flags = []
+                if uniques.get(c, 0) <= 1:
+                    flags.append("constant"); consts.append(c)
+                if uniq_ratio.get(c, 0) >= 0.95 and "datetime" not in dtypes[c].lower():
+                    flags.append("id-like"); id_like.append(c)
+                if dtypes[c].startswith("object") and uniq_ratio.get(c, 0) > 0.5 and c not in id_like:
+                    flags.append("high-card"); hi_card.append(c)
+                flags_col.append(", ".join(flags))
+            _stats_code = (
+                "rows, cols = df.shape\n"
+                "mem_bytes = int(df.memory_usage(deep=True).sum())\n"
+                "mem_mb = round(mem_bytes / (1024*1024), 2)\n"
+            )
+            _stats_html = f"""
+            <style>
+              .smx-statwrap{{display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));gap:10px}}
+              .smx-stat{{background:#fff;border:1px solid #e5e7eb;border-radius:10px;padding:10px 12px;text-align:center}}
+              .smx-stat h4{{margin:0 0 4px;font-size:.9rem}}
+              .smx-stat div{{font-weight:700;font-size:1.05rem}}
+            </style>
+            <div class="smx-statwrap">
+              <div class="smx-stat"><h4>Rows</h4><div>{rows:,}</div></div>
+              <div class="smx-stat"><h4>Columns</h4><div>{cols:,}</div></div>
+              <div class="smx-stat"><h4>Memory (MB)</h4><div>{mem_mb}</div></div>
+            </div>
+            """
+            data_cells.append({
+                "title": f"{ds} Overview",
+                "output": Markup(_stats_html),
+                "code": _stats_code,
+                "span":"eda-col-8"
+            })
+            # 2) Integrity Notes — with "Show all" toggle
+            notes = []
+            if id_like:
+                notes.append(f"ID-like columns: {', '.join(map(str, id_like[:6]))}{'…' if len(id_like)>6 else ''}")
+            if hi_card:
+                notes.append(f"High-cardinality categoricals: {', '.join(map(str, hi_card[:6]))}{'…' if len(hi_card)>6 else ''}")
+            if consts:
+                notes.append(f"Constant columns: {', '.join(map(str, consts[:6]))}{'…' if len(consts)>6 else ''}")
+            # Build full flagged table
+            flag_rows = []
+            for c in df.columns:
+                f = []
+                if c in id_like: f.append("id-like")
+                if c in hi_card: f.append("high-card")
+                if c in consts:  f.append("constant")
+                if f:
+                    flag_rows.append({
+                        "Column": c,
+                        "Flags": ", ".join(f),
+                        "Type": dtypes[c],
+                        "Unique Values": int(uniques.get(c, 0)),
+                        "Unique Ratio": float(uniq_ratio.get(c, 0)),
+                        "Missing (%)": float(miss_pct.get(c, 0)),
+                    })
+            flagged_df = pd.DataFrame(flag_rows)
+            flagged_df = flagged_df.sort_values(["Flags","Column"]) if not flagged_df.empty else flagged_df
+            # Render notes + toggle
+            notes_html = (
+                "<ul style='margin:0;padding-left:18px;'>" +
+                "".join([f"<li>{n}</li>" for n in notes]) +
+                "</ul>"
+            ) if notes else "<em>No obvious integrity flags.</em>"
+            if not flagged_df.empty:
+                table_html = datatable_box(flagged_df)
+                body_html = (
+                    notes_html +
+                    f"<details style='margin-top:8px;'><summary>Show all flagged columns ({len(flagged_df)})</summary>"
+                    f"<div style='margin-top:8px;'>{table_html}</div></details>"
+                )
+            else:
+                body_html = notes_html
             data_cells.append({
-                "title": f"{ds} size",
-                "output": num_records,
-                "code": "df.shape"
+                "title": "Integrity Notes",
+                "output": Markup(body_html),
+                "code": (
+                    "# Build Integrity Notes lists and full flagged table\n"
+                    "flag_rows = []\n"
+                    "for c in df.columns:\n"
+                    "    f = []\n"
+                    "    if c in id_like: f.append('id-like')\n"
+                    "    if c in hi_card: f.append('high-card')\n"
+                    "    if c in consts:  f.append('constant')\n"
+                    "    if f:\n"
+                    "        flag_rows.append({\n"
+                    "           'Column': c,\n"
+                    "           'Flags': ', '.join(f),\n"
+                    "           'Type': dtypes[c],\n"
+                    "           'Unique Values': int(uniques.get(c,0)),\n"
+                    "           'Unique Ratio': float(uniq_ratio.get(c,0)),\n"
+                    "           'Missing (%)': float(miss_pct.get(c,0))\n"
+                    "        })\n"
+                    "flagged_df = pd.DataFrame(flag_rows)\n"
+                    "flagged_df"
+                ),
+                "span":"eda-col-4"
             })
-            preview_cols = df.columns[:8]
+            # 3) Data Preview
+            preview_cols = df.columns
+            preview_df = _coerce_intlike_for_display(df[preview_cols].head(8))
             data_cells.append({
                 "title": "Data Preview",
-                "output": Markup(datatable_box(df[preview_cols].head(8))),
-                "code": f"df[{list(preview_cols)}].head(8)"
+                "output": Markup(datatable_box(preview_df)),
+                "code": f"df[{list(preview_cols)}].head(8)",
+                "span": "eda-col-6"
             })
+            # 4) Summary Statistics
+            summary_cols = df.columns
+            summary_df = _coerce_intlike_for_display(df[summary_cols].describe())
             data_cells.append({
                 "title": "Summary Statistics",
-                "output": Markup(datatable_box(df.describe())),
-                "code": "df.describe()"
+                "output": Markup(datatable_box(summary_df)),
+                "code": f"df[{list(summary_cols)}].describe()",
+                "span": "eda-col-6"
+            })
+            # 5) Column Profile
+            def _sample_vals(s, k=3):
+                try:
+                    vals = pd.unique(s.dropna().astype(str))[:k]
+                    return ", ".join(map(str, vals))
+                except Exception:
+                    return ""
+            profile_df = pd.DataFrame({
+                "Column": df.columns,
+                "Type": dtypes.values,
+                "Non-Null Count": nonnull.values,
+                "Missing (%)": miss_pct.values,
+                "Unique Values": uniques.values,
+                "Sample Values": [ _sample_vals(df[c]) for c in df.columns ],
+                "Flags": flags_col
+            })
+            data_cells.append({
+                "title": "Column Profile",
+                "output": Markup(datatable_box(profile_df)),
+                "code": (
+                    "dtypes = df.dtypes.astype(str)\n"
+                    "nonnull = df.notnull().sum()\n"
+                    "miss_pct = (df.isnull().mean()*100).round(1)\n"
+                    "uniques = df.nunique(dropna=True)\n"
+                    "n = max(len(df), 1)\n"
+                    "uniq_ratio = (uniques / n).fillna(0.0)\n"
+                    "def _sample_vals(s, k=3):\n"
+                    "    vals = pd.unique(s.dropna().astype(str))[:k]\n"
+                    "    return ', '.join(map(str, vals)) if len(vals) else ''\n"
+                    "flags_col = []\n"
+                    "for c in df.columns:\n"
+                    "    flags=[]\n"
+                    "    if uniques.get(c,0) <= 1: flags.append('constant')\n"
+                    "    if uniq_ratio.get(c,0) >= 0.95 and 'datetime' not in dtypes[c].lower(): flags.append('id-like')\n"
+                    "    if dtypes[c].startswith('object') and uniq_ratio.get(c,0) > 0.5 and 'id-like' not in flags: flags.append('high-card')\n"
+                    "    flags_col.append(', '.join(flags))\n"
+                    "profile_df = pd.DataFrame({\n"
+                    "  'Column': df.columns,\n"
+                    "  'Type': dtypes.values,\n"
+                    "  'Non-Null Count': nonnull.values,\n"
+                    "  'Missing (%)': miss_pct.values,\n"
+                    "  'Unique Values': uniques.values,\n"
+                    "  'Sample Values': [ _sample_vals(df[c]) for c in df.columns ],\n"
+                    "  'Flags': flags_col\n"
+                    "})\n"
+                    "profile_df"
+                ),
+                "span":"eda-col-6"
             })
+            # 6) Column Types
+            dtype_df = pd.DataFrame({
+                "Column": df.columns,
+                "Type": df.dtypes.astype(str).values,
+                "Non-Null Count": df.notnull().sum().values,
+                "Unique Values": df.nunique().values
+            })
+            data_cells.append({
+                "title": "Column Types",
+                "output": Markup(datatable_box(dtype_df)),
+                "code": (
+                    "pd.DataFrame({\n"
+                    "    'Column': df.columns,\n"
+                    "    'Type': df.dtypes.astype(str).values,\n"
+                    "    'Non-Null Count': df.notnull().sum().values,\n"
+                    "    'Unique Values': df.nunique().values\n"
+                    "})"
+                ),
+                "span":"eda-col-6"
+            })
+            # 7) Outliers — Top 3 records (robust MAD score, capped 5k×80)
+            try:
+                import numpy as np
+                num_cols_all = df.select_dtypes(include="number").columns.tolist()
+                if len(num_cols_all) >= 1:
+                    num_cols = num_cols_all[:max_cols]  # use your cap (80)
+                    df_num = df[num_cols].copy()
+                    # cap rows for speed (5k)
+                    if len(df_num) > max_rows:
+                        df_num = df_num.sample(max_rows, random_state=0)
+                    # robust z: 0.6745 * (x - median) / MAD  (MAD==0 → NaN)
+                    med = df_num.median(numeric_only=True)
+                    mad = (df_num - med).abs().median(numeric_only=True)
+                    rz = 0.6745 * (df_num - med) / mad.replace(0, np.nan)
+                    abs_rz = rz.abs()
+                    row_score = abs_rz.max(axis=1, skipna=True)  # strongest dev across features
+                    top_idx = row_score.nlargest(3).index.tolist()
+                    # Build compact, mobile-friendly cards for the top 3 rows
+                    cards_html = []
+                    for ridx in top_idx:
+                        # top contributing columns for this row
+                        contrib = abs_rz.loc[ridx].dropna().sort_values(ascending=False).head(5)
+                        maxv = float(contrib.iloc[0]) if len(contrib) else 0.0
+                        bars = []
+                        for c, v in contrib.items():
+                            pct = 0.0 if maxv <= 0 else min(100.0, float(v) / maxv * 100.0)
+                            bars.append(f"""
+                              <div class="barrow">
+                                <span class="cname">{html.escape(str(c))}</span>
+                                <div class="bar"><div class="fill" style="width:{pct:.1f}%"></div></div>
+                                <span class="score">{v:.2f}</span>
+                              </div>
+                            """)
+                        bars_html = "".join(bars) if bars else "<em>No strong single-column contributors.</em>"
+                        # show the full record (all columns) with horizontal scroll
+                        row_vals = df.loc[ridx, :].to_dict()
+                        row_tbl = datatable_box(pd.DataFrame([row_vals]))
+                        score_val = float(row_score.loc[ridx]) if pd.notnull(row_score.loc[ridx]) else 0.0
+                        title_idx = int(ridx) if isinstance(ridx, (int, np.integer)) else html.escape(str(ridx))
+                        cards_html.append(f"""
+                          <div class="mad-card">
+                            <div class="mad-title">Row index: {title_idx} · score: {score_val:.2f}</div>
+                            <div class="mad-bars">{bars_html}</div>
+                            <div class="mad-row">{row_tbl}</div>
+                          </div>
+                        """)
+                    grid_html = f"""
+                      <style>
+                        .mad-grid{{display:grid;grid-template-columns:repeat(3,minmax(0,1fr));gap:10px}}
+                        @media(max-width:1024px){{.mad-grid{{grid-template-columns:repeat(2,minmax(0,1fr))}}}}
+                        @media(max-width:640px){{.mad-grid{{grid-template-columns:repeat(1,minmax(0,1fr))}}}}
+                        .mad-card{{background:#fff;border:1px solid #e5e7eb;border-radius:10px;padding:8px 10px}}
+                        .mad-title{{font-weight:600;margin-bottom:6px}}
+                        .mad-bars .barrow{{display:grid;grid-template-columns:140px 1fr 46px;gap:6px;align-items:center;margin:4px 0}}
+                        .mad-bars .bar{{background:#eef2f7;border-radius:6px;height:8px;overflow:hidden}}
+                        .mad-bars .fill{{background:#0b8ae5;height:8px}}
+                        .mad-bars .cname{{font-size:12px;color:#444;white-space:nowrap;overflow:hidden;text-overflow:ellipsis}}
+                        .mad-bars .score{{font-size:12px;color:#333;text-align:right}}
+                        .mad-row .smx-table{{font-size:12px}}
+                      </style>
+                      <div class="mad-grid">{''.join(cards_html)}</div>
+                    """
+                    data_cells.append({
+                        "title": "Outliers — Top 3 records",
+                        "output": Markup(grid_html),
+                        "code": (
+                            "num_cols = df.select_dtypes(include='number').columns.tolist()[:max_cols]\n"
+                            "df_num = df[num_cols]\n"
+                            "df_num = df_num.sample(max_rows, random_state=0) if len(df_num) > max_rows else df_num\n"
+                            "med = df_num.median(); mad = (df_num - med).abs().median()\n"
+                            "rz = 0.6745 * (df_num - med) / mad.replace(0, np.nan)\n"
+                            "row_score = rz.abs().max(axis=1)\n"
+                            "top3 = row_score.nlargest(3)\n"
+                        ),
+                        "span": "eda-col-12"
+                    })
+                else:
+                    data_cells.append({
+                        "title": "Outliers — Top 3 records (robust MAD score)",
+                        "output": "<em>No numeric columns available.</em>",
+                        "code": "# no numeric columns",
+                        "span": "eda-col-6"
+                    })
+            except Exception as _e:
+                data_cells.append({
+                    "title": "Outliers — Top 3 records (robust MAD score)",
+                    "output": f"<em>Could not compute robust outliers: {html.escape(str(_e))}</em>",
+                    "code": "# error during robust outlier computation",
+                    "span": "eda-col-6"
+                })
+            # 8) Outliers — Violin + Box (Top 3 numerics by IQR outliers, capped 5k×80)
+            try:
+                num_outliers = 3
+                num_cols_all = df.select_dtypes(include="number").columns.tolist()
+                if len(num_cols_all) >= 1:
+                    num_cols = num_cols_all[:max_cols]
+                    dfn = df[num_cols].copy()
+                    # cap rows for speed (5k)
+                    if len(dfn) > max_rows:
+                        dfn = dfn.sample(max_rows, random_state=0)
+                    # rank columns by number of Tukey outliers (1.5*IQR)
+                    ranks = []
+                    for c in dfn.columns:
+                        s = pd.to_numeric(dfn[c], errors="coerce").dropna()
+                        if s.empty:
+                            ranks.append((c, 0, 0.0))
+                            continue
+                        q1 = s.quantile(0.25); q3 = s.quantile(0.75)
+                        iqr = float(q3 - q1)
+                        if iqr <= 0:
+                            ranks.append((c, 0, 0.0))
+                            continue
+                        lower = q1 - 1.5 * iqr
+                        upper = q3 + 1.5 * iqr
+                        out_count = int(((s < lower) | (s > upper)).sum())
+                        ranks.append((c, out_count, float(iqr)))
+                    # choose top 6 (break ties by IQR spread)
+                    sel_cols = [c for c, _, _ in sorted(ranks, key=lambda x: (-x[1], -x[2]))[:num_outliers]]
+                    if not sel_cols:
+                        raise ValueError("No numeric columns have spread for violin plots.")
+                    # package data for JS (values only; thresholds for display)
+                    charts = []
+                    for c in sel_cols:
+                        s = pd.to_numeric(dfn[c], errors="coerce").dropna()
+                        if s.empty:
+                            continue
+                        q1 = s.quantile(0.25); q3 = s.quantile(0.75); iqr = q3 - q1
+                        lower = float(q1 - 1.5 * iqr); upper = float(q3 + 1.5 * iqr)
+                        out_count = int(((s < lower) | (s > upper)).sum())
+                        charts.append({
+                            "name": str(c),
+                            "values": [float(v) for v in s.tolist()],
+                            "lower": lower,
+                            "upper": upper,
+                            "n": int(s.size),
+                            "out": out_count
+                        })
+                    container_id = f"violgrid_{uuid.uuid4().hex}"
+                    sub_divs = "\n".join([f'<div id="{container_id}_{i}" class="vplot"></div>' for i in range(len(charts))])
+                    plot_html = f"""
+                    <style>
+                      /* mini-grid 3x2 → 2x? → 1x? */
+                      #{container_id}{{display:grid;grid-template-columns:repeat(3,minmax(0,1fr));gap:10px}}
+                      @media(max-width:1024px){{#{container_id}{{grid-template-columns:repeat(2,minmax(0,1fr))}}}}
+                      @media(max-width:640px){{#{container_id}{{grid-template-columns:repeat(1,minmax(0,1fr))}}}}
+                      /* each plot container – height set via JS for monotonic responsiveness */
+                      #{container_id} .vplot{{width:100%;}}
+                    </style>
+                    <div id="{container_id}">
+                      {sub_divs}
+                    </div>
+                    <script>
+                    (function(){{
+                      var charts = {json.dumps(charts)};
+                      function calcHeight(el){{
+                        var w = (el && el.clientWidth) || (el && el.parentElement && el.parentElement.clientWidth) || 360;
+                        // smooth, monotone: ~0.55×width, clamped
+                        return Math.round(Math.max(220, Math.min(360, w * 0.55)));
+                      }}
+                      function drawOne(target, data){{
+                        var el = document.getElementById(target);
+                        if(!el) return;
+                        var h = calcHeight(el);
+                        el.style.setProperty('height', h + 'px', 'important'); // defeat global height:auto
+                        var trace = {{
+                          type: 'violin',
+                          y: data.values,
+                          name: data.name,
+                          box: {{ visible: true }},
+                          meanline: {{ visible: true }},
+                          points: 'suspectedoutliers',
+                          hovertemplate: '%{{y}}<extra></extra>',
+                          showlegend: false
+                        }};
+                        var layout = {{
+                          margin: {{ l: 40, r: 10, t: 26, b: 28 }},
+                          title: {{ text: data.name + ' (n=' + data.n + ', out=' + data.out + ')', font: {{ size: 12 }} }},
+                          yaxis: {{ automargin: true }}
+                        }};
+                        var config = {{ displayModeBar: true, responsive: true }};
+                        if(window.Plotly && Plotly.newPlot){{
+                          Plotly.newPlot(el, [trace], layout, config).then(function(){{
+                            if(Plotly.Plots && Plotly.Plots.resize) Plotly.Plots.resize(el);
+                          }});
+                        }} else {{
+                          var p=document.createElement('div'); p.style.color='crimson'; p.style.marginTop='8px';
+                          p.textContent='Plotly is not loaded.'; el.appendChild(p);
+                        }}
+                      }}
+                      function drawAll(){{
+                        for(var i=0;i<charts.length;i++) drawOne("{container_id}_" + i, charts[i]);
+                      }}
+                      drawAll();
+                      window.addEventListener('resize', drawAll);
+                    }})();
+                    </script>
+                    """
+                    data_cells.append({
+                      "title": "Outliers — Violin + Box (Top 3 numerics by IQR outliers)",
+                      "output": Markup(plot_html),
+                      "code": (
+                          "dfn = df.select_dtypes(include='number').iloc[:, :max_cols]\n"
+                          "dfn = dfn.sample(max_rows, random_state=0) if len(dfn) > max_rows else dfn\n"
+                          "# rank columns by Tukey outliers (1.5*IQR) and plot violins with inner box"
+                      ),
+                      "span": "eda-col-12"
+                    })
+                else:
+                    data_cells.append({
+                        "title": "Outliers — Violin + Box",
+                        "output": "<em>No numeric columns available.</em>",
+                        "code": "# no numeric columns",
+                        "span": "eda-col-6"
+                    })
+            except Exception as _e:
+                data_cells.append({
+                    "title": "Outliers — Violin + Box",
+                    "output": f"<em>Could not render violins: {html.escape(str(_e))}</em>",
+                    "code": "# error during violin rendering",
+                    "span": "eda-col-6"
+                })
+            # 9) Missing Values table
             nulls = df.isnull().sum()
             nulls_pct = (df.isnull().mean() * 100).round(1)
             missing_df = pd.DataFrame({
-                "Missing Values": nulls,
-                "Missing (%)": nulls_pct
+                "Column": df.columns,
+                "Missing Values": nulls.values,
+                "Missing (%)": nulls_pct.values
             })
-            missing = missing_df[missing_df["Missing Values"] > 0]
+            missing = missing_df[missing_df["Missing Values"] > 0]
             data_cells.append({
                 "title": "Missing Values",
                 "output": Markup(datatable_box(missing)) if not missing.empty else "<em>No missing values detected.</em>",
                 "code": (
                     "nulls = df.isnull().sum()\n"
                     "nulls_pct = (df.isnull().mean() * 100).round(1)\n"
-                    "missing_df = pd.DataFrame({'Missing Values': nulls, 'Missing (%)': nulls_pct})\n"
+                    "missing_df = pd.DataFrame({\n"
+                    "    'Column': df.columns,\n"
+                    "    'Missing Values': nulls.values,\n"
+                    "    'Missing (%)': nulls_pct.values\n"
+                    "})\\n"
                     "missing_df[missing_df['Missing Values'] > 0]"
-                )
+                ),
+                "span":"eda-col-4"
             })
-            dtype_df = pd.DataFrame({
-                "Type": df.dtypes.astype(str),
-                "Non-Null Count": df.notnull().sum(),
-                "Unique Values": df.nunique()
-            })
-            data_cells.append({
-                "title": "Column Types",
-                "output": Markup(datatable_box(dtype_df)),
-                "code": (
-                    "pd.DataFrame({\n"
-                    "    'Type': df.dtypes.astype(str),\n"
-                    "    'Non-Null Count': df.notnull().sum(),\n"
-                    "    'Unique Values': df.nunique()\n"
-                    "})"
+            # 9) Missingness (Top 20) – Plotly bar chart
+            if not missing.empty:
+                top_miss = (
+                    missing_df[missing_df["Missing Values"] > 0]
+                    .sort_values("Missing (%)", ascending=False)
+                    .loc[:, ["Column", "Missing (%)"]]
+                    .head(20)
+                    .reset_index(drop=True)
                 )
-            })
+                container_id = f"miss_plot_{uuid.uuid4().hex}"
+                x_vals = [html.escape(str(c)) for c in top_miss["Column"].tolist()]
+                y_vals = [float(v) for v in top_miss["Missing (%)"].tolist()]
+                plot_html = f"""
+                <div id="{container_id}" style="width:100%;height:340px;"></div>
+                <script>
+                  (function(){{
+                    var x = {json.dumps(x_vals)};
+                    var y = {json.dumps(y_vals)};
+                    var data = [{{
+                      type: 'bar',
+                      x: x,
+                      y: y,
+                      hovertemplate: '%{{x}}<br>Missing: %{{y:.1f}}%<extra></extra>'
+                    }}];
+                    var layout = {{
+                      margin: {{l:50, r:20, t:10, b:100}},
+                      yaxis: {{ title: 'Missing (%)', rangemode: 'tozero' }},
+                      xaxis: {{ title: 'Column', tickangle: -45 }}
+                    }};
+                    if (window.Plotly && Plotly.newPlot) {{
+                      Plotly.newPlot("{container_id}", data, layout, {{displayModeBar:true, responsive:true}});
+                    }} else {{
+                      var p=document.createElement('div'); p.style.color='crimson'; p.style.marginTop='8px';
+                      p.textContent='Plotly is not loaded.'; document.getElementById("{container_id}").appendChild(p);
+                    }}
+                  }})();
+                </script>
+                """
+                data_cells.append({
+                    "title": "Missingness (Top 20)",
+                    "output": Markup(plot_html),
+                    "code": (
+                        "nulls = df.isnull().sum();\n"
+                        "nulls_pct = (\n"
+                        "    df.isnull().mean()*100\n"
+                        ").round(1)\n"
+                        "missing_df = pd.DataFrame({\n"
+                        "    'Column': df.columns,\n"
+                        "    'Missing Values': nulls.values,\n"
+                        "    'Missing (%)': nulls_pct.values\n"
+                        "})\n\n"
+                        "top_miss = (\n"
+                        "    missing_df[missing_df['Missing Values'] > 0]\n"
+                        "       .sort_values('Missing (%)', ascending=False)\n"
+                        "       .loc[:, ['Column', 'Missing (%)']]\n"
+                        "       .head(20)\n"
+                        "       .reset_index(drop=True)\n"
+                        ")\n"
+                        "top_miss"
+                    ),
+                    "span":"eda-col-4"
+                })
+            # 11 Category Distribution — 3D doughnut (dataset-agnostic, capped 5k)
+            try:
+                # 1) Column universe: object / category / bool (integers remain numeric)
+                cat_cols_all = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
+                # 2) Honour user pick if categorical; otherwise auto-pick a sensible default
+                dist_param = (request.args.get("dist") or request.form.get("dist") or "").strip()
+                if dist_param and dist_param in cat_cols_all:
+                    dist_col = dist_param
+                else:
+                    # Auto-pick preference: 3–20 unique values excluding obvious ID-like;
+                    # else allow 2-level; else first categorical.
+                    n_total = len(df)
+                    uniques_loc = df.nunique(dropna=True)
+                    miss_pct_loc = (df.isnull().mean() * 100).round(1)
+                    id_like_loc = {c for c in cat_cols_all if n_total > 0 and (uniques_loc.get(c, 0) / n_total) >= 0.95}
+                    multilevel = [c for c in cat_cols_all
+                                  if (3 <= int(uniques_loc.get(c, df[c].nunique(dropna=True))) <= 20)
+                                  and (c not in id_like_loc)]
+                    if multilevel:
+                        # score nearer 8 levels and lower missingness
+                        best, best_score = "", -1e9
+                        for c in multilevel:
+                            k = int(uniques_loc.get(c, df[c].nunique(dropna=True)))
+                            miss = float(miss_pct_loc.get(c, (df[c].isna().mean() * 100)))
+                            score = -abs(k - 8) - (miss / 10.0)
+                            if score > best_score:
+                                best, best_score = c, score
+                        dist_col = best
+                    else:
+                        twolevel = [c for c in cat_cols_all if int(uniques_loc.get(c, df[c].nunique(dropna=True))) == 2]
+                        dist_col = (twolevel[0] if twolevel else (cat_cols_all[0] if cat_cols_all else ""))
+                # 3) Build options AFTER dist_col is final (so selection sticks)
+                opts = []
+                for c in cat_cols_all:
+                    sel = " selected" if c == dist_col else ""
+                    opts.append(f'<option value="{html.escape(str(c))}"{sel}>{html.escape(str(c))}</option>')
+                opts_html = "\n".join(opts)
+                form_html = f"""
+                <a id="dist3d"></a>
+                <form method="get" action="/dashboard#dist3d"
+                      style="display:flex; flex-wrap:wrap; gap:10px; align-items:center; margin-bottom:8px;">
+                  <input type="hidden" name="section" value="explore">
+                  <input type="hidden" name="dataset" value="{html.escape(str(selected_dataset or ''))}">
+                  <label><strong>Distribution column:</strong></label>
+                  <select name="dist" onchange="this.form.submit()" style="min-width:200px; height:28px;">
+                    {opts_html}
+                  </select>
+                </form>
+                """
+                if dist_col:
+                    s = df[dist_col]
+                    # cap cheap counting to 5k
+                    if len(s) > 5000:
+                        s = s.sample(5000, random_state=0)
+                    # 4) Robust counting: treat NaN as "Missing", stringify labels for safety
+                    s = s.astype("object")
+                    s = s.where(~s.isna(), other="Missing")
+                    vc = s.value_counts(dropna=False)
+                    if vc.empty:
+                        raise ValueError("No values to display for the selected column.")
+                    # Top-8 + 'Other' (excluding 'Missing' which we keep separate)
+                    top_k = 8
+                    non_missing = vc.drop(index=["Missing"], errors="ignore") if "Missing" in vc.index else vc
+                    head = non_missing.sort_values(ascending=False).head(top_k)
+                    other = int(non_missing.iloc[top_k:].sum()) if len(non_missing) > top_k else 0
+                    miss = int(vc.get("Missing", 0))
+                    labels = [str(x) for x in head.index.tolist()]
+                    values = [int(v) for v in head.values.tolist()]
+                    if other > 0:
+                        labels.append("Other"); values.append(other)
+                    if miss > 0:
+                        labels.append("Missing"); values.append(miss)
+                    # colours for faux 3D (no external deps)
+                    k = len(labels)
+                    def _hsl(i, n, l=0.58, s=0.62):
+                        h = (i / max(1, n)) * 360.0
+                        return f"hsl({int(h)}, {int(s*100)}%, {int(l*100)}%)"
+                    top_colors  = [_hsl(i, k, l=0.58) for i in range(k)]
+                    base_colors = [_hsl(i, k, l=0.40) for i in range(k)]
+                    container_id = f"dist3d_{uuid.uuid4().hex}"
+                    total = int(sum(values))
+                    plot_html = f"""
+                    <div id="{container_id}" class="dist3d-chart"></div>
+                    <script>
+                    (function(){{
+                      var el = document.getElementById("{container_id}");
+                      var labels = {json.dumps(labels)};
+                      var values = {json.dumps(values)};
+                      var total  = {total};
+                      var base = {{
+                        type: 'pie', labels: labels, values: values,
+                        hole: 0.64, sort: false, textinfo: 'none', hoverinfo: 'skip',
+                        marker: {{ colors: {json.dumps(base_colors)} }},
+                        showlegend: false
+                      }};
+                      var top = {{
+                        type: 'pie', labels: labels, values: values,
+                        hole: 0.52, sort: false,
+                        textinfo: 'percent', textposition: 'inside', insidetextorientation: 'radial',
+                        hovertemplate: '%{{label}}<br>%{{value}} of {total:,} (%{{percent}})<extra></extra>',
+                        marker: {{ colors: {json.dumps(top_colors)}, line: {{ width: 1, color: 'rgba(0,0,0,0.25)' }} }},
+                        showlegend: true, legendgroup: 'dist'
+                      }};
+                      function parentWidth(){{
+                        return (el && el.parentElement ? el.parentElement.clientWidth : (window.innerWidth||360));
+                      }}
+                      // Smooth, monotonic: height = 0.65 * width, clamped [220, 520].
+                      function chartHeight(){{
+                        var w = parentWidth();
+                        return Math.round(Math.max(220, Math.min(520, w * 0.65)));
+                      }}
+                      function legendOrientation(){{
+                        return parentWidth() < 640 ? 'h' : 'v';
+                      }}
+                      function makeLayout(){{
+                        return {{
+                          margin: {{ l:10, r:10, t:10, b:10 }},
+                          legend: {{ orientation: legendOrientation(), x:1, xanchor:'right', y:1 }},
+                          uniformtext: {{ mode: 'hide', minsize: 10 }}
+                        }};
+                      }}
+                      function applySize(){{
+                        // Override global .plotly-graph-div {{ height:auto !important }}
+                        el.style.setProperty('height', chartHeight() + 'px', 'important');
+                        if (window.Plotly) {{
+                          Plotly.relayout(el, {{ 'legend.orientation': legendOrientation() }});
+                          Plotly.Plots.resize(el);
+                        }}
+                      }}
+                      if (window.Plotly && Plotly.newPlot) {{
+                        // Initial explicit height before draw
+                        el.style.setProperty('height', chartHeight() + 'px', 'important');
+                        Plotly.newPlot(el, [base, top], makeLayout(), {{ displayModeBar:true, responsive:true }})
+                          .then(function(){{ applySize(); }});
+                        window.addEventListener('resize', applySize);
+                      }} else {{
+                        var p=document.createElement('div'); p.style.color='crimson'; p.style.marginTop='8px';
+                        p.textContent='Plotly is not loaded.'; el.appendChild(p);
+                      }}
+                    }})();
+                    </script>
+                    """
+                    data_cells.append({
+                        "title": f"Category Distribution — ({html.escape(dist_col)})",
+                        "output": Markup(form_html + plot_html),
+                        "code": (
+                            "dist_col = '<chosen categorical>'\n"
+                            "s = df[dist_col].astype('object').where(~df[dist_col].isna(), other='Missing')\n"
+                            "vc = s.value_counts(dropna=False)\n"
+                            "top_k = 8  # Top-8 + Other (+ Missing)\n"
+                        ),
+                        "span": "eda-col-4"
+                    })
+                else:
+                    data_cells.append({
+                        "title": "Category Distribution — 3D doughnut",
+                        "output": "<em>No categorical columns found.</em>",
+                        "code": "# no categorical columns",
+                        "span": "eda-col-4"
+                    })
+            except Exception as _e:
+                data_cells.append({
+                    "title": "Category Distribution — 3D doughnut",
+                    "output": f"<em>Could not render distribution: {html.escape(str(_e))}</em>",
+                    "code": "# error during distribution rendering",
+                    "span": "eda-col-4"
+                })
             for cell in data_cells:
-              cell["highlighted_code"] = Markup(_pygmentize(cell["code"]))
+                cell["highlighted_code"] = Markup(_pygmentize(cell["code"]))
         highlighted_ai_code = _pygmentize(ai_code)
         return render_template(
           "dashboard.html",
           section=section,
           datasets=datasets,
           selected_dataset=selected_dataset,
           ai_outputs=ai_outputs,
-          ai_code=ai_code,  # AI-generated code for toggle
+          ai_code=ai_code,
           highlighted_ai_code=highlighted_ai_code if ai_code else None,
           askai_question=smx.sanitize_rough_to_markdown_task(askai_question),
-          refined_question=refined_question,  # Refined question
+          refined_question=refined_question,
+          tasks=tags,
           data_cells=data_cells,
           session_id=session_id,
+          llm_usage=llm_usage
         )
@@ -5672,7 +6478,7 @@ def setup_routes(smx):
         if not html_doc:
             return ("No result available.", 404)
-        buf = io.BytesIO(html_doc.encode("utf-8"))
+        buf = _std_io.BytesIO(html_doc.encode("utf-8"))
         buf.seek(0)
         # keep a copy if you wish, or free it:
@@ -5744,7 +6550,7 @@ def setup_routes(smx):
         text = re.sub(r"<[^>]+>", " ", text)
         text = re.sub(r"\n{3,}", "\n\n", text)
         text = html.unescape(text).strip()
-        buf = io.BytesIO()
+        buf = _std_io.BytesIO()
         doc = SimpleDocTemplate(buf, pagesize=A4, leftMargin=16*mm, rightMargin=16*mm, topMargin=16*mm, bottomMargin=16*mm)
         styles = getSampleStyleSheet()
         flow = []

syntaxmatrix 2.3.5__py3-none-any.whl → 2.5.5.4__py3-none-any.whl

syntaxmatrix 2.3.5py3-none-any.whl → 2.5.5.4py3-none-any.whl