syntaxmatrix 2.5.1__py3-none-any.whl → 2.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
syntaxmatrix/routes.py CHANGED
@@ -1,18 +1,28 @@
1
- from prompt_toolkit import HTML
2
- from scipy import io
3
- import os, zipfile, time, uuid, werkzeug, queue, html, ast, re, threading, textwrap, json, pandas as pd
4
- from PyPDF2.errors import EmptyFileError
5
- import io
1
+ import os, zipfile, time, uuid, werkzeug, queue, html, ast, re
2
+ import threading, textwrap, json, pandas as pd
3
+ import contextlib
4
+
5
+ import io as _std_io
6
+
6
7
  from io import BytesIO
8
+ from scipy import io
9
+ from flask import Blueprint, Response, request, send_file, session
10
+ from flask import render_template, render_template_string, url_for, redirect, g
11
+ from flask import flash, jsonify, send_from_directory, get_flashed_messages, stream_with_context
12
+
13
+ from flask_login import current_user
14
+
7
15
  from PyPDF2 import PdfReader
8
16
  from markupsafe import Markup
9
- from urllib.parse import quote
17
+ from urllib.parse import quote
18
+ from datetime import datetime
19
+ from prompt_toolkit import HTML
20
+ from PyPDF2.errors import EmptyFileError
21
+ import numpy as np
10
22
  from .auth import register_user, authenticate, login_required, admin_required, superadmin_required
11
- from flask import Blueprint, Response, request, send_file, session, render_template, render_template_string, redirect, url_for, flash, jsonify, send_from_directory, get_flashed_messages, stream_with_context
12
23
 
13
24
  from syntaxmatrix.themes import DEFAULT_THEMES
14
25
  from syntaxmatrix import db
15
- from syntaxmatrix.utils import *
16
26
  from syntaxmatrix.vector_db import add_pdf_chunk
17
27
  from syntaxmatrix.file_processor import *
18
28
  from syntaxmatrix.vectorizer import embed_text
@@ -22,14 +32,13 @@ from syntaxmatrix.history_store import SQLHistoryStore, PersistentHistoryStore
22
32
  from syntaxmatrix.kernel_manager import SyntaxMatrixKernelManager, execute_code_in_kernel
23
33
  from syntaxmatrix.vector_db import *
24
34
  from syntaxmatrix.settings.string_navbar import string_navbar_items
25
- from syntaxmatrix.settings.model_map import PROVIDERS_MODELS, MODEL_DESCRIPTIONS, PURPOSE_TAGS, EMBEDDING_MODELS
26
- from .project_root import detect_project_root
27
- from . import generate_page as _genpage
28
- from . import auth as _auth
35
+ from syntaxmatrix.settings.model_map import GPT_MODELS_LATEST, PROVIDERS_MODELS, MODEL_DESCRIPTIONS, PURPOSE_TAGS, EMBEDDING_MODELS
36
+ from syntaxmatrix.project_root import detect_project_root
37
+ from syntaxmatrix import generate_page as _genpage
38
+ from syntaxmatrix import auth as _auth
29
39
  from syntaxmatrix import profiles as _prof
30
40
  from syntaxmatrix.gpt_models_latest import set_args, extract_output_text as _out
31
- from datetime import datetime
32
- import contextlib
41
+ from syntaxmatrix.agentic.agents import classify_ml_job_agent, refine_question_agent, text_formatter_agent
33
42
 
34
43
  try:
35
44
  from pygments import highlight as _hl
@@ -39,10 +48,16 @@ try:
39
48
  except Exception:
40
49
  _HAVE_PYGMENTS = False
41
50
 
42
- from flask_login import current_user
43
- from flask import g
51
+ # from syntaxmatrix.utils import *
52
+ from syntaxmatrix.utils import (
53
+ auto_inject_template, drop_bad_classification_metrics, ensure_accuracy_block,
54
+ ensure_image_output, ensure_output, fix_plain_prints, fix_print_html, patch_fix_sentinel_plot_calls,
55
+ patch_pairplot, fix_to_datetime_errors, harden_ai_code, patch_ensure_seaborn_import, get_plotting_imports,
56
+ patch_fix_seaborn_palette_calls, patch_quiet_specific_warnings, fix_seaborn_barplot_nameerror, fix_seaborn_boxplot_nameerror, ensure_matplotlib_title, patch_plot_code, patch_prefix_seaborn_calls, fix_scatter_and_summary, inject_auto_preprocessing, fix_importance_groupby, patch_pie_chart, patch_rmse_calls, clean_llm_code
57
+ )
44
58
 
45
- # app = Flask(__name__)
59
+ from syntaxmatrix.agentic.agent_tools import ToolRunner
60
+ from syntaxmatrix.agentic.code_tools_registry import EARLY_SANITIZERS, SYNTAX_AND_REPAIR
46
61
 
47
62
  _CLIENT_DIR = detect_project_root()
48
63
  _stream_q = queue.Queue()
@@ -482,9 +497,8 @@ def setup_routes(smx):
482
497
  padding: 2px 8px;
483
498
  color:cyan;
484
499
  }}
485
-
486
500
  </style>
487
-
501
+
488
502
  <!-- Add MathJax -->
489
503
  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
490
504
  <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
@@ -623,8 +637,8 @@ def setup_routes(smx):
623
637
 
624
638
  desktop_nav = f"""
625
639
  <div class="nav-left">
626
- <a class="logo" href="/" style="margin:0; padding:0;">{smx.site_logo}</a>
627
- <a class="logo" href="/" style="text-decoration:none; vertical-align="middle; margin:0 24px 0 0; padding:0px;">{smx.site_title}</a>
640
+ <a class="logo" href="/">{smx.site_logo}</a>
641
+ <a class="logo" href="/" style="text-decoration:none; margin:0 24px 0 0; padding:0px; vertical-align:middle;">{smx.site_title}</a>
628
642
  <div class="nav-links" style="margin-left:24px;">
629
643
  {nav_links}
630
644
  </div>
@@ -3767,10 +3781,13 @@ def setup_routes(smx):
3767
3781
 
3768
3782
 
3769
3783
  # if any live cached profile on smx matches this name, clear it
3770
- for attr in ("_chat_profile", "_admin_profile", "_coding_profile", "_classification_profile", "_summarization_profile", "_vision2text_profile"):
3784
+ db_profiles = prof.get_profiles()
3785
+ # for attr in ("_chat_profile", "_admin_profile", "_coding_profile", "_classification_profile", "_summarization_profile", "_vision2text_profile"):
3786
+ for attr in ([db_profiles]):
3771
3787
  prof = getattr(smx, attr, None)
3772
3788
  if isinstance(prof, dict) and prof.get("name") == name:
3773
3789
  setattr(smx, attr, {})
3790
+ prof.refresh_profiles_cache()
3774
3791
 
3775
3792
  elif action == "add_model":
3776
3793
  prov = request.form.get("catalog_provider","").strip()
@@ -5112,7 +5129,7 @@ def setup_routes(smx):
5112
5129
  rows = _auth.list_role_audit(limit=limit)
5113
5130
 
5114
5131
  import io, csv, datetime
5115
- buf = io.StringIO()
5132
+ buf = _std_io.StringIO()
5116
5133
  writer = csv.writer(buf)
5117
5134
  writer.writerow(["timestamp", "actor", "target", "from_role", "to_role"])
5118
5135
  for r in rows:
@@ -5373,25 +5390,28 @@ def setup_routes(smx):
5373
5390
  # ────────────────────────────────────────────────────────────────────────────────────────
5374
5391
  # DASHBOARD
5375
5392
  # ────────────────────────────────────────────────────────────────────────────────────────
5376
- # ── DASHBOARD VIEW DETAILS -----------------------------
5377
5393
  @smx.app.route("/dashboard", methods=["GET", "POST"])
5378
5394
  # @login_required
5379
5395
  def dashboard():
5380
5396
  DATA_FOLDER = os.path.join(_CLIENT_DIR, "uploads", "data")
5381
5397
  os.makedirs(DATA_FOLDER, exist_ok=True)
5382
-
5383
- ####################################################################
5384
5398
 
5385
- _CELL_REPAIR_RULES = """
5386
- Fix the Python cell to satisfy:
5387
- - Single valid cell; imports at the top.
5388
- - No top-level statements between if/elif/else branches.
5389
- - Regression must use either sklearn with train_test_split (then X_test exists) and R^2/MAE/RMSE, or statsmodels OLS. No accuracy_score in regression.
5390
- - Keep all plotting + savefig + BytesIO + display inside the branch that created the figure.
5391
- Return ONLY the corrected cell.
5392
- """
5393
-
5399
+ max_rows = 5000
5400
+ max_cols = 80
5401
+
5394
5402
  def _smx_repair_python_cell(py_code: str) -> str:
5403
+
5404
+ _CELL_REPAIR_RULES = """
5405
+ You are an experienced Python code reviewer
5406
+ Fix the Python cell to satisfy:
5407
+ - Single valid cell; imports at the top.
5408
+ - Do not import or invoke or use 'python-dotenv' or 'dotenv' because it's not needed.
5409
+ - No top-level statements between if/elif/else branches.
5410
+ - Regression must use either sklearn with train_test_split (then X_test exists) and R^2/MAE/RMSE,
5411
+ or statsmodels OLS. No accuracy_score in regression.
5412
+ - Keep all plotting + savefig + BytesIO + display inside the branch that created the figure.
5413
+ - Return ONLY the corrected cell.
5414
+ """
5395
5415
  code = textwrap.dedent(py_code or "").strip()
5396
5416
  needs_fix = False
5397
5417
  if re.search(r"\baccuracy_score\b", code) and re.search(r"\bLinearRegression\b|\bOLS\b", code):
@@ -5404,66 +5424,84 @@ def setup_routes(smx):
5404
5424
  needs_fix = True
5405
5425
  if not needs_fix:
5406
5426
  return code
5407
- prof = _prof.get_profile("coding") or _prof.get_profile("admin")
5427
+
5428
+ _prompt = f"```python\n{code}\n```"
5429
+
5430
+ prof = _prof.get_profile("classification") or _prof.get_profile("admin")
5408
5431
  if not prof:
5409
5432
  return code
5410
-
5411
- _prompt = f"```python\n{code}\n```"
5412
- _client = _prof.get_client(prof)
5433
+
5434
+ prof["client"] = _prof.get_client(prof)
5435
+ _client = prof["client"]
5413
5436
  _model = prof["model"]
5414
-
5415
- if prof['provider'] == "google":
5416
- fixed = _out(_client.models.generate_content(
5417
- model=_model,
5418
- contents=f"{_CELL_REPAIR_RULES}\n\n{_prompt}",
5419
- )
5420
- ).strip()
5437
+ _provider = prof["provider"].lower()
5438
+
5439
+ #1 Google
5440
+ if _provider == "google":
5441
+ from google.genai import types
5421
5442
 
5422
- elif prof["provider"] == "openai" and _model in smx.gpt_models_latest():
5443
+ fixed = _client.models.generate_content(
5444
+ model=_model,
5445
+ contents=_prompt,
5446
+ config=types.GenerateContentConfig(
5447
+ system_instruction=_CELL_REPAIR_RULES,
5448
+ temperature=0.8,
5449
+ max_output_tokens=1024,
5450
+ ),
5451
+ )
5452
+
5453
+ #2 Openai
5454
+ elif _provider == "openai" and _model in GPT_MODELS_LATEST:
5455
+
5423
5456
  args = set_args(
5424
- model=prof.get("model"),
5457
+ model=_model,
5425
5458
  instructions=_CELL_REPAIR_RULES,
5426
- input=_prompt,
5427
- previous_id=None,
5428
- store=False,
5429
- reasoning_effort="minimal",
5430
- verbosity="low"
5459
+ input=[{"role": "user", "content": _prompt}],
5460
+ previous_id=None,
5461
+ store=False,
5462
+ reasoning_effort="medium",
5463
+ verbosity="medium",
5431
5464
  )
5432
- fixed = _out(_client.responses.create(**args)).strip()
5433
-
5434
- elif prof["provider"] == "anthropic":
5435
- fixed = _out(_client.messages.create(
5436
- model=_model,
5437
- max_tokens=1024,
5438
- system=_CELL_REPAIR_RULES,
5439
- messages=[{"role": "user", "content":_prompt}]
5440
- )).strip()
5441
-
5442
- else:
5443
- fixed = _out(_client.chat.completions.create(
5444
- model=_model,
5445
- messages=[
5446
- {"role": "system", "content":_CELL_REPAIR_RULES},
5447
- {"role": "user", "content":_prompt},
5448
- ]
5449
- )
5450
- ).strip()
5451
-
5465
+ fixed = _out(_client.responses.create(**args))
5466
+
5467
+ # Anthropic
5468
+ elif _provider == "anthropic":
5469
+
5470
+ fixed = _client.messages.create(
5471
+ model=_model,
5472
+ max_tokens=1024,
5473
+ system=_CELL_REPAIR_RULES,
5474
+ messages=[{"role": "user", "content":_prompt}],
5475
+ stream=False,
5476
+ )
5477
+
5478
+ # OpenAI SDK
5479
+ else:
5480
+ fixed = _client.chat.completions.create(
5481
+ model=_model,
5482
+ messages=[
5483
+ {"role": "system", "content":_CELL_REPAIR_RULES},
5484
+ {"role": "user", "content":_prompt},
5485
+ ],
5486
+ max_tokens=1024,
5487
+ )
5488
+
5489
+ fixed_txt = clean_llm_code(fixed)
5490
+
5452
5491
  try:
5453
- ast.parse(fixed);
5454
- return fixed
5455
- except SyntaxError:
5492
+ # Only accept the repaired cell if it's valid Python
5493
+ ast.parse(fixed_txt)
5494
+ return fixed_txt
5495
+ except Exception:
5496
+ # If the repaired version is still broken, fall back to the original code
5456
5497
  return code
5457
- ################################################################
5458
-
5498
+
5459
5499
  section = request.args.get("section", "explore")
5460
5500
  datasets = [f for f in os.listdir(DATA_FOLDER) if f.lower().endswith(".csv")]
5461
5501
  selected_dataset = request.form.get("dataset") or request.args.get("dataset")
5462
5502
  if not selected_dataset and datasets:
5463
5503
  selected_dataset = datasets[0]
5464
5504
 
5465
- # selected_dataset = selected_dataset or ""
5466
-
5467
5505
  # Handle file upload
5468
5506
  if request.method == "POST" and "dataset_file" in request.files:
5469
5507
  f = request.files["dataset_file"]
@@ -5475,7 +5513,7 @@ def setup_routes(smx):
5475
5513
 
5476
5514
  # Load dataframe if available
5477
5515
  df = pd.read_csv(os.path.join(DATA_FOLDER, selected_dataset)) if selected_dataset else None
5478
-
5516
+
5479
5517
  # --- Jupyter kernel management ---
5480
5518
  session_id = session.get('smx_kernel_id')
5481
5519
  if not session_id:
@@ -5486,38 +5524,84 @@ def setup_routes(smx):
5486
5524
 
5487
5525
  # --- Handle Ask AI ---
5488
5526
  ai_outputs = []
5527
+ dl_html = ""
5489
5528
  askai_question = ""
5490
- refined_question = None
5529
+ refined_question = ""
5530
+ tags = []
5491
5531
  ai_code = None
5532
+ eda_df = df
5533
+ llm_usage = None
5492
5534
 
5493
5535
  if request.method == "POST" and "askai_question" in request.form:
5494
5536
  askai_question = request.form["askai_question"].strip()
5495
- if df is not None:
5537
+ if df is not None:
5538
+ CLEANED_FOLDER = str(selected_dataset).split(".")[0] + "_preprocessed"
5539
+ cleaned_path = os.path.join(DATA_FOLDER, CLEANED_FOLDER, "cleaned_df.csv")
5540
+ if os.path.exists(cleaned_path):
5541
+ df = pd.read_csv(cleaned_path, low_memory=False)
5542
+ else:
5543
+ from syntaxmatrix.dataset_preprocessing import ensure_cleaned_df
5544
+ df = ensure_cleaned_df(DATA_FOLDER, CLEANED_FOLDER, df) # writes cleaned_df.csv
5545
+
5546
+ # Build lightweight context
5547
+ columns_summary = ", ".join(df.columns.tolist())
5548
+ dataset_context = f"columns: {columns_summary}"
5549
+ dataset_profile = f"modality: tabular; columns: {columns_summary}"
5550
+
5551
+ refined_question = refine_question_agent(askai_question, dataset_context)
5552
+ tags = classify_ml_job_agent(refined_question, dataset_profile)
5553
+
5554
+ ai_code = smx.ai_generate_code(refined_question, tags, df)
5555
+ llm_usage = smx.get_last_llm_usage()
5556
+ ai_code = auto_inject_template(ai_code, tags, df)
5557
+
5558
+ # --- 1) Strip dotenv ASAP (kill imports, %magics, !pip) ---
5559
+ ctx = {
5560
+ "question": refined_question,
5561
+ "df_columns": list(df.columns),
5562
+ }
5563
+ ai_code = ToolRunner(EARLY_SANITIZERS).run(ai_code, ctx) # dotenv first
5496
5564
 
5497
- refined_question = refine_eda_question(askai_question, df)
5498
- intent = classify_ml_job(refined_question)
5499
- ai_code = smx.ai_generate_code(refined_question, intent, df)
5500
- ai_code = auto_inject_template(ai_code, intent, df)
5565
+ # --- 2) Domain/Plotting patches ---
5501
5566
  ai_code = fix_scatter_and_summary(ai_code)
5502
5567
  ai_code = fix_importance_groupby(ai_code)
5503
5568
  ai_code = inject_auto_preprocessing(ai_code)
5504
5569
  ai_code = patch_plot_code(ai_code, df, refined_question)
5570
+ ai_code = ensure_matplotlib_title(ai_code)
5571
+ ai_code = patch_pie_chart(ai_code, df, refined_question)
5505
5572
  ai_code = patch_pairplot(ai_code, df)
5573
+ ai_code = fix_seaborn_boxplot_nameerror(ai_code)
5574
+ ai_code = fix_seaborn_barplot_nameerror(ai_code)
5506
5575
  ai_code = get_plotting_imports(ai_code)
5507
- ai_code = ensure_image_output(ai_code)
5508
- ai_code = fix_numeric_sum(ai_code)
5576
+ ai_code = patch_prefix_seaborn_calls(ai_code)
5577
+ ai_code = patch_fix_sentinel_plot_calls(ai_code)
5578
+ ai_code = patch_ensure_seaborn_import(ai_code)
5579
+ ai_code = patch_rmse_calls(ai_code)
5580
+ ai_code = patch_fix_seaborn_palette_calls(ai_code)
5581
+ ai_code = patch_quiet_specific_warnings(ai_code)
5582
+ ai_code = clean_llm_code(ai_code)
5583
+ ai_code = ensure_image_output(ai_code)
5509
5584
  ai_code = ensure_accuracy_block(ai_code)
5510
5585
  ai_code = ensure_output(ai_code)
5511
5586
  ai_code = fix_plain_prints(ai_code)
5512
- ai_code = fix_print_html(ai_code)
5587
+ ai_code = fix_print_html(ai_code)
5513
5588
  ai_code = fix_to_datetime_errors(ai_code)
5589
+
5590
+ # --- 3-4) Global syntax/data fixers (must run AFTER patches, BEFORE final repair) ---
5591
+ ai_code = ToolRunner(SYNTAX_AND_REPAIR).run(ai_code, ctx)
5592
+
5593
+ # # --- 4) Final catch-all repair (run LAST) ---
5514
5594
  ai_code = _smx_repair_python_cell(ai_code)
5595
+ ai_code = harden_ai_code(ai_code)
5596
+ ai_code = drop_bad_classification_metrics(ai_code, df)
5597
+ ai_code = patch_fix_sentinel_plot_calls(ai_code)
5515
5598
 
5516
- # Always make sure 'df' is in the kernel before running user code
5599
+ # Always make sure 'df' is in the kernel before running user code
5517
5600
  df_init_code = (
5518
5601
  f"import pandas as pd\n"
5519
- f"df = pd.read_csv(r'''{os.path.join(DATA_FOLDER, selected_dataset)}''')"
5602
+ f"df = pd.read_csv(r'''{os.path.join(cleaned_path)}''')"
5520
5603
  )
5604
+
5521
5605
  execute_code_in_kernel(kc, df_init_code)
5522
5606
 
5523
5607
  outputs, errors = execute_code_in_kernel(kc, ai_code)
@@ -5530,7 +5614,6 @@ def setup_routes(smx):
5530
5614
  build_display_summary, phrase_commentary_vision, wrap_html
5531
5615
  )
5532
5616
 
5533
-
5534
5617
  # Probe axes/labels/legend
5535
5618
  probe1_out, probe1_err = execute_code_in_kernel(kc, MPL_PROBE_SNIPPET)
5536
5619
  axes_info = parse_mpl_probe_output([str(x) for x in (probe1_out + probe1_err)])
@@ -5547,7 +5630,7 @@ def setup_routes(smx):
5547
5630
  ################################################################
5548
5631
 
5549
5632
  # ----- Build a single HTML with Result + Commentary + AI Code ----------
5550
- _buf_out, _buf_err = io.StringIO(), io.StringIO()
5633
+ _buf_out, _buf_err = _std_io.StringIO(), _std_io.StringIO()
5551
5634
  with contextlib.redirect_stdout(_buf_out), contextlib.redirect_stderr(_buf_err):
5552
5635
  # Exact result blocks (already cleaned by kernel_manager)
5553
5636
  result_html = rendered_html if rendered_html.strip() else "<pre>No output.</pre>"
@@ -5594,79 +5677,797 @@ def setup_routes(smx):
5594
5677
  """
5595
5678
  ai_outputs.append(Markup(dl_html))
5596
5679
 
5597
- ################################################################
5598
-
5599
-
5600
5680
  # --- EDA/static cells ---
5681
+ # Display helper: coerce integer-like float columns to Int64 just for rendering
5682
+ def _coerce_intlike_for_display(df_in: pd.DataFrame, per_cell: bool = False, eps: float = 1e-9) -> pd.DataFrame:
5683
+ import numpy as np
5684
+ out = df_in.copy()
5685
+ if per_cell:
5686
+ def _maybe(v):
5687
+ try:
5688
+ fv = float(v)
5689
+ except Exception:
5690
+ return v
5691
+ if pd.notnull(v) and np.isfinite(fv) and abs(fv - round(fv)) <= eps:
5692
+ return int(round(fv))
5693
+ return v
5694
+ return out.applymap(_maybe)
5695
+ # column-wise mode (original behaviour for previews)
5696
+ for c in out.columns:
5697
+ s = out[c]
5698
+ if pd.api.types.is_float_dtype(s):
5699
+ vals = s.dropna().to_numpy()
5700
+ if vals.size and np.isfinite(vals).all() and np.allclose(vals, np.round(vals), rtol=0, atol=eps):
5701
+ out[c] = s.round().astype("Int64")
5702
+ return out
5703
+
5601
5704
  data_cells = []
5705
+ max_rows = 5000
5706
+ max_cols = 80
5602
5707
  if df is not None:
5603
- num_records = df.shape
5604
- ds = selected_dataset.replace("_"," ").replace(".csv","").capitalize()
5708
+ df = eda_df
5709
+ ds = (selected_dataset or "").replace("_", " ").replace(".csv", "").capitalize()
5710
+
5711
+ # 1) Dataset Overview (stat cards)
5712
+ rows, cols = df.shape
5713
+ mem_bytes = int(df.memory_usage(deep=True).sum())
5714
+ mem_mb = round(mem_bytes / (1024 * 1024), 2)
5715
+ dup_rows = int(df.duplicated().sum())
5716
+ nunique_all = df.nunique(dropna=False)
5717
+
5718
+ n = max(rows, 1)
5719
+ dtypes = df.dtypes.astype(str)
5720
+ nonnull = df.notnull().sum()
5721
+ miss_pct = (df.isnull().mean() * 100).round(1)
5722
+ uniques = df.nunique(dropna=True)
5723
+ uniq_ratio = (uniques / n).fillna(0.0)
5724
+
5725
+ id_like, hi_card, consts, flags_col = [], [], [], []
5726
+ for c in df.columns:
5727
+ flags = []
5728
+ if uniques.get(c, 0) <= 1:
5729
+ flags.append("constant"); consts.append(c)
5730
+ if uniq_ratio.get(c, 0) >= 0.95 and "datetime" not in dtypes[c].lower():
5731
+ flags.append("id-like"); id_like.append(c)
5732
+ if dtypes[c].startswith("object") and uniq_ratio.get(c, 0) > 0.5 and c not in id_like:
5733
+ flags.append("high-card"); hi_card.append(c)
5734
+ flags_col.append(", ".join(flags))
5735
+
5736
+ _stats_code = (
5737
+ "rows, cols = df.shape\n"
5738
+ "mem_bytes = int(df.memory_usage(deep=True).sum())\n"
5739
+ "mem_mb = round(mem_bytes / (1024*1024), 2)\n"
5740
+ )
5741
+
5742
+ _stats_html = f"""
5743
+ <style>
5744
+ .smx-statwrap{{display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));gap:10px}}
5745
+ .smx-stat{{background:#fff;border:1px solid #e5e7eb;border-radius:10px;padding:10px 12px;text-align:center}}
5746
+ .smx-stat h4{{margin:0 0 4px;font-size:.9rem}}
5747
+ .smx-stat div{{font-weight:700;font-size:1.05rem}}
5748
+ </style>
5749
+ <div class="smx-statwrap">
5750
+ <div class="smx-stat"><h4>Rows</h4><div>{rows:,}</div></div>
5751
+ <div class="smx-stat"><h4>Columns</h4><div>{cols:,}</div></div>
5752
+ <div class="smx-stat"><h4>Memory (MB)</h4><div>{mem_mb}</div></div>
5753
+ </div>
5754
+ """
5605
5755
  data_cells.append({
5606
- "title": f"{ds} size",
5607
- "output": num_records,
5608
- "code": "df.shape"
5756
+ "title": f"{ds} Overview",
5757
+ "output": Markup(_stats_html),
5758
+ "code": _stats_code,
5759
+ "span":"eda-col-8"
5609
5760
  })
5761
+
5762
+ # 2) Integrity Notes — with "Show all" toggle
5763
+ notes = []
5764
+ if id_like:
5765
+ notes.append(f"ID-like columns: {', '.join(map(str, id_like[:6]))}{'…' if len(id_like)>6 else ''}")
5766
+ if hi_card:
5767
+ notes.append(f"High-cardinality categoricals: {', '.join(map(str, hi_card[:6]))}{'…' if len(hi_card)>6 else ''}")
5768
+ if consts:
5769
+ notes.append(f"Constant columns: {', '.join(map(str, consts[:6]))}{'…' if len(consts)>6 else ''}")
5770
+
5771
+ # Build full flagged table
5772
+ flag_rows = []
5773
+ for c in df.columns:
5774
+ f = []
5775
+ if c in id_like: f.append("id-like")
5776
+ if c in hi_card: f.append("high-card")
5777
+ if c in consts: f.append("constant")
5778
+ if f:
5779
+ flag_rows.append({
5780
+ "Column": c,
5781
+ "Flags": ", ".join(f),
5782
+ "Type": dtypes[c],
5783
+ "Unique Values": int(uniques.get(c, 0)),
5784
+ "Unique Ratio": float(uniq_ratio.get(c, 0)),
5785
+ "Missing (%)": float(miss_pct.get(c, 0)),
5786
+ })
5787
+ flagged_df = pd.DataFrame(flag_rows)
5788
+ flagged_df = flagged_df.sort_values(["Flags","Column"]) if not flagged_df.empty else flagged_df
5789
+
5790
+ # Render notes + toggle
5791
+ notes_html = (
5792
+ "<ul style='margin:0;padding-left:18px;'>" +
5793
+ "".join([f"<li>{n}</li>" for n in notes]) +
5794
+ "</ul>"
5795
+ ) if notes else "<em>No obvious integrity flags.</em>"
5796
+
5797
+ if not flagged_df.empty:
5798
+ table_html = datatable_box(flagged_df)
5799
+ body_html = (
5800
+ notes_html +
5801
+ f"<details style='margin-top:8px;'><summary>Show all flagged columns ({len(flagged_df)})</summary>"
5802
+ f"<div style='margin-top:8px;'>{table_html}</div></details>"
5803
+ )
5804
+ else:
5805
+ body_html = notes_html
5806
+
5807
+ data_cells.append({
5808
+ "title": "Integrity Notes",
5809
+ "output": Markup(body_html),
5810
+ "code": (
5811
+ "# Build Integrity Notes lists and full flagged table\n"
5812
+ "flag_rows = []\n"
5813
+ "for c in df.columns:\n"
5814
+ " f = []\n"
5815
+ " if c in id_like: f.append('id-like')\n"
5816
+ " if c in hi_card: f.append('high-card')\n"
5817
+ " if c in consts: f.append('constant')\n"
5818
+ " if f:\n"
5819
+ " flag_rows.append({\n"
5820
+ " 'Column': c,\n"
5821
+ " 'Flags': ', '.join(f),\n"
5822
+ " 'Type': dtypes[c],\n"
5823
+ " 'Unique Values': int(uniques.get(c,0)),\n"
5824
+ " 'Unique Ratio': float(uniq_ratio.get(c,0)),\n"
5825
+ " 'Missing (%)': float(miss_pct.get(c,0))\n"
5826
+ " })\n"
5827
+ "flagged_df = pd.DataFrame(flag_rows)\n"
5828
+ "flagged_df"
5829
+ ),
5830
+ "span":"eda-col-4"
5831
+ })
5832
+
5833
+ # 3) Data Preview
5610
5834
  preview_cols = df.columns
5835
+ preview_df = _coerce_intlike_for_display(df[preview_cols].head(8))
5611
5836
  data_cells.append({
5612
5837
  "title": "Data Preview",
5613
- "output": Markup(datatable_box(df[preview_cols].head(8))),
5614
- "code": f"df[{list(preview_cols)}].head(8)"
5838
+ "output": Markup(datatable_box(preview_df)),
5839
+ "code": f"df[{list(preview_cols)}].head(8)",
5840
+ "span": "eda-col-6"
5615
5841
  })
5842
+
5843
+ # 4) Summary Statistics
5844
+ summary_cols = df.columns
5845
+ summary_df = _coerce_intlike_for_display(df[summary_cols].describe())
5616
5846
  data_cells.append({
5617
5847
  "title": "Summary Statistics",
5618
- "output": Markup(datatable_box(df.describe())),
5619
- "code": "df.describe()"
5848
+ "output": Markup(datatable_box(summary_df)),
5849
+ "code": f"df[{list(summary_cols)}].describe()",
5850
+ "span": "eda-col-6"
5851
+ })
5852
+
5853
+ # 5) Column Profile
5854
+ def _sample_vals(s, k=3):
5855
+ try:
5856
+ vals = pd.unique(s.dropna().astype(str))[:k]
5857
+ return ", ".join(map(str, vals))
5858
+ except Exception:
5859
+ return ""
5860
+
5861
+ profile_df = pd.DataFrame({
5862
+ "Column": df.columns,
5863
+ "Type": dtypes.values,
5864
+ "Non-Null Count": nonnull.values,
5865
+ "Missing (%)": miss_pct.values,
5866
+ "Unique Values": uniques.values,
5867
+ "Sample Values": [ _sample_vals(df[c]) for c in df.columns ],
5868
+ "Flags": flags_col
5869
+ })
5870
+ data_cells.append({
5871
+ "title": "Column Profile",
5872
+ "output": Markup(datatable_box(profile_df)),
5873
+ "code": (
5874
+ "dtypes = df.dtypes.astype(str)\n"
5875
+ "nonnull = df.notnull().sum()\n"
5876
+ "miss_pct = (df.isnull().mean()*100).round(1)\n"
5877
+ "uniques = df.nunique(dropna=True)\n"
5878
+ "n = max(len(df), 1)\n"
5879
+ "uniq_ratio = (uniques / n).fillna(0.0)\n"
5880
+ "def _sample_vals(s, k=3):\n"
5881
+ " vals = pd.unique(s.dropna().astype(str))[:k]\n"
5882
+ " return ', '.join(map(str, vals)) if len(vals) else ''\n"
5883
+ "flags_col = []\n"
5884
+ "for c in df.columns:\n"
5885
+ " flags=[]\n"
5886
+ " if uniques.get(c,0) <= 1: flags.append('constant')\n"
5887
+ " if uniq_ratio.get(c,0) >= 0.95 and 'datetime' not in dtypes[c].lower(): flags.append('id-like')\n"
5888
+ " if dtypes[c].startswith('object') and uniq_ratio.get(c,0) > 0.5 and 'id-like' not in flags: flags.append('high-card')\n"
5889
+ " flags_col.append(', '.join(flags))\n"
5890
+ "profile_df = pd.DataFrame({\n"
5891
+ " 'Column': df.columns,\n"
5892
+ " 'Type': dtypes.values,\n"
5893
+ " 'Non-Null Count': nonnull.values,\n"
5894
+ " 'Missing (%)': miss_pct.values,\n"
5895
+ " 'Unique Values': uniques.values,\n"
5896
+ " 'Sample Values': [ _sample_vals(df[c]) for c in df.columns ],\n"
5897
+ " 'Flags': flags_col\n"
5898
+ "})\n"
5899
+ "profile_df"
5900
+ ),
5901
+ "span":"eda-col-6"
5620
5902
  })
5903
+
5904
+ # 6) Column Types
5905
+ dtype_df = pd.DataFrame({
5906
+ "Column": df.columns,
5907
+ "Type": df.dtypes.astype(str).values,
5908
+ "Non-Null Count": df.notnull().sum().values,
5909
+ "Unique Values": df.nunique().values
5910
+ })
5911
+ data_cells.append({
5912
+ "title": "Column Types",
5913
+ "output": Markup(datatable_box(dtype_df)),
5914
+ "code": (
5915
+ "pd.DataFrame({\n"
5916
+ " 'Column': df.columns,\n"
5917
+ " 'Type': df.dtypes.astype(str).values,\n"
5918
+ " 'Non-Null Count': df.notnull().sum().values,\n"
5919
+ " 'Unique Values': df.nunique().values\n"
5920
+ "})"
5921
+ ),
5922
+ "span":"eda-col-6"
5923
+ })
5924
+
5925
+ # 7) Outliers — Top 3 records (robust MAD score, capped 5k×80)
5926
+ try:
5927
+ import numpy as np
5928
+
5929
+ num_cols_all = df.select_dtypes(include="number").columns.tolist()
5930
+ if len(num_cols_all) >= 1:
5931
+ num_cols = num_cols_all[:max_cols] # use your cap (80)
5932
+ df_num = df[num_cols].copy()
5933
+
5934
+ # cap rows for speed (5k)
5935
+ if len(df_num) > max_rows:
5936
+ df_num = df_num.sample(max_rows, random_state=0)
5937
+
5938
+ # robust z: 0.6745 * (x - median) / MAD (MAD==0 → NaN)
5939
+ med = df_num.median(numeric_only=True)
5940
+ mad = (df_num - med).abs().median(numeric_only=True)
5941
+ rz = 0.6745 * (df_num - med) / mad.replace(0, np.nan)
5942
+
5943
+ abs_rz = rz.abs()
5944
+ row_score = abs_rz.max(axis=1, skipna=True) # strongest dev across features
5945
+ top_idx = row_score.nlargest(3).index.tolist()
5946
+
5947
+ # Build compact, mobile-friendly cards for the top 3 rows
5948
+ cards_html = []
5949
+ for ridx in top_idx:
5950
+ # top contributing columns for this row
5951
+ contrib = abs_rz.loc[ridx].dropna().sort_values(ascending=False).head(5)
5952
+ maxv = float(contrib.iloc[0]) if len(contrib) else 0.0
5953
+
5954
+ bars = []
5955
+ for c, v in contrib.items():
5956
+ pct = 0.0 if maxv <= 0 else min(100.0, float(v) / maxv * 100.0)
5957
+ bars.append(f"""
5958
+ <div class="barrow">
5959
+ <span class="cname">{html.escape(str(c))}</span>
5960
+ <div class="bar"><div class="fill" style="width:{pct:.1f}%"></div></div>
5961
+ <span class="score">{v:.2f}</span>
5962
+ </div>
5963
+ """)
5964
+
5965
+ bars_html = "".join(bars) if bars else "<em>No strong single-column contributors.</em>"
5966
+
5967
+ # show the full record (all columns) with horizontal scroll
5968
+ row_vals = df.loc[ridx, :].to_dict()
5969
+ row_tbl = datatable_box(pd.DataFrame([row_vals]))
5970
+
5971
+ score_val = float(row_score.loc[ridx]) if pd.notnull(row_score.loc[ridx]) else 0.0
5972
+ title_idx = int(ridx) if isinstance(ridx, (int, np.integer)) else html.escape(str(ridx))
5973
+
5974
+ cards_html.append(f"""
5975
+ <div class="mad-card">
5976
+ <div class="mad-title">Row index: {title_idx} · score: {score_val:.2f}</div>
5977
+ <div class="mad-bars">{bars_html}</div>
5978
+ <div class="mad-row">{row_tbl}</div>
5979
+ </div>
5980
+ """)
5981
+
5982
+ grid_html = f"""
5983
+ <style>
5984
+ .mad-grid{{display:grid;grid-template-columns:repeat(3,minmax(0,1fr));gap:10px}}
5985
+ @media(max-width:1024px){{.mad-grid{{grid-template-columns:repeat(2,minmax(0,1fr))}}}}
5986
+ @media(max-width:640px){{.mad-grid{{grid-template-columns:repeat(1,minmax(0,1fr))}}}}
5987
+ .mad-card{{background:#fff;border:1px solid #e5e7eb;border-radius:10px;padding:8px 10px}}
5988
+ .mad-title{{font-weight:600;margin-bottom:6px}}
5989
+ .mad-bars .barrow{{display:grid;grid-template-columns:140px 1fr 46px;gap:6px;align-items:center;margin:4px 0}}
5990
+ .mad-bars .bar{{background:#eef2f7;border-radius:6px;height:8px;overflow:hidden}}
5991
+ .mad-bars .fill{{background:#0b8ae5;height:8px}}
5992
+ .mad-bars .cname{{font-size:12px;color:#444;white-space:nowrap;overflow:hidden;text-overflow:ellipsis}}
5993
+ .mad-bars .score{{font-size:12px;color:#333;text-align:right}}
5994
+ .mad-row .smx-table{{font-size:12px}}
5995
+ </style>
5996
+ <div class="mad-grid">{''.join(cards_html)}</div>
5997
+ """
5998
+
5999
+ data_cells.append({
6000
+ "title": "Outliers — Top 3 records",
6001
+ "output": Markup(grid_html),
6002
+ "code": (
6003
+ "num_cols = df.select_dtypes(include='number').columns.tolist()[:max_cols]\n"
6004
+ "df_num = df[num_cols]\n"
6005
+ "df_num = df_num.sample(max_rows, random_state=0) if len(df_num) > max_rows else df_num\n"
6006
+ "med = df_num.median(); mad = (df_num - med).abs().median()\n"
6007
+ "rz = 0.6745 * (df_num - med) / mad.replace(0, np.nan)\n"
6008
+ "row_score = rz.abs().max(axis=1)\n"
6009
+ "top3 = row_score.nlargest(3)\n"
6010
+ ),
6011
+ "span": "eda-col-12"
6012
+ })
6013
+ else:
6014
+ data_cells.append({
6015
+ "title": "Outliers — Top 3 records (robust MAD score)",
6016
+ "output": "<em>No numeric columns available.</em>",
6017
+ "code": "# no numeric columns",
6018
+ "span": "eda-col-6"
6019
+ })
6020
+ except Exception as _e:
6021
+ data_cells.append({
6022
+ "title": "Outliers — Top 3 records (robust MAD score)",
6023
+ "output": f"<em>Could not compute robust outliers: {html.escape(str(_e))}</em>",
6024
+ "code": "# error during robust outlier computation",
6025
+ "span": "eda-col-6"
6026
+ })
6027
+
6028
+ # 8) Outliers — Violin + Box (Top 3 numerics by IQR outliers, capped 5k×80)
6029
+ try:
6030
+ num_outliers = 3
6031
+ num_cols_all = df.select_dtypes(include="number").columns.tolist()
6032
+ if len(num_cols_all) >= 1:
6033
+ num_cols = num_cols_all[:max_cols]
6034
+ dfn = df[num_cols].copy()
6035
+
6036
+ # cap rows for speed (5k)
6037
+ if len(dfn) > max_rows:
6038
+ dfn = dfn.sample(max_rows, random_state=0)
6039
+
6040
+ # rank columns by number of Tukey outliers (1.5*IQR)
6041
+ ranks = []
6042
+ for c in dfn.columns:
6043
+ s = pd.to_numeric(dfn[c], errors="coerce").dropna()
6044
+ if s.empty:
6045
+ ranks.append((c, 0, 0.0))
6046
+ continue
6047
+ q1 = s.quantile(0.25); q3 = s.quantile(0.75)
6048
+ iqr = float(q3 - q1)
6049
+ if iqr <= 0:
6050
+ ranks.append((c, 0, 0.0))
6051
+ continue
6052
+ lower = q1 - 1.5 * iqr
6053
+ upper = q3 + 1.5 * iqr
6054
+ out_count = int(((s < lower) | (s > upper)).sum())
6055
+ ranks.append((c, out_count, float(iqr)))
6056
+
6057
+ # choose top 6 (break ties by IQR spread)
6058
+ sel_cols = [c for c, _, _ in sorted(ranks, key=lambda x: (-x[1], -x[2]))[:num_outliers]]
6059
+ if not sel_cols:
6060
+ raise ValueError("No numeric columns have spread for violin plots.")
6061
+
6062
+ # package data for JS (values only; thresholds for display)
6063
+ charts = []
6064
+ for c in sel_cols:
6065
+ s = pd.to_numeric(dfn[c], errors="coerce").dropna()
6066
+ if s.empty:
6067
+ continue
6068
+ q1 = s.quantile(0.25); q3 = s.quantile(0.75); iqr = q3 - q1
6069
+ lower = float(q1 - 1.5 * iqr); upper = float(q3 + 1.5 * iqr)
6070
+ out_count = int(((s < lower) | (s > upper)).sum())
6071
+ charts.append({
6072
+ "name": str(c),
6073
+ "values": [float(v) for v in s.tolist()],
6074
+ "lower": lower,
6075
+ "upper": upper,
6076
+ "n": int(s.size),
6077
+ "out": out_count
6078
+ })
6079
+
6080
+ container_id = f"violgrid_{uuid.uuid4().hex}"
6081
+ sub_divs = "\n".join([f'<div id="{container_id}_{i}" class="vplot"></div>' for i in range(len(charts))])
6082
+
6083
+ plot_html = f"""
6084
+ <style>
6085
+ /* mini-grid 3x2 → 2x? → 1x? */
6086
+ #{container_id}{{display:grid;grid-template-columns:repeat(3,minmax(0,1fr));gap:10px}}
6087
+ @media(max-width:1024px){{#{container_id}{{grid-template-columns:repeat(2,minmax(0,1fr))}}}}
6088
+ @media(max-width:640px){{#{container_id}{{grid-template-columns:repeat(1,minmax(0,1fr))}}}}
6089
+ /* each plot container – height set via JS for monotonic responsiveness */
6090
+ #{container_id} .vplot{{width:100%;}}
6091
+ </style>
6092
+ <div id="{container_id}">
6093
+ {sub_divs}
6094
+ </div>
6095
+ <script>
6096
+ (function(){{
6097
+ var charts = {json.dumps(charts)};
6098
+
6099
+ function calcHeight(el){{
6100
+ var w = (el && el.clientWidth) || (el && el.parentElement && el.parentElement.clientWidth) || 360;
6101
+ // smooth, monotone: ~0.55×width, clamped
6102
+ return Math.round(Math.max(220, Math.min(360, w * 0.55)));
6103
+ }}
6104
+
6105
+ function drawOne(target, data){{
6106
+ var el = document.getElementById(target);
6107
+ if(!el) return;
6108
+ var h = calcHeight(el);
6109
+ el.style.setProperty('height', h + 'px', 'important'); // defeat global height:auto
6110
+
6111
+ var trace = {{
6112
+ type: 'violin',
6113
+ y: data.values,
6114
+ name: data.name,
6115
+ box: {{ visible: true }},
6116
+ meanline: {{ visible: true }},
6117
+ points: 'suspectedoutliers',
6118
+ hovertemplate: '%{{y}}<extra></extra>',
6119
+ showlegend: false
6120
+ }};
6121
+
6122
+ var layout = {{
6123
+ margin: {{ l: 40, r: 10, t: 26, b: 28 }},
6124
+ title: {{ text: data.name + ' (n=' + data.n + ', out=' + data.out + ')', font: {{ size: 12 }} }},
6125
+ yaxis: {{ automargin: true }}
6126
+ }};
6127
+
6128
+ var config = {{ displayModeBar: true, responsive: true }};
6129
+ if(window.Plotly && Plotly.newPlot){{
6130
+ Plotly.newPlot(el, [trace], layout, config).then(function(){{
6131
+ if(Plotly.Plots && Plotly.Plots.resize) Plotly.Plots.resize(el);
6132
+ }});
6133
+ }} else {{
6134
+ var p=document.createElement('div'); p.style.color='crimson'; p.style.marginTop='8px';
6135
+ p.textContent='Plotly is not loaded.'; el.appendChild(p);
6136
+ }}
6137
+ }}
6138
+
6139
+ function drawAll(){{
6140
+ for(var i=0;i<charts.length;i++) drawOne("{container_id}_" + i, charts[i]);
6141
+ }}
6142
+ drawAll();
6143
+ window.addEventListener('resize', drawAll);
6144
+ }})();
6145
+ </script>
6146
+ """
6147
+
6148
+ data_cells.append({
6149
+ "title": "Outliers — Violin + Box (Top 3 numerics by IQR outliers)",
6150
+ "output": Markup(plot_html),
6151
+ "code": (
6152
+ "dfn = df.select_dtypes(include='number').iloc[:, :max_cols]\n"
6153
+ "dfn = dfn.sample(max_rows, random_state=0) if len(dfn) > max_rows else dfn\n"
6154
+ "# rank columns by Tukey outliers (1.5*IQR) and plot violins with inner box"
6155
+ ),
6156
+ "span": "eda-col-12"
6157
+ })
6158
+
6159
+ else:
6160
+ data_cells.append({
6161
+ "title": "Outliers — Violin + Box",
6162
+ "output": "<em>No numeric columns available.</em>",
6163
+ "code": "# no numeric columns",
6164
+ "span": "eda-col-6"
6165
+ })
6166
+ except Exception as _e:
6167
+ data_cells.append({
6168
+ "title": "Outliers — Violin + Box",
6169
+ "output": f"<em>Could not render violins: {html.escape(str(_e))}</em>",
6170
+ "code": "# error during violin rendering",
6171
+ "span": "eda-col-6"
6172
+ })
6173
+
6174
+ # 9) Missing Values table
5621
6175
  nulls = df.isnull().sum()
5622
6176
  nulls_pct = (df.isnull().mean() * 100).round(1)
5623
6177
  missing_df = pd.DataFrame({
5624
- "Missing Values": nulls,
5625
- "Missing (%)": nulls_pct
6178
+ "Column": df.columns,
6179
+ "Missing Values": nulls.values,
6180
+ "Missing (%)": nulls_pct.values
5626
6181
  })
5627
- missing = missing_df[missing_df["Missing Values"] > 0]
6182
+ missing = missing_df[missing_df["Missing Values"] > 0]
5628
6183
  data_cells.append({
5629
6184
  "title": "Missing Values",
5630
6185
  "output": Markup(datatable_box(missing)) if not missing.empty else "<em>No missing values detected.</em>",
5631
6186
  "code": (
5632
6187
  "nulls = df.isnull().sum()\n"
5633
6188
  "nulls_pct = (df.isnull().mean() * 100).round(1)\n"
5634
- "missing_df = pd.DataFrame({'Missing Values': nulls, 'Missing (%)': nulls_pct})\n"
6189
+ "missing_df = pd.DataFrame({\n"
6190
+ " 'Column': df.columns,\n"
6191
+ " 'Missing Values': nulls.values,\n"
6192
+ " 'Missing (%)': nulls_pct.values\n"
6193
+ "})\\n"
5635
6194
  "missing_df[missing_df['Missing Values'] > 0]"
5636
- )
6195
+ ),
6196
+ "span":"eda-col-4"
5637
6197
  })
5638
- dtype_df = pd.DataFrame({
5639
- "Type": df.dtypes.astype(str),
5640
- "Non-Null Count": df.notnull().sum(),
5641
- "Unique Values": df.nunique()
5642
- })
5643
- data_cells.append({
5644
- "title": "Column Types",
5645
6198
 
5646
- "output": Markup(datatable_box(dtype_df)),
5647
- "code": (
5648
- "pd.DataFrame({\n"
5649
- " 'Type': df.dtypes.astype(str),\n"
5650
- " 'Non-Null Count': df.notnull().sum(),\n"
5651
- " 'Unique Values': df.nunique()\n"
5652
- "})"
6199
+ # 9) Missingness (Top 20) – Plotly bar chart
6200
+ if not missing.empty:
6201
+ top_miss = (
6202
+ missing_df[missing_df["Missing Values"] > 0]
6203
+ .sort_values("Missing (%)", ascending=False)
6204
+ .loc[:, ["Column", "Missing (%)"]]
6205
+ .head(20)
6206
+ .reset_index(drop=True)
5653
6207
  )
5654
- })
6208
+
6209
+ container_id = f"miss_plot_{uuid.uuid4().hex}"
6210
+ x_vals = [html.escape(str(c)) for c in top_miss["Column"].tolist()]
6211
+ y_vals = [float(v) for v in top_miss["Missing (%)"].tolist()]
6212
+
6213
+ plot_html = f"""
6214
+ <div id="{container_id}" style="width:100%;height:340px;"></div>
6215
+ <script>
6216
+ (function(){{
6217
+ var x = {json.dumps(x_vals)};
6218
+ var y = {json.dumps(y_vals)};
6219
+ var data = [{{
6220
+ type: 'bar',
6221
+ x: x,
6222
+ y: y,
6223
+ hovertemplate: '%{{x}}<br>Missing: %{{y:.1f}}%<extra></extra>'
6224
+ }}];
6225
+ var layout = {{
6226
+ margin: {{l:50, r:20, t:10, b:100}},
6227
+ yaxis: {{ title: 'Missing (%)', rangemode: 'tozero' }},
6228
+ xaxis: {{ title: 'Column', tickangle: -45 }}
6229
+ }};
6230
+ if (window.Plotly && Plotly.newPlot) {{
6231
+ Plotly.newPlot("{container_id}", data, layout, {{displayModeBar:true, responsive:true}});
6232
+ }} else {{
6233
+ var p=document.createElement('div'); p.style.color='crimson'; p.style.marginTop='8px';
6234
+ p.textContent='Plotly is not loaded.'; document.getElementById("{container_id}").appendChild(p);
6235
+ }}
6236
+ }})();
6237
+ </script>
6238
+ """
6239
+ data_cells.append({
6240
+ "title": "Missingness (Top 20)",
6241
+ "output": Markup(plot_html),
6242
+ "code": (
6243
+ "nulls = df.isnull().sum();\n"
6244
+ "nulls_pct = (\n"
6245
+ " df.isnull().mean()*100\n"
6246
+ ").round(1)\n"
6247
+ "missing_df = pd.DataFrame({\n"
6248
+ " 'Column': df.columns,\n"
6249
+ " 'Missing Values': nulls.values,\n"
6250
+ " 'Missing (%)': nulls_pct.values\n"
6251
+ "})\n\n"
6252
+ "top_miss = (\n"
6253
+ " missing_df[missing_df['Missing Values'] > 0]\n"
6254
+ " .sort_values('Missing (%)', ascending=False)\n"
6255
+ " .loc[:, ['Column', 'Missing (%)']]\n"
6256
+ " .head(20)\n"
6257
+ " .reset_index(drop=True)\n"
6258
+ ")\n"
6259
+ "top_miss"
6260
+ ),
6261
+ "span":"eda-col-4"
6262
+ })
6263
+
6264
+ # 11 Category Distribution — 3D doughnut (dataset-agnostic, capped 5k)
6265
+ try:
6266
+ # 1) Column universe: object / category / bool (integers remain numeric)
6267
+ cat_cols_all = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
6268
+
6269
+ # 2) Honour user pick if categorical; otherwise auto-pick a sensible default
6270
+ dist_param = (request.args.get("dist") or request.form.get("dist") or "").strip()
6271
+ if dist_param and dist_param in cat_cols_all:
6272
+ dist_col = dist_param
6273
+ else:
6274
+ # Auto-pick preference: 3–20 unique values excluding obvious ID-like;
6275
+ # else allow 2-level; else first categorical.
6276
+ n_total = len(df)
6277
+ uniques_loc = df.nunique(dropna=True)
6278
+ miss_pct_loc = (df.isnull().mean() * 100).round(1)
6279
+ id_like_loc = {c for c in cat_cols_all if n_total > 0 and (uniques_loc.get(c, 0) / n_total) >= 0.95}
6280
+
6281
+ multilevel = [c for c in cat_cols_all
6282
+ if (3 <= int(uniques_loc.get(c, df[c].nunique(dropna=True))) <= 20)
6283
+ and (c not in id_like_loc)]
6284
+ if multilevel:
6285
+ # score nearer 8 levels and lower missingness
6286
+ best, best_score = "", -1e9
6287
+ for c in multilevel:
6288
+ k = int(uniques_loc.get(c, df[c].nunique(dropna=True)))
6289
+ miss = float(miss_pct_loc.get(c, (df[c].isna().mean() * 100)))
6290
+ score = -abs(k - 8) - (miss / 10.0)
6291
+ if score > best_score:
6292
+ best, best_score = c, score
6293
+ dist_col = best
6294
+ else:
6295
+ twolevel = [c for c in cat_cols_all if int(uniques_loc.get(c, df[c].nunique(dropna=True))) == 2]
6296
+ dist_col = (twolevel[0] if twolevel else (cat_cols_all[0] if cat_cols_all else ""))
6297
+
6298
+ # 3) Build options AFTER dist_col is final (so selection sticks)
6299
+ opts = []
6300
+ for c in cat_cols_all:
6301
+ sel = " selected" if c == dist_col else ""
6302
+ opts.append(f'<option value="{html.escape(str(c))}"{sel}>{html.escape(str(c))}</option>')
6303
+ opts_html = "\n".join(opts)
6304
+
6305
+ form_html = f"""
6306
+ <a id="dist3d"></a>
6307
+ <form method="get" action="/dashboard#dist3d"
6308
+ style="display:flex; flex-wrap:wrap; gap:10px; align-items:center; margin-bottom:8px;">
6309
+ <input type="hidden" name="section" value="explore">
6310
+ <input type="hidden" name="dataset" value="{html.escape(str(selected_dataset or ''))}">
6311
+ <label><strong>Distribution column:</strong></label>
6312
+ <select name="dist" onchange="this.form.submit()" style="min-width:200px; height:28px;">
6313
+ {opts_html}
6314
+ </select>
6315
+ </form>
6316
+ """
6317
+
6318
+ if dist_col:
6319
+ s = df[dist_col]
6320
+ # cap cheap counting to 5k
6321
+ if len(s) > 5000:
6322
+ s = s.sample(5000, random_state=0)
6323
+
6324
+ # 4) Robust counting: treat NaN as "Missing", stringify labels for safety
6325
+ s = s.astype("object")
6326
+ s = s.where(~s.isna(), other="Missing")
6327
+ vc = s.value_counts(dropna=False)
6328
+
6329
+ if vc.empty:
6330
+ raise ValueError("No values to display for the selected column.")
6331
+
6332
+ # Top-8 + 'Other' (excluding 'Missing' which we keep separate)
6333
+ top_k = 8
6334
+ non_missing = vc.drop(index=["Missing"], errors="ignore") if "Missing" in vc.index else vc
6335
+ head = non_missing.sort_values(ascending=False).head(top_k)
6336
+ other = int(non_missing.iloc[top_k:].sum()) if len(non_missing) > top_k else 0
6337
+ miss = int(vc.get("Missing", 0))
6338
+
6339
+ labels = [str(x) for x in head.index.tolist()]
6340
+ values = [int(v) for v in head.values.tolist()]
6341
+ if other > 0:
6342
+ labels.append("Other"); values.append(other)
6343
+ if miss > 0:
6344
+ labels.append("Missing"); values.append(miss)
6345
+
6346
+ # colours for faux 3D (no external deps)
6347
+ k = len(labels)
6348
+ def _hsl(i, n, l=0.58, s=0.62):
6349
+ h = (i / max(1, n)) * 360.0
6350
+ return f"hsl({int(h)}, {int(s*100)}%, {int(l*100)}%)"
6351
+ top_colors = [_hsl(i, k, l=0.58) for i in range(k)]
6352
+ base_colors = [_hsl(i, k, l=0.40) for i in range(k)]
6353
+
6354
+ container_id = f"dist3d_{uuid.uuid4().hex}"
6355
+ total = int(sum(values))
6356
+
6357
+ plot_html = f"""
6358
+ <div id="{container_id}" class="dist3d-chart"></div>
6359
+ <script>
6360
+ (function(){{
6361
+ var el = document.getElementById("{container_id}");
6362
+ var labels = {json.dumps(labels)};
6363
+ var values = {json.dumps(values)};
6364
+ var total = {total};
6365
+
6366
+ var base = {{
6367
+ type: 'pie', labels: labels, values: values,
6368
+ hole: 0.64, sort: false, textinfo: 'none', hoverinfo: 'skip',
6369
+ marker: {{ colors: {json.dumps(base_colors)} }},
6370
+ showlegend: false
6371
+ }};
6372
+ var top = {{
6373
+ type: 'pie', labels: labels, values: values,
6374
+ hole: 0.52, sort: false,
6375
+ textinfo: 'percent', textposition: 'inside', insidetextorientation: 'radial',
6376
+ hovertemplate: '%{{label}}<br>%{{value}} of {total:,} (%{{percent}})<extra></extra>',
6377
+ marker: {{ colors: {json.dumps(top_colors)}, line: {{ width: 1, color: 'rgba(0,0,0,0.25)' }} }},
6378
+ showlegend: true, legendgroup: 'dist'
6379
+ }};
6380
+
6381
+ function parentWidth(){{
6382
+ return (el && el.parentElement ? el.parentElement.clientWidth : (window.innerWidth||360));
6383
+ }}
6384
+
6385
+ // Smooth, monotonic: height = 0.65 * width, clamped [220, 520].
6386
+ function chartHeight(){{
6387
+ var w = parentWidth();
6388
+ return Math.round(Math.max(220, Math.min(520, w * 0.65)));
6389
+ }}
6390
+
6391
+ function legendOrientation(){{
6392
+ return parentWidth() < 640 ? 'h' : 'v';
6393
+ }}
6394
+
6395
+ function makeLayout(){{
6396
+ return {{
6397
+ margin: {{ l:10, r:10, t:10, b:10 }},
6398
+ legend: {{ orientation: legendOrientation(), x:1, xanchor:'right', y:1 }},
6399
+ uniformtext: {{ mode: 'hide', minsize: 10 }}
6400
+ }};
6401
+ }}
6402
+
6403
+ function applySize(){{
6404
+ // Override global .plotly-graph-div {{ height:auto !important }}
6405
+ el.style.setProperty('height', chartHeight() + 'px', 'important');
6406
+ if (window.Plotly) {{
6407
+ Plotly.relayout(el, {{ 'legend.orientation': legendOrientation() }});
6408
+ Plotly.Plots.resize(el);
6409
+ }}
6410
+ }}
6411
+
6412
+ if (window.Plotly && Plotly.newPlot) {{
6413
+ // Initial explicit height before draw
6414
+ el.style.setProperty('height', chartHeight() + 'px', 'important');
6415
+ Plotly.newPlot(el, [base, top], makeLayout(), {{ displayModeBar:true, responsive:true }})
6416
+ .then(function(){{ applySize(); }});
6417
+ window.addEventListener('resize', applySize);
6418
+ }} else {{
6419
+ var p=document.createElement('div'); p.style.color='crimson'; p.style.marginTop='8px';
6420
+ p.textContent='Plotly is not loaded.'; el.appendChild(p);
6421
+ }}
6422
+ }})();
6423
+ </script>
6424
+ """
6425
+
6426
+ data_cells.append({
6427
+ "title": f"Category Distribution — ({html.escape(dist_col)})",
6428
+ "output": Markup(form_html + plot_html),
6429
+ "code": (
6430
+ "dist_col = '<chosen categorical>'\n"
6431
+ "s = df[dist_col].astype('object').where(~df[dist_col].isna(), other='Missing')\n"
6432
+ "vc = s.value_counts(dropna=False)\n"
6433
+ "top_k = 8 # Top-8 + Other (+ Missing)\n"
6434
+ ),
6435
+ "span": "eda-col-4"
6436
+ })
6437
+ else:
6438
+ data_cells.append({
6439
+ "title": "Category Distribution — 3D doughnut",
6440
+ "output": "<em>No categorical columns found.</em>",
6441
+ "code": "# no categorical columns",
6442
+ "span": "eda-col-4"
6443
+ })
6444
+ except Exception as _e:
6445
+ data_cells.append({
6446
+ "title": "Category Distribution — 3D doughnut",
6447
+ "output": f"<em>Could not render distribution: {html.escape(str(_e))}</em>",
6448
+ "code": "# error during distribution rendering",
6449
+ "span": "eda-col-4"
6450
+ })
6451
+
5655
6452
  for cell in data_cells:
5656
- cell["highlighted_code"] = Markup(_pygmentize(cell["code"]))
6453
+ cell["highlighted_code"] = Markup(_pygmentize(cell["code"]))
6454
+
5657
6455
  highlighted_ai_code = _pygmentize(ai_code)
6456
+
5658
6457
  return render_template(
5659
6458
  "dashboard.html",
5660
6459
  section=section,
5661
6460
  datasets=datasets,
5662
6461
  selected_dataset=selected_dataset,
5663
6462
  ai_outputs=ai_outputs,
5664
- ai_code=ai_code, # AI-generated code for toggle
6463
+ ai_code=ai_code,
5665
6464
  highlighted_ai_code=highlighted_ai_code if ai_code else None,
5666
6465
  askai_question=smx.sanitize_rough_to_markdown_task(askai_question),
5667
- refined_question=refined_question, # Refined question
6466
+ refined_question=refined_question,
6467
+ tasks=tags,
5668
6468
  data_cells=data_cells,
5669
6469
  session_id=session_id,
6470
+ llm_usage=llm_usage
5670
6471
  )
5671
6472
 
5672
6473
 
@@ -5677,7 +6478,7 @@ def setup_routes(smx):
5677
6478
  if not html_doc:
5678
6479
  return ("No result available.", 404)
5679
6480
 
5680
- buf = io.BytesIO(html_doc.encode("utf-8"))
6481
+ buf = _std_io.BytesIO(html_doc.encode("utf-8"))
5681
6482
  buf.seek(0)
5682
6483
 
5683
6484
  # keep a copy if you wish, or free it:
@@ -5749,7 +6550,7 @@ def setup_routes(smx):
5749
6550
  text = re.sub(r"<[^>]+>", " ", text)
5750
6551
  text = re.sub(r"\n{3,}", "\n\n", text)
5751
6552
  text = html.unescape(text).strip()
5752
- buf = io.BytesIO()
6553
+ buf = _std_io.BytesIO()
5753
6554
  doc = SimpleDocTemplate(buf, pagesize=A4, leftMargin=16*mm, rightMargin=16*mm, topMargin=16*mm, bottomMargin=16*mm)
5754
6555
  styles = getSampleStyleSheet()
5755
6556
  flow = []