syntaxmatrix 2.3.5__py3-none-any.whl → 2.5.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
syntaxmatrix/routes.py CHANGED
@@ -1,18 +1,28 @@
1
- from prompt_toolkit import HTML
2
- from scipy import io
3
- import os, zipfile, time, uuid, werkzeug, queue, html, ast, re, threading, textwrap, json, pandas as pd
4
- from PyPDF2.errors import EmptyFileError
5
- import io
1
+ import os, zipfile, time, uuid, werkzeug, queue, html, ast, re
2
+ import threading, textwrap, json, pandas as pd
3
+ import contextlib
4
+
5
+ import io as _std_io
6
+
6
7
  from io import BytesIO
8
+ from scipy import io
9
+ from flask import Blueprint, Response, request, send_file, session
10
+ from flask import render_template, render_template_string, url_for, redirect, g
11
+ from flask import flash, jsonify, send_from_directory, get_flashed_messages, stream_with_context
12
+
13
+ from flask_login import current_user
14
+
7
15
  from PyPDF2 import PdfReader
8
16
  from markupsafe import Markup
9
- from urllib.parse import quote
17
+ from urllib.parse import quote
18
+ from datetime import datetime
19
+ from prompt_toolkit import HTML
20
+ from PyPDF2.errors import EmptyFileError
21
+ import numpy as np
10
22
  from .auth import register_user, authenticate, login_required, admin_required, superadmin_required
11
- from flask import Blueprint, Response, request, send_file, session, render_template, render_template_string, redirect, url_for, flash, jsonify, send_from_directory, get_flashed_messages, stream_with_context
12
23
 
13
24
  from syntaxmatrix.themes import DEFAULT_THEMES
14
25
  from syntaxmatrix import db
15
- from syntaxmatrix.utils import *
16
26
  from syntaxmatrix.vector_db import add_pdf_chunk
17
27
  from syntaxmatrix.file_processor import *
18
28
  from syntaxmatrix.vectorizer import embed_text
@@ -22,16 +32,13 @@ from syntaxmatrix.history_store import SQLHistoryStore, PersistentHistoryStore
22
32
  from syntaxmatrix.kernel_manager import SyntaxMatrixKernelManager, execute_code_in_kernel
23
33
  from syntaxmatrix.vector_db import *
24
34
  from syntaxmatrix.settings.string_navbar import string_navbar_items
25
- from syntaxmatrix.settings.model_map import PROVIDERS_MODELS, MODEL_DESCRIPTIONS, PURPOSE_TAGS, EMBEDDING_MODELS
26
- from .project_root import detect_project_root
27
- from . import profiles as _prof
28
- from . import generate_page as _genpage
29
- from . import profiles as _prof
30
- from . import auth as _auth
35
+ from syntaxmatrix.settings.model_map import GPT_MODELS_LATEST, PROVIDERS_MODELS, MODEL_DESCRIPTIONS, PURPOSE_TAGS, EMBEDDING_MODELS
36
+ from syntaxmatrix.project_root import detect_project_root
37
+ from syntaxmatrix import generate_page as _genpage
38
+ from syntaxmatrix import auth as _auth
31
39
  from syntaxmatrix import profiles as _prof
32
40
  from syntaxmatrix.gpt_models_latest import set_args, extract_output_text as _out
33
- from datetime import datetime
34
- import contextlib
41
+ from syntaxmatrix.agentic.agents import classify_ml_job_agent, refine_question_agent, text_formatter_agent
35
42
 
36
43
  try:
37
44
  from pygments import highlight as _hl
@@ -41,10 +48,16 @@ try:
41
48
  except Exception:
42
49
  _HAVE_PYGMENTS = False
43
50
 
44
- from flask_login import current_user
45
- from flask import g
51
+ # from syntaxmatrix.utils import *
52
+ from syntaxmatrix.utils import (
53
+ auto_inject_template, drop_bad_classification_metrics, ensure_accuracy_block,
54
+ ensure_image_output, ensure_output, fix_plain_prints, fix_print_html, patch_fix_sentinel_plot_calls,
55
+ patch_pairplot, fix_to_datetime_errors, harden_ai_code, patch_ensure_seaborn_import, get_plotting_imports,
56
+ patch_fix_seaborn_palette_calls, patch_quiet_specific_warnings, fix_seaborn_barplot_nameerror, fix_seaborn_boxplot_nameerror, ensure_matplotlib_title, patch_plot_code, patch_prefix_seaborn_calls, fix_scatter_and_summary, inject_auto_preprocessing, fix_importance_groupby, patch_pie_chart, patch_rmse_calls, clean_llm_code
57
+ )
46
58
 
47
- # app = Flask(__name__)
59
+ from syntaxmatrix.agentic.agent_tools import ToolRunner
60
+ from syntaxmatrix.agentic.code_tools_registry import EARLY_SANITIZERS, SYNTAX_AND_REPAIR
48
61
 
49
62
  _CLIENT_DIR = detect_project_root()
50
63
  _stream_q = queue.Queue()
@@ -192,7 +205,6 @@ def setup_routes(smx):
192
205
  )
193
206
  return resp
194
207
 
195
-
196
208
  def head_html():
197
209
  # Determine a contrasting mobile text color based on the sidebar background.
198
210
  mobile_text_color = smx.theme["nav_text"]
@@ -625,8 +637,8 @@ def setup_routes(smx):
625
637
 
626
638
  desktop_nav = f"""
627
639
  <div class="nav-left">
628
- <a class="logo" href="/" style="margin:0; padding:0;">{smx.site_logo}</a>
629
- <a class="logo" href="/" style="text-decoration:none; vertical-align="middle; margin:0 24px 0 0; padding:0px;">{smx.site_title}</a>
640
+ <a class="logo" href="/">{smx.site_logo}</a>
641
+ <a class="logo" href="/" style="text-decoration:none; margin:0 24px 0 0; padding:0px; vertical-align:middle;">{smx.site_title}</a>
630
642
  <div class="nav-links" style="margin-left:24px;">
631
643
  {nav_links}
632
644
  </div>
@@ -3769,10 +3781,13 @@ def setup_routes(smx):
3769
3781
 
3770
3782
 
3771
3783
  # if any live cached profile on smx matches this name, clear it
3772
- for attr in ("_chat_profile", "_admin_profile", "_coding_profile", "_classification_profile", "_summarization_profile"):
3784
+ db_profiles = prof.get_profiles()
3785
+ # for attr in ("_chat_profile", "_admin_profile", "_coding_profile", "_classification_profile", "_summarization_profile", "_vision2text_profile"):
3786
+ for attr in ([db_profiles]):
3773
3787
  prof = getattr(smx, attr, None)
3774
3788
  if isinstance(prof, dict) and prof.get("name") == name:
3775
3789
  setattr(smx, attr, {})
3790
+ prof.refresh_profiles_cache()
3776
3791
 
3777
3792
  elif action == "add_model":
3778
3793
  prov = request.form.get("catalog_provider","").strip()
@@ -3944,7 +3959,7 @@ def setup_routes(smx):
3944
3959
  <label for="catalog_model">Model</label>
3945
3960
  <select id="catalog_model" name="catalog_model" required></select>
3946
3961
 
3947
- <label for="catalog_purpose">Purpose</label>
3962
+ <label for="catalog_purpose">Agency</label>
3948
3963
  <select id="catalog_purpose" name="catalog_purpose" required></select>
3949
3964
 
3950
3965
  <label class="form-label mb-1" style="display:block; position:relative;">
@@ -4066,7 +4081,7 @@ def setup_routes(smx):
4066
4081
 
4067
4082
  models_catalog_list_card = f"""
4068
4083
  <div class="card span-4">
4069
- <h4>Models Catalog</h4>
4084
+ <h4>Models Catalogue</h4>
4070
4085
  <ul class="catalog-list">
4071
4086
  {cat_items or "<li class='li-row'>No models yet.</li>"}
4072
4087
  </ul>
@@ -4080,15 +4095,15 @@ def setup_routes(smx):
4080
4095
  <div class='card span-4'>
4081
4096
  <h4>Setup Profiles</h4>
4082
4097
  <form method="post" style="margin-bottom:0.5rem;">
4083
- <label for="profile_name" class="form-label mb-1">
4084
- Confirm purpose
4098
+ <label for="profile_name" class="form-label mb-1" style="margin-bottom:12px;">
4099
+ Confirm Agency
4085
4100
  <button id="name-help" type="button" class="info-btn btn-link p-0 text-muted"
4086
4101
  style="font-size:0.8rem; line-height:1; padding:2px; display:inline-block;"
4087
4102
  aria-haspopup="true" aria-expanded="false"
4088
- title="Click to see naming suggestions">ⓘ</button>
4103
+ title="Click to see agencies">ⓘ</button>
4089
4104
  </label>
4090
4105
  <input id="profile_name" name="profile_name" type="text" class="form-control"
4091
- placeholder="Purpose" required>
4106
+ placeholder="Agency" required>
4092
4107
 
4093
4108
  <div id="name-suggestions" role="tooltip"
4094
4109
  class="suggestion-popover card shadow-sm p-2"
@@ -4175,9 +4190,9 @@ def setup_routes(smx):
4175
4190
 
4176
4191
  manage_sys_files_card = f"""
4177
4192
  <div class='card span-6'>
4178
- <h4>Manage System Files</h4>
4193
+ <h4>Manage Company Files</h4>
4179
4194
  <ul class="catalog-list" style="list-style:none; padding-left:0; margin:0;">
4180
- {sys_files_html or "<li>No system file has been uploaded yet.</li>"}
4195
+ {sys_files_html or "<li>No company file has been uploaded yet.</li>"}
4181
4196
  </ul>
4182
4197
  </div>
4183
4198
  """
@@ -5114,7 +5129,7 @@ def setup_routes(smx):
5114
5129
  rows = _auth.list_role_audit(limit=limit)
5115
5130
 
5116
5131
  import io, csv, datetime
5117
- buf = io.StringIO()
5132
+ buf = _std_io.StringIO()
5118
5133
  writer = csv.writer(buf)
5119
5134
  writer.writerow(["timestamp", "actor", "target", "from_role", "to_role"])
5120
5135
  for r in rows:
@@ -5375,25 +5390,28 @@ def setup_routes(smx):
5375
5390
  # ────────────────────────────────────────────────────────────────────────────────────────
5376
5391
  # DASHBOARD
5377
5392
  # ────────────────────────────────────────────────────────────────────────────────────────
5378
- # ── DASHBOARD VIEW DETAILS -----------------------------
5379
5393
  @smx.app.route("/dashboard", methods=["GET", "POST"])
5380
5394
  # @login_required
5381
5395
  def dashboard():
5382
5396
  DATA_FOLDER = os.path.join(_CLIENT_DIR, "uploads", "data")
5383
5397
  os.makedirs(DATA_FOLDER, exist_ok=True)
5384
-
5385
- ####################################################################
5386
5398
 
5387
- _CELL_REPAIR_RULES = """
5388
- Fix the Python cell to satisfy:
5389
- - Single valid cell; imports at the top.
5390
- - No top-level statements between if/elif/else branches.
5391
- - Regression must use either sklearn with train_test_split (then X_test exists) and R^2/MAE/RMSE, or statsmodels OLS. No accuracy_score in regression.
5392
- - Keep all plotting + savefig + BytesIO + display inside the branch that created the figure.
5393
- Return ONLY the corrected cell.
5394
- """
5395
-
5399
+ max_rows = 5000
5400
+ max_cols = 80
5401
+
5396
5402
  def _smx_repair_python_cell(py_code: str) -> str:
5403
+
5404
+ _CELL_REPAIR_RULES = """
5405
+ You are an experienced Python code reviewer
5406
+ Fix the Python cell to satisfy:
5407
+ - Single valid cell; imports at the top.
5408
+ - Do not import or invoke or use 'python-dotenv' or 'dotenv' because it's not needed.
5409
+ - No top-level statements between if/elif/else branches.
5410
+ - Regression must use either sklearn with train_test_split (then X_test exists) and R^2/MAE/RMSE,
5411
+ or statsmodels OLS. No accuracy_score in regression.
5412
+ - Keep all plotting + savefig + BytesIO + display inside the branch that created the figure.
5413
+ - Return ONLY the corrected cell.
5414
+ """
5397
5415
  code = textwrap.dedent(py_code or "").strip()
5398
5416
  needs_fix = False
5399
5417
  if re.search(r"\baccuracy_score\b", code) and re.search(r"\bLinearRegression\b|\bOLS\b", code):
@@ -5406,59 +5424,84 @@ def setup_routes(smx):
5406
5424
  needs_fix = True
5407
5425
  if not needs_fix:
5408
5426
  return code
5409
- prof = _prof.get_profile("coding") or _prof.get_profile("admin")
5427
+
5428
+ _prompt = f"```python\n{code}\n```"
5429
+
5430
+ prof = _prof.get_profile("classification") or _prof.get_profile("admin")
5410
5431
  if not prof:
5411
5432
  return code
5412
-
5413
- _prompt = f"```python\n{code}\n```"
5414
- _client = _prof.get_client(prof)
5433
+
5434
+ prof["client"] = _prof.get_client(prof)
5435
+ _client = prof["client"]
5415
5436
  _model = prof["model"]
5416
-
5417
- if prof['provider'] == "google":
5418
- fixed = _out(_client.models.generate_content(
5419
- model=_model,
5420
- contents=f"{_CELL_REPAIR_RULES}\n\n{_prompt}",
5421
- )
5422
- ).strip()
5423
-
5424
- elif prof["provider"] == "openai" and _model in smx.gpt_models_latest():
5425
- args = set_args(model=prof.get("model"), instructions=_CELL_REPAIR_RULES,
5426
- input=_prompt, previous_id=None, store=False,
5427
- reasoning_effort="minimal", verbosity="low")
5428
- fixed = _out(_client.responses.create(**args)).strip()
5429
-
5430
- elif prof["provider"] == "anthropic":
5431
- fixed = _out(_client.messages.create(
5432
- model=_model,
5433
- max_tokens=1024,
5434
- system=_CELL_REPAIR_RULES,
5435
- messages=[{"role": "user", "content":_prompt}]
5436
- )).strip()
5437
-
5438
- else:
5439
- fixed = _out(_client.chat.completions.create(
5440
- model=_model,
5441
- messages=[
5442
- {"role": "system", "content":_CELL_REPAIR_RULES},
5443
- {"role": "user", "content":_prompt},
5444
- ]
5445
- )
5446
- ).strip()
5437
+ _provider = prof["provider"].lower()
5438
+
5439
+ #1 Google
5440
+ if _provider == "google":
5441
+ from google.genai import types
5447
5442
 
5443
+ fixed = _client.models.generate_content(
5444
+ model=_model,
5445
+ contents=_prompt,
5446
+ config=types.GenerateContentConfig(
5447
+ system_instruction=_CELL_REPAIR_RULES,
5448
+ temperature=0.8,
5449
+ max_output_tokens=1024,
5450
+ ),
5451
+ )
5452
+
5453
+ #2 Openai
5454
+ elif _provider == "openai" and _model in GPT_MODELS_LATEST:
5455
+
5456
+ args = set_args(
5457
+ model=_model,
5458
+ instructions=_CELL_REPAIR_RULES,
5459
+ input=[{"role": "user", "content": _prompt}],
5460
+ previous_id=None,
5461
+ store=False,
5462
+ reasoning_effort="medium",
5463
+ verbosity="medium",
5464
+ )
5465
+ fixed = _out(_client.responses.create(**args))
5466
+
5467
+ # Anthropic
5468
+ elif _provider == "anthropic":
5469
+
5470
+ fixed = _client.messages.create(
5471
+ model=_model,
5472
+ max_tokens=1024,
5473
+ system=_CELL_REPAIR_RULES,
5474
+ messages=[{"role": "user", "content":_prompt}],
5475
+ stream=False,
5476
+ )
5477
+
5478
+ # OpenAI SDK
5479
+ else:
5480
+ fixed = _client.chat.completions.create(
5481
+ model=_model,
5482
+ messages=[
5483
+ {"role": "system", "content":_CELL_REPAIR_RULES},
5484
+ {"role": "user", "content":_prompt},
5485
+ ],
5486
+ max_tokens=1024,
5487
+ )
5488
+
5489
+ fixed_txt = clean_llm_code(fixed)
5490
+
5448
5491
  try:
5449
- ast.parse(fixed); return fixed
5450
- except SyntaxError:
5492
+ # Only accept the repaired cell if it's valid Python
5493
+ ast.parse(fixed_txt)
5494
+ return fixed_txt
5495
+ except Exception:
5496
+ # If the repaired version is still broken, fall back to the original code
5451
5497
  return code
5452
- ################################################################
5453
-
5498
+
5454
5499
  section = request.args.get("section", "explore")
5455
5500
  datasets = [f for f in os.listdir(DATA_FOLDER) if f.lower().endswith(".csv")]
5456
5501
  selected_dataset = request.form.get("dataset") or request.args.get("dataset")
5457
5502
  if not selected_dataset and datasets:
5458
5503
  selected_dataset = datasets[0]
5459
5504
 
5460
- # selected_dataset = selected_dataset or ""
5461
-
5462
5505
  # Handle file upload
5463
5506
  if request.method == "POST" and "dataset_file" in request.files:
5464
5507
  f = request.files["dataset_file"]
@@ -5470,7 +5513,7 @@ def setup_routes(smx):
5470
5513
 
5471
5514
  # Load dataframe if available
5472
5515
  df = pd.read_csv(os.path.join(DATA_FOLDER, selected_dataset)) if selected_dataset else None
5473
-
5516
+
5474
5517
  # --- Jupyter kernel management ---
5475
5518
  session_id = session.get('smx_kernel_id')
5476
5519
  if not session_id:
@@ -5481,38 +5524,84 @@ def setup_routes(smx):
5481
5524
 
5482
5525
  # --- Handle Ask AI ---
5483
5526
  ai_outputs = []
5527
+ dl_html = ""
5484
5528
  askai_question = ""
5485
- refined_question = None
5529
+ refined_question = ""
5530
+ tags = []
5486
5531
  ai_code = None
5532
+ eda_df = df
5533
+ llm_usage = None
5487
5534
 
5488
5535
  if request.method == "POST" and "askai_question" in request.form:
5489
5536
  askai_question = request.form["askai_question"].strip()
5490
- if df is not None:
5537
+ if df is not None:
5538
+ CLEANED_FOLDER = str(selected_dataset).split(".")[0] + "_preprocessed"
5539
+ cleaned_path = os.path.join(DATA_FOLDER, CLEANED_FOLDER, "cleaned_df.csv")
5540
+ if os.path.exists(cleaned_path):
5541
+ df = pd.read_csv(cleaned_path, low_memory=False)
5542
+ else:
5543
+ from syntaxmatrix.dataset_preprocessing import ensure_cleaned_df
5544
+ df = ensure_cleaned_df(DATA_FOLDER, CLEANED_FOLDER, df) # writes cleaned_df.csv
5545
+
5546
+ # Build lightweight context
5547
+ columns_summary = ", ".join(df.columns.tolist())
5548
+ dataset_context = f"columns: {columns_summary}"
5549
+ dataset_profile = f"modality: tabular; columns: {columns_summary}"
5550
+
5551
+ refined_question = refine_question_agent(askai_question, dataset_context)
5552
+ tags = classify_ml_job_agent(refined_question, dataset_profile)
5553
+
5554
+ ai_code = smx.ai_generate_code(refined_question, tags, df)
5555
+ llm_usage = smx.get_last_llm_usage()
5556
+ ai_code = auto_inject_template(ai_code, tags, df)
5557
+
5558
+ # --- 1) Strip dotenv ASAP (kill imports, %magics, !pip) ---
5559
+ ctx = {
5560
+ "question": refined_question,
5561
+ "df_columns": list(df.columns),
5562
+ }
5563
+ ai_code = ToolRunner(EARLY_SANITIZERS).run(ai_code, ctx) # dotenv first
5491
5564
 
5492
- refined_question = refine_eda_question(askai_question, df)
5493
- intent = classify(refined_question)
5494
- ai_code = smx.ai_generate_code(refined_question, intent, df)
5495
- ai_code = auto_inject_template(ai_code, intent, df)
5565
+ # --- 2) Domain/Plotting patches ---
5496
5566
  ai_code = fix_scatter_and_summary(ai_code)
5497
5567
  ai_code = fix_importance_groupby(ai_code)
5498
5568
  ai_code = inject_auto_preprocessing(ai_code)
5499
5569
  ai_code = patch_plot_code(ai_code, df, refined_question)
5570
+ ai_code = ensure_matplotlib_title(ai_code)
5571
+ ai_code = patch_pie_chart(ai_code, df, refined_question)
5500
5572
  ai_code = patch_pairplot(ai_code, df)
5573
+ ai_code = fix_seaborn_boxplot_nameerror(ai_code)
5574
+ ai_code = fix_seaborn_barplot_nameerror(ai_code)
5501
5575
  ai_code = get_plotting_imports(ai_code)
5502
- ai_code = ensure_image_output(ai_code)
5503
- ai_code = fix_numeric_sum(ai_code)
5576
+ ai_code = patch_prefix_seaborn_calls(ai_code)
5577
+ ai_code = patch_fix_sentinel_plot_calls(ai_code)
5578
+ ai_code = patch_ensure_seaborn_import(ai_code)
5579
+ ai_code = patch_rmse_calls(ai_code)
5580
+ ai_code = patch_fix_seaborn_palette_calls(ai_code)
5581
+ ai_code = patch_quiet_specific_warnings(ai_code)
5582
+ ai_code = clean_llm_code(ai_code)
5583
+ ai_code = ensure_image_output(ai_code)
5504
5584
  ai_code = ensure_accuracy_block(ai_code)
5505
5585
  ai_code = ensure_output(ai_code)
5506
5586
  ai_code = fix_plain_prints(ai_code)
5507
- ai_code = fix_print_html(ai_code)
5587
+ ai_code = fix_print_html(ai_code)
5508
5588
  ai_code = fix_to_datetime_errors(ai_code)
5589
+
5590
+ # --- 3-4) Global syntax/data fixers (must run AFTER patches, BEFORE final repair) ---
5591
+ ai_code = ToolRunner(SYNTAX_AND_REPAIR).run(ai_code, ctx)
5592
+
5593
+ # # --- 4) Final catch-all repair (run LAST) ---
5509
5594
  ai_code = _smx_repair_python_cell(ai_code)
5595
+ ai_code = harden_ai_code(ai_code)
5596
+ ai_code = drop_bad_classification_metrics(ai_code, df)
5597
+ ai_code = patch_fix_sentinel_plot_calls(ai_code)
5510
5598
 
5511
- # Always make sure 'df' is in the kernel before running user code
5599
+ # Always make sure 'df' is in the kernel before running user code
5512
5600
  df_init_code = (
5513
5601
  f"import pandas as pd\n"
5514
- f"df = pd.read_csv(r'''{os.path.join(DATA_FOLDER, selected_dataset)}''')"
5602
+ f"df = pd.read_csv(r'''{os.path.join(cleaned_path)}''')"
5515
5603
  )
5604
+
5516
5605
  execute_code_in_kernel(kc, df_init_code)
5517
5606
 
5518
5607
  outputs, errors = execute_code_in_kernel(kc, ai_code)
@@ -5525,7 +5614,6 @@ def setup_routes(smx):
5525
5614
  build_display_summary, phrase_commentary_vision, wrap_html
5526
5615
  )
5527
5616
 
5528
-
5529
5617
  # Probe axes/labels/legend
5530
5618
  probe1_out, probe1_err = execute_code_in_kernel(kc, MPL_PROBE_SNIPPET)
5531
5619
  axes_info = parse_mpl_probe_output([str(x) for x in (probe1_out + probe1_err)])
@@ -5542,17 +5630,17 @@ def setup_routes(smx):
5542
5630
  ################################################################
5543
5631
 
5544
5632
  # ----- Build a single HTML with Result + Commentary + AI Code ----------
5545
- _buf_out, _buf_err = io.StringIO(), io.StringIO()
5633
+ _buf_out, _buf_err = _std_io.StringIO(), _std_io.StringIO()
5546
5634
  with contextlib.redirect_stdout(_buf_out), contextlib.redirect_stderr(_buf_err):
5547
- # 1 Exact result blocks (already cleaned by kernel_manager)
5635
+ # Exact result blocks (already cleaned by kernel_manager)
5548
5636
  result_html = rendered_html if rendered_html.strip() else "<pre>No output.</pre>"
5549
5637
 
5550
- # 2 Commentary (we already have the raw HTML via wrap_html)
5638
+ # Commentary (we already have the raw HTML via wrap_html)
5551
5639
  commentary_html = wrap_html(commentary_text)
5552
5640
 
5553
5641
  code_html = _render_code_block("AI Generated Code", ai_code)
5554
5642
 
5555
- full_body_html = "\n" + askai_question + "\n" + result_html + "\n" + commentary_html + "\n" + code_html
5643
+ full_body_html = "\n" + askai_question + "\n" + result_html + "\n" + code_html + "\n" + commentary_html
5556
5644
 
5557
5645
  html_doc = (
5558
5646
  "<!doctype html>"
@@ -5576,7 +5664,7 @@ def setup_routes(smx):
5576
5664
 
5577
5665
  _last_result_html[session_id] = html_doc
5578
5666
 
5579
- # 2.4 Append a single download button (explicit click → fetch → download)
5667
+ # Append a single download button (explicit click → fetch → download)
5580
5668
  download_url = url_for("download_result_html", session_id=session_id)
5581
5669
  dl_html = f"""
5582
5670
  <a href="{download_url}">
@@ -5589,79 +5677,797 @@ def setup_routes(smx):
5589
5677
  """
5590
5678
  ai_outputs.append(Markup(dl_html))
5591
5679
 
5592
- ################################################################
5593
-
5594
-
5595
5680
  # --- EDA/static cells ---
5681
+ # Display helper: coerce integer-like float columns to Int64 just for rendering
5682
+ def _coerce_intlike_for_display(df_in: pd.DataFrame, per_cell: bool = False, eps: float = 1e-9) -> pd.DataFrame:
5683
+ import numpy as np
5684
+ out = df_in.copy()
5685
+ if per_cell:
5686
+ def _maybe(v):
5687
+ try:
5688
+ fv = float(v)
5689
+ except Exception:
5690
+ return v
5691
+ if pd.notnull(v) and np.isfinite(fv) and abs(fv - round(fv)) <= eps:
5692
+ return int(round(fv))
5693
+ return v
5694
+ return out.applymap(_maybe)
5695
+ # column-wise mode (original behaviour for previews)
5696
+ for c in out.columns:
5697
+ s = out[c]
5698
+ if pd.api.types.is_float_dtype(s):
5699
+ vals = s.dropna().to_numpy()
5700
+ if vals.size and np.isfinite(vals).all() and np.allclose(vals, np.round(vals), rtol=0, atol=eps):
5701
+ out[c] = s.round().astype("Int64")
5702
+ return out
5703
+
5596
5704
  data_cells = []
5705
+ max_rows = 5000
5706
+ max_cols = 80
5597
5707
  if df is not None:
5598
- num_records = df.shape
5599
- ds = selected_dataset.replace("_"," ").replace(".csv","").capitalize()
5708
+ df = eda_df
5709
+ ds = (selected_dataset or "").replace("_", " ").replace(".csv", "").capitalize()
5710
+
5711
+ # 1) Dataset Overview (stat cards)
5712
+ rows, cols = df.shape
5713
+ mem_bytes = int(df.memory_usage(deep=True).sum())
5714
+ mem_mb = round(mem_bytes / (1024 * 1024), 2)
5715
+ dup_rows = int(df.duplicated().sum())
5716
+ nunique_all = df.nunique(dropna=False)
5717
+
5718
+ n = max(rows, 1)
5719
+ dtypes = df.dtypes.astype(str)
5720
+ nonnull = df.notnull().sum()
5721
+ miss_pct = (df.isnull().mean() * 100).round(1)
5722
+ uniques = df.nunique(dropna=True)
5723
+ uniq_ratio = (uniques / n).fillna(0.0)
5724
+
5725
+ id_like, hi_card, consts, flags_col = [], [], [], []
5726
+ for c in df.columns:
5727
+ flags = []
5728
+ if uniques.get(c, 0) <= 1:
5729
+ flags.append("constant"); consts.append(c)
5730
+ if uniq_ratio.get(c, 0) >= 0.95 and "datetime" not in dtypes[c].lower():
5731
+ flags.append("id-like"); id_like.append(c)
5732
+ if dtypes[c].startswith("object") and uniq_ratio.get(c, 0) > 0.5 and c not in id_like:
5733
+ flags.append("high-card"); hi_card.append(c)
5734
+ flags_col.append(", ".join(flags))
5735
+
5736
+ _stats_code = (
5737
+ "rows, cols = df.shape\n"
5738
+ "mem_bytes = int(df.memory_usage(deep=True).sum())\n"
5739
+ "mem_mb = round(mem_bytes / (1024*1024), 2)\n"
5740
+ )
5741
+
5742
+ _stats_html = f"""
5743
+ <style>
5744
+ .smx-statwrap{{display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));gap:10px}}
5745
+ .smx-stat{{background:#fff;border:1px solid #e5e7eb;border-radius:10px;padding:10px 12px;text-align:center}}
5746
+ .smx-stat h4{{margin:0 0 4px;font-size:.9rem}}
5747
+ .smx-stat div{{font-weight:700;font-size:1.05rem}}
5748
+ </style>
5749
+ <div class="smx-statwrap">
5750
+ <div class="smx-stat"><h4>Rows</h4><div>{rows:,}</div></div>
5751
+ <div class="smx-stat"><h4>Columns</h4><div>{cols:,}</div></div>
5752
+ <div class="smx-stat"><h4>Memory (MB)</h4><div>{mem_mb}</div></div>
5753
+ </div>
5754
+ """
5755
+ data_cells.append({
5756
+ "title": f"{ds} Overview",
5757
+ "output": Markup(_stats_html),
5758
+ "code": _stats_code,
5759
+ "span":"eda-col-8"
5760
+ })
5761
+
5762
+ # 2) Integrity Notes — with "Show all" toggle
5763
+ notes = []
5764
+ if id_like:
5765
+ notes.append(f"ID-like columns: {', '.join(map(str, id_like[:6]))}{'…' if len(id_like)>6 else ''}")
5766
+ if hi_card:
5767
+ notes.append(f"High-cardinality categoricals: {', '.join(map(str, hi_card[:6]))}{'…' if len(hi_card)>6 else ''}")
5768
+ if consts:
5769
+ notes.append(f"Constant columns: {', '.join(map(str, consts[:6]))}{'…' if len(consts)>6 else ''}")
5770
+
5771
+ # Build full flagged table
5772
+ flag_rows = []
5773
+ for c in df.columns:
5774
+ f = []
5775
+ if c in id_like: f.append("id-like")
5776
+ if c in hi_card: f.append("high-card")
5777
+ if c in consts: f.append("constant")
5778
+ if f:
5779
+ flag_rows.append({
5780
+ "Column": c,
5781
+ "Flags": ", ".join(f),
5782
+ "Type": dtypes[c],
5783
+ "Unique Values": int(uniques.get(c, 0)),
5784
+ "Unique Ratio": float(uniq_ratio.get(c, 0)),
5785
+ "Missing (%)": float(miss_pct.get(c, 0)),
5786
+ })
5787
+ flagged_df = pd.DataFrame(flag_rows)
5788
+ flagged_df = flagged_df.sort_values(["Flags","Column"]) if not flagged_df.empty else flagged_df
5789
+
5790
+ # Render notes + toggle
5791
+ notes_html = (
5792
+ "<ul style='margin:0;padding-left:18px;'>" +
5793
+ "".join([f"<li>{n}</li>" for n in notes]) +
5794
+ "</ul>"
5795
+ ) if notes else "<em>No obvious integrity flags.</em>"
5796
+
5797
+ if not flagged_df.empty:
5798
+ table_html = datatable_box(flagged_df)
5799
+ body_html = (
5800
+ notes_html +
5801
+ f"<details style='margin-top:8px;'><summary>Show all flagged columns ({len(flagged_df)})</summary>"
5802
+ f"<div style='margin-top:8px;'>{table_html}</div></details>"
5803
+ )
5804
+ else:
5805
+ body_html = notes_html
5806
+
5600
5807
  data_cells.append({
5601
- "title": f"{ds} size",
5602
- "output": num_records,
5603
- "code": "df.shape"
5808
+ "title": "Integrity Notes",
5809
+ "output": Markup(body_html),
5810
+ "code": (
5811
+ "# Build Integrity Notes lists and full flagged table\n"
5812
+ "flag_rows = []\n"
5813
+ "for c in df.columns:\n"
5814
+ " f = []\n"
5815
+ " if c in id_like: f.append('id-like')\n"
5816
+ " if c in hi_card: f.append('high-card')\n"
5817
+ " if c in consts: f.append('constant')\n"
5818
+ " if f:\n"
5819
+ " flag_rows.append({\n"
5820
+ " 'Column': c,\n"
5821
+ " 'Flags': ', '.join(f),\n"
5822
+ " 'Type': dtypes[c],\n"
5823
+ " 'Unique Values': int(uniques.get(c,0)),\n"
5824
+ " 'Unique Ratio': float(uniq_ratio.get(c,0)),\n"
5825
+ " 'Missing (%)': float(miss_pct.get(c,0))\n"
5826
+ " })\n"
5827
+ "flagged_df = pd.DataFrame(flag_rows)\n"
5828
+ "flagged_df"
5829
+ ),
5830
+ "span":"eda-col-4"
5604
5831
  })
5605
- preview_cols = df.columns[:8]
5832
+
5833
+ # 3) Data Preview
5834
+ preview_cols = df.columns
5835
+ preview_df = _coerce_intlike_for_display(df[preview_cols].head(8))
5606
5836
  data_cells.append({
5607
5837
  "title": "Data Preview",
5608
- "output": Markup(datatable_box(df[preview_cols].head(8))),
5609
- "code": f"df[{list(preview_cols)}].head(8)"
5838
+ "output": Markup(datatable_box(preview_df)),
5839
+ "code": f"df[{list(preview_cols)}].head(8)",
5840
+ "span": "eda-col-6"
5610
5841
  })
5842
+
5843
+ # 4) Summary Statistics
5844
+ summary_cols = df.columns
5845
+ summary_df = _coerce_intlike_for_display(df[summary_cols].describe())
5611
5846
  data_cells.append({
5612
5847
  "title": "Summary Statistics",
5613
- "output": Markup(datatable_box(df.describe())),
5614
- "code": "df.describe()"
5848
+ "output": Markup(datatable_box(summary_df)),
5849
+ "code": f"df[{list(summary_cols)}].describe()",
5850
+ "span": "eda-col-6"
5851
+ })
5852
+
5853
+ # 5) Column Profile
5854
+ def _sample_vals(s, k=3):
5855
+ try:
5856
+ vals = pd.unique(s.dropna().astype(str))[:k]
5857
+ return ", ".join(map(str, vals))
5858
+ except Exception:
5859
+ return ""
5860
+
5861
+ profile_df = pd.DataFrame({
5862
+ "Column": df.columns,
5863
+ "Type": dtypes.values,
5864
+ "Non-Null Count": nonnull.values,
5865
+ "Missing (%)": miss_pct.values,
5866
+ "Unique Values": uniques.values,
5867
+ "Sample Values": [ _sample_vals(df[c]) for c in df.columns ],
5868
+ "Flags": flags_col
5869
+ })
5870
+ data_cells.append({
5871
+ "title": "Column Profile",
5872
+ "output": Markup(datatable_box(profile_df)),
5873
+ "code": (
5874
+ "dtypes = df.dtypes.astype(str)\n"
5875
+ "nonnull = df.notnull().sum()\n"
5876
+ "miss_pct = (df.isnull().mean()*100).round(1)\n"
5877
+ "uniques = df.nunique(dropna=True)\n"
5878
+ "n = max(len(df), 1)\n"
5879
+ "uniq_ratio = (uniques / n).fillna(0.0)\n"
5880
+ "def _sample_vals(s, k=3):\n"
5881
+ " vals = pd.unique(s.dropna().astype(str))[:k]\n"
5882
+ " return ', '.join(map(str, vals)) if len(vals) else ''\n"
5883
+ "flags_col = []\n"
5884
+ "for c in df.columns:\n"
5885
+ " flags=[]\n"
5886
+ " if uniques.get(c,0) <= 1: flags.append('constant')\n"
5887
+ " if uniq_ratio.get(c,0) >= 0.95 and 'datetime' not in dtypes[c].lower(): flags.append('id-like')\n"
5888
+ " if dtypes[c].startswith('object') and uniq_ratio.get(c,0) > 0.5 and 'id-like' not in flags: flags.append('high-card')\n"
5889
+ " flags_col.append(', '.join(flags))\n"
5890
+ "profile_df = pd.DataFrame({\n"
5891
+ " 'Column': df.columns,\n"
5892
+ " 'Type': dtypes.values,\n"
5893
+ " 'Non-Null Count': nonnull.values,\n"
5894
+ " 'Missing (%)': miss_pct.values,\n"
5895
+ " 'Unique Values': uniques.values,\n"
5896
+ " 'Sample Values': [ _sample_vals(df[c]) for c in df.columns ],\n"
5897
+ " 'Flags': flags_col\n"
5898
+ "})\n"
5899
+ "profile_df"
5900
+ ),
5901
+ "span":"eda-col-6"
5615
5902
  })
5903
+
5904
+ # 6) Column Types
5905
+ dtype_df = pd.DataFrame({
5906
+ "Column": df.columns,
5907
+ "Type": df.dtypes.astype(str).values,
5908
+ "Non-Null Count": df.notnull().sum().values,
5909
+ "Unique Values": df.nunique().values
5910
+ })
5911
+ data_cells.append({
5912
+ "title": "Column Types",
5913
+ "output": Markup(datatable_box(dtype_df)),
5914
+ "code": (
5915
+ "pd.DataFrame({\n"
5916
+ " 'Column': df.columns,\n"
5917
+ " 'Type': df.dtypes.astype(str).values,\n"
5918
+ " 'Non-Null Count': df.notnull().sum().values,\n"
5919
+ " 'Unique Values': df.nunique().values\n"
5920
+ "})"
5921
+ ),
5922
+ "span":"eda-col-6"
5923
+ })
5924
+
5925
+ # 7) Outliers — Top 3 records (robust MAD score, capped 5k×80)
5926
+ try:
5927
+ import numpy as np
5928
+
5929
+ num_cols_all = df.select_dtypes(include="number").columns.tolist()
5930
+ if len(num_cols_all) >= 1:
5931
+ num_cols = num_cols_all[:max_cols] # use your cap (80)
5932
+ df_num = df[num_cols].copy()
5933
+
5934
+ # cap rows for speed (5k)
5935
+ if len(df_num) > max_rows:
5936
+ df_num = df_num.sample(max_rows, random_state=0)
5937
+
5938
+ # robust z: 0.6745 * (x - median) / MAD (MAD==0 → NaN)
5939
+ med = df_num.median(numeric_only=True)
5940
+ mad = (df_num - med).abs().median(numeric_only=True)
5941
+ rz = 0.6745 * (df_num - med) / mad.replace(0, np.nan)
5942
+
5943
+ abs_rz = rz.abs()
5944
+ row_score = abs_rz.max(axis=1, skipna=True) # strongest dev across features
5945
+ top_idx = row_score.nlargest(3).index.tolist()
5946
+
5947
+ # Build compact, mobile-friendly cards for the top 3 rows
5948
+ cards_html = []
5949
+ for ridx in top_idx:
5950
+ # top contributing columns for this row
5951
+ contrib = abs_rz.loc[ridx].dropna().sort_values(ascending=False).head(5)
5952
+ maxv = float(contrib.iloc[0]) if len(contrib) else 0.0
5953
+
5954
+ bars = []
5955
+ for c, v in contrib.items():
5956
+ pct = 0.0 if maxv <= 0 else min(100.0, float(v) / maxv * 100.0)
5957
+ bars.append(f"""
5958
+ <div class="barrow">
5959
+ <span class="cname">{html.escape(str(c))}</span>
5960
+ <div class="bar"><div class="fill" style="width:{pct:.1f}%"></div></div>
5961
+ <span class="score">{v:.2f}</span>
5962
+ </div>
5963
+ """)
5964
+
5965
+ bars_html = "".join(bars) if bars else "<em>No strong single-column contributors.</em>"
5966
+
5967
+ # show the full record (all columns) with horizontal scroll
5968
+ row_vals = df.loc[ridx, :].to_dict()
5969
+ row_tbl = datatable_box(pd.DataFrame([row_vals]))
5970
+
5971
+ score_val = float(row_score.loc[ridx]) if pd.notnull(row_score.loc[ridx]) else 0.0
5972
+ title_idx = int(ridx) if isinstance(ridx, (int, np.integer)) else html.escape(str(ridx))
5973
+
5974
+ cards_html.append(f"""
5975
+ <div class="mad-card">
5976
+ <div class="mad-title">Row index: {title_idx} · score: {score_val:.2f}</div>
5977
+ <div class="mad-bars">{bars_html}</div>
5978
+ <div class="mad-row">{row_tbl}</div>
5979
+ </div>
5980
+ """)
5981
+
5982
+ grid_html = f"""
5983
+ <style>
5984
+ .mad-grid{{display:grid;grid-template-columns:repeat(3,minmax(0,1fr));gap:10px}}
5985
+ @media(max-width:1024px){{.mad-grid{{grid-template-columns:repeat(2,minmax(0,1fr))}}}}
5986
+ @media(max-width:640px){{.mad-grid{{grid-template-columns:repeat(1,minmax(0,1fr))}}}}
5987
+ .mad-card{{background:#fff;border:1px solid #e5e7eb;border-radius:10px;padding:8px 10px}}
5988
+ .mad-title{{font-weight:600;margin-bottom:6px}}
5989
+ .mad-bars .barrow{{display:grid;grid-template-columns:140px 1fr 46px;gap:6px;align-items:center;margin:4px 0}}
5990
+ .mad-bars .bar{{background:#eef2f7;border-radius:6px;height:8px;overflow:hidden}}
5991
+ .mad-bars .fill{{background:#0b8ae5;height:8px}}
5992
+ .mad-bars .cname{{font-size:12px;color:#444;white-space:nowrap;overflow:hidden;text-overflow:ellipsis}}
5993
+ .mad-bars .score{{font-size:12px;color:#333;text-align:right}}
5994
+ .mad-row .smx-table{{font-size:12px}}
5995
+ </style>
5996
+ <div class="mad-grid">{''.join(cards_html)}</div>
5997
+ """
5998
+
5999
+ data_cells.append({
6000
+ "title": "Outliers — Top 3 records",
6001
+ "output": Markup(grid_html),
6002
+ "code": (
6003
+ "num_cols = df.select_dtypes(include='number').columns.tolist()[:max_cols]\n"
6004
+ "df_num = df[num_cols]\n"
6005
+ "df_num = df_num.sample(max_rows, random_state=0) if len(df_num) > max_rows else df_num\n"
6006
+ "med = df_num.median(); mad = (df_num - med).abs().median()\n"
6007
+ "rz = 0.6745 * (df_num - med) / mad.replace(0, np.nan)\n"
6008
+ "row_score = rz.abs().max(axis=1)\n"
6009
+ "top3 = row_score.nlargest(3)\n"
6010
+ ),
6011
+ "span": "eda-col-12"
6012
+ })
6013
+ else:
6014
+ data_cells.append({
6015
+ "title": "Outliers — Top 3 records (robust MAD score)",
6016
+ "output": "<em>No numeric columns available.</em>",
6017
+ "code": "# no numeric columns",
6018
+ "span": "eda-col-6"
6019
+ })
6020
+ except Exception as _e:
6021
+ data_cells.append({
6022
+ "title": "Outliers — Top 3 records (robust MAD score)",
6023
+ "output": f"<em>Could not compute robust outliers: {html.escape(str(_e))}</em>",
6024
+ "code": "# error during robust outlier computation",
6025
+ "span": "eda-col-6"
6026
+ })
6027
+
6028
+ # 8) Outliers — Violin + Box (Top 3 numerics by IQR outliers, capped 5k×80)
6029
+ try:
6030
+ num_outliers = 3
6031
+ num_cols_all = df.select_dtypes(include="number").columns.tolist()
6032
+ if len(num_cols_all) >= 1:
6033
+ num_cols = num_cols_all[:max_cols]
6034
+ dfn = df[num_cols].copy()
6035
+
6036
+ # cap rows for speed (5k)
6037
+ if len(dfn) > max_rows:
6038
+ dfn = dfn.sample(max_rows, random_state=0)
6039
+
6040
+ # rank columns by number of Tukey outliers (1.5*IQR)
6041
+ ranks = []
6042
+ for c in dfn.columns:
6043
+ s = pd.to_numeric(dfn[c], errors="coerce").dropna()
6044
+ if s.empty:
6045
+ ranks.append((c, 0, 0.0))
6046
+ continue
6047
+ q1 = s.quantile(0.25); q3 = s.quantile(0.75)
6048
+ iqr = float(q3 - q1)
6049
+ if iqr <= 0:
6050
+ ranks.append((c, 0, 0.0))
6051
+ continue
6052
+ lower = q1 - 1.5 * iqr
6053
+ upper = q3 + 1.5 * iqr
6054
+ out_count = int(((s < lower) | (s > upper)).sum())
6055
+ ranks.append((c, out_count, float(iqr)))
6056
+
6057
+ # choose top 6 (break ties by IQR spread)
6058
+ sel_cols = [c for c, _, _ in sorted(ranks, key=lambda x: (-x[1], -x[2]))[:num_outliers]]
6059
+ if not sel_cols:
6060
+ raise ValueError("No numeric columns have spread for violin plots.")
6061
+
6062
+ # package data for JS (values only; thresholds for display)
6063
+ charts = []
6064
+ for c in sel_cols:
6065
+ s = pd.to_numeric(dfn[c], errors="coerce").dropna()
6066
+ if s.empty:
6067
+ continue
6068
+ q1 = s.quantile(0.25); q3 = s.quantile(0.75); iqr = q3 - q1
6069
+ lower = float(q1 - 1.5 * iqr); upper = float(q3 + 1.5 * iqr)
6070
+ out_count = int(((s < lower) | (s > upper)).sum())
6071
+ charts.append({
6072
+ "name": str(c),
6073
+ "values": [float(v) for v in s.tolist()],
6074
+ "lower": lower,
6075
+ "upper": upper,
6076
+ "n": int(s.size),
6077
+ "out": out_count
6078
+ })
6079
+
6080
+ container_id = f"violgrid_{uuid.uuid4().hex}"
6081
+ sub_divs = "\n".join([f'<div id="{container_id}_{i}" class="vplot"></div>' for i in range(len(charts))])
6082
+
6083
+ plot_html = f"""
6084
+ <style>
6085
+ /* mini-grid 3x2 → 2x? → 1x? */
6086
+ #{container_id}{{display:grid;grid-template-columns:repeat(3,minmax(0,1fr));gap:10px}}
6087
+ @media(max-width:1024px){{#{container_id}{{grid-template-columns:repeat(2,minmax(0,1fr))}}}}
6088
+ @media(max-width:640px){{#{container_id}{{grid-template-columns:repeat(1,minmax(0,1fr))}}}}
6089
+ /* each plot container – height set via JS for monotonic responsiveness */
6090
+ #{container_id} .vplot{{width:100%;}}
6091
+ </style>
6092
+ <div id="{container_id}">
6093
+ {sub_divs}
6094
+ </div>
6095
+ <script>
6096
+ (function(){{
6097
+ var charts = {json.dumps(charts)};
6098
+
6099
+ function calcHeight(el){{
6100
+ var w = (el && el.clientWidth) || (el && el.parentElement && el.parentElement.clientWidth) || 360;
6101
+ // smooth, monotone: ~0.55×width, clamped
6102
+ return Math.round(Math.max(220, Math.min(360, w * 0.55)));
6103
+ }}
6104
+
6105
+ function drawOne(target, data){{
6106
+ var el = document.getElementById(target);
6107
+ if(!el) return;
6108
+ var h = calcHeight(el);
6109
+ el.style.setProperty('height', h + 'px', 'important'); // defeat global height:auto
6110
+
6111
+ var trace = {{
6112
+ type: 'violin',
6113
+ y: data.values,
6114
+ name: data.name,
6115
+ box: {{ visible: true }},
6116
+ meanline: {{ visible: true }},
6117
+ points: 'suspectedoutliers',
6118
+ hovertemplate: '%{{y}}<extra></extra>',
6119
+ showlegend: false
6120
+ }};
6121
+
6122
+ var layout = {{
6123
+ margin: {{ l: 40, r: 10, t: 26, b: 28 }},
6124
+ title: {{ text: data.name + ' (n=' + data.n + ', out=' + data.out + ')', font: {{ size: 12 }} }},
6125
+ yaxis: {{ automargin: true }}
6126
+ }};
6127
+
6128
+ var config = {{ displayModeBar: true, responsive: true }};
6129
+ if(window.Plotly && Plotly.newPlot){{
6130
+ Plotly.newPlot(el, [trace], layout, config).then(function(){{
6131
+ if(Plotly.Plots && Plotly.Plots.resize) Plotly.Plots.resize(el);
6132
+ }});
6133
+ }} else {{
6134
+ var p=document.createElement('div'); p.style.color='crimson'; p.style.marginTop='8px';
6135
+ p.textContent='Plotly is not loaded.'; el.appendChild(p);
6136
+ }}
6137
+ }}
6138
+
6139
+ function drawAll(){{
6140
+ for(var i=0;i<charts.length;i++) drawOne("{container_id}_" + i, charts[i]);
6141
+ }}
6142
+ drawAll();
6143
+ window.addEventListener('resize', drawAll);
6144
+ }})();
6145
+ </script>
6146
+ """
6147
+
6148
+ data_cells.append({
6149
+ "title": "Outliers — Violin + Box (Top 3 numerics by IQR outliers)",
6150
+ "output": Markup(plot_html),
6151
+ "code": (
6152
+ "dfn = df.select_dtypes(include='number').iloc[:, :max_cols]\n"
6153
+ "dfn = dfn.sample(max_rows, random_state=0) if len(dfn) > max_rows else dfn\n"
6154
+ "# rank columns by Tukey outliers (1.5*IQR) and plot violins with inner box"
6155
+ ),
6156
+ "span": "eda-col-12"
6157
+ })
6158
+
6159
+ else:
6160
+ data_cells.append({
6161
+ "title": "Outliers — Violin + Box",
6162
+ "output": "<em>No numeric columns available.</em>",
6163
+ "code": "# no numeric columns",
6164
+ "span": "eda-col-6"
6165
+ })
6166
+ except Exception as _e:
6167
+ data_cells.append({
6168
+ "title": "Outliers — Violin + Box",
6169
+ "output": f"<em>Could not render violins: {html.escape(str(_e))}</em>",
6170
+ "code": "# error during violin rendering",
6171
+ "span": "eda-col-6"
6172
+ })
6173
+
6174
+ # 9) Missing Values table
5616
6175
  nulls = df.isnull().sum()
5617
6176
  nulls_pct = (df.isnull().mean() * 100).round(1)
5618
6177
  missing_df = pd.DataFrame({
5619
- "Missing Values": nulls,
5620
- "Missing (%)": nulls_pct
6178
+ "Column": df.columns,
6179
+ "Missing Values": nulls.values,
6180
+ "Missing (%)": nulls_pct.values
5621
6181
  })
5622
- missing = missing_df[missing_df["Missing Values"] > 0]
6182
+ missing = missing_df[missing_df["Missing Values"] > 0]
5623
6183
  data_cells.append({
5624
6184
  "title": "Missing Values",
5625
6185
  "output": Markup(datatable_box(missing)) if not missing.empty else "<em>No missing values detected.</em>",
5626
6186
  "code": (
5627
6187
  "nulls = df.isnull().sum()\n"
5628
6188
  "nulls_pct = (df.isnull().mean() * 100).round(1)\n"
5629
- "missing_df = pd.DataFrame({'Missing Values': nulls, 'Missing (%)': nulls_pct})\n"
6189
+ "missing_df = pd.DataFrame({\n"
6190
+ " 'Column': df.columns,\n"
6191
+ " 'Missing Values': nulls.values,\n"
6192
+ " 'Missing (%)': nulls_pct.values\n"
6193
+ "})\\n"
5630
6194
  "missing_df[missing_df['Missing Values'] > 0]"
5631
- )
6195
+ ),
6196
+ "span":"eda-col-4"
5632
6197
  })
5633
- dtype_df = pd.DataFrame({
5634
- "Type": df.dtypes.astype(str),
5635
- "Non-Null Count": df.notnull().sum(),
5636
- "Unique Values": df.nunique()
5637
- })
5638
- data_cells.append({
5639
- "title": "Column Types",
5640
6198
 
5641
- "output": Markup(datatable_box(dtype_df)),
5642
- "code": (
5643
- "pd.DataFrame({\n"
5644
- " 'Type': df.dtypes.astype(str),\n"
5645
- " 'Non-Null Count': df.notnull().sum(),\n"
5646
- " 'Unique Values': df.nunique()\n"
5647
- "})"
6199
+ # 9) Missingness (Top 20) – Plotly bar chart
6200
+ if not missing.empty:
6201
+ top_miss = (
6202
+ missing_df[missing_df["Missing Values"] > 0]
6203
+ .sort_values("Missing (%)", ascending=False)
6204
+ .loc[:, ["Column", "Missing (%)"]]
6205
+ .head(20)
6206
+ .reset_index(drop=True)
5648
6207
  )
5649
- })
6208
+
6209
+ container_id = f"miss_plot_{uuid.uuid4().hex}"
6210
+ x_vals = [html.escape(str(c)) for c in top_miss["Column"].tolist()]
6211
+ y_vals = [float(v) for v in top_miss["Missing (%)"].tolist()]
6212
+
6213
+ plot_html = f"""
6214
+ <div id="{container_id}" style="width:100%;height:340px;"></div>
6215
+ <script>
6216
+ (function(){{
6217
+ var x = {json.dumps(x_vals)};
6218
+ var y = {json.dumps(y_vals)};
6219
+ var data = [{{
6220
+ type: 'bar',
6221
+ x: x,
6222
+ y: y,
6223
+ hovertemplate: '%{{x}}<br>Missing: %{{y:.1f}}%<extra></extra>'
6224
+ }}];
6225
+ var layout = {{
6226
+ margin: {{l:50, r:20, t:10, b:100}},
6227
+ yaxis: {{ title: 'Missing (%)', rangemode: 'tozero' }},
6228
+ xaxis: {{ title: 'Column', tickangle: -45 }}
6229
+ }};
6230
+ if (window.Plotly && Plotly.newPlot) {{
6231
+ Plotly.newPlot("{container_id}", data, layout, {{displayModeBar:true, responsive:true}});
6232
+ }} else {{
6233
+ var p=document.createElement('div'); p.style.color='crimson'; p.style.marginTop='8px';
6234
+ p.textContent='Plotly is not loaded.'; document.getElementById("{container_id}").appendChild(p);
6235
+ }}
6236
+ }})();
6237
+ </script>
6238
+ """
6239
+ data_cells.append({
6240
+ "title": "Missingness (Top 20)",
6241
+ "output": Markup(plot_html),
6242
+ "code": (
6243
+ "nulls = df.isnull().sum();\n"
6244
+ "nulls_pct = (\n"
6245
+ " df.isnull().mean()*100\n"
6246
+ ").round(1)\n"
6247
+ "missing_df = pd.DataFrame({\n"
6248
+ " 'Column': df.columns,\n"
6249
+ " 'Missing Values': nulls.values,\n"
6250
+ " 'Missing (%)': nulls_pct.values\n"
6251
+ "})\n\n"
6252
+ "top_miss = (\n"
6253
+ " missing_df[missing_df['Missing Values'] > 0]\n"
6254
+ " .sort_values('Missing (%)', ascending=False)\n"
6255
+ " .loc[:, ['Column', 'Missing (%)']]\n"
6256
+ " .head(20)\n"
6257
+ " .reset_index(drop=True)\n"
6258
+ ")\n"
6259
+ "top_miss"
6260
+ ),
6261
+ "span":"eda-col-4"
6262
+ })
6263
+
6264
+ # 11 Category Distribution — 3D doughnut (dataset-agnostic, capped 5k)
6265
+ try:
6266
+ # 1) Column universe: object / category / bool (integers remain numeric)
6267
+ cat_cols_all = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
6268
+
6269
+ # 2) Honour user pick if categorical; otherwise auto-pick a sensible default
6270
+ dist_param = (request.args.get("dist") or request.form.get("dist") or "").strip()
6271
+ if dist_param and dist_param in cat_cols_all:
6272
+ dist_col = dist_param
6273
+ else:
6274
+ # Auto-pick preference: 3–20 unique values excluding obvious ID-like;
6275
+ # else allow 2-level; else first categorical.
6276
+ n_total = len(df)
6277
+ uniques_loc = df.nunique(dropna=True)
6278
+ miss_pct_loc = (df.isnull().mean() * 100).round(1)
6279
+ id_like_loc = {c for c in cat_cols_all if n_total > 0 and (uniques_loc.get(c, 0) / n_total) >= 0.95}
6280
+
6281
+ multilevel = [c for c in cat_cols_all
6282
+ if (3 <= int(uniques_loc.get(c, df[c].nunique(dropna=True))) <= 20)
6283
+ and (c not in id_like_loc)]
6284
+ if multilevel:
6285
+ # score nearer 8 levels and lower missingness
6286
+ best, best_score = "", -1e9
6287
+ for c in multilevel:
6288
+ k = int(uniques_loc.get(c, df[c].nunique(dropna=True)))
6289
+ miss = float(miss_pct_loc.get(c, (df[c].isna().mean() * 100)))
6290
+ score = -abs(k - 8) - (miss / 10.0)
6291
+ if score > best_score:
6292
+ best, best_score = c, score
6293
+ dist_col = best
6294
+ else:
6295
+ twolevel = [c for c in cat_cols_all if int(uniques_loc.get(c, df[c].nunique(dropna=True))) == 2]
6296
+ dist_col = (twolevel[0] if twolevel else (cat_cols_all[0] if cat_cols_all else ""))
6297
+
6298
+ # 3) Build options AFTER dist_col is final (so selection sticks)
6299
+ opts = []
6300
+ for c in cat_cols_all:
6301
+ sel = " selected" if c == dist_col else ""
6302
+ opts.append(f'<option value="{html.escape(str(c))}"{sel}>{html.escape(str(c))}</option>')
6303
+ opts_html = "\n".join(opts)
6304
+
6305
+ form_html = f"""
6306
+ <a id="dist3d"></a>
6307
+ <form method="get" action="/dashboard#dist3d"
6308
+ style="display:flex; flex-wrap:wrap; gap:10px; align-items:center; margin-bottom:8px;">
6309
+ <input type="hidden" name="section" value="explore">
6310
+ <input type="hidden" name="dataset" value="{html.escape(str(selected_dataset or ''))}">
6311
+ <label><strong>Distribution column:</strong></label>
6312
+ <select name="dist" onchange="this.form.submit()" style="min-width:200px; height:28px;">
6313
+ {opts_html}
6314
+ </select>
6315
+ </form>
6316
+ """
6317
+
6318
+ if dist_col:
6319
+ s = df[dist_col]
6320
+ # cap cheap counting to 5k
6321
+ if len(s) > 5000:
6322
+ s = s.sample(5000, random_state=0)
6323
+
6324
+ # 4) Robust counting: treat NaN as "Missing", stringify labels for safety
6325
+ s = s.astype("object")
6326
+ s = s.where(~s.isna(), other="Missing")
6327
+ vc = s.value_counts(dropna=False)
6328
+
6329
+ if vc.empty:
6330
+ raise ValueError("No values to display for the selected column.")
6331
+
6332
+ # Top-8 + 'Other' (excluding 'Missing' which we keep separate)
6333
+ top_k = 8
6334
+ non_missing = vc.drop(index=["Missing"], errors="ignore") if "Missing" in vc.index else vc
6335
+ head = non_missing.sort_values(ascending=False).head(top_k)
6336
+ other = int(non_missing.iloc[top_k:].sum()) if len(non_missing) > top_k else 0
6337
+ miss = int(vc.get("Missing", 0))
6338
+
6339
+ labels = [str(x) for x in head.index.tolist()]
6340
+ values = [int(v) for v in head.values.tolist()]
6341
+ if other > 0:
6342
+ labels.append("Other"); values.append(other)
6343
+ if miss > 0:
6344
+ labels.append("Missing"); values.append(miss)
6345
+
6346
+ # colours for faux 3D (no external deps)
6347
+ k = len(labels)
6348
+ def _hsl(i, n, l=0.58, s=0.62):
6349
+ h = (i / max(1, n)) * 360.0
6350
+ return f"hsl({int(h)}, {int(s*100)}%, {int(l*100)}%)"
6351
+ top_colors = [_hsl(i, k, l=0.58) for i in range(k)]
6352
+ base_colors = [_hsl(i, k, l=0.40) for i in range(k)]
6353
+
6354
+ container_id = f"dist3d_{uuid.uuid4().hex}"
6355
+ total = int(sum(values))
6356
+
6357
+ plot_html = f"""
6358
+ <div id="{container_id}" class="dist3d-chart"></div>
6359
+ <script>
6360
+ (function(){{
6361
+ var el = document.getElementById("{container_id}");
6362
+ var labels = {json.dumps(labels)};
6363
+ var values = {json.dumps(values)};
6364
+ var total = {total};
6365
+
6366
+ var base = {{
6367
+ type: 'pie', labels: labels, values: values,
6368
+ hole: 0.64, sort: false, textinfo: 'none', hoverinfo: 'skip',
6369
+ marker: {{ colors: {json.dumps(base_colors)} }},
6370
+ showlegend: false
6371
+ }};
6372
+ var top = {{
6373
+ type: 'pie', labels: labels, values: values,
6374
+ hole: 0.52, sort: false,
6375
+ textinfo: 'percent', textposition: 'inside', insidetextorientation: 'radial',
6376
+ hovertemplate: '%{{label}}<br>%{{value}} of {total:,} (%{{percent}})<extra></extra>',
6377
+ marker: {{ colors: {json.dumps(top_colors)}, line: {{ width: 1, color: 'rgba(0,0,0,0.25)' }} }},
6378
+ showlegend: true, legendgroup: 'dist'
6379
+ }};
6380
+
6381
+ function parentWidth(){{
6382
+ return (el && el.parentElement ? el.parentElement.clientWidth : (window.innerWidth||360));
6383
+ }}
6384
+
6385
+ // Smooth, monotonic: height = 0.65 * width, clamped [220, 520].
6386
+ function chartHeight(){{
6387
+ var w = parentWidth();
6388
+ return Math.round(Math.max(220, Math.min(520, w * 0.65)));
6389
+ }}
6390
+
6391
+ function legendOrientation(){{
6392
+ return parentWidth() < 640 ? 'h' : 'v';
6393
+ }}
6394
+
6395
+ function makeLayout(){{
6396
+ return {{
6397
+ margin: {{ l:10, r:10, t:10, b:10 }},
6398
+ legend: {{ orientation: legendOrientation(), x:1, xanchor:'right', y:1 }},
6399
+ uniformtext: {{ mode: 'hide', minsize: 10 }}
6400
+ }};
6401
+ }}
6402
+
6403
+ function applySize(){{
6404
+ // Override global .plotly-graph-div {{ height:auto !important }}
6405
+ el.style.setProperty('height', chartHeight() + 'px', 'important');
6406
+ if (window.Plotly) {{
6407
+ Plotly.relayout(el, {{ 'legend.orientation': legendOrientation() }});
6408
+ Plotly.Plots.resize(el);
6409
+ }}
6410
+ }}
6411
+
6412
+ if (window.Plotly && Plotly.newPlot) {{
6413
+ // Initial explicit height before draw
6414
+ el.style.setProperty('height', chartHeight() + 'px', 'important');
6415
+ Plotly.newPlot(el, [base, top], makeLayout(), {{ displayModeBar:true, responsive:true }})
6416
+ .then(function(){{ applySize(); }});
6417
+ window.addEventListener('resize', applySize);
6418
+ }} else {{
6419
+ var p=document.createElement('div'); p.style.color='crimson'; p.style.marginTop='8px';
6420
+ p.textContent='Plotly is not loaded.'; el.appendChild(p);
6421
+ }}
6422
+ }})();
6423
+ </script>
6424
+ """
6425
+
6426
+ data_cells.append({
6427
+ "title": f"Category Distribution — ({html.escape(dist_col)})",
6428
+ "output": Markup(form_html + plot_html),
6429
+ "code": (
6430
+ "dist_col = '<chosen categorical>'\n"
6431
+ "s = df[dist_col].astype('object').where(~df[dist_col].isna(), other='Missing')\n"
6432
+ "vc = s.value_counts(dropna=False)\n"
6433
+ "top_k = 8 # Top-8 + Other (+ Missing)\n"
6434
+ ),
6435
+ "span": "eda-col-4"
6436
+ })
6437
+ else:
6438
+ data_cells.append({
6439
+ "title": "Category Distribution — 3D doughnut",
6440
+ "output": "<em>No categorical columns found.</em>",
6441
+ "code": "# no categorical columns",
6442
+ "span": "eda-col-4"
6443
+ })
6444
+ except Exception as _e:
6445
+ data_cells.append({
6446
+ "title": "Category Distribution — 3D doughnut",
6447
+ "output": f"<em>Could not render distribution: {html.escape(str(_e))}</em>",
6448
+ "code": "# error during distribution rendering",
6449
+ "span": "eda-col-4"
6450
+ })
6451
+
5650
6452
  for cell in data_cells:
5651
- cell["highlighted_code"] = Markup(_pygmentize(cell["code"]))
6453
+ cell["highlighted_code"] = Markup(_pygmentize(cell["code"]))
6454
+
5652
6455
  highlighted_ai_code = _pygmentize(ai_code)
6456
+
5653
6457
  return render_template(
5654
6458
  "dashboard.html",
5655
6459
  section=section,
5656
6460
  datasets=datasets,
5657
6461
  selected_dataset=selected_dataset,
5658
6462
  ai_outputs=ai_outputs,
5659
- ai_code=ai_code, # AI-generated code for toggle
6463
+ ai_code=ai_code,
5660
6464
  highlighted_ai_code=highlighted_ai_code if ai_code else None,
5661
6465
  askai_question=smx.sanitize_rough_to_markdown_task(askai_question),
5662
- refined_question=refined_question, # Refined question
6466
+ refined_question=refined_question,
6467
+ tasks=tags,
5663
6468
  data_cells=data_cells,
5664
6469
  session_id=session_id,
6470
+ llm_usage=llm_usage
5665
6471
  )
5666
6472
 
5667
6473
 
@@ -5672,7 +6478,7 @@ def setup_routes(smx):
5672
6478
  if not html_doc:
5673
6479
  return ("No result available.", 404)
5674
6480
 
5675
- buf = io.BytesIO(html_doc.encode("utf-8"))
6481
+ buf = _std_io.BytesIO(html_doc.encode("utf-8"))
5676
6482
  buf.seek(0)
5677
6483
 
5678
6484
  # keep a copy if you wish, or free it:
@@ -5744,7 +6550,7 @@ def setup_routes(smx):
5744
6550
  text = re.sub(r"<[^>]+>", " ", text)
5745
6551
  text = re.sub(r"\n{3,}", "\n\n", text)
5746
6552
  text = html.unescape(text).strip()
5747
- buf = io.BytesIO()
6553
+ buf = _std_io.BytesIO()
5748
6554
  doc = SimpleDocTemplate(buf, pagesize=A4, leftMargin=16*mm, rightMargin=16*mm, topMargin=16*mm, bottomMargin=16*mm)
5749
6555
  styles = getSampleStyleSheet()
5750
6556
  flow = []