syntaxmatrix 2.5.1__py3-none-any.whl → 2.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- syntaxmatrix/__init__.py +3 -3
- syntaxmatrix/commentary.py +134 -112
- syntaxmatrix/core.py +449 -338
- syntaxmatrix/dataset_preprocessing.py +218 -0
- syntaxmatrix/display.py +89 -37
- syntaxmatrix/gpt_models_latest.py +5 -4
- syntaxmatrix/profiles.py +19 -4
- syntaxmatrix/routes.py +932 -131
- syntaxmatrix/settings/model_map.py +38 -30
- syntaxmatrix/static/icons/hero_bg.jpg +0 -0
- syntaxmatrix/templates/dashboard.html +256 -55
- syntaxmatrix/utils.py +2254 -84
- {syntaxmatrix-2.5.1.dist-info → syntaxmatrix-2.5.3.dist-info}/METADATA +3 -1
- {syntaxmatrix-2.5.1.dist-info → syntaxmatrix-2.5.3.dist-info}/RECORD +17 -18
- syntaxmatrix/model_templates.py +0 -29
- syntaxmatrix/smx_task_runner.py +0 -12
- syntaxmatrix/smx_usage_example.py +0 -4
- {syntaxmatrix-2.5.1.dist-info → syntaxmatrix-2.5.3.dist-info}/WHEEL +0 -0
- {syntaxmatrix-2.5.1.dist-info → syntaxmatrix-2.5.3.dist-info}/licenses/LICENSE.txt +0 -0
- {syntaxmatrix-2.5.1.dist-info → syntaxmatrix-2.5.3.dist-info}/top_level.txt +0 -0
syntaxmatrix/routes.py
CHANGED
|
@@ -1,18 +1,28 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
import
|
|
4
|
-
|
|
5
|
-
import io
|
|
1
|
+
import os, zipfile, time, uuid, werkzeug, queue, html, ast, re
|
|
2
|
+
import threading, textwrap, json, pandas as pd
|
|
3
|
+
import contextlib
|
|
4
|
+
|
|
5
|
+
import io as _std_io
|
|
6
|
+
|
|
6
7
|
from io import BytesIO
|
|
8
|
+
from scipy import io
|
|
9
|
+
from flask import Blueprint, Response, request, send_file, session
|
|
10
|
+
from flask import render_template, render_template_string, url_for, redirect, g
|
|
11
|
+
from flask import flash, jsonify, send_from_directory, get_flashed_messages, stream_with_context
|
|
12
|
+
|
|
13
|
+
from flask_login import current_user
|
|
14
|
+
|
|
7
15
|
from PyPDF2 import PdfReader
|
|
8
16
|
from markupsafe import Markup
|
|
9
|
-
from urllib.parse import quote
|
|
17
|
+
from urllib.parse import quote
|
|
18
|
+
from datetime import datetime
|
|
19
|
+
from prompt_toolkit import HTML
|
|
20
|
+
from PyPDF2.errors import EmptyFileError
|
|
21
|
+
import numpy as np
|
|
10
22
|
from .auth import register_user, authenticate, login_required, admin_required, superadmin_required
|
|
11
|
-
from flask import Blueprint, Response, request, send_file, session, render_template, render_template_string, redirect, url_for, flash, jsonify, send_from_directory, get_flashed_messages, stream_with_context
|
|
12
23
|
|
|
13
24
|
from syntaxmatrix.themes import DEFAULT_THEMES
|
|
14
25
|
from syntaxmatrix import db
|
|
15
|
-
from syntaxmatrix.utils import *
|
|
16
26
|
from syntaxmatrix.vector_db import add_pdf_chunk
|
|
17
27
|
from syntaxmatrix.file_processor import *
|
|
18
28
|
from syntaxmatrix.vectorizer import embed_text
|
|
@@ -22,14 +32,13 @@ from syntaxmatrix.history_store import SQLHistoryStore, PersistentHistoryStore
|
|
|
22
32
|
from syntaxmatrix.kernel_manager import SyntaxMatrixKernelManager, execute_code_in_kernel
|
|
23
33
|
from syntaxmatrix.vector_db import *
|
|
24
34
|
from syntaxmatrix.settings.string_navbar import string_navbar_items
|
|
25
|
-
from syntaxmatrix.settings.model_map import PROVIDERS_MODELS, MODEL_DESCRIPTIONS, PURPOSE_TAGS, EMBEDDING_MODELS
|
|
26
|
-
from .project_root import detect_project_root
|
|
27
|
-
from
|
|
28
|
-
from
|
|
35
|
+
from syntaxmatrix.settings.model_map import GPT_MODELS_LATEST, PROVIDERS_MODELS, MODEL_DESCRIPTIONS, PURPOSE_TAGS, EMBEDDING_MODELS
|
|
36
|
+
from syntaxmatrix.project_root import detect_project_root
|
|
37
|
+
from syntaxmatrix import generate_page as _genpage
|
|
38
|
+
from syntaxmatrix import auth as _auth
|
|
29
39
|
from syntaxmatrix import profiles as _prof
|
|
30
40
|
from syntaxmatrix.gpt_models_latest import set_args, extract_output_text as _out
|
|
31
|
-
from
|
|
32
|
-
import contextlib
|
|
41
|
+
from syntaxmatrix.agentic.agents import classify_ml_job_agent, refine_question_agent, text_formatter_agent
|
|
33
42
|
|
|
34
43
|
try:
|
|
35
44
|
from pygments import highlight as _hl
|
|
@@ -39,10 +48,16 @@ try:
|
|
|
39
48
|
except Exception:
|
|
40
49
|
_HAVE_PYGMENTS = False
|
|
41
50
|
|
|
42
|
-
from
|
|
43
|
-
from
|
|
51
|
+
# from syntaxmatrix.utils import *
|
|
52
|
+
from syntaxmatrix.utils import (
|
|
53
|
+
auto_inject_template, drop_bad_classification_metrics, ensure_accuracy_block,
|
|
54
|
+
ensure_image_output, ensure_output, fix_plain_prints, fix_print_html, patch_fix_sentinel_plot_calls,
|
|
55
|
+
patch_pairplot, fix_to_datetime_errors, harden_ai_code, patch_ensure_seaborn_import, get_plotting_imports,
|
|
56
|
+
patch_fix_seaborn_palette_calls, patch_quiet_specific_warnings, fix_seaborn_barplot_nameerror, fix_seaborn_boxplot_nameerror, ensure_matplotlib_title, patch_plot_code, patch_prefix_seaborn_calls, fix_scatter_and_summary, inject_auto_preprocessing, fix_importance_groupby, patch_pie_chart, patch_rmse_calls, clean_llm_code
|
|
57
|
+
)
|
|
44
58
|
|
|
45
|
-
|
|
59
|
+
from syntaxmatrix.agentic.agent_tools import ToolRunner
|
|
60
|
+
from syntaxmatrix.agentic.code_tools_registry import EARLY_SANITIZERS, SYNTAX_AND_REPAIR
|
|
46
61
|
|
|
47
62
|
_CLIENT_DIR = detect_project_root()
|
|
48
63
|
_stream_q = queue.Queue()
|
|
@@ -482,9 +497,8 @@ def setup_routes(smx):
|
|
|
482
497
|
padding: 2px 8px;
|
|
483
498
|
color:cyan;
|
|
484
499
|
}}
|
|
485
|
-
|
|
486
500
|
</style>
|
|
487
|
-
|
|
501
|
+
|
|
488
502
|
<!-- Add MathJax -->
|
|
489
503
|
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
|
|
490
504
|
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
|
|
@@ -623,8 +637,8 @@ def setup_routes(smx):
|
|
|
623
637
|
|
|
624
638
|
desktop_nav = f"""
|
|
625
639
|
<div class="nav-left">
|
|
626
|
-
<a class="logo" href="/"
|
|
627
|
-
<a class="logo" href="/" style="text-decoration:none;
|
|
640
|
+
<a class="logo" href="/">{smx.site_logo}</a>
|
|
641
|
+
<a class="logo" href="/" style="text-decoration:none; margin:0 24px 0 0; padding:0px; vertical-align:middle;">{smx.site_title}</a>
|
|
628
642
|
<div class="nav-links" style="margin-left:24px;">
|
|
629
643
|
{nav_links}
|
|
630
644
|
</div>
|
|
@@ -3767,10 +3781,13 @@ def setup_routes(smx):
|
|
|
3767
3781
|
|
|
3768
3782
|
|
|
3769
3783
|
# if any live cached profile on smx matches this name, clear it
|
|
3770
|
-
|
|
3784
|
+
db_profiles = prof.get_profiles()
|
|
3785
|
+
# for attr in ("_chat_profile", "_admin_profile", "_coding_profile", "_classification_profile", "_summarization_profile", "_vision2text_profile"):
|
|
3786
|
+
for attr in ([db_profiles]):
|
|
3771
3787
|
prof = getattr(smx, attr, None)
|
|
3772
3788
|
if isinstance(prof, dict) and prof.get("name") == name:
|
|
3773
3789
|
setattr(smx, attr, {})
|
|
3790
|
+
prof.refresh_profiles_cache()
|
|
3774
3791
|
|
|
3775
3792
|
elif action == "add_model":
|
|
3776
3793
|
prov = request.form.get("catalog_provider","").strip()
|
|
@@ -5112,7 +5129,7 @@ def setup_routes(smx):
|
|
|
5112
5129
|
rows = _auth.list_role_audit(limit=limit)
|
|
5113
5130
|
|
|
5114
5131
|
import io, csv, datetime
|
|
5115
|
-
buf =
|
|
5132
|
+
buf = _std_io.StringIO()
|
|
5116
5133
|
writer = csv.writer(buf)
|
|
5117
5134
|
writer.writerow(["timestamp", "actor", "target", "from_role", "to_role"])
|
|
5118
5135
|
for r in rows:
|
|
@@ -5373,25 +5390,28 @@ def setup_routes(smx):
|
|
|
5373
5390
|
# ────────────────────────────────────────────────────────────────────────────────────────
|
|
5374
5391
|
# DASHBOARD
|
|
5375
5392
|
# ────────────────────────────────────────────────────────────────────────────────────────
|
|
5376
|
-
# ── DASHBOARD VIEW DETAILS -----------------------------
|
|
5377
5393
|
@smx.app.route("/dashboard", methods=["GET", "POST"])
|
|
5378
5394
|
# @login_required
|
|
5379
5395
|
def dashboard():
|
|
5380
5396
|
DATA_FOLDER = os.path.join(_CLIENT_DIR, "uploads", "data")
|
|
5381
5397
|
os.makedirs(DATA_FOLDER, exist_ok=True)
|
|
5382
|
-
|
|
5383
|
-
####################################################################
|
|
5384
5398
|
|
|
5385
|
-
|
|
5386
|
-
|
|
5387
|
-
|
|
5388
|
-
- No top-level statements between if/elif/else branches.
|
|
5389
|
-
- Regression must use either sklearn with train_test_split (then X_test exists) and R^2/MAE/RMSE, or statsmodels OLS. No accuracy_score in regression.
|
|
5390
|
-
- Keep all plotting + savefig + BytesIO + display inside the branch that created the figure.
|
|
5391
|
-
Return ONLY the corrected cell.
|
|
5392
|
-
"""
|
|
5393
|
-
|
|
5399
|
+
max_rows = 5000
|
|
5400
|
+
max_cols = 80
|
|
5401
|
+
|
|
5394
5402
|
def _smx_repair_python_cell(py_code: str) -> str:
|
|
5403
|
+
|
|
5404
|
+
_CELL_REPAIR_RULES = """
|
|
5405
|
+
You are an experienced Python code reviewer
|
|
5406
|
+
Fix the Python cell to satisfy:
|
|
5407
|
+
- Single valid cell; imports at the top.
|
|
5408
|
+
- Do not import or invoke or use 'python-dotenv' or 'dotenv' because it's not needed.
|
|
5409
|
+
- No top-level statements between if/elif/else branches.
|
|
5410
|
+
- Regression must use either sklearn with train_test_split (then X_test exists) and R^2/MAE/RMSE,
|
|
5411
|
+
or statsmodels OLS. No accuracy_score in regression.
|
|
5412
|
+
- Keep all plotting + savefig + BytesIO + display inside the branch that created the figure.
|
|
5413
|
+
- Return ONLY the corrected cell.
|
|
5414
|
+
"""
|
|
5395
5415
|
code = textwrap.dedent(py_code or "").strip()
|
|
5396
5416
|
needs_fix = False
|
|
5397
5417
|
if re.search(r"\baccuracy_score\b", code) and re.search(r"\bLinearRegression\b|\bOLS\b", code):
|
|
@@ -5404,66 +5424,84 @@ def setup_routes(smx):
|
|
|
5404
5424
|
needs_fix = True
|
|
5405
5425
|
if not needs_fix:
|
|
5406
5426
|
return code
|
|
5407
|
-
|
|
5427
|
+
|
|
5428
|
+
_prompt = f"```python\n{code}\n```"
|
|
5429
|
+
|
|
5430
|
+
prof = _prof.get_profile("classification") or _prof.get_profile("admin")
|
|
5408
5431
|
if not prof:
|
|
5409
5432
|
return code
|
|
5410
|
-
|
|
5411
|
-
|
|
5412
|
-
_client =
|
|
5433
|
+
|
|
5434
|
+
prof["client"] = _prof.get_client(prof)
|
|
5435
|
+
_client = prof["client"]
|
|
5413
5436
|
_model = prof["model"]
|
|
5414
|
-
|
|
5415
|
-
|
|
5416
|
-
|
|
5417
|
-
|
|
5418
|
-
|
|
5419
|
-
)
|
|
5420
|
-
).strip()
|
|
5437
|
+
_provider = prof["provider"].lower()
|
|
5438
|
+
|
|
5439
|
+
#1 Google
|
|
5440
|
+
if _provider == "google":
|
|
5441
|
+
from google.genai import types
|
|
5421
5442
|
|
|
5422
|
-
|
|
5443
|
+
fixed = _client.models.generate_content(
|
|
5444
|
+
model=_model,
|
|
5445
|
+
contents=_prompt,
|
|
5446
|
+
config=types.GenerateContentConfig(
|
|
5447
|
+
system_instruction=_CELL_REPAIR_RULES,
|
|
5448
|
+
temperature=0.8,
|
|
5449
|
+
max_output_tokens=1024,
|
|
5450
|
+
),
|
|
5451
|
+
)
|
|
5452
|
+
|
|
5453
|
+
#2 Openai
|
|
5454
|
+
elif _provider == "openai" and _model in GPT_MODELS_LATEST:
|
|
5455
|
+
|
|
5423
5456
|
args = set_args(
|
|
5424
|
-
model=
|
|
5457
|
+
model=_model,
|
|
5425
5458
|
instructions=_CELL_REPAIR_RULES,
|
|
5426
|
-
|
|
5427
|
-
|
|
5428
|
-
|
|
5429
|
-
|
|
5430
|
-
|
|
5459
|
+
input=[{"role": "user", "content": _prompt}],
|
|
5460
|
+
previous_id=None,
|
|
5461
|
+
store=False,
|
|
5462
|
+
reasoning_effort="medium",
|
|
5463
|
+
verbosity="medium",
|
|
5431
5464
|
)
|
|
5432
|
-
fixed = _out(_client.responses.create(**args))
|
|
5433
|
-
|
|
5434
|
-
|
|
5435
|
-
|
|
5436
|
-
|
|
5437
|
-
|
|
5438
|
-
|
|
5439
|
-
|
|
5440
|
-
|
|
5441
|
-
|
|
5442
|
-
|
|
5443
|
-
|
|
5444
|
-
|
|
5445
|
-
|
|
5446
|
-
|
|
5447
|
-
|
|
5448
|
-
|
|
5449
|
-
|
|
5450
|
-
|
|
5451
|
-
|
|
5465
|
+
fixed = _out(_client.responses.create(**args))
|
|
5466
|
+
|
|
5467
|
+
# Anthropic
|
|
5468
|
+
elif _provider == "anthropic":
|
|
5469
|
+
|
|
5470
|
+
fixed = _client.messages.create(
|
|
5471
|
+
model=_model,
|
|
5472
|
+
max_tokens=1024,
|
|
5473
|
+
system=_CELL_REPAIR_RULES,
|
|
5474
|
+
messages=[{"role": "user", "content":_prompt}],
|
|
5475
|
+
stream=False,
|
|
5476
|
+
)
|
|
5477
|
+
|
|
5478
|
+
# OpenAI SDK
|
|
5479
|
+
else:
|
|
5480
|
+
fixed = _client.chat.completions.create(
|
|
5481
|
+
model=_model,
|
|
5482
|
+
messages=[
|
|
5483
|
+
{"role": "system", "content":_CELL_REPAIR_RULES},
|
|
5484
|
+
{"role": "user", "content":_prompt},
|
|
5485
|
+
],
|
|
5486
|
+
max_tokens=1024,
|
|
5487
|
+
)
|
|
5488
|
+
|
|
5489
|
+
fixed_txt = clean_llm_code(fixed)
|
|
5490
|
+
|
|
5452
5491
|
try:
|
|
5453
|
-
|
|
5454
|
-
|
|
5455
|
-
|
|
5492
|
+
# Only accept the repaired cell if it's valid Python
|
|
5493
|
+
ast.parse(fixed_txt)
|
|
5494
|
+
return fixed_txt
|
|
5495
|
+
except Exception:
|
|
5496
|
+
# If the repaired version is still broken, fall back to the original code
|
|
5456
5497
|
return code
|
|
5457
|
-
|
|
5458
|
-
|
|
5498
|
+
|
|
5459
5499
|
section = request.args.get("section", "explore")
|
|
5460
5500
|
datasets = [f for f in os.listdir(DATA_FOLDER) if f.lower().endswith(".csv")]
|
|
5461
5501
|
selected_dataset = request.form.get("dataset") or request.args.get("dataset")
|
|
5462
5502
|
if not selected_dataset and datasets:
|
|
5463
5503
|
selected_dataset = datasets[0]
|
|
5464
5504
|
|
|
5465
|
-
# selected_dataset = selected_dataset or ""
|
|
5466
|
-
|
|
5467
5505
|
# Handle file upload
|
|
5468
5506
|
if request.method == "POST" and "dataset_file" in request.files:
|
|
5469
5507
|
f = request.files["dataset_file"]
|
|
@@ -5475,7 +5513,7 @@ def setup_routes(smx):
|
|
|
5475
5513
|
|
|
5476
5514
|
# Load dataframe if available
|
|
5477
5515
|
df = pd.read_csv(os.path.join(DATA_FOLDER, selected_dataset)) if selected_dataset else None
|
|
5478
|
-
|
|
5516
|
+
|
|
5479
5517
|
# --- Jupyter kernel management ---
|
|
5480
5518
|
session_id = session.get('smx_kernel_id')
|
|
5481
5519
|
if not session_id:
|
|
@@ -5486,38 +5524,84 @@ def setup_routes(smx):
|
|
|
5486
5524
|
|
|
5487
5525
|
# --- Handle Ask AI ---
|
|
5488
5526
|
ai_outputs = []
|
|
5527
|
+
dl_html = ""
|
|
5489
5528
|
askai_question = ""
|
|
5490
|
-
refined_question =
|
|
5529
|
+
refined_question = ""
|
|
5530
|
+
tags = []
|
|
5491
5531
|
ai_code = None
|
|
5532
|
+
eda_df = df
|
|
5533
|
+
llm_usage = None
|
|
5492
5534
|
|
|
5493
5535
|
if request.method == "POST" and "askai_question" in request.form:
|
|
5494
5536
|
askai_question = request.form["askai_question"].strip()
|
|
5495
|
-
if df is not None:
|
|
5537
|
+
if df is not None:
|
|
5538
|
+
CLEANED_FOLDER = str(selected_dataset).split(".")[0] + "_preprocessed"
|
|
5539
|
+
cleaned_path = os.path.join(DATA_FOLDER, CLEANED_FOLDER, "cleaned_df.csv")
|
|
5540
|
+
if os.path.exists(cleaned_path):
|
|
5541
|
+
df = pd.read_csv(cleaned_path, low_memory=False)
|
|
5542
|
+
else:
|
|
5543
|
+
from syntaxmatrix.dataset_preprocessing import ensure_cleaned_df
|
|
5544
|
+
df = ensure_cleaned_df(DATA_FOLDER, CLEANED_FOLDER, df) # writes cleaned_df.csv
|
|
5545
|
+
|
|
5546
|
+
# Build lightweight context
|
|
5547
|
+
columns_summary = ", ".join(df.columns.tolist())
|
|
5548
|
+
dataset_context = f"columns: {columns_summary}"
|
|
5549
|
+
dataset_profile = f"modality: tabular; columns: {columns_summary}"
|
|
5550
|
+
|
|
5551
|
+
refined_question = refine_question_agent(askai_question, dataset_context)
|
|
5552
|
+
tags = classify_ml_job_agent(refined_question, dataset_profile)
|
|
5553
|
+
|
|
5554
|
+
ai_code = smx.ai_generate_code(refined_question, tags, df)
|
|
5555
|
+
llm_usage = smx.get_last_llm_usage()
|
|
5556
|
+
ai_code = auto_inject_template(ai_code, tags, df)
|
|
5557
|
+
|
|
5558
|
+
# --- 1) Strip dotenv ASAP (kill imports, %magics, !pip) ---
|
|
5559
|
+
ctx = {
|
|
5560
|
+
"question": refined_question,
|
|
5561
|
+
"df_columns": list(df.columns),
|
|
5562
|
+
}
|
|
5563
|
+
ai_code = ToolRunner(EARLY_SANITIZERS).run(ai_code, ctx) # dotenv first
|
|
5496
5564
|
|
|
5497
|
-
|
|
5498
|
-
intent = classify_ml_job(refined_question)
|
|
5499
|
-
ai_code = smx.ai_generate_code(refined_question, intent, df)
|
|
5500
|
-
ai_code = auto_inject_template(ai_code, intent, df)
|
|
5565
|
+
# --- 2) Domain/Plotting patches ---
|
|
5501
5566
|
ai_code = fix_scatter_and_summary(ai_code)
|
|
5502
5567
|
ai_code = fix_importance_groupby(ai_code)
|
|
5503
5568
|
ai_code = inject_auto_preprocessing(ai_code)
|
|
5504
5569
|
ai_code = patch_plot_code(ai_code, df, refined_question)
|
|
5570
|
+
ai_code = ensure_matplotlib_title(ai_code)
|
|
5571
|
+
ai_code = patch_pie_chart(ai_code, df, refined_question)
|
|
5505
5572
|
ai_code = patch_pairplot(ai_code, df)
|
|
5573
|
+
ai_code = fix_seaborn_boxplot_nameerror(ai_code)
|
|
5574
|
+
ai_code = fix_seaborn_barplot_nameerror(ai_code)
|
|
5506
5575
|
ai_code = get_plotting_imports(ai_code)
|
|
5507
|
-
ai_code =
|
|
5508
|
-
ai_code =
|
|
5576
|
+
ai_code = patch_prefix_seaborn_calls(ai_code)
|
|
5577
|
+
ai_code = patch_fix_sentinel_plot_calls(ai_code)
|
|
5578
|
+
ai_code = patch_ensure_seaborn_import(ai_code)
|
|
5579
|
+
ai_code = patch_rmse_calls(ai_code)
|
|
5580
|
+
ai_code = patch_fix_seaborn_palette_calls(ai_code)
|
|
5581
|
+
ai_code = patch_quiet_specific_warnings(ai_code)
|
|
5582
|
+
ai_code = clean_llm_code(ai_code)
|
|
5583
|
+
ai_code = ensure_image_output(ai_code)
|
|
5509
5584
|
ai_code = ensure_accuracy_block(ai_code)
|
|
5510
5585
|
ai_code = ensure_output(ai_code)
|
|
5511
5586
|
ai_code = fix_plain_prints(ai_code)
|
|
5512
|
-
ai_code = fix_print_html(ai_code)
|
|
5587
|
+
ai_code = fix_print_html(ai_code)
|
|
5513
5588
|
ai_code = fix_to_datetime_errors(ai_code)
|
|
5589
|
+
|
|
5590
|
+
# --- 3-4) Global syntax/data fixers (must run AFTER patches, BEFORE final repair) ---
|
|
5591
|
+
ai_code = ToolRunner(SYNTAX_AND_REPAIR).run(ai_code, ctx)
|
|
5592
|
+
|
|
5593
|
+
# # --- 4) Final catch-all repair (run LAST) ---
|
|
5514
5594
|
ai_code = _smx_repair_python_cell(ai_code)
|
|
5595
|
+
ai_code = harden_ai_code(ai_code)
|
|
5596
|
+
ai_code = drop_bad_classification_metrics(ai_code, df)
|
|
5597
|
+
ai_code = patch_fix_sentinel_plot_calls(ai_code)
|
|
5515
5598
|
|
|
5516
|
-
|
|
5599
|
+
# Always make sure 'df' is in the kernel before running user code
|
|
5517
5600
|
df_init_code = (
|
|
5518
5601
|
f"import pandas as pd\n"
|
|
5519
|
-
f"df = pd.read_csv(r'''{os.path.join(
|
|
5602
|
+
f"df = pd.read_csv(r'''{os.path.join(cleaned_path)}''')"
|
|
5520
5603
|
)
|
|
5604
|
+
|
|
5521
5605
|
execute_code_in_kernel(kc, df_init_code)
|
|
5522
5606
|
|
|
5523
5607
|
outputs, errors = execute_code_in_kernel(kc, ai_code)
|
|
@@ -5530,7 +5614,6 @@ def setup_routes(smx):
|
|
|
5530
5614
|
build_display_summary, phrase_commentary_vision, wrap_html
|
|
5531
5615
|
)
|
|
5532
5616
|
|
|
5533
|
-
|
|
5534
5617
|
# Probe axes/labels/legend
|
|
5535
5618
|
probe1_out, probe1_err = execute_code_in_kernel(kc, MPL_PROBE_SNIPPET)
|
|
5536
5619
|
axes_info = parse_mpl_probe_output([str(x) for x in (probe1_out + probe1_err)])
|
|
@@ -5547,7 +5630,7 @@ def setup_routes(smx):
|
|
|
5547
5630
|
################################################################
|
|
5548
5631
|
|
|
5549
5632
|
# ----- Build a single HTML with Result + Commentary + AI Code ----------
|
|
5550
|
-
_buf_out, _buf_err =
|
|
5633
|
+
_buf_out, _buf_err = _std_io.StringIO(), _std_io.StringIO()
|
|
5551
5634
|
with contextlib.redirect_stdout(_buf_out), contextlib.redirect_stderr(_buf_err):
|
|
5552
5635
|
# Exact result blocks (already cleaned by kernel_manager)
|
|
5553
5636
|
result_html = rendered_html if rendered_html.strip() else "<pre>No output.</pre>"
|
|
@@ -5594,79 +5677,797 @@ def setup_routes(smx):
|
|
|
5594
5677
|
"""
|
|
5595
5678
|
ai_outputs.append(Markup(dl_html))
|
|
5596
5679
|
|
|
5597
|
-
################################################################
|
|
5598
|
-
|
|
5599
|
-
|
|
5600
5680
|
# --- EDA/static cells ---
|
|
5681
|
+
# Display helper: coerce integer-like float columns to Int64 just for rendering
|
|
5682
|
+
def _coerce_intlike_for_display(df_in: pd.DataFrame, per_cell: bool = False, eps: float = 1e-9) -> pd.DataFrame:
|
|
5683
|
+
import numpy as np
|
|
5684
|
+
out = df_in.copy()
|
|
5685
|
+
if per_cell:
|
|
5686
|
+
def _maybe(v):
|
|
5687
|
+
try:
|
|
5688
|
+
fv = float(v)
|
|
5689
|
+
except Exception:
|
|
5690
|
+
return v
|
|
5691
|
+
if pd.notnull(v) and np.isfinite(fv) and abs(fv - round(fv)) <= eps:
|
|
5692
|
+
return int(round(fv))
|
|
5693
|
+
return v
|
|
5694
|
+
return out.applymap(_maybe)
|
|
5695
|
+
# column-wise mode (original behaviour for previews)
|
|
5696
|
+
for c in out.columns:
|
|
5697
|
+
s = out[c]
|
|
5698
|
+
if pd.api.types.is_float_dtype(s):
|
|
5699
|
+
vals = s.dropna().to_numpy()
|
|
5700
|
+
if vals.size and np.isfinite(vals).all() and np.allclose(vals, np.round(vals), rtol=0, atol=eps):
|
|
5701
|
+
out[c] = s.round().astype("Int64")
|
|
5702
|
+
return out
|
|
5703
|
+
|
|
5601
5704
|
data_cells = []
|
|
5705
|
+
max_rows = 5000
|
|
5706
|
+
max_cols = 80
|
|
5602
5707
|
if df is not None:
|
|
5603
|
-
|
|
5604
|
-
ds = selected_dataset.replace("_"," ").replace(".csv","").capitalize()
|
|
5708
|
+
df = eda_df
|
|
5709
|
+
ds = (selected_dataset or "").replace("_", " ").replace(".csv", "").capitalize()
|
|
5710
|
+
|
|
5711
|
+
# 1) Dataset Overview (stat cards)
|
|
5712
|
+
rows, cols = df.shape
|
|
5713
|
+
mem_bytes = int(df.memory_usage(deep=True).sum())
|
|
5714
|
+
mem_mb = round(mem_bytes / (1024 * 1024), 2)
|
|
5715
|
+
dup_rows = int(df.duplicated().sum())
|
|
5716
|
+
nunique_all = df.nunique(dropna=False)
|
|
5717
|
+
|
|
5718
|
+
n = max(rows, 1)
|
|
5719
|
+
dtypes = df.dtypes.astype(str)
|
|
5720
|
+
nonnull = df.notnull().sum()
|
|
5721
|
+
miss_pct = (df.isnull().mean() * 100).round(1)
|
|
5722
|
+
uniques = df.nunique(dropna=True)
|
|
5723
|
+
uniq_ratio = (uniques / n).fillna(0.0)
|
|
5724
|
+
|
|
5725
|
+
id_like, hi_card, consts, flags_col = [], [], [], []
|
|
5726
|
+
for c in df.columns:
|
|
5727
|
+
flags = []
|
|
5728
|
+
if uniques.get(c, 0) <= 1:
|
|
5729
|
+
flags.append("constant"); consts.append(c)
|
|
5730
|
+
if uniq_ratio.get(c, 0) >= 0.95 and "datetime" not in dtypes[c].lower():
|
|
5731
|
+
flags.append("id-like"); id_like.append(c)
|
|
5732
|
+
if dtypes[c].startswith("object") and uniq_ratio.get(c, 0) > 0.5 and c not in id_like:
|
|
5733
|
+
flags.append("high-card"); hi_card.append(c)
|
|
5734
|
+
flags_col.append(", ".join(flags))
|
|
5735
|
+
|
|
5736
|
+
_stats_code = (
|
|
5737
|
+
"rows, cols = df.shape\n"
|
|
5738
|
+
"mem_bytes = int(df.memory_usage(deep=True).sum())\n"
|
|
5739
|
+
"mem_mb = round(mem_bytes / (1024*1024), 2)\n"
|
|
5740
|
+
)
|
|
5741
|
+
|
|
5742
|
+
_stats_html = f"""
|
|
5743
|
+
<style>
|
|
5744
|
+
.smx-statwrap{{display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));gap:10px}}
|
|
5745
|
+
.smx-stat{{background:#fff;border:1px solid #e5e7eb;border-radius:10px;padding:10px 12px;text-align:center}}
|
|
5746
|
+
.smx-stat h4{{margin:0 0 4px;font-size:.9rem}}
|
|
5747
|
+
.smx-stat div{{font-weight:700;font-size:1.05rem}}
|
|
5748
|
+
</style>
|
|
5749
|
+
<div class="smx-statwrap">
|
|
5750
|
+
<div class="smx-stat"><h4>Rows</h4><div>{rows:,}</div></div>
|
|
5751
|
+
<div class="smx-stat"><h4>Columns</h4><div>{cols:,}</div></div>
|
|
5752
|
+
<div class="smx-stat"><h4>Memory (MB)</h4><div>{mem_mb}</div></div>
|
|
5753
|
+
</div>
|
|
5754
|
+
"""
|
|
5605
5755
|
data_cells.append({
|
|
5606
|
-
"title": f"{ds}
|
|
5607
|
-
"output":
|
|
5608
|
-
"code":
|
|
5756
|
+
"title": f"{ds} Overview",
|
|
5757
|
+
"output": Markup(_stats_html),
|
|
5758
|
+
"code": _stats_code,
|
|
5759
|
+
"span":"eda-col-8"
|
|
5609
5760
|
})
|
|
5761
|
+
|
|
5762
|
+
# 2) Integrity Notes — with "Show all" toggle
|
|
5763
|
+
notes = []
|
|
5764
|
+
if id_like:
|
|
5765
|
+
notes.append(f"ID-like columns: {', '.join(map(str, id_like[:6]))}{'…' if len(id_like)>6 else ''}")
|
|
5766
|
+
if hi_card:
|
|
5767
|
+
notes.append(f"High-cardinality categoricals: {', '.join(map(str, hi_card[:6]))}{'…' if len(hi_card)>6 else ''}")
|
|
5768
|
+
if consts:
|
|
5769
|
+
notes.append(f"Constant columns: {', '.join(map(str, consts[:6]))}{'…' if len(consts)>6 else ''}")
|
|
5770
|
+
|
|
5771
|
+
# Build full flagged table
|
|
5772
|
+
flag_rows = []
|
|
5773
|
+
for c in df.columns:
|
|
5774
|
+
f = []
|
|
5775
|
+
if c in id_like: f.append("id-like")
|
|
5776
|
+
if c in hi_card: f.append("high-card")
|
|
5777
|
+
if c in consts: f.append("constant")
|
|
5778
|
+
if f:
|
|
5779
|
+
flag_rows.append({
|
|
5780
|
+
"Column": c,
|
|
5781
|
+
"Flags": ", ".join(f),
|
|
5782
|
+
"Type": dtypes[c],
|
|
5783
|
+
"Unique Values": int(uniques.get(c, 0)),
|
|
5784
|
+
"Unique Ratio": float(uniq_ratio.get(c, 0)),
|
|
5785
|
+
"Missing (%)": float(miss_pct.get(c, 0)),
|
|
5786
|
+
})
|
|
5787
|
+
flagged_df = pd.DataFrame(flag_rows)
|
|
5788
|
+
flagged_df = flagged_df.sort_values(["Flags","Column"]) if not flagged_df.empty else flagged_df
|
|
5789
|
+
|
|
5790
|
+
# Render notes + toggle
|
|
5791
|
+
notes_html = (
|
|
5792
|
+
"<ul style='margin:0;padding-left:18px;'>" +
|
|
5793
|
+
"".join([f"<li>{n}</li>" for n in notes]) +
|
|
5794
|
+
"</ul>"
|
|
5795
|
+
) if notes else "<em>No obvious integrity flags.</em>"
|
|
5796
|
+
|
|
5797
|
+
if not flagged_df.empty:
|
|
5798
|
+
table_html = datatable_box(flagged_df)
|
|
5799
|
+
body_html = (
|
|
5800
|
+
notes_html +
|
|
5801
|
+
f"<details style='margin-top:8px;'><summary>Show all flagged columns ({len(flagged_df)})</summary>"
|
|
5802
|
+
f"<div style='margin-top:8px;'>{table_html}</div></details>"
|
|
5803
|
+
)
|
|
5804
|
+
else:
|
|
5805
|
+
body_html = notes_html
|
|
5806
|
+
|
|
5807
|
+
data_cells.append({
|
|
5808
|
+
"title": "Integrity Notes",
|
|
5809
|
+
"output": Markup(body_html),
|
|
5810
|
+
"code": (
|
|
5811
|
+
"# Build Integrity Notes lists and full flagged table\n"
|
|
5812
|
+
"flag_rows = []\n"
|
|
5813
|
+
"for c in df.columns:\n"
|
|
5814
|
+
" f = []\n"
|
|
5815
|
+
" if c in id_like: f.append('id-like')\n"
|
|
5816
|
+
" if c in hi_card: f.append('high-card')\n"
|
|
5817
|
+
" if c in consts: f.append('constant')\n"
|
|
5818
|
+
" if f:\n"
|
|
5819
|
+
" flag_rows.append({\n"
|
|
5820
|
+
" 'Column': c,\n"
|
|
5821
|
+
" 'Flags': ', '.join(f),\n"
|
|
5822
|
+
" 'Type': dtypes[c],\n"
|
|
5823
|
+
" 'Unique Values': int(uniques.get(c,0)),\n"
|
|
5824
|
+
" 'Unique Ratio': float(uniq_ratio.get(c,0)),\n"
|
|
5825
|
+
" 'Missing (%)': float(miss_pct.get(c,0))\n"
|
|
5826
|
+
" })\n"
|
|
5827
|
+
"flagged_df = pd.DataFrame(flag_rows)\n"
|
|
5828
|
+
"flagged_df"
|
|
5829
|
+
),
|
|
5830
|
+
"span":"eda-col-4"
|
|
5831
|
+
})
|
|
5832
|
+
|
|
5833
|
+
# 3) Data Preview
|
|
5610
5834
|
preview_cols = df.columns
|
|
5835
|
+
preview_df = _coerce_intlike_for_display(df[preview_cols].head(8))
|
|
5611
5836
|
data_cells.append({
|
|
5612
5837
|
"title": "Data Preview",
|
|
5613
|
-
"output": Markup(datatable_box(
|
|
5614
|
-
"code": f"df[{list(preview_cols)}].head(8)"
|
|
5838
|
+
"output": Markup(datatable_box(preview_df)),
|
|
5839
|
+
"code": f"df[{list(preview_cols)}].head(8)",
|
|
5840
|
+
"span": "eda-col-6"
|
|
5615
5841
|
})
|
|
5842
|
+
|
|
5843
|
+
# 4) Summary Statistics
|
|
5844
|
+
summary_cols = df.columns
|
|
5845
|
+
summary_df = _coerce_intlike_for_display(df[summary_cols].describe())
|
|
5616
5846
|
data_cells.append({
|
|
5617
5847
|
"title": "Summary Statistics",
|
|
5618
|
-
"output": Markup(datatable_box(
|
|
5619
|
-
"code": "df.describe()"
|
|
5848
|
+
"output": Markup(datatable_box(summary_df)),
|
|
5849
|
+
"code": f"df[{list(summary_cols)}].describe()",
|
|
5850
|
+
"span": "eda-col-6"
|
|
5851
|
+
})
|
|
5852
|
+
|
|
5853
|
+
# 5) Column Profile
|
|
5854
|
+
def _sample_vals(s, k=3):
|
|
5855
|
+
try:
|
|
5856
|
+
vals = pd.unique(s.dropna().astype(str))[:k]
|
|
5857
|
+
return ", ".join(map(str, vals))
|
|
5858
|
+
except Exception:
|
|
5859
|
+
return ""
|
|
5860
|
+
|
|
5861
|
+
profile_df = pd.DataFrame({
|
|
5862
|
+
"Column": df.columns,
|
|
5863
|
+
"Type": dtypes.values,
|
|
5864
|
+
"Non-Null Count": nonnull.values,
|
|
5865
|
+
"Missing (%)": miss_pct.values,
|
|
5866
|
+
"Unique Values": uniques.values,
|
|
5867
|
+
"Sample Values": [ _sample_vals(df[c]) for c in df.columns ],
|
|
5868
|
+
"Flags": flags_col
|
|
5869
|
+
})
|
|
5870
|
+
data_cells.append({
|
|
5871
|
+
"title": "Column Profile",
|
|
5872
|
+
"output": Markup(datatable_box(profile_df)),
|
|
5873
|
+
"code": (
|
|
5874
|
+
"dtypes = df.dtypes.astype(str)\n"
|
|
5875
|
+
"nonnull = df.notnull().sum()\n"
|
|
5876
|
+
"miss_pct = (df.isnull().mean()*100).round(1)\n"
|
|
5877
|
+
"uniques = df.nunique(dropna=True)\n"
|
|
5878
|
+
"n = max(len(df), 1)\n"
|
|
5879
|
+
"uniq_ratio = (uniques / n).fillna(0.0)\n"
|
|
5880
|
+
"def _sample_vals(s, k=3):\n"
|
|
5881
|
+
" vals = pd.unique(s.dropna().astype(str))[:k]\n"
|
|
5882
|
+
" return ', '.join(map(str, vals)) if len(vals) else ''\n"
|
|
5883
|
+
"flags_col = []\n"
|
|
5884
|
+
"for c in df.columns:\n"
|
|
5885
|
+
" flags=[]\n"
|
|
5886
|
+
" if uniques.get(c,0) <= 1: flags.append('constant')\n"
|
|
5887
|
+
" if uniq_ratio.get(c,0) >= 0.95 and 'datetime' not in dtypes[c].lower(): flags.append('id-like')\n"
|
|
5888
|
+
" if dtypes[c].startswith('object') and uniq_ratio.get(c,0) > 0.5 and 'id-like' not in flags: flags.append('high-card')\n"
|
|
5889
|
+
" flags_col.append(', '.join(flags))\n"
|
|
5890
|
+
"profile_df = pd.DataFrame({\n"
|
|
5891
|
+
" 'Column': df.columns,\n"
|
|
5892
|
+
" 'Type': dtypes.values,\n"
|
|
5893
|
+
" 'Non-Null Count': nonnull.values,\n"
|
|
5894
|
+
" 'Missing (%)': miss_pct.values,\n"
|
|
5895
|
+
" 'Unique Values': uniques.values,\n"
|
|
5896
|
+
" 'Sample Values': [ _sample_vals(df[c]) for c in df.columns ],\n"
|
|
5897
|
+
" 'Flags': flags_col\n"
|
|
5898
|
+
"})\n"
|
|
5899
|
+
"profile_df"
|
|
5900
|
+
),
|
|
5901
|
+
"span":"eda-col-6"
|
|
5620
5902
|
})
|
|
5903
|
+
|
|
5904
|
+
# 6) Column Types
|
|
5905
|
+
dtype_df = pd.DataFrame({
|
|
5906
|
+
"Column": df.columns,
|
|
5907
|
+
"Type": df.dtypes.astype(str).values,
|
|
5908
|
+
"Non-Null Count": df.notnull().sum().values,
|
|
5909
|
+
"Unique Values": df.nunique().values
|
|
5910
|
+
})
|
|
5911
|
+
data_cells.append({
|
|
5912
|
+
"title": "Column Types",
|
|
5913
|
+
"output": Markup(datatable_box(dtype_df)),
|
|
5914
|
+
"code": (
|
|
5915
|
+
"pd.DataFrame({\n"
|
|
5916
|
+
" 'Column': df.columns,\n"
|
|
5917
|
+
" 'Type': df.dtypes.astype(str).values,\n"
|
|
5918
|
+
" 'Non-Null Count': df.notnull().sum().values,\n"
|
|
5919
|
+
" 'Unique Values': df.nunique().values\n"
|
|
5920
|
+
"})"
|
|
5921
|
+
),
|
|
5922
|
+
"span":"eda-col-6"
|
|
5923
|
+
})
|
|
5924
|
+
|
|
5925
|
+
# 7) Outliers — Top 3 records (robust MAD score, capped 5k×80)
|
|
5926
|
+
try:
|
|
5927
|
+
import numpy as np
|
|
5928
|
+
|
|
5929
|
+
num_cols_all = df.select_dtypes(include="number").columns.tolist()
|
|
5930
|
+
if len(num_cols_all) >= 1:
|
|
5931
|
+
num_cols = num_cols_all[:max_cols] # use your cap (80)
|
|
5932
|
+
df_num = df[num_cols].copy()
|
|
5933
|
+
|
|
5934
|
+
# cap rows for speed (5k)
|
|
5935
|
+
if len(df_num) > max_rows:
|
|
5936
|
+
df_num = df_num.sample(max_rows, random_state=0)
|
|
5937
|
+
|
|
5938
|
+
# robust z: 0.6745 * (x - median) / MAD (MAD==0 → NaN)
|
|
5939
|
+
med = df_num.median(numeric_only=True)
|
|
5940
|
+
mad = (df_num - med).abs().median(numeric_only=True)
|
|
5941
|
+
rz = 0.6745 * (df_num - med) / mad.replace(0, np.nan)
|
|
5942
|
+
|
|
5943
|
+
abs_rz = rz.abs()
|
|
5944
|
+
row_score = abs_rz.max(axis=1, skipna=True) # strongest dev across features
|
|
5945
|
+
top_idx = row_score.nlargest(3).index.tolist()
|
|
5946
|
+
|
|
5947
|
+
# Build compact, mobile-friendly cards for the top 3 rows
|
|
5948
|
+
cards_html = []
|
|
5949
|
+
for ridx in top_idx:
|
|
5950
|
+
# top contributing columns for this row
|
|
5951
|
+
contrib = abs_rz.loc[ridx].dropna().sort_values(ascending=False).head(5)
|
|
5952
|
+
maxv = float(contrib.iloc[0]) if len(contrib) else 0.0
|
|
5953
|
+
|
|
5954
|
+
bars = []
|
|
5955
|
+
for c, v in contrib.items():
|
|
5956
|
+
pct = 0.0 if maxv <= 0 else min(100.0, float(v) / maxv * 100.0)
|
|
5957
|
+
bars.append(f"""
|
|
5958
|
+
<div class="barrow">
|
|
5959
|
+
<span class="cname">{html.escape(str(c))}</span>
|
|
5960
|
+
<div class="bar"><div class="fill" style="width:{pct:.1f}%"></div></div>
|
|
5961
|
+
<span class="score">{v:.2f}</span>
|
|
5962
|
+
</div>
|
|
5963
|
+
""")
|
|
5964
|
+
|
|
5965
|
+
bars_html = "".join(bars) if bars else "<em>No strong single-column contributors.</em>"
|
|
5966
|
+
|
|
5967
|
+
# show the full record (all columns) with horizontal scroll
|
|
5968
|
+
row_vals = df.loc[ridx, :].to_dict()
|
|
5969
|
+
row_tbl = datatable_box(pd.DataFrame([row_vals]))
|
|
5970
|
+
|
|
5971
|
+
score_val = float(row_score.loc[ridx]) if pd.notnull(row_score.loc[ridx]) else 0.0
|
|
5972
|
+
title_idx = int(ridx) if isinstance(ridx, (int, np.integer)) else html.escape(str(ridx))
|
|
5973
|
+
|
|
5974
|
+
cards_html.append(f"""
|
|
5975
|
+
<div class="mad-card">
|
|
5976
|
+
<div class="mad-title">Row index: {title_idx} · score: {score_val:.2f}</div>
|
|
5977
|
+
<div class="mad-bars">{bars_html}</div>
|
|
5978
|
+
<div class="mad-row">{row_tbl}</div>
|
|
5979
|
+
</div>
|
|
5980
|
+
""")
|
|
5981
|
+
|
|
5982
|
+
grid_html = f"""
|
|
5983
|
+
<style>
|
|
5984
|
+
.mad-grid{{display:grid;grid-template-columns:repeat(3,minmax(0,1fr));gap:10px}}
|
|
5985
|
+
@media(max-width:1024px){{.mad-grid{{grid-template-columns:repeat(2,minmax(0,1fr))}}}}
|
|
5986
|
+
@media(max-width:640px){{.mad-grid{{grid-template-columns:repeat(1,minmax(0,1fr))}}}}
|
|
5987
|
+
.mad-card{{background:#fff;border:1px solid #e5e7eb;border-radius:10px;padding:8px 10px}}
|
|
5988
|
+
.mad-title{{font-weight:600;margin-bottom:6px}}
|
|
5989
|
+
.mad-bars .barrow{{display:grid;grid-template-columns:140px 1fr 46px;gap:6px;align-items:center;margin:4px 0}}
|
|
5990
|
+
.mad-bars .bar{{background:#eef2f7;border-radius:6px;height:8px;overflow:hidden}}
|
|
5991
|
+
.mad-bars .fill{{background:#0b8ae5;height:8px}}
|
|
5992
|
+
.mad-bars .cname{{font-size:12px;color:#444;white-space:nowrap;overflow:hidden;text-overflow:ellipsis}}
|
|
5993
|
+
.mad-bars .score{{font-size:12px;color:#333;text-align:right}}
|
|
5994
|
+
.mad-row .smx-table{{font-size:12px}}
|
|
5995
|
+
</style>
|
|
5996
|
+
<div class="mad-grid">{''.join(cards_html)}</div>
|
|
5997
|
+
"""
|
|
5998
|
+
|
|
5999
|
+
data_cells.append({
|
|
6000
|
+
"title": "Outliers — Top 3 records",
|
|
6001
|
+
"output": Markup(grid_html),
|
|
6002
|
+
"code": (
|
|
6003
|
+
"num_cols = df.select_dtypes(include='number').columns.tolist()[:max_cols]\n"
|
|
6004
|
+
"df_num = df[num_cols]\n"
|
|
6005
|
+
"df_num = df_num.sample(max_rows, random_state=0) if len(df_num) > max_rows else df_num\n"
|
|
6006
|
+
"med = df_num.median(); mad = (df_num - med).abs().median()\n"
|
|
6007
|
+
"rz = 0.6745 * (df_num - med) / mad.replace(0, np.nan)\n"
|
|
6008
|
+
"row_score = rz.abs().max(axis=1)\n"
|
|
6009
|
+
"top3 = row_score.nlargest(3)\n"
|
|
6010
|
+
),
|
|
6011
|
+
"span": "eda-col-12"
|
|
6012
|
+
})
|
|
6013
|
+
else:
|
|
6014
|
+
data_cells.append({
|
|
6015
|
+
"title": "Outliers — Top 3 records (robust MAD score)",
|
|
6016
|
+
"output": "<em>No numeric columns available.</em>",
|
|
6017
|
+
"code": "# no numeric columns",
|
|
6018
|
+
"span": "eda-col-6"
|
|
6019
|
+
})
|
|
6020
|
+
except Exception as _e:
|
|
6021
|
+
data_cells.append({
|
|
6022
|
+
"title": "Outliers — Top 3 records (robust MAD score)",
|
|
6023
|
+
"output": f"<em>Could not compute robust outliers: {html.escape(str(_e))}</em>",
|
|
6024
|
+
"code": "# error during robust outlier computation",
|
|
6025
|
+
"span": "eda-col-6"
|
|
6026
|
+
})
|
|
6027
|
+
|
|
6028
|
+
# 8) Outliers — Violin + Box (Top 3 numerics by IQR outliers, capped 5k×80)
|
|
6029
|
+
try:
|
|
6030
|
+
num_outliers = 3
|
|
6031
|
+
num_cols_all = df.select_dtypes(include="number").columns.tolist()
|
|
6032
|
+
if len(num_cols_all) >= 1:
|
|
6033
|
+
num_cols = num_cols_all[:max_cols]
|
|
6034
|
+
dfn = df[num_cols].copy()
|
|
6035
|
+
|
|
6036
|
+
# cap rows for speed (5k)
|
|
6037
|
+
if len(dfn) > max_rows:
|
|
6038
|
+
dfn = dfn.sample(max_rows, random_state=0)
|
|
6039
|
+
|
|
6040
|
+
# rank columns by number of Tukey outliers (1.5*IQR)
|
|
6041
|
+
ranks = []
|
|
6042
|
+
for c in dfn.columns:
|
|
6043
|
+
s = pd.to_numeric(dfn[c], errors="coerce").dropna()
|
|
6044
|
+
if s.empty:
|
|
6045
|
+
ranks.append((c, 0, 0.0))
|
|
6046
|
+
continue
|
|
6047
|
+
q1 = s.quantile(0.25); q3 = s.quantile(0.75)
|
|
6048
|
+
iqr = float(q3 - q1)
|
|
6049
|
+
if iqr <= 0:
|
|
6050
|
+
ranks.append((c, 0, 0.0))
|
|
6051
|
+
continue
|
|
6052
|
+
lower = q1 - 1.5 * iqr
|
|
6053
|
+
upper = q3 + 1.5 * iqr
|
|
6054
|
+
out_count = int(((s < lower) | (s > upper)).sum())
|
|
6055
|
+
ranks.append((c, out_count, float(iqr)))
|
|
6056
|
+
|
|
6057
|
+
# choose top 6 (break ties by IQR spread)
|
|
6058
|
+
sel_cols = [c for c, _, _ in sorted(ranks, key=lambda x: (-x[1], -x[2]))[:num_outliers]]
|
|
6059
|
+
if not sel_cols:
|
|
6060
|
+
raise ValueError("No numeric columns have spread for violin plots.")
|
|
6061
|
+
|
|
6062
|
+
# package data for JS (values only; thresholds for display)
|
|
6063
|
+
charts = []
|
|
6064
|
+
for c in sel_cols:
|
|
6065
|
+
s = pd.to_numeric(dfn[c], errors="coerce").dropna()
|
|
6066
|
+
if s.empty:
|
|
6067
|
+
continue
|
|
6068
|
+
q1 = s.quantile(0.25); q3 = s.quantile(0.75); iqr = q3 - q1
|
|
6069
|
+
lower = float(q1 - 1.5 * iqr); upper = float(q3 + 1.5 * iqr)
|
|
6070
|
+
out_count = int(((s < lower) | (s > upper)).sum())
|
|
6071
|
+
charts.append({
|
|
6072
|
+
"name": str(c),
|
|
6073
|
+
"values": [float(v) for v in s.tolist()],
|
|
6074
|
+
"lower": lower,
|
|
6075
|
+
"upper": upper,
|
|
6076
|
+
"n": int(s.size),
|
|
6077
|
+
"out": out_count
|
|
6078
|
+
})
|
|
6079
|
+
|
|
6080
|
+
container_id = f"violgrid_{uuid.uuid4().hex}"
|
|
6081
|
+
sub_divs = "\n".join([f'<div id="{container_id}_{i}" class="vplot"></div>' for i in range(len(charts))])
|
|
6082
|
+
|
|
6083
|
+
plot_html = f"""
|
|
6084
|
+
<style>
|
|
6085
|
+
/* mini-grid 3x2 → 2x? → 1x? */
|
|
6086
|
+
#{container_id}{{display:grid;grid-template-columns:repeat(3,minmax(0,1fr));gap:10px}}
|
|
6087
|
+
@media(max-width:1024px){{#{container_id}{{grid-template-columns:repeat(2,minmax(0,1fr))}}}}
|
|
6088
|
+
@media(max-width:640px){{#{container_id}{{grid-template-columns:repeat(1,minmax(0,1fr))}}}}
|
|
6089
|
+
/* each plot container – height set via JS for monotonic responsiveness */
|
|
6090
|
+
#{container_id} .vplot{{width:100%;}}
|
|
6091
|
+
</style>
|
|
6092
|
+
<div id="{container_id}">
|
|
6093
|
+
{sub_divs}
|
|
6094
|
+
</div>
|
|
6095
|
+
<script>
|
|
6096
|
+
(function(){{
|
|
6097
|
+
var charts = {json.dumps(charts)};
|
|
6098
|
+
|
|
6099
|
+
function calcHeight(el){{
|
|
6100
|
+
var w = (el && el.clientWidth) || (el && el.parentElement && el.parentElement.clientWidth) || 360;
|
|
6101
|
+
// smooth, monotone: ~0.55×width, clamped
|
|
6102
|
+
return Math.round(Math.max(220, Math.min(360, w * 0.55)));
|
|
6103
|
+
}}
|
|
6104
|
+
|
|
6105
|
+
function drawOne(target, data){{
|
|
6106
|
+
var el = document.getElementById(target);
|
|
6107
|
+
if(!el) return;
|
|
6108
|
+
var h = calcHeight(el);
|
|
6109
|
+
el.style.setProperty('height', h + 'px', 'important'); // defeat global height:auto
|
|
6110
|
+
|
|
6111
|
+
var trace = {{
|
|
6112
|
+
type: 'violin',
|
|
6113
|
+
y: data.values,
|
|
6114
|
+
name: data.name,
|
|
6115
|
+
box: {{ visible: true }},
|
|
6116
|
+
meanline: {{ visible: true }},
|
|
6117
|
+
points: 'suspectedoutliers',
|
|
6118
|
+
hovertemplate: '%{{y}}<extra></extra>',
|
|
6119
|
+
showlegend: false
|
|
6120
|
+
}};
|
|
6121
|
+
|
|
6122
|
+
var layout = {{
|
|
6123
|
+
margin: {{ l: 40, r: 10, t: 26, b: 28 }},
|
|
6124
|
+
title: {{ text: data.name + ' (n=' + data.n + ', out=' + data.out + ')', font: {{ size: 12 }} }},
|
|
6125
|
+
yaxis: {{ automargin: true }}
|
|
6126
|
+
}};
|
|
6127
|
+
|
|
6128
|
+
var config = {{ displayModeBar: true, responsive: true }};
|
|
6129
|
+
if(window.Plotly && Plotly.newPlot){{
|
|
6130
|
+
Plotly.newPlot(el, [trace], layout, config).then(function(){{
|
|
6131
|
+
if(Plotly.Plots && Plotly.Plots.resize) Plotly.Plots.resize(el);
|
|
6132
|
+
}});
|
|
6133
|
+
}} else {{
|
|
6134
|
+
var p=document.createElement('div'); p.style.color='crimson'; p.style.marginTop='8px';
|
|
6135
|
+
p.textContent='Plotly is not loaded.'; el.appendChild(p);
|
|
6136
|
+
}}
|
|
6137
|
+
}}
|
|
6138
|
+
|
|
6139
|
+
function drawAll(){{
|
|
6140
|
+
for(var i=0;i<charts.length;i++) drawOne("{container_id}_" + i, charts[i]);
|
|
6141
|
+
}}
|
|
6142
|
+
drawAll();
|
|
6143
|
+
window.addEventListener('resize', drawAll);
|
|
6144
|
+
}})();
|
|
6145
|
+
</script>
|
|
6146
|
+
"""
|
|
6147
|
+
|
|
6148
|
+
data_cells.append({
|
|
6149
|
+
"title": "Outliers — Violin + Box (Top 3 numerics by IQR outliers)",
|
|
6150
|
+
"output": Markup(plot_html),
|
|
6151
|
+
"code": (
|
|
6152
|
+
"dfn = df.select_dtypes(include='number').iloc[:, :max_cols]\n"
|
|
6153
|
+
"dfn = dfn.sample(max_rows, random_state=0) if len(dfn) > max_rows else dfn\n"
|
|
6154
|
+
"# rank columns by Tukey outliers (1.5*IQR) and plot violins with inner box"
|
|
6155
|
+
),
|
|
6156
|
+
"span": "eda-col-12"
|
|
6157
|
+
})
|
|
6158
|
+
|
|
6159
|
+
else:
|
|
6160
|
+
data_cells.append({
|
|
6161
|
+
"title": "Outliers — Violin + Box",
|
|
6162
|
+
"output": "<em>No numeric columns available.</em>",
|
|
6163
|
+
"code": "# no numeric columns",
|
|
6164
|
+
"span": "eda-col-6"
|
|
6165
|
+
})
|
|
6166
|
+
except Exception as _e:
|
|
6167
|
+
data_cells.append({
|
|
6168
|
+
"title": "Outliers — Violin + Box",
|
|
6169
|
+
"output": f"<em>Could not render violins: {html.escape(str(_e))}</em>",
|
|
6170
|
+
"code": "# error during violin rendering",
|
|
6171
|
+
"span": "eda-col-6"
|
|
6172
|
+
})
|
|
6173
|
+
|
|
6174
|
+
# 9) Missing Values table
|
|
5621
6175
|
nulls = df.isnull().sum()
|
|
5622
6176
|
nulls_pct = (df.isnull().mean() * 100).round(1)
|
|
5623
6177
|
missing_df = pd.DataFrame({
|
|
5624
|
-
"
|
|
5625
|
-
"Missing
|
|
6178
|
+
"Column": df.columns,
|
|
6179
|
+
"Missing Values": nulls.values,
|
|
6180
|
+
"Missing (%)": nulls_pct.values
|
|
5626
6181
|
})
|
|
5627
|
-
missing = missing_df[missing_df["Missing Values"] > 0]
|
|
6182
|
+
missing = missing_df[missing_df["Missing Values"] > 0]
|
|
5628
6183
|
data_cells.append({
|
|
5629
6184
|
"title": "Missing Values",
|
|
5630
6185
|
"output": Markup(datatable_box(missing)) if not missing.empty else "<em>No missing values detected.</em>",
|
|
5631
6186
|
"code": (
|
|
5632
6187
|
"nulls = df.isnull().sum()\n"
|
|
5633
6188
|
"nulls_pct = (df.isnull().mean() * 100).round(1)\n"
|
|
5634
|
-
"missing_df = pd.DataFrame({
|
|
6189
|
+
"missing_df = pd.DataFrame({\n"
|
|
6190
|
+
" 'Column': df.columns,\n"
|
|
6191
|
+
" 'Missing Values': nulls.values,\n"
|
|
6192
|
+
" 'Missing (%)': nulls_pct.values\n"
|
|
6193
|
+
"})\\n"
|
|
5635
6194
|
"missing_df[missing_df['Missing Values'] > 0]"
|
|
5636
|
-
)
|
|
6195
|
+
),
|
|
6196
|
+
"span":"eda-col-4"
|
|
5637
6197
|
})
|
|
5638
|
-
dtype_df = pd.DataFrame({
|
|
5639
|
-
"Type": df.dtypes.astype(str),
|
|
5640
|
-
"Non-Null Count": df.notnull().sum(),
|
|
5641
|
-
"Unique Values": df.nunique()
|
|
5642
|
-
})
|
|
5643
|
-
data_cells.append({
|
|
5644
|
-
"title": "Column Types",
|
|
5645
6198
|
|
|
5646
|
-
|
|
5647
|
-
|
|
5648
|
-
|
|
5649
|
-
"
|
|
5650
|
-
"
|
|
5651
|
-
"
|
|
5652
|
-
|
|
6199
|
+
# 9) Missingness (Top 20) – Plotly bar chart
|
|
6200
|
+
if not missing.empty:
|
|
6201
|
+
top_miss = (
|
|
6202
|
+
missing_df[missing_df["Missing Values"] > 0]
|
|
6203
|
+
.sort_values("Missing (%)", ascending=False)
|
|
6204
|
+
.loc[:, ["Column", "Missing (%)"]]
|
|
6205
|
+
.head(20)
|
|
6206
|
+
.reset_index(drop=True)
|
|
5653
6207
|
)
|
|
5654
|
-
|
|
6208
|
+
|
|
6209
|
+
container_id = f"miss_plot_{uuid.uuid4().hex}"
|
|
6210
|
+
x_vals = [html.escape(str(c)) for c in top_miss["Column"].tolist()]
|
|
6211
|
+
y_vals = [float(v) for v in top_miss["Missing (%)"].tolist()]
|
|
6212
|
+
|
|
6213
|
+
plot_html = f"""
|
|
6214
|
+
<div id="{container_id}" style="width:100%;height:340px;"></div>
|
|
6215
|
+
<script>
|
|
6216
|
+
(function(){{
|
|
6217
|
+
var x = {json.dumps(x_vals)};
|
|
6218
|
+
var y = {json.dumps(y_vals)};
|
|
6219
|
+
var data = [{{
|
|
6220
|
+
type: 'bar',
|
|
6221
|
+
x: x,
|
|
6222
|
+
y: y,
|
|
6223
|
+
hovertemplate: '%{{x}}<br>Missing: %{{y:.1f}}%<extra></extra>'
|
|
6224
|
+
}}];
|
|
6225
|
+
var layout = {{
|
|
6226
|
+
margin: {{l:50, r:20, t:10, b:100}},
|
|
6227
|
+
yaxis: {{ title: 'Missing (%)', rangemode: 'tozero' }},
|
|
6228
|
+
xaxis: {{ title: 'Column', tickangle: -45 }}
|
|
6229
|
+
}};
|
|
6230
|
+
if (window.Plotly && Plotly.newPlot) {{
|
|
6231
|
+
Plotly.newPlot("{container_id}", data, layout, {{displayModeBar:true, responsive:true}});
|
|
6232
|
+
}} else {{
|
|
6233
|
+
var p=document.createElement('div'); p.style.color='crimson'; p.style.marginTop='8px';
|
|
6234
|
+
p.textContent='Plotly is not loaded.'; document.getElementById("{container_id}").appendChild(p);
|
|
6235
|
+
}}
|
|
6236
|
+
}})();
|
|
6237
|
+
</script>
|
|
6238
|
+
"""
|
|
6239
|
+
data_cells.append({
|
|
6240
|
+
"title": "Missingness (Top 20)",
|
|
6241
|
+
"output": Markup(plot_html),
|
|
6242
|
+
"code": (
|
|
6243
|
+
"nulls = df.isnull().sum();\n"
|
|
6244
|
+
"nulls_pct = (\n"
|
|
6245
|
+
" df.isnull().mean()*100\n"
|
|
6246
|
+
").round(1)\n"
|
|
6247
|
+
"missing_df = pd.DataFrame({\n"
|
|
6248
|
+
" 'Column': df.columns,\n"
|
|
6249
|
+
" 'Missing Values': nulls.values,\n"
|
|
6250
|
+
" 'Missing (%)': nulls_pct.values\n"
|
|
6251
|
+
"})\n\n"
|
|
6252
|
+
"top_miss = (\n"
|
|
6253
|
+
" missing_df[missing_df['Missing Values'] > 0]\n"
|
|
6254
|
+
" .sort_values('Missing (%)', ascending=False)\n"
|
|
6255
|
+
" .loc[:, ['Column', 'Missing (%)']]\n"
|
|
6256
|
+
" .head(20)\n"
|
|
6257
|
+
" .reset_index(drop=True)\n"
|
|
6258
|
+
")\n"
|
|
6259
|
+
"top_miss"
|
|
6260
|
+
),
|
|
6261
|
+
"span":"eda-col-4"
|
|
6262
|
+
})
|
|
6263
|
+
|
|
6264
|
+
# 11 Category Distribution — 3D doughnut (dataset-agnostic, capped 5k)
|
|
6265
|
+
try:
|
|
6266
|
+
# 1) Column universe: object / category / bool (integers remain numeric)
|
|
6267
|
+
cat_cols_all = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
|
|
6268
|
+
|
|
6269
|
+
# 2) Honour user pick if categorical; otherwise auto-pick a sensible default
|
|
6270
|
+
dist_param = (request.args.get("dist") or request.form.get("dist") or "").strip()
|
|
6271
|
+
if dist_param and dist_param in cat_cols_all:
|
|
6272
|
+
dist_col = dist_param
|
|
6273
|
+
else:
|
|
6274
|
+
# Auto-pick preference: 3–20 unique values excluding obvious ID-like;
|
|
6275
|
+
# else allow 2-level; else first categorical.
|
|
6276
|
+
n_total = len(df)
|
|
6277
|
+
uniques_loc = df.nunique(dropna=True)
|
|
6278
|
+
miss_pct_loc = (df.isnull().mean() * 100).round(1)
|
|
6279
|
+
id_like_loc = {c for c in cat_cols_all if n_total > 0 and (uniques_loc.get(c, 0) / n_total) >= 0.95}
|
|
6280
|
+
|
|
6281
|
+
multilevel = [c for c in cat_cols_all
|
|
6282
|
+
if (3 <= int(uniques_loc.get(c, df[c].nunique(dropna=True))) <= 20)
|
|
6283
|
+
and (c not in id_like_loc)]
|
|
6284
|
+
if multilevel:
|
|
6285
|
+
# score nearer 8 levels and lower missingness
|
|
6286
|
+
best, best_score = "", -1e9
|
|
6287
|
+
for c in multilevel:
|
|
6288
|
+
k = int(uniques_loc.get(c, df[c].nunique(dropna=True)))
|
|
6289
|
+
miss = float(miss_pct_loc.get(c, (df[c].isna().mean() * 100)))
|
|
6290
|
+
score = -abs(k - 8) - (miss / 10.0)
|
|
6291
|
+
if score > best_score:
|
|
6292
|
+
best, best_score = c, score
|
|
6293
|
+
dist_col = best
|
|
6294
|
+
else:
|
|
6295
|
+
twolevel = [c for c in cat_cols_all if int(uniques_loc.get(c, df[c].nunique(dropna=True))) == 2]
|
|
6296
|
+
dist_col = (twolevel[0] if twolevel else (cat_cols_all[0] if cat_cols_all else ""))
|
|
6297
|
+
|
|
6298
|
+
# 3) Build options AFTER dist_col is final (so selection sticks)
|
|
6299
|
+
opts = []
|
|
6300
|
+
for c in cat_cols_all:
|
|
6301
|
+
sel = " selected" if c == dist_col else ""
|
|
6302
|
+
opts.append(f'<option value="{html.escape(str(c))}"{sel}>{html.escape(str(c))}</option>')
|
|
6303
|
+
opts_html = "\n".join(opts)
|
|
6304
|
+
|
|
6305
|
+
form_html = f"""
|
|
6306
|
+
<a id="dist3d"></a>
|
|
6307
|
+
<form method="get" action="/dashboard#dist3d"
|
|
6308
|
+
style="display:flex; flex-wrap:wrap; gap:10px; align-items:center; margin-bottom:8px;">
|
|
6309
|
+
<input type="hidden" name="section" value="explore">
|
|
6310
|
+
<input type="hidden" name="dataset" value="{html.escape(str(selected_dataset or ''))}">
|
|
6311
|
+
<label><strong>Distribution column:</strong></label>
|
|
6312
|
+
<select name="dist" onchange="this.form.submit()" style="min-width:200px; height:28px;">
|
|
6313
|
+
{opts_html}
|
|
6314
|
+
</select>
|
|
6315
|
+
</form>
|
|
6316
|
+
"""
|
|
6317
|
+
|
|
6318
|
+
if dist_col:
|
|
6319
|
+
s = df[dist_col]
|
|
6320
|
+
# cap cheap counting to 5k
|
|
6321
|
+
if len(s) > 5000:
|
|
6322
|
+
s = s.sample(5000, random_state=0)
|
|
6323
|
+
|
|
6324
|
+
# 4) Robust counting: treat NaN as "Missing", stringify labels for safety
|
|
6325
|
+
s = s.astype("object")
|
|
6326
|
+
s = s.where(~s.isna(), other="Missing")
|
|
6327
|
+
vc = s.value_counts(dropna=False)
|
|
6328
|
+
|
|
6329
|
+
if vc.empty:
|
|
6330
|
+
raise ValueError("No values to display for the selected column.")
|
|
6331
|
+
|
|
6332
|
+
# Top-8 + 'Other' (excluding 'Missing' which we keep separate)
|
|
6333
|
+
top_k = 8
|
|
6334
|
+
non_missing = vc.drop(index=["Missing"], errors="ignore") if "Missing" in vc.index else vc
|
|
6335
|
+
head = non_missing.sort_values(ascending=False).head(top_k)
|
|
6336
|
+
other = int(non_missing.iloc[top_k:].sum()) if len(non_missing) > top_k else 0
|
|
6337
|
+
miss = int(vc.get("Missing", 0))
|
|
6338
|
+
|
|
6339
|
+
labels = [str(x) for x in head.index.tolist()]
|
|
6340
|
+
values = [int(v) for v in head.values.tolist()]
|
|
6341
|
+
if other > 0:
|
|
6342
|
+
labels.append("Other"); values.append(other)
|
|
6343
|
+
if miss > 0:
|
|
6344
|
+
labels.append("Missing"); values.append(miss)
|
|
6345
|
+
|
|
6346
|
+
# colours for faux 3D (no external deps)
|
|
6347
|
+
k = len(labels)
|
|
6348
|
+
def _hsl(i, n, l=0.58, s=0.62):
|
|
6349
|
+
h = (i / max(1, n)) * 360.0
|
|
6350
|
+
return f"hsl({int(h)}, {int(s*100)}%, {int(l*100)}%)"
|
|
6351
|
+
top_colors = [_hsl(i, k, l=0.58) for i in range(k)]
|
|
6352
|
+
base_colors = [_hsl(i, k, l=0.40) for i in range(k)]
|
|
6353
|
+
|
|
6354
|
+
container_id = f"dist3d_{uuid.uuid4().hex}"
|
|
6355
|
+
total = int(sum(values))
|
|
6356
|
+
|
|
6357
|
+
plot_html = f"""
|
|
6358
|
+
<div id="{container_id}" class="dist3d-chart"></div>
|
|
6359
|
+
<script>
|
|
6360
|
+
(function(){{
|
|
6361
|
+
var el = document.getElementById("{container_id}");
|
|
6362
|
+
var labels = {json.dumps(labels)};
|
|
6363
|
+
var values = {json.dumps(values)};
|
|
6364
|
+
var total = {total};
|
|
6365
|
+
|
|
6366
|
+
var base = {{
|
|
6367
|
+
type: 'pie', labels: labels, values: values,
|
|
6368
|
+
hole: 0.64, sort: false, textinfo: 'none', hoverinfo: 'skip',
|
|
6369
|
+
marker: {{ colors: {json.dumps(base_colors)} }},
|
|
6370
|
+
showlegend: false
|
|
6371
|
+
}};
|
|
6372
|
+
var top = {{
|
|
6373
|
+
type: 'pie', labels: labels, values: values,
|
|
6374
|
+
hole: 0.52, sort: false,
|
|
6375
|
+
textinfo: 'percent', textposition: 'inside', insidetextorientation: 'radial',
|
|
6376
|
+
hovertemplate: '%{{label}}<br>%{{value}} of {total:,} (%{{percent}})<extra></extra>',
|
|
6377
|
+
marker: {{ colors: {json.dumps(top_colors)}, line: {{ width: 1, color: 'rgba(0,0,0,0.25)' }} }},
|
|
6378
|
+
showlegend: true, legendgroup: 'dist'
|
|
6379
|
+
}};
|
|
6380
|
+
|
|
6381
|
+
function parentWidth(){{
|
|
6382
|
+
return (el && el.parentElement ? el.parentElement.clientWidth : (window.innerWidth||360));
|
|
6383
|
+
}}
|
|
6384
|
+
|
|
6385
|
+
// Smooth, monotonic: height = 0.65 * width, clamped [220, 520].
|
|
6386
|
+
function chartHeight(){{
|
|
6387
|
+
var w = parentWidth();
|
|
6388
|
+
return Math.round(Math.max(220, Math.min(520, w * 0.65)));
|
|
6389
|
+
}}
|
|
6390
|
+
|
|
6391
|
+
function legendOrientation(){{
|
|
6392
|
+
return parentWidth() < 640 ? 'h' : 'v';
|
|
6393
|
+
}}
|
|
6394
|
+
|
|
6395
|
+
function makeLayout(){{
|
|
6396
|
+
return {{
|
|
6397
|
+
margin: {{ l:10, r:10, t:10, b:10 }},
|
|
6398
|
+
legend: {{ orientation: legendOrientation(), x:1, xanchor:'right', y:1 }},
|
|
6399
|
+
uniformtext: {{ mode: 'hide', minsize: 10 }}
|
|
6400
|
+
}};
|
|
6401
|
+
}}
|
|
6402
|
+
|
|
6403
|
+
function applySize(){{
|
|
6404
|
+
// Override global .plotly-graph-div {{ height:auto !important }}
|
|
6405
|
+
el.style.setProperty('height', chartHeight() + 'px', 'important');
|
|
6406
|
+
if (window.Plotly) {{
|
|
6407
|
+
Plotly.relayout(el, {{ 'legend.orientation': legendOrientation() }});
|
|
6408
|
+
Plotly.Plots.resize(el);
|
|
6409
|
+
}}
|
|
6410
|
+
}}
|
|
6411
|
+
|
|
6412
|
+
if (window.Plotly && Plotly.newPlot) {{
|
|
6413
|
+
// Initial explicit height before draw
|
|
6414
|
+
el.style.setProperty('height', chartHeight() + 'px', 'important');
|
|
6415
|
+
Plotly.newPlot(el, [base, top], makeLayout(), {{ displayModeBar:true, responsive:true }})
|
|
6416
|
+
.then(function(){{ applySize(); }});
|
|
6417
|
+
window.addEventListener('resize', applySize);
|
|
6418
|
+
}} else {{
|
|
6419
|
+
var p=document.createElement('div'); p.style.color='crimson'; p.style.marginTop='8px';
|
|
6420
|
+
p.textContent='Plotly is not loaded.'; el.appendChild(p);
|
|
6421
|
+
}}
|
|
6422
|
+
}})();
|
|
6423
|
+
</script>
|
|
6424
|
+
"""
|
|
6425
|
+
|
|
6426
|
+
data_cells.append({
|
|
6427
|
+
"title": f"Category Distribution — ({html.escape(dist_col)})",
|
|
6428
|
+
"output": Markup(form_html + plot_html),
|
|
6429
|
+
"code": (
|
|
6430
|
+
"dist_col = '<chosen categorical>'\n"
|
|
6431
|
+
"s = df[dist_col].astype('object').where(~df[dist_col].isna(), other='Missing')\n"
|
|
6432
|
+
"vc = s.value_counts(dropna=False)\n"
|
|
6433
|
+
"top_k = 8 # Top-8 + Other (+ Missing)\n"
|
|
6434
|
+
),
|
|
6435
|
+
"span": "eda-col-4"
|
|
6436
|
+
})
|
|
6437
|
+
else:
|
|
6438
|
+
data_cells.append({
|
|
6439
|
+
"title": "Category Distribution — 3D doughnut",
|
|
6440
|
+
"output": "<em>No categorical columns found.</em>",
|
|
6441
|
+
"code": "# no categorical columns",
|
|
6442
|
+
"span": "eda-col-4"
|
|
6443
|
+
})
|
|
6444
|
+
except Exception as _e:
|
|
6445
|
+
data_cells.append({
|
|
6446
|
+
"title": "Category Distribution — 3D doughnut",
|
|
6447
|
+
"output": f"<em>Could not render distribution: {html.escape(str(_e))}</em>",
|
|
6448
|
+
"code": "# error during distribution rendering",
|
|
6449
|
+
"span": "eda-col-4"
|
|
6450
|
+
})
|
|
6451
|
+
|
|
5655
6452
|
for cell in data_cells:
|
|
5656
|
-
|
|
6453
|
+
cell["highlighted_code"] = Markup(_pygmentize(cell["code"]))
|
|
6454
|
+
|
|
5657
6455
|
highlighted_ai_code = _pygmentize(ai_code)
|
|
6456
|
+
|
|
5658
6457
|
return render_template(
|
|
5659
6458
|
"dashboard.html",
|
|
5660
6459
|
section=section,
|
|
5661
6460
|
datasets=datasets,
|
|
5662
6461
|
selected_dataset=selected_dataset,
|
|
5663
6462
|
ai_outputs=ai_outputs,
|
|
5664
|
-
ai_code=ai_code,
|
|
6463
|
+
ai_code=ai_code,
|
|
5665
6464
|
highlighted_ai_code=highlighted_ai_code if ai_code else None,
|
|
5666
6465
|
askai_question=smx.sanitize_rough_to_markdown_task(askai_question),
|
|
5667
|
-
refined_question=refined_question,
|
|
6466
|
+
refined_question=refined_question,
|
|
6467
|
+
tasks=tags,
|
|
5668
6468
|
data_cells=data_cells,
|
|
5669
6469
|
session_id=session_id,
|
|
6470
|
+
llm_usage=llm_usage
|
|
5670
6471
|
)
|
|
5671
6472
|
|
|
5672
6473
|
|
|
@@ -5677,7 +6478,7 @@ def setup_routes(smx):
|
|
|
5677
6478
|
if not html_doc:
|
|
5678
6479
|
return ("No result available.", 404)
|
|
5679
6480
|
|
|
5680
|
-
buf =
|
|
6481
|
+
buf = _std_io.BytesIO(html_doc.encode("utf-8"))
|
|
5681
6482
|
buf.seek(0)
|
|
5682
6483
|
|
|
5683
6484
|
# keep a copy if you wish, or free it:
|
|
@@ -5749,7 +6550,7 @@ def setup_routes(smx):
|
|
|
5749
6550
|
text = re.sub(r"<[^>]+>", " ", text)
|
|
5750
6551
|
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
5751
6552
|
text = html.unescape(text).strip()
|
|
5752
|
-
buf =
|
|
6553
|
+
buf = _std_io.BytesIO()
|
|
5753
6554
|
doc = SimpleDocTemplate(buf, pagesize=A4, leftMargin=16*mm, rightMargin=16*mm, topMargin=16*mm, bottomMargin=16*mm)
|
|
5754
6555
|
styles = getSampleStyleSheet()
|
|
5755
6556
|
flow = []
|