syntaxmatrix 2.3.5__py3-none-any.whl → 2.5.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- syntaxmatrix/agentic/__init__.py +0 -0
- syntaxmatrix/agentic/agent_tools.py +24 -0
- syntaxmatrix/agentic/agents.py +810 -0
- syntaxmatrix/agentic/code_tools_registry.py +37 -0
- syntaxmatrix/agentic/model_templates.py +1790 -0
- syntaxmatrix/commentary.py +134 -112
- syntaxmatrix/core.py +385 -245
- syntaxmatrix/dataset_preprocessing.py +218 -0
- syntaxmatrix/display.py +89 -37
- syntaxmatrix/gpt_models_latest.py +5 -4
- syntaxmatrix/profiles.py +19 -4
- syntaxmatrix/routes.py +947 -141
- syntaxmatrix/settings/model_map.py +38 -30
- syntaxmatrix/static/icons/hero_bg.jpg +0 -0
- syntaxmatrix/templates/dashboard.html +248 -54
- syntaxmatrix/utils.py +2254 -84
- {syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/METADATA +16 -17
- {syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/RECORD +21 -15
- syntaxmatrix/model_templates.py +0 -29
- {syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/WHEEL +0 -0
- {syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/licenses/LICENSE.txt +0 -0
- {syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/top_level.txt +0 -0
syntaxmatrix/routes.py
CHANGED
|
@@ -1,18 +1,28 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
import
|
|
4
|
-
|
|
5
|
-
import io
|
|
1
|
+
import os, zipfile, time, uuid, werkzeug, queue, html, ast, re
|
|
2
|
+
import threading, textwrap, json, pandas as pd
|
|
3
|
+
import contextlib
|
|
4
|
+
|
|
5
|
+
import io as _std_io
|
|
6
|
+
|
|
6
7
|
from io import BytesIO
|
|
8
|
+
from scipy import io
|
|
9
|
+
from flask import Blueprint, Response, request, send_file, session
|
|
10
|
+
from flask import render_template, render_template_string, url_for, redirect, g
|
|
11
|
+
from flask import flash, jsonify, send_from_directory, get_flashed_messages, stream_with_context
|
|
12
|
+
|
|
13
|
+
from flask_login import current_user
|
|
14
|
+
|
|
7
15
|
from PyPDF2 import PdfReader
|
|
8
16
|
from markupsafe import Markup
|
|
9
|
-
from urllib.parse import quote
|
|
17
|
+
from urllib.parse import quote
|
|
18
|
+
from datetime import datetime
|
|
19
|
+
from prompt_toolkit import HTML
|
|
20
|
+
from PyPDF2.errors import EmptyFileError
|
|
21
|
+
import numpy as np
|
|
10
22
|
from .auth import register_user, authenticate, login_required, admin_required, superadmin_required
|
|
11
|
-
from flask import Blueprint, Response, request, send_file, session, render_template, render_template_string, redirect, url_for, flash, jsonify, send_from_directory, get_flashed_messages, stream_with_context
|
|
12
23
|
|
|
13
24
|
from syntaxmatrix.themes import DEFAULT_THEMES
|
|
14
25
|
from syntaxmatrix import db
|
|
15
|
-
from syntaxmatrix.utils import *
|
|
16
26
|
from syntaxmatrix.vector_db import add_pdf_chunk
|
|
17
27
|
from syntaxmatrix.file_processor import *
|
|
18
28
|
from syntaxmatrix.vectorizer import embed_text
|
|
@@ -22,16 +32,13 @@ from syntaxmatrix.history_store import SQLHistoryStore, PersistentHistoryStore
|
|
|
22
32
|
from syntaxmatrix.kernel_manager import SyntaxMatrixKernelManager, execute_code_in_kernel
|
|
23
33
|
from syntaxmatrix.vector_db import *
|
|
24
34
|
from syntaxmatrix.settings.string_navbar import string_navbar_items
|
|
25
|
-
from syntaxmatrix.settings.model_map import PROVIDERS_MODELS, MODEL_DESCRIPTIONS, PURPOSE_TAGS, EMBEDDING_MODELS
|
|
26
|
-
from .project_root import detect_project_root
|
|
27
|
-
from
|
|
28
|
-
from
|
|
29
|
-
from . import profiles as _prof
|
|
30
|
-
from . import auth as _auth
|
|
35
|
+
from syntaxmatrix.settings.model_map import GPT_MODELS_LATEST, PROVIDERS_MODELS, MODEL_DESCRIPTIONS, PURPOSE_TAGS, EMBEDDING_MODELS
|
|
36
|
+
from syntaxmatrix.project_root import detect_project_root
|
|
37
|
+
from syntaxmatrix import generate_page as _genpage
|
|
38
|
+
from syntaxmatrix import auth as _auth
|
|
31
39
|
from syntaxmatrix import profiles as _prof
|
|
32
40
|
from syntaxmatrix.gpt_models_latest import set_args, extract_output_text as _out
|
|
33
|
-
from
|
|
34
|
-
import contextlib
|
|
41
|
+
from syntaxmatrix.agentic.agents import classify_ml_job_agent, refine_question_agent, text_formatter_agent
|
|
35
42
|
|
|
36
43
|
try:
|
|
37
44
|
from pygments import highlight as _hl
|
|
@@ -41,10 +48,16 @@ try:
|
|
|
41
48
|
except Exception:
|
|
42
49
|
_HAVE_PYGMENTS = False
|
|
43
50
|
|
|
44
|
-
from
|
|
45
|
-
from
|
|
51
|
+
# from syntaxmatrix.utils import *
|
|
52
|
+
from syntaxmatrix.utils import (
|
|
53
|
+
auto_inject_template, drop_bad_classification_metrics, ensure_accuracy_block,
|
|
54
|
+
ensure_image_output, ensure_output, fix_plain_prints, fix_print_html, patch_fix_sentinel_plot_calls,
|
|
55
|
+
patch_pairplot, fix_to_datetime_errors, harden_ai_code, patch_ensure_seaborn_import, get_plotting_imports,
|
|
56
|
+
patch_fix_seaborn_palette_calls, patch_quiet_specific_warnings, fix_seaborn_barplot_nameerror, fix_seaborn_boxplot_nameerror, ensure_matplotlib_title, patch_plot_code, patch_prefix_seaborn_calls, fix_scatter_and_summary, inject_auto_preprocessing, fix_importance_groupby, patch_pie_chart, patch_rmse_calls, clean_llm_code
|
|
57
|
+
)
|
|
46
58
|
|
|
47
|
-
|
|
59
|
+
from syntaxmatrix.agentic.agent_tools import ToolRunner
|
|
60
|
+
from syntaxmatrix.agentic.code_tools_registry import EARLY_SANITIZERS, SYNTAX_AND_REPAIR
|
|
48
61
|
|
|
49
62
|
_CLIENT_DIR = detect_project_root()
|
|
50
63
|
_stream_q = queue.Queue()
|
|
@@ -192,7 +205,6 @@ def setup_routes(smx):
|
|
|
192
205
|
)
|
|
193
206
|
return resp
|
|
194
207
|
|
|
195
|
-
|
|
196
208
|
def head_html():
|
|
197
209
|
# Determine a contrasting mobile text color based on the sidebar background.
|
|
198
210
|
mobile_text_color = smx.theme["nav_text"]
|
|
@@ -625,8 +637,8 @@ def setup_routes(smx):
|
|
|
625
637
|
|
|
626
638
|
desktop_nav = f"""
|
|
627
639
|
<div class="nav-left">
|
|
628
|
-
<a class="logo" href="/"
|
|
629
|
-
<a class="logo" href="/" style="text-decoration:none;
|
|
640
|
+
<a class="logo" href="/">{smx.site_logo}</a>
|
|
641
|
+
<a class="logo" href="/" style="text-decoration:none; margin:0 24px 0 0; padding:0px; vertical-align:middle;">{smx.site_title}</a>
|
|
630
642
|
<div class="nav-links" style="margin-left:24px;">
|
|
631
643
|
{nav_links}
|
|
632
644
|
</div>
|
|
@@ -3769,10 +3781,13 @@ def setup_routes(smx):
|
|
|
3769
3781
|
|
|
3770
3782
|
|
|
3771
3783
|
# if any live cached profile on smx matches this name, clear it
|
|
3772
|
-
|
|
3784
|
+
db_profiles = prof.get_profiles()
|
|
3785
|
+
# for attr in ("_chat_profile", "_admin_profile", "_coding_profile", "_classification_profile", "_summarization_profile", "_vision2text_profile"):
|
|
3786
|
+
for attr in ([db_profiles]):
|
|
3773
3787
|
prof = getattr(smx, attr, None)
|
|
3774
3788
|
if isinstance(prof, dict) and prof.get("name") == name:
|
|
3775
3789
|
setattr(smx, attr, {})
|
|
3790
|
+
prof.refresh_profiles_cache()
|
|
3776
3791
|
|
|
3777
3792
|
elif action == "add_model":
|
|
3778
3793
|
prov = request.form.get("catalog_provider","").strip()
|
|
@@ -3944,7 +3959,7 @@ def setup_routes(smx):
|
|
|
3944
3959
|
<label for="catalog_model">Model</label>
|
|
3945
3960
|
<select id="catalog_model" name="catalog_model" required></select>
|
|
3946
3961
|
|
|
3947
|
-
<label for="catalog_purpose">
|
|
3962
|
+
<label for="catalog_purpose">Agency</label>
|
|
3948
3963
|
<select id="catalog_purpose" name="catalog_purpose" required></select>
|
|
3949
3964
|
|
|
3950
3965
|
<label class="form-label mb-1" style="display:block; position:relative;">
|
|
@@ -4066,7 +4081,7 @@ def setup_routes(smx):
|
|
|
4066
4081
|
|
|
4067
4082
|
models_catalog_list_card = f"""
|
|
4068
4083
|
<div class="card span-4">
|
|
4069
|
-
<h4>Models
|
|
4084
|
+
<h4>Models Catalogue</h4>
|
|
4070
4085
|
<ul class="catalog-list">
|
|
4071
4086
|
{cat_items or "<li class='li-row'>No models yet.</li>"}
|
|
4072
4087
|
</ul>
|
|
@@ -4080,15 +4095,15 @@ def setup_routes(smx):
|
|
|
4080
4095
|
<div class='card span-4'>
|
|
4081
4096
|
<h4>Setup Profiles</h4>
|
|
4082
4097
|
<form method="post" style="margin-bottom:0.5rem;">
|
|
4083
|
-
<label for="profile_name" class="form-label mb-1">
|
|
4084
|
-
Confirm
|
|
4098
|
+
<label for="profile_name" class="form-label mb-1" style="margin-bottom:12px;">
|
|
4099
|
+
Confirm Agency
|
|
4085
4100
|
<button id="name-help" type="button" class="info-btn btn-link p-0 text-muted"
|
|
4086
4101
|
style="font-size:0.8rem; line-height:1; padding:2px; display:inline-block;"
|
|
4087
4102
|
aria-haspopup="true" aria-expanded="false"
|
|
4088
|
-
title="Click to see
|
|
4103
|
+
title="Click to see agencies">ⓘ</button>
|
|
4089
4104
|
</label>
|
|
4090
4105
|
<input id="profile_name" name="profile_name" type="text" class="form-control"
|
|
4091
|
-
placeholder="
|
|
4106
|
+
placeholder="Agency" required>
|
|
4092
4107
|
|
|
4093
4108
|
<div id="name-suggestions" role="tooltip"
|
|
4094
4109
|
class="suggestion-popover card shadow-sm p-2"
|
|
@@ -4175,9 +4190,9 @@ def setup_routes(smx):
|
|
|
4175
4190
|
|
|
4176
4191
|
manage_sys_files_card = f"""
|
|
4177
4192
|
<div class='card span-6'>
|
|
4178
|
-
<h4>Manage
|
|
4193
|
+
<h4>Manage Company Files</h4>
|
|
4179
4194
|
<ul class="catalog-list" style="list-style:none; padding-left:0; margin:0;">
|
|
4180
|
-
{sys_files_html or "<li>No
|
|
4195
|
+
{sys_files_html or "<li>No company file has been uploaded yet.</li>"}
|
|
4181
4196
|
</ul>
|
|
4182
4197
|
</div>
|
|
4183
4198
|
"""
|
|
@@ -5114,7 +5129,7 @@ def setup_routes(smx):
|
|
|
5114
5129
|
rows = _auth.list_role_audit(limit=limit)
|
|
5115
5130
|
|
|
5116
5131
|
import io, csv, datetime
|
|
5117
|
-
buf =
|
|
5132
|
+
buf = _std_io.StringIO()
|
|
5118
5133
|
writer = csv.writer(buf)
|
|
5119
5134
|
writer.writerow(["timestamp", "actor", "target", "from_role", "to_role"])
|
|
5120
5135
|
for r in rows:
|
|
@@ -5375,25 +5390,28 @@ def setup_routes(smx):
|
|
|
5375
5390
|
# ────────────────────────────────────────────────────────────────────────────────────────
|
|
5376
5391
|
# DASHBOARD
|
|
5377
5392
|
# ────────────────────────────────────────────────────────────────────────────────────────
|
|
5378
|
-
# ── DASHBOARD VIEW DETAILS -----------------------------
|
|
5379
5393
|
@smx.app.route("/dashboard", methods=["GET", "POST"])
|
|
5380
5394
|
# @login_required
|
|
5381
5395
|
def dashboard():
|
|
5382
5396
|
DATA_FOLDER = os.path.join(_CLIENT_DIR, "uploads", "data")
|
|
5383
5397
|
os.makedirs(DATA_FOLDER, exist_ok=True)
|
|
5384
|
-
|
|
5385
|
-
####################################################################
|
|
5386
5398
|
|
|
5387
|
-
|
|
5388
|
-
|
|
5389
|
-
|
|
5390
|
-
- No top-level statements between if/elif/else branches.
|
|
5391
|
-
- Regression must use either sklearn with train_test_split (then X_test exists) and R^2/MAE/RMSE, or statsmodels OLS. No accuracy_score in regression.
|
|
5392
|
-
- Keep all plotting + savefig + BytesIO + display inside the branch that created the figure.
|
|
5393
|
-
Return ONLY the corrected cell.
|
|
5394
|
-
"""
|
|
5395
|
-
|
|
5399
|
+
max_rows = 5000
|
|
5400
|
+
max_cols = 80
|
|
5401
|
+
|
|
5396
5402
|
def _smx_repair_python_cell(py_code: str) -> str:
|
|
5403
|
+
|
|
5404
|
+
_CELL_REPAIR_RULES = """
|
|
5405
|
+
You are an experienced Python code reviewer
|
|
5406
|
+
Fix the Python cell to satisfy:
|
|
5407
|
+
- Single valid cell; imports at the top.
|
|
5408
|
+
- Do not import or invoke or use 'python-dotenv' or 'dotenv' because it's not needed.
|
|
5409
|
+
- No top-level statements between if/elif/else branches.
|
|
5410
|
+
- Regression must use either sklearn with train_test_split (then X_test exists) and R^2/MAE/RMSE,
|
|
5411
|
+
or statsmodels OLS. No accuracy_score in regression.
|
|
5412
|
+
- Keep all plotting + savefig + BytesIO + display inside the branch that created the figure.
|
|
5413
|
+
- Return ONLY the corrected cell.
|
|
5414
|
+
"""
|
|
5397
5415
|
code = textwrap.dedent(py_code or "").strip()
|
|
5398
5416
|
needs_fix = False
|
|
5399
5417
|
if re.search(r"\baccuracy_score\b", code) and re.search(r"\bLinearRegression\b|\bOLS\b", code):
|
|
@@ -5406,59 +5424,84 @@ def setup_routes(smx):
|
|
|
5406
5424
|
needs_fix = True
|
|
5407
5425
|
if not needs_fix:
|
|
5408
5426
|
return code
|
|
5409
|
-
|
|
5427
|
+
|
|
5428
|
+
_prompt = f"```python\n{code}\n```"
|
|
5429
|
+
|
|
5430
|
+
prof = _prof.get_profile("classification") or _prof.get_profile("admin")
|
|
5410
5431
|
if not prof:
|
|
5411
5432
|
return code
|
|
5412
|
-
|
|
5413
|
-
|
|
5414
|
-
_client =
|
|
5433
|
+
|
|
5434
|
+
prof["client"] = _prof.get_client(prof)
|
|
5435
|
+
_client = prof["client"]
|
|
5415
5436
|
_model = prof["model"]
|
|
5416
|
-
|
|
5417
|
-
|
|
5418
|
-
|
|
5419
|
-
|
|
5420
|
-
|
|
5421
|
-
)
|
|
5422
|
-
).strip()
|
|
5423
|
-
|
|
5424
|
-
elif prof["provider"] == "openai" and _model in smx.gpt_models_latest():
|
|
5425
|
-
args = set_args(model=prof.get("model"), instructions=_CELL_REPAIR_RULES,
|
|
5426
|
-
input=_prompt, previous_id=None, store=False,
|
|
5427
|
-
reasoning_effort="minimal", verbosity="low")
|
|
5428
|
-
fixed = _out(_client.responses.create(**args)).strip()
|
|
5429
|
-
|
|
5430
|
-
elif prof["provider"] == "anthropic":
|
|
5431
|
-
fixed = _out(_client.messages.create(
|
|
5432
|
-
model=_model,
|
|
5433
|
-
max_tokens=1024,
|
|
5434
|
-
system=_CELL_REPAIR_RULES,
|
|
5435
|
-
messages=[{"role": "user", "content":_prompt}]
|
|
5436
|
-
)).strip()
|
|
5437
|
-
|
|
5438
|
-
else:
|
|
5439
|
-
fixed = _out(_client.chat.completions.create(
|
|
5440
|
-
model=_model,
|
|
5441
|
-
messages=[
|
|
5442
|
-
{"role": "system", "content":_CELL_REPAIR_RULES},
|
|
5443
|
-
{"role": "user", "content":_prompt},
|
|
5444
|
-
]
|
|
5445
|
-
)
|
|
5446
|
-
).strip()
|
|
5437
|
+
_provider = prof["provider"].lower()
|
|
5438
|
+
|
|
5439
|
+
#1 Google
|
|
5440
|
+
if _provider == "google":
|
|
5441
|
+
from google.genai import types
|
|
5447
5442
|
|
|
5443
|
+
fixed = _client.models.generate_content(
|
|
5444
|
+
model=_model,
|
|
5445
|
+
contents=_prompt,
|
|
5446
|
+
config=types.GenerateContentConfig(
|
|
5447
|
+
system_instruction=_CELL_REPAIR_RULES,
|
|
5448
|
+
temperature=0.8,
|
|
5449
|
+
max_output_tokens=1024,
|
|
5450
|
+
),
|
|
5451
|
+
)
|
|
5452
|
+
|
|
5453
|
+
#2 Openai
|
|
5454
|
+
elif _provider == "openai" and _model in GPT_MODELS_LATEST:
|
|
5455
|
+
|
|
5456
|
+
args = set_args(
|
|
5457
|
+
model=_model,
|
|
5458
|
+
instructions=_CELL_REPAIR_RULES,
|
|
5459
|
+
input=[{"role": "user", "content": _prompt}],
|
|
5460
|
+
previous_id=None,
|
|
5461
|
+
store=False,
|
|
5462
|
+
reasoning_effort="medium",
|
|
5463
|
+
verbosity="medium",
|
|
5464
|
+
)
|
|
5465
|
+
fixed = _out(_client.responses.create(**args))
|
|
5466
|
+
|
|
5467
|
+
# Anthropic
|
|
5468
|
+
elif _provider == "anthropic":
|
|
5469
|
+
|
|
5470
|
+
fixed = _client.messages.create(
|
|
5471
|
+
model=_model,
|
|
5472
|
+
max_tokens=1024,
|
|
5473
|
+
system=_CELL_REPAIR_RULES,
|
|
5474
|
+
messages=[{"role": "user", "content":_prompt}],
|
|
5475
|
+
stream=False,
|
|
5476
|
+
)
|
|
5477
|
+
|
|
5478
|
+
# OpenAI SDK
|
|
5479
|
+
else:
|
|
5480
|
+
fixed = _client.chat.completions.create(
|
|
5481
|
+
model=_model,
|
|
5482
|
+
messages=[
|
|
5483
|
+
{"role": "system", "content":_CELL_REPAIR_RULES},
|
|
5484
|
+
{"role": "user", "content":_prompt},
|
|
5485
|
+
],
|
|
5486
|
+
max_tokens=1024,
|
|
5487
|
+
)
|
|
5488
|
+
|
|
5489
|
+
fixed_txt = clean_llm_code(fixed)
|
|
5490
|
+
|
|
5448
5491
|
try:
|
|
5449
|
-
|
|
5450
|
-
|
|
5492
|
+
# Only accept the repaired cell if it's valid Python
|
|
5493
|
+
ast.parse(fixed_txt)
|
|
5494
|
+
return fixed_txt
|
|
5495
|
+
except Exception:
|
|
5496
|
+
# If the repaired version is still broken, fall back to the original code
|
|
5451
5497
|
return code
|
|
5452
|
-
|
|
5453
|
-
|
|
5498
|
+
|
|
5454
5499
|
section = request.args.get("section", "explore")
|
|
5455
5500
|
datasets = [f for f in os.listdir(DATA_FOLDER) if f.lower().endswith(".csv")]
|
|
5456
5501
|
selected_dataset = request.form.get("dataset") or request.args.get("dataset")
|
|
5457
5502
|
if not selected_dataset and datasets:
|
|
5458
5503
|
selected_dataset = datasets[0]
|
|
5459
5504
|
|
|
5460
|
-
# selected_dataset = selected_dataset or ""
|
|
5461
|
-
|
|
5462
5505
|
# Handle file upload
|
|
5463
5506
|
if request.method == "POST" and "dataset_file" in request.files:
|
|
5464
5507
|
f = request.files["dataset_file"]
|
|
@@ -5470,7 +5513,7 @@ def setup_routes(smx):
|
|
|
5470
5513
|
|
|
5471
5514
|
# Load dataframe if available
|
|
5472
5515
|
df = pd.read_csv(os.path.join(DATA_FOLDER, selected_dataset)) if selected_dataset else None
|
|
5473
|
-
|
|
5516
|
+
|
|
5474
5517
|
# --- Jupyter kernel management ---
|
|
5475
5518
|
session_id = session.get('smx_kernel_id')
|
|
5476
5519
|
if not session_id:
|
|
@@ -5481,38 +5524,84 @@ def setup_routes(smx):
|
|
|
5481
5524
|
|
|
5482
5525
|
# --- Handle Ask AI ---
|
|
5483
5526
|
ai_outputs = []
|
|
5527
|
+
dl_html = ""
|
|
5484
5528
|
askai_question = ""
|
|
5485
|
-
refined_question =
|
|
5529
|
+
refined_question = ""
|
|
5530
|
+
tags = []
|
|
5486
5531
|
ai_code = None
|
|
5532
|
+
eda_df = df
|
|
5533
|
+
llm_usage = None
|
|
5487
5534
|
|
|
5488
5535
|
if request.method == "POST" and "askai_question" in request.form:
|
|
5489
5536
|
askai_question = request.form["askai_question"].strip()
|
|
5490
|
-
if df is not None:
|
|
5537
|
+
if df is not None:
|
|
5538
|
+
CLEANED_FOLDER = str(selected_dataset).split(".")[0] + "_preprocessed"
|
|
5539
|
+
cleaned_path = os.path.join(DATA_FOLDER, CLEANED_FOLDER, "cleaned_df.csv")
|
|
5540
|
+
if os.path.exists(cleaned_path):
|
|
5541
|
+
df = pd.read_csv(cleaned_path, low_memory=False)
|
|
5542
|
+
else:
|
|
5543
|
+
from syntaxmatrix.dataset_preprocessing import ensure_cleaned_df
|
|
5544
|
+
df = ensure_cleaned_df(DATA_FOLDER, CLEANED_FOLDER, df) # writes cleaned_df.csv
|
|
5545
|
+
|
|
5546
|
+
# Build lightweight context
|
|
5547
|
+
columns_summary = ", ".join(df.columns.tolist())
|
|
5548
|
+
dataset_context = f"columns: {columns_summary}"
|
|
5549
|
+
dataset_profile = f"modality: tabular; columns: {columns_summary}"
|
|
5550
|
+
|
|
5551
|
+
refined_question = refine_question_agent(askai_question, dataset_context)
|
|
5552
|
+
tags = classify_ml_job_agent(refined_question, dataset_profile)
|
|
5553
|
+
|
|
5554
|
+
ai_code = smx.ai_generate_code(refined_question, tags, df)
|
|
5555
|
+
llm_usage = smx.get_last_llm_usage()
|
|
5556
|
+
ai_code = auto_inject_template(ai_code, tags, df)
|
|
5557
|
+
|
|
5558
|
+
# --- 1) Strip dotenv ASAP (kill imports, %magics, !pip) ---
|
|
5559
|
+
ctx = {
|
|
5560
|
+
"question": refined_question,
|
|
5561
|
+
"df_columns": list(df.columns),
|
|
5562
|
+
}
|
|
5563
|
+
ai_code = ToolRunner(EARLY_SANITIZERS).run(ai_code, ctx) # dotenv first
|
|
5491
5564
|
|
|
5492
|
-
|
|
5493
|
-
intent = classify(refined_question)
|
|
5494
|
-
ai_code = smx.ai_generate_code(refined_question, intent, df)
|
|
5495
|
-
ai_code = auto_inject_template(ai_code, intent, df)
|
|
5565
|
+
# --- 2) Domain/Plotting patches ---
|
|
5496
5566
|
ai_code = fix_scatter_and_summary(ai_code)
|
|
5497
5567
|
ai_code = fix_importance_groupby(ai_code)
|
|
5498
5568
|
ai_code = inject_auto_preprocessing(ai_code)
|
|
5499
5569
|
ai_code = patch_plot_code(ai_code, df, refined_question)
|
|
5570
|
+
ai_code = ensure_matplotlib_title(ai_code)
|
|
5571
|
+
ai_code = patch_pie_chart(ai_code, df, refined_question)
|
|
5500
5572
|
ai_code = patch_pairplot(ai_code, df)
|
|
5573
|
+
ai_code = fix_seaborn_boxplot_nameerror(ai_code)
|
|
5574
|
+
ai_code = fix_seaborn_barplot_nameerror(ai_code)
|
|
5501
5575
|
ai_code = get_plotting_imports(ai_code)
|
|
5502
|
-
ai_code =
|
|
5503
|
-
ai_code =
|
|
5576
|
+
ai_code = patch_prefix_seaborn_calls(ai_code)
|
|
5577
|
+
ai_code = patch_fix_sentinel_plot_calls(ai_code)
|
|
5578
|
+
ai_code = patch_ensure_seaborn_import(ai_code)
|
|
5579
|
+
ai_code = patch_rmse_calls(ai_code)
|
|
5580
|
+
ai_code = patch_fix_seaborn_palette_calls(ai_code)
|
|
5581
|
+
ai_code = patch_quiet_specific_warnings(ai_code)
|
|
5582
|
+
ai_code = clean_llm_code(ai_code)
|
|
5583
|
+
ai_code = ensure_image_output(ai_code)
|
|
5504
5584
|
ai_code = ensure_accuracy_block(ai_code)
|
|
5505
5585
|
ai_code = ensure_output(ai_code)
|
|
5506
5586
|
ai_code = fix_plain_prints(ai_code)
|
|
5507
|
-
ai_code = fix_print_html(ai_code)
|
|
5587
|
+
ai_code = fix_print_html(ai_code)
|
|
5508
5588
|
ai_code = fix_to_datetime_errors(ai_code)
|
|
5589
|
+
|
|
5590
|
+
# --- 3-4) Global syntax/data fixers (must run AFTER patches, BEFORE final repair) ---
|
|
5591
|
+
ai_code = ToolRunner(SYNTAX_AND_REPAIR).run(ai_code, ctx)
|
|
5592
|
+
|
|
5593
|
+
# # --- 4) Final catch-all repair (run LAST) ---
|
|
5509
5594
|
ai_code = _smx_repair_python_cell(ai_code)
|
|
5595
|
+
ai_code = harden_ai_code(ai_code)
|
|
5596
|
+
ai_code = drop_bad_classification_metrics(ai_code, df)
|
|
5597
|
+
ai_code = patch_fix_sentinel_plot_calls(ai_code)
|
|
5510
5598
|
|
|
5511
|
-
|
|
5599
|
+
# Always make sure 'df' is in the kernel before running user code
|
|
5512
5600
|
df_init_code = (
|
|
5513
5601
|
f"import pandas as pd\n"
|
|
5514
|
-
f"df = pd.read_csv(r'''{os.path.join(
|
|
5602
|
+
f"df = pd.read_csv(r'''{os.path.join(cleaned_path)}''')"
|
|
5515
5603
|
)
|
|
5604
|
+
|
|
5516
5605
|
execute_code_in_kernel(kc, df_init_code)
|
|
5517
5606
|
|
|
5518
5607
|
outputs, errors = execute_code_in_kernel(kc, ai_code)
|
|
@@ -5525,7 +5614,6 @@ def setup_routes(smx):
|
|
|
5525
5614
|
build_display_summary, phrase_commentary_vision, wrap_html
|
|
5526
5615
|
)
|
|
5527
5616
|
|
|
5528
|
-
|
|
5529
5617
|
# Probe axes/labels/legend
|
|
5530
5618
|
probe1_out, probe1_err = execute_code_in_kernel(kc, MPL_PROBE_SNIPPET)
|
|
5531
5619
|
axes_info = parse_mpl_probe_output([str(x) for x in (probe1_out + probe1_err)])
|
|
@@ -5542,17 +5630,17 @@ def setup_routes(smx):
|
|
|
5542
5630
|
################################################################
|
|
5543
5631
|
|
|
5544
5632
|
# ----- Build a single HTML with Result + Commentary + AI Code ----------
|
|
5545
|
-
_buf_out, _buf_err =
|
|
5633
|
+
_buf_out, _buf_err = _std_io.StringIO(), _std_io.StringIO()
|
|
5546
5634
|
with contextlib.redirect_stdout(_buf_out), contextlib.redirect_stderr(_buf_err):
|
|
5547
|
-
#
|
|
5635
|
+
# Exact result blocks (already cleaned by kernel_manager)
|
|
5548
5636
|
result_html = rendered_html if rendered_html.strip() else "<pre>No output.</pre>"
|
|
5549
5637
|
|
|
5550
|
-
#
|
|
5638
|
+
# Commentary (we already have the raw HTML via wrap_html)
|
|
5551
5639
|
commentary_html = wrap_html(commentary_text)
|
|
5552
5640
|
|
|
5553
5641
|
code_html = _render_code_block("AI Generated Code", ai_code)
|
|
5554
5642
|
|
|
5555
|
-
full_body_html = "\n" + askai_question + "\n" + result_html + "\n" +
|
|
5643
|
+
full_body_html = "\n" + askai_question + "\n" + result_html + "\n" + code_html + "\n" + commentary_html
|
|
5556
5644
|
|
|
5557
5645
|
html_doc = (
|
|
5558
5646
|
"<!doctype html>"
|
|
@@ -5576,7 +5664,7 @@ def setup_routes(smx):
|
|
|
5576
5664
|
|
|
5577
5665
|
_last_result_html[session_id] = html_doc
|
|
5578
5666
|
|
|
5579
|
-
#
|
|
5667
|
+
# Append a single download button (explicit click → fetch → download)
|
|
5580
5668
|
download_url = url_for("download_result_html", session_id=session_id)
|
|
5581
5669
|
dl_html = f"""
|
|
5582
5670
|
<a href="{download_url}">
|
|
@@ -5589,79 +5677,797 @@ def setup_routes(smx):
|
|
|
5589
5677
|
"""
|
|
5590
5678
|
ai_outputs.append(Markup(dl_html))
|
|
5591
5679
|
|
|
5592
|
-
################################################################
|
|
5593
|
-
|
|
5594
|
-
|
|
5595
5680
|
# --- EDA/static cells ---
|
|
5681
|
+
# Display helper: coerce integer-like float columns to Int64 just for rendering
|
|
5682
|
+
def _coerce_intlike_for_display(df_in: pd.DataFrame, per_cell: bool = False, eps: float = 1e-9) -> pd.DataFrame:
|
|
5683
|
+
import numpy as np
|
|
5684
|
+
out = df_in.copy()
|
|
5685
|
+
if per_cell:
|
|
5686
|
+
def _maybe(v):
|
|
5687
|
+
try:
|
|
5688
|
+
fv = float(v)
|
|
5689
|
+
except Exception:
|
|
5690
|
+
return v
|
|
5691
|
+
if pd.notnull(v) and np.isfinite(fv) and abs(fv - round(fv)) <= eps:
|
|
5692
|
+
return int(round(fv))
|
|
5693
|
+
return v
|
|
5694
|
+
return out.applymap(_maybe)
|
|
5695
|
+
# column-wise mode (original behaviour for previews)
|
|
5696
|
+
for c in out.columns:
|
|
5697
|
+
s = out[c]
|
|
5698
|
+
if pd.api.types.is_float_dtype(s):
|
|
5699
|
+
vals = s.dropna().to_numpy()
|
|
5700
|
+
if vals.size and np.isfinite(vals).all() and np.allclose(vals, np.round(vals), rtol=0, atol=eps):
|
|
5701
|
+
out[c] = s.round().astype("Int64")
|
|
5702
|
+
return out
|
|
5703
|
+
|
|
5596
5704
|
data_cells = []
|
|
5705
|
+
max_rows = 5000
|
|
5706
|
+
max_cols = 80
|
|
5597
5707
|
if df is not None:
|
|
5598
|
-
|
|
5599
|
-
ds = selected_dataset.replace("_"," ").replace(".csv","").capitalize()
|
|
5708
|
+
df = eda_df
|
|
5709
|
+
ds = (selected_dataset or "").replace("_", " ").replace(".csv", "").capitalize()
|
|
5710
|
+
|
|
5711
|
+
# 1) Dataset Overview (stat cards)
|
|
5712
|
+
rows, cols = df.shape
|
|
5713
|
+
mem_bytes = int(df.memory_usage(deep=True).sum())
|
|
5714
|
+
mem_mb = round(mem_bytes / (1024 * 1024), 2)
|
|
5715
|
+
dup_rows = int(df.duplicated().sum())
|
|
5716
|
+
nunique_all = df.nunique(dropna=False)
|
|
5717
|
+
|
|
5718
|
+
n = max(rows, 1)
|
|
5719
|
+
dtypes = df.dtypes.astype(str)
|
|
5720
|
+
nonnull = df.notnull().sum()
|
|
5721
|
+
miss_pct = (df.isnull().mean() * 100).round(1)
|
|
5722
|
+
uniques = df.nunique(dropna=True)
|
|
5723
|
+
uniq_ratio = (uniques / n).fillna(0.0)
|
|
5724
|
+
|
|
5725
|
+
id_like, hi_card, consts, flags_col = [], [], [], []
|
|
5726
|
+
for c in df.columns:
|
|
5727
|
+
flags = []
|
|
5728
|
+
if uniques.get(c, 0) <= 1:
|
|
5729
|
+
flags.append("constant"); consts.append(c)
|
|
5730
|
+
if uniq_ratio.get(c, 0) >= 0.95 and "datetime" not in dtypes[c].lower():
|
|
5731
|
+
flags.append("id-like"); id_like.append(c)
|
|
5732
|
+
if dtypes[c].startswith("object") and uniq_ratio.get(c, 0) > 0.5 and c not in id_like:
|
|
5733
|
+
flags.append("high-card"); hi_card.append(c)
|
|
5734
|
+
flags_col.append(", ".join(flags))
|
|
5735
|
+
|
|
5736
|
+
_stats_code = (
|
|
5737
|
+
"rows, cols = df.shape\n"
|
|
5738
|
+
"mem_bytes = int(df.memory_usage(deep=True).sum())\n"
|
|
5739
|
+
"mem_mb = round(mem_bytes / (1024*1024), 2)\n"
|
|
5740
|
+
)
|
|
5741
|
+
|
|
5742
|
+
_stats_html = f"""
|
|
5743
|
+
<style>
|
|
5744
|
+
.smx-statwrap{{display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));gap:10px}}
|
|
5745
|
+
.smx-stat{{background:#fff;border:1px solid #e5e7eb;border-radius:10px;padding:10px 12px;text-align:center}}
|
|
5746
|
+
.smx-stat h4{{margin:0 0 4px;font-size:.9rem}}
|
|
5747
|
+
.smx-stat div{{font-weight:700;font-size:1.05rem}}
|
|
5748
|
+
</style>
|
|
5749
|
+
<div class="smx-statwrap">
|
|
5750
|
+
<div class="smx-stat"><h4>Rows</h4><div>{rows:,}</div></div>
|
|
5751
|
+
<div class="smx-stat"><h4>Columns</h4><div>{cols:,}</div></div>
|
|
5752
|
+
<div class="smx-stat"><h4>Memory (MB)</h4><div>{mem_mb}</div></div>
|
|
5753
|
+
</div>
|
|
5754
|
+
"""
|
|
5755
|
+
data_cells.append({
|
|
5756
|
+
"title": f"{ds} Overview",
|
|
5757
|
+
"output": Markup(_stats_html),
|
|
5758
|
+
"code": _stats_code,
|
|
5759
|
+
"span":"eda-col-8"
|
|
5760
|
+
})
|
|
5761
|
+
|
|
5762
|
+
# 2) Integrity Notes — with "Show all" toggle
|
|
5763
|
+
notes = []
|
|
5764
|
+
if id_like:
|
|
5765
|
+
notes.append(f"ID-like columns: {', '.join(map(str, id_like[:6]))}{'…' if len(id_like)>6 else ''}")
|
|
5766
|
+
if hi_card:
|
|
5767
|
+
notes.append(f"High-cardinality categoricals: {', '.join(map(str, hi_card[:6]))}{'…' if len(hi_card)>6 else ''}")
|
|
5768
|
+
if consts:
|
|
5769
|
+
notes.append(f"Constant columns: {', '.join(map(str, consts[:6]))}{'…' if len(consts)>6 else ''}")
|
|
5770
|
+
|
|
5771
|
+
# Build full flagged table
|
|
5772
|
+
flag_rows = []
|
|
5773
|
+
for c in df.columns:
|
|
5774
|
+
f = []
|
|
5775
|
+
if c in id_like: f.append("id-like")
|
|
5776
|
+
if c in hi_card: f.append("high-card")
|
|
5777
|
+
if c in consts: f.append("constant")
|
|
5778
|
+
if f:
|
|
5779
|
+
flag_rows.append({
|
|
5780
|
+
"Column": c,
|
|
5781
|
+
"Flags": ", ".join(f),
|
|
5782
|
+
"Type": dtypes[c],
|
|
5783
|
+
"Unique Values": int(uniques.get(c, 0)),
|
|
5784
|
+
"Unique Ratio": float(uniq_ratio.get(c, 0)),
|
|
5785
|
+
"Missing (%)": float(miss_pct.get(c, 0)),
|
|
5786
|
+
})
|
|
5787
|
+
flagged_df = pd.DataFrame(flag_rows)
|
|
5788
|
+
flagged_df = flagged_df.sort_values(["Flags","Column"]) if not flagged_df.empty else flagged_df
|
|
5789
|
+
|
|
5790
|
+
# Render notes + toggle
|
|
5791
|
+
notes_html = (
|
|
5792
|
+
"<ul style='margin:0;padding-left:18px;'>" +
|
|
5793
|
+
"".join([f"<li>{n}</li>" for n in notes]) +
|
|
5794
|
+
"</ul>"
|
|
5795
|
+
) if notes else "<em>No obvious integrity flags.</em>"
|
|
5796
|
+
|
|
5797
|
+
if not flagged_df.empty:
|
|
5798
|
+
table_html = datatable_box(flagged_df)
|
|
5799
|
+
body_html = (
|
|
5800
|
+
notes_html +
|
|
5801
|
+
f"<details style='margin-top:8px;'><summary>Show all flagged columns ({len(flagged_df)})</summary>"
|
|
5802
|
+
f"<div style='margin-top:8px;'>{table_html}</div></details>"
|
|
5803
|
+
)
|
|
5804
|
+
else:
|
|
5805
|
+
body_html = notes_html
|
|
5806
|
+
|
|
5600
5807
|
data_cells.append({
|
|
5601
|
-
"title":
|
|
5602
|
-
"output":
|
|
5603
|
-
"code":
|
|
5808
|
+
"title": "Integrity Notes",
|
|
5809
|
+
"output": Markup(body_html),
|
|
5810
|
+
"code": (
|
|
5811
|
+
"# Build Integrity Notes lists and full flagged table\n"
|
|
5812
|
+
"flag_rows = []\n"
|
|
5813
|
+
"for c in df.columns:\n"
|
|
5814
|
+
" f = []\n"
|
|
5815
|
+
" if c in id_like: f.append('id-like')\n"
|
|
5816
|
+
" if c in hi_card: f.append('high-card')\n"
|
|
5817
|
+
" if c in consts: f.append('constant')\n"
|
|
5818
|
+
" if f:\n"
|
|
5819
|
+
" flag_rows.append({\n"
|
|
5820
|
+
" 'Column': c,\n"
|
|
5821
|
+
" 'Flags': ', '.join(f),\n"
|
|
5822
|
+
" 'Type': dtypes[c],\n"
|
|
5823
|
+
" 'Unique Values': int(uniques.get(c,0)),\n"
|
|
5824
|
+
" 'Unique Ratio': float(uniq_ratio.get(c,0)),\n"
|
|
5825
|
+
" 'Missing (%)': float(miss_pct.get(c,0))\n"
|
|
5826
|
+
" })\n"
|
|
5827
|
+
"flagged_df = pd.DataFrame(flag_rows)\n"
|
|
5828
|
+
"flagged_df"
|
|
5829
|
+
),
|
|
5830
|
+
"span":"eda-col-4"
|
|
5604
5831
|
})
|
|
5605
|
-
|
|
5832
|
+
|
|
5833
|
+
# 3) Data Preview
|
|
5834
|
+
preview_cols = df.columns
|
|
5835
|
+
preview_df = _coerce_intlike_for_display(df[preview_cols].head(8))
|
|
5606
5836
|
data_cells.append({
|
|
5607
5837
|
"title": "Data Preview",
|
|
5608
|
-
"output": Markup(datatable_box(
|
|
5609
|
-
"code": f"df[{list(preview_cols)}].head(8)"
|
|
5838
|
+
"output": Markup(datatable_box(preview_df)),
|
|
5839
|
+
"code": f"df[{list(preview_cols)}].head(8)",
|
|
5840
|
+
"span": "eda-col-6"
|
|
5610
5841
|
})
|
|
5842
|
+
|
|
5843
|
+
# 4) Summary Statistics
|
|
5844
|
+
summary_cols = df.columns
|
|
5845
|
+
summary_df = _coerce_intlike_for_display(df[summary_cols].describe())
|
|
5611
5846
|
data_cells.append({
|
|
5612
5847
|
"title": "Summary Statistics",
|
|
5613
|
-
"output": Markup(datatable_box(
|
|
5614
|
-
"code": "df.describe()"
|
|
5848
|
+
"output": Markup(datatable_box(summary_df)),
|
|
5849
|
+
"code": f"df[{list(summary_cols)}].describe()",
|
|
5850
|
+
"span": "eda-col-6"
|
|
5851
|
+
})
|
|
5852
|
+
|
|
5853
|
+
# 5) Column Profile
|
|
5854
|
+
def _sample_vals(s, k=3):
|
|
5855
|
+
try:
|
|
5856
|
+
vals = pd.unique(s.dropna().astype(str))[:k]
|
|
5857
|
+
return ", ".join(map(str, vals))
|
|
5858
|
+
except Exception:
|
|
5859
|
+
return ""
|
|
5860
|
+
|
|
5861
|
+
profile_df = pd.DataFrame({
|
|
5862
|
+
"Column": df.columns,
|
|
5863
|
+
"Type": dtypes.values,
|
|
5864
|
+
"Non-Null Count": nonnull.values,
|
|
5865
|
+
"Missing (%)": miss_pct.values,
|
|
5866
|
+
"Unique Values": uniques.values,
|
|
5867
|
+
"Sample Values": [ _sample_vals(df[c]) for c in df.columns ],
|
|
5868
|
+
"Flags": flags_col
|
|
5869
|
+
})
|
|
5870
|
+
data_cells.append({
|
|
5871
|
+
"title": "Column Profile",
|
|
5872
|
+
"output": Markup(datatable_box(profile_df)),
|
|
5873
|
+
"code": (
|
|
5874
|
+
"dtypes = df.dtypes.astype(str)\n"
|
|
5875
|
+
"nonnull = df.notnull().sum()\n"
|
|
5876
|
+
"miss_pct = (df.isnull().mean()*100).round(1)\n"
|
|
5877
|
+
"uniques = df.nunique(dropna=True)\n"
|
|
5878
|
+
"n = max(len(df), 1)\n"
|
|
5879
|
+
"uniq_ratio = (uniques / n).fillna(0.0)\n"
|
|
5880
|
+
"def _sample_vals(s, k=3):\n"
|
|
5881
|
+
" vals = pd.unique(s.dropna().astype(str))[:k]\n"
|
|
5882
|
+
" return ', '.join(map(str, vals)) if len(vals) else ''\n"
|
|
5883
|
+
"flags_col = []\n"
|
|
5884
|
+
"for c in df.columns:\n"
|
|
5885
|
+
" flags=[]\n"
|
|
5886
|
+
" if uniques.get(c,0) <= 1: flags.append('constant')\n"
|
|
5887
|
+
" if uniq_ratio.get(c,0) >= 0.95 and 'datetime' not in dtypes[c].lower(): flags.append('id-like')\n"
|
|
5888
|
+
" if dtypes[c].startswith('object') and uniq_ratio.get(c,0) > 0.5 and 'id-like' not in flags: flags.append('high-card')\n"
|
|
5889
|
+
" flags_col.append(', '.join(flags))\n"
|
|
5890
|
+
"profile_df = pd.DataFrame({\n"
|
|
5891
|
+
" 'Column': df.columns,\n"
|
|
5892
|
+
" 'Type': dtypes.values,\n"
|
|
5893
|
+
" 'Non-Null Count': nonnull.values,\n"
|
|
5894
|
+
" 'Missing (%)': miss_pct.values,\n"
|
|
5895
|
+
" 'Unique Values': uniques.values,\n"
|
|
5896
|
+
" 'Sample Values': [ _sample_vals(df[c]) for c in df.columns ],\n"
|
|
5897
|
+
" 'Flags': flags_col\n"
|
|
5898
|
+
"})\n"
|
|
5899
|
+
"profile_df"
|
|
5900
|
+
),
|
|
5901
|
+
"span":"eda-col-6"
|
|
5615
5902
|
})
|
|
5903
|
+
|
|
5904
|
+
# 6) Column Types
|
|
5905
|
+
dtype_df = pd.DataFrame({
|
|
5906
|
+
"Column": df.columns,
|
|
5907
|
+
"Type": df.dtypes.astype(str).values,
|
|
5908
|
+
"Non-Null Count": df.notnull().sum().values,
|
|
5909
|
+
"Unique Values": df.nunique().values
|
|
5910
|
+
})
|
|
5911
|
+
data_cells.append({
|
|
5912
|
+
"title": "Column Types",
|
|
5913
|
+
"output": Markup(datatable_box(dtype_df)),
|
|
5914
|
+
"code": (
|
|
5915
|
+
"pd.DataFrame({\n"
|
|
5916
|
+
" 'Column': df.columns,\n"
|
|
5917
|
+
" 'Type': df.dtypes.astype(str).values,\n"
|
|
5918
|
+
" 'Non-Null Count': df.notnull().sum().values,\n"
|
|
5919
|
+
" 'Unique Values': df.nunique().values\n"
|
|
5920
|
+
"})"
|
|
5921
|
+
),
|
|
5922
|
+
"span":"eda-col-6"
|
|
5923
|
+
})
|
|
5924
|
+
|
|
5925
|
+
# 7) Outliers — Top 3 records (robust MAD score, capped 5k×80)
|
|
5926
|
+
try:
|
|
5927
|
+
import numpy as np
|
|
5928
|
+
|
|
5929
|
+
num_cols_all = df.select_dtypes(include="number").columns.tolist()
|
|
5930
|
+
if len(num_cols_all) >= 1:
|
|
5931
|
+
num_cols = num_cols_all[:max_cols] # use your cap (80)
|
|
5932
|
+
df_num = df[num_cols].copy()
|
|
5933
|
+
|
|
5934
|
+
# cap rows for speed (5k)
|
|
5935
|
+
if len(df_num) > max_rows:
|
|
5936
|
+
df_num = df_num.sample(max_rows, random_state=0)
|
|
5937
|
+
|
|
5938
|
+
# robust z: 0.6745 * (x - median) / MAD (MAD==0 → NaN)
|
|
5939
|
+
med = df_num.median(numeric_only=True)
|
|
5940
|
+
mad = (df_num - med).abs().median(numeric_only=True)
|
|
5941
|
+
rz = 0.6745 * (df_num - med) / mad.replace(0, np.nan)
|
|
5942
|
+
|
|
5943
|
+
abs_rz = rz.abs()
|
|
5944
|
+
row_score = abs_rz.max(axis=1, skipna=True) # strongest dev across features
|
|
5945
|
+
top_idx = row_score.nlargest(3).index.tolist()
|
|
5946
|
+
|
|
5947
|
+
# Build compact, mobile-friendly cards for the top 3 rows
|
|
5948
|
+
cards_html = []
|
|
5949
|
+
for ridx in top_idx:
|
|
5950
|
+
# top contributing columns for this row
|
|
5951
|
+
contrib = abs_rz.loc[ridx].dropna().sort_values(ascending=False).head(5)
|
|
5952
|
+
maxv = float(contrib.iloc[0]) if len(contrib) else 0.0
|
|
5953
|
+
|
|
5954
|
+
bars = []
|
|
5955
|
+
for c, v in contrib.items():
|
|
5956
|
+
pct = 0.0 if maxv <= 0 else min(100.0, float(v) / maxv * 100.0)
|
|
5957
|
+
bars.append(f"""
|
|
5958
|
+
<div class="barrow">
|
|
5959
|
+
<span class="cname">{html.escape(str(c))}</span>
|
|
5960
|
+
<div class="bar"><div class="fill" style="width:{pct:.1f}%"></div></div>
|
|
5961
|
+
<span class="score">{v:.2f}</span>
|
|
5962
|
+
</div>
|
|
5963
|
+
""")
|
|
5964
|
+
|
|
5965
|
+
bars_html = "".join(bars) if bars else "<em>No strong single-column contributors.</em>"
|
|
5966
|
+
|
|
5967
|
+
# show the full record (all columns) with horizontal scroll
|
|
5968
|
+
row_vals = df.loc[ridx, :].to_dict()
|
|
5969
|
+
row_tbl = datatable_box(pd.DataFrame([row_vals]))
|
|
5970
|
+
|
|
5971
|
+
score_val = float(row_score.loc[ridx]) if pd.notnull(row_score.loc[ridx]) else 0.0
|
|
5972
|
+
title_idx = int(ridx) if isinstance(ridx, (int, np.integer)) else html.escape(str(ridx))
|
|
5973
|
+
|
|
5974
|
+
cards_html.append(f"""
|
|
5975
|
+
<div class="mad-card">
|
|
5976
|
+
<div class="mad-title">Row index: {title_idx} · score: {score_val:.2f}</div>
|
|
5977
|
+
<div class="mad-bars">{bars_html}</div>
|
|
5978
|
+
<div class="mad-row">{row_tbl}</div>
|
|
5979
|
+
</div>
|
|
5980
|
+
""")
|
|
5981
|
+
|
|
5982
|
+
grid_html = f"""
|
|
5983
|
+
<style>
|
|
5984
|
+
.mad-grid{{display:grid;grid-template-columns:repeat(3,minmax(0,1fr));gap:10px}}
|
|
5985
|
+
@media(max-width:1024px){{.mad-grid{{grid-template-columns:repeat(2,minmax(0,1fr))}}}}
|
|
5986
|
+
@media(max-width:640px){{.mad-grid{{grid-template-columns:repeat(1,minmax(0,1fr))}}}}
|
|
5987
|
+
.mad-card{{background:#fff;border:1px solid #e5e7eb;border-radius:10px;padding:8px 10px}}
|
|
5988
|
+
.mad-title{{font-weight:600;margin-bottom:6px}}
|
|
5989
|
+
.mad-bars .barrow{{display:grid;grid-template-columns:140px 1fr 46px;gap:6px;align-items:center;margin:4px 0}}
|
|
5990
|
+
.mad-bars .bar{{background:#eef2f7;border-radius:6px;height:8px;overflow:hidden}}
|
|
5991
|
+
.mad-bars .fill{{background:#0b8ae5;height:8px}}
|
|
5992
|
+
.mad-bars .cname{{font-size:12px;color:#444;white-space:nowrap;overflow:hidden;text-overflow:ellipsis}}
|
|
5993
|
+
.mad-bars .score{{font-size:12px;color:#333;text-align:right}}
|
|
5994
|
+
.mad-row .smx-table{{font-size:12px}}
|
|
5995
|
+
</style>
|
|
5996
|
+
<div class="mad-grid">{''.join(cards_html)}</div>
|
|
5997
|
+
"""
|
|
5998
|
+
|
|
5999
|
+
data_cells.append({
|
|
6000
|
+
"title": "Outliers — Top 3 records",
|
|
6001
|
+
"output": Markup(grid_html),
|
|
6002
|
+
"code": (
|
|
6003
|
+
"num_cols = df.select_dtypes(include='number').columns.tolist()[:max_cols]\n"
|
|
6004
|
+
"df_num = df[num_cols]\n"
|
|
6005
|
+
"df_num = df_num.sample(max_rows, random_state=0) if len(df_num) > max_rows else df_num\n"
|
|
6006
|
+
"med = df_num.median(); mad = (df_num - med).abs().median()\n"
|
|
6007
|
+
"rz = 0.6745 * (df_num - med) / mad.replace(0, np.nan)\n"
|
|
6008
|
+
"row_score = rz.abs().max(axis=1)\n"
|
|
6009
|
+
"top3 = row_score.nlargest(3)\n"
|
|
6010
|
+
),
|
|
6011
|
+
"span": "eda-col-12"
|
|
6012
|
+
})
|
|
6013
|
+
else:
|
|
6014
|
+
data_cells.append({
|
|
6015
|
+
"title": "Outliers — Top 3 records (robust MAD score)",
|
|
6016
|
+
"output": "<em>No numeric columns available.</em>",
|
|
6017
|
+
"code": "# no numeric columns",
|
|
6018
|
+
"span": "eda-col-6"
|
|
6019
|
+
})
|
|
6020
|
+
except Exception as _e:
|
|
6021
|
+
data_cells.append({
|
|
6022
|
+
"title": "Outliers — Top 3 records (robust MAD score)",
|
|
6023
|
+
"output": f"<em>Could not compute robust outliers: {html.escape(str(_e))}</em>",
|
|
6024
|
+
"code": "# error during robust outlier computation",
|
|
6025
|
+
"span": "eda-col-6"
|
|
6026
|
+
})
|
|
6027
|
+
|
|
6028
|
+
# 8) Outliers — Violin + Box (Top 3 numerics by IQR outliers, capped 5k×80)
|
|
6029
|
+
try:
|
|
6030
|
+
num_outliers = 3
|
|
6031
|
+
num_cols_all = df.select_dtypes(include="number").columns.tolist()
|
|
6032
|
+
if len(num_cols_all) >= 1:
|
|
6033
|
+
num_cols = num_cols_all[:max_cols]
|
|
6034
|
+
dfn = df[num_cols].copy()
|
|
6035
|
+
|
|
6036
|
+
# cap rows for speed (5k)
|
|
6037
|
+
if len(dfn) > max_rows:
|
|
6038
|
+
dfn = dfn.sample(max_rows, random_state=0)
|
|
6039
|
+
|
|
6040
|
+
# rank columns by number of Tukey outliers (1.5*IQR)
|
|
6041
|
+
ranks = []
|
|
6042
|
+
for c in dfn.columns:
|
|
6043
|
+
s = pd.to_numeric(dfn[c], errors="coerce").dropna()
|
|
6044
|
+
if s.empty:
|
|
6045
|
+
ranks.append((c, 0, 0.0))
|
|
6046
|
+
continue
|
|
6047
|
+
q1 = s.quantile(0.25); q3 = s.quantile(0.75)
|
|
6048
|
+
iqr = float(q3 - q1)
|
|
6049
|
+
if iqr <= 0:
|
|
6050
|
+
ranks.append((c, 0, 0.0))
|
|
6051
|
+
continue
|
|
6052
|
+
lower = q1 - 1.5 * iqr
|
|
6053
|
+
upper = q3 + 1.5 * iqr
|
|
6054
|
+
out_count = int(((s < lower) | (s > upper)).sum())
|
|
6055
|
+
ranks.append((c, out_count, float(iqr)))
|
|
6056
|
+
|
|
6057
|
+
# choose top 6 (break ties by IQR spread)
|
|
6058
|
+
sel_cols = [c for c, _, _ in sorted(ranks, key=lambda x: (-x[1], -x[2]))[:num_outliers]]
|
|
6059
|
+
if not sel_cols:
|
|
6060
|
+
raise ValueError("No numeric columns have spread for violin plots.")
|
|
6061
|
+
|
|
6062
|
+
# package data for JS (values only; thresholds for display)
|
|
6063
|
+
charts = []
|
|
6064
|
+
for c in sel_cols:
|
|
6065
|
+
s = pd.to_numeric(dfn[c], errors="coerce").dropna()
|
|
6066
|
+
if s.empty:
|
|
6067
|
+
continue
|
|
6068
|
+
q1 = s.quantile(0.25); q3 = s.quantile(0.75); iqr = q3 - q1
|
|
6069
|
+
lower = float(q1 - 1.5 * iqr); upper = float(q3 + 1.5 * iqr)
|
|
6070
|
+
out_count = int(((s < lower) | (s > upper)).sum())
|
|
6071
|
+
charts.append({
|
|
6072
|
+
"name": str(c),
|
|
6073
|
+
"values": [float(v) for v in s.tolist()],
|
|
6074
|
+
"lower": lower,
|
|
6075
|
+
"upper": upper,
|
|
6076
|
+
"n": int(s.size),
|
|
6077
|
+
"out": out_count
|
|
6078
|
+
})
|
|
6079
|
+
|
|
6080
|
+
container_id = f"violgrid_{uuid.uuid4().hex}"
|
|
6081
|
+
sub_divs = "\n".join([f'<div id="{container_id}_{i}" class="vplot"></div>' for i in range(len(charts))])
|
|
6082
|
+
|
|
6083
|
+
plot_html = f"""
|
|
6084
|
+
<style>
|
|
6085
|
+
/* mini-grid 3x2 → 2x? → 1x? */
|
|
6086
|
+
#{container_id}{{display:grid;grid-template-columns:repeat(3,minmax(0,1fr));gap:10px}}
|
|
6087
|
+
@media(max-width:1024px){{#{container_id}{{grid-template-columns:repeat(2,minmax(0,1fr))}}}}
|
|
6088
|
+
@media(max-width:640px){{#{container_id}{{grid-template-columns:repeat(1,minmax(0,1fr))}}}}
|
|
6089
|
+
/* each plot container – height set via JS for monotonic responsiveness */
|
|
6090
|
+
#{container_id} .vplot{{width:100%;}}
|
|
6091
|
+
</style>
|
|
6092
|
+
<div id="{container_id}">
|
|
6093
|
+
{sub_divs}
|
|
6094
|
+
</div>
|
|
6095
|
+
<script>
|
|
6096
|
+
(function(){{
|
|
6097
|
+
var charts = {json.dumps(charts)};
|
|
6098
|
+
|
|
6099
|
+
function calcHeight(el){{
|
|
6100
|
+
var w = (el && el.clientWidth) || (el && el.parentElement && el.parentElement.clientWidth) || 360;
|
|
6101
|
+
// smooth, monotone: ~0.55×width, clamped
|
|
6102
|
+
return Math.round(Math.max(220, Math.min(360, w * 0.55)));
|
|
6103
|
+
}}
|
|
6104
|
+
|
|
6105
|
+
function drawOne(target, data){{
|
|
6106
|
+
var el = document.getElementById(target);
|
|
6107
|
+
if(!el) return;
|
|
6108
|
+
var h = calcHeight(el);
|
|
6109
|
+
el.style.setProperty('height', h + 'px', 'important'); // defeat global height:auto
|
|
6110
|
+
|
|
6111
|
+
var trace = {{
|
|
6112
|
+
type: 'violin',
|
|
6113
|
+
y: data.values,
|
|
6114
|
+
name: data.name,
|
|
6115
|
+
box: {{ visible: true }},
|
|
6116
|
+
meanline: {{ visible: true }},
|
|
6117
|
+
points: 'suspectedoutliers',
|
|
6118
|
+
hovertemplate: '%{{y}}<extra></extra>',
|
|
6119
|
+
showlegend: false
|
|
6120
|
+
}};
|
|
6121
|
+
|
|
6122
|
+
var layout = {{
|
|
6123
|
+
margin: {{ l: 40, r: 10, t: 26, b: 28 }},
|
|
6124
|
+
title: {{ text: data.name + ' (n=' + data.n + ', out=' + data.out + ')', font: {{ size: 12 }} }},
|
|
6125
|
+
yaxis: {{ automargin: true }}
|
|
6126
|
+
}};
|
|
6127
|
+
|
|
6128
|
+
var config = {{ displayModeBar: true, responsive: true }};
|
|
6129
|
+
if(window.Plotly && Plotly.newPlot){{
|
|
6130
|
+
Plotly.newPlot(el, [trace], layout, config).then(function(){{
|
|
6131
|
+
if(Plotly.Plots && Plotly.Plots.resize) Plotly.Plots.resize(el);
|
|
6132
|
+
}});
|
|
6133
|
+
}} else {{
|
|
6134
|
+
var p=document.createElement('div'); p.style.color='crimson'; p.style.marginTop='8px';
|
|
6135
|
+
p.textContent='Plotly is not loaded.'; el.appendChild(p);
|
|
6136
|
+
}}
|
|
6137
|
+
}}
|
|
6138
|
+
|
|
6139
|
+
function drawAll(){{
|
|
6140
|
+
for(var i=0;i<charts.length;i++) drawOne("{container_id}_" + i, charts[i]);
|
|
6141
|
+
}}
|
|
6142
|
+
drawAll();
|
|
6143
|
+
window.addEventListener('resize', drawAll);
|
|
6144
|
+
}})();
|
|
6145
|
+
</script>
|
|
6146
|
+
"""
|
|
6147
|
+
|
|
6148
|
+
data_cells.append({
|
|
6149
|
+
"title": "Outliers — Violin + Box (Top 3 numerics by IQR outliers)",
|
|
6150
|
+
"output": Markup(plot_html),
|
|
6151
|
+
"code": (
|
|
6152
|
+
"dfn = df.select_dtypes(include='number').iloc[:, :max_cols]\n"
|
|
6153
|
+
"dfn = dfn.sample(max_rows, random_state=0) if len(dfn) > max_rows else dfn\n"
|
|
6154
|
+
"# rank columns by Tukey outliers (1.5*IQR) and plot violins with inner box"
|
|
6155
|
+
),
|
|
6156
|
+
"span": "eda-col-12"
|
|
6157
|
+
})
|
|
6158
|
+
|
|
6159
|
+
else:
|
|
6160
|
+
data_cells.append({
|
|
6161
|
+
"title": "Outliers — Violin + Box",
|
|
6162
|
+
"output": "<em>No numeric columns available.</em>",
|
|
6163
|
+
"code": "# no numeric columns",
|
|
6164
|
+
"span": "eda-col-6"
|
|
6165
|
+
})
|
|
6166
|
+
except Exception as _e:
|
|
6167
|
+
data_cells.append({
|
|
6168
|
+
"title": "Outliers — Violin + Box",
|
|
6169
|
+
"output": f"<em>Could not render violins: {html.escape(str(_e))}</em>",
|
|
6170
|
+
"code": "# error during violin rendering",
|
|
6171
|
+
"span": "eda-col-6"
|
|
6172
|
+
})
|
|
6173
|
+
|
|
6174
|
+
# 9) Missing Values table
|
|
5616
6175
|
nulls = df.isnull().sum()
|
|
5617
6176
|
nulls_pct = (df.isnull().mean() * 100).round(1)
|
|
5618
6177
|
missing_df = pd.DataFrame({
|
|
5619
|
-
"
|
|
5620
|
-
"Missing
|
|
6178
|
+
"Column": df.columns,
|
|
6179
|
+
"Missing Values": nulls.values,
|
|
6180
|
+
"Missing (%)": nulls_pct.values
|
|
5621
6181
|
})
|
|
5622
|
-
missing = missing_df[missing_df["Missing Values"] > 0]
|
|
6182
|
+
missing = missing_df[missing_df["Missing Values"] > 0]
|
|
5623
6183
|
data_cells.append({
|
|
5624
6184
|
"title": "Missing Values",
|
|
5625
6185
|
"output": Markup(datatable_box(missing)) if not missing.empty else "<em>No missing values detected.</em>",
|
|
5626
6186
|
"code": (
|
|
5627
6187
|
"nulls = df.isnull().sum()\n"
|
|
5628
6188
|
"nulls_pct = (df.isnull().mean() * 100).round(1)\n"
|
|
5629
|
-
"missing_df = pd.DataFrame({
|
|
6189
|
+
"missing_df = pd.DataFrame({\n"
|
|
6190
|
+
" 'Column': df.columns,\n"
|
|
6191
|
+
" 'Missing Values': nulls.values,\n"
|
|
6192
|
+
" 'Missing (%)': nulls_pct.values\n"
|
|
6193
|
+
"})\\n"
|
|
5630
6194
|
"missing_df[missing_df['Missing Values'] > 0]"
|
|
5631
|
-
)
|
|
6195
|
+
),
|
|
6196
|
+
"span":"eda-col-4"
|
|
5632
6197
|
})
|
|
5633
|
-
dtype_df = pd.DataFrame({
|
|
5634
|
-
"Type": df.dtypes.astype(str),
|
|
5635
|
-
"Non-Null Count": df.notnull().sum(),
|
|
5636
|
-
"Unique Values": df.nunique()
|
|
5637
|
-
})
|
|
5638
|
-
data_cells.append({
|
|
5639
|
-
"title": "Column Types",
|
|
5640
6198
|
|
|
5641
|
-
|
|
5642
|
-
|
|
5643
|
-
|
|
5644
|
-
"
|
|
5645
|
-
"
|
|
5646
|
-
"
|
|
5647
|
-
|
|
6199
|
+
# 9) Missingness (Top 20) – Plotly bar chart
|
|
6200
|
+
if not missing.empty:
|
|
6201
|
+
top_miss = (
|
|
6202
|
+
missing_df[missing_df["Missing Values"] > 0]
|
|
6203
|
+
.sort_values("Missing (%)", ascending=False)
|
|
6204
|
+
.loc[:, ["Column", "Missing (%)"]]
|
|
6205
|
+
.head(20)
|
|
6206
|
+
.reset_index(drop=True)
|
|
5648
6207
|
)
|
|
5649
|
-
|
|
6208
|
+
|
|
6209
|
+
container_id = f"miss_plot_{uuid.uuid4().hex}"
|
|
6210
|
+
x_vals = [html.escape(str(c)) for c in top_miss["Column"].tolist()]
|
|
6211
|
+
y_vals = [float(v) for v in top_miss["Missing (%)"].tolist()]
|
|
6212
|
+
|
|
6213
|
+
plot_html = f"""
|
|
6214
|
+
<div id="{container_id}" style="width:100%;height:340px;"></div>
|
|
6215
|
+
<script>
|
|
6216
|
+
(function(){{
|
|
6217
|
+
var x = {json.dumps(x_vals)};
|
|
6218
|
+
var y = {json.dumps(y_vals)};
|
|
6219
|
+
var data = [{{
|
|
6220
|
+
type: 'bar',
|
|
6221
|
+
x: x,
|
|
6222
|
+
y: y,
|
|
6223
|
+
hovertemplate: '%{{x}}<br>Missing: %{{y:.1f}}%<extra></extra>'
|
|
6224
|
+
}}];
|
|
6225
|
+
var layout = {{
|
|
6226
|
+
margin: {{l:50, r:20, t:10, b:100}},
|
|
6227
|
+
yaxis: {{ title: 'Missing (%)', rangemode: 'tozero' }},
|
|
6228
|
+
xaxis: {{ title: 'Column', tickangle: -45 }}
|
|
6229
|
+
}};
|
|
6230
|
+
if (window.Plotly && Plotly.newPlot) {{
|
|
6231
|
+
Plotly.newPlot("{container_id}", data, layout, {{displayModeBar:true, responsive:true}});
|
|
6232
|
+
}} else {{
|
|
6233
|
+
var p=document.createElement('div'); p.style.color='crimson'; p.style.marginTop='8px';
|
|
6234
|
+
p.textContent='Plotly is not loaded.'; document.getElementById("{container_id}").appendChild(p);
|
|
6235
|
+
}}
|
|
6236
|
+
}})();
|
|
6237
|
+
</script>
|
|
6238
|
+
"""
|
|
6239
|
+
data_cells.append({
|
|
6240
|
+
"title": "Missingness (Top 20)",
|
|
6241
|
+
"output": Markup(plot_html),
|
|
6242
|
+
"code": (
|
|
6243
|
+
"nulls = df.isnull().sum();\n"
|
|
6244
|
+
"nulls_pct = (\n"
|
|
6245
|
+
" df.isnull().mean()*100\n"
|
|
6246
|
+
").round(1)\n"
|
|
6247
|
+
"missing_df = pd.DataFrame({\n"
|
|
6248
|
+
" 'Column': df.columns,\n"
|
|
6249
|
+
" 'Missing Values': nulls.values,\n"
|
|
6250
|
+
" 'Missing (%)': nulls_pct.values\n"
|
|
6251
|
+
"})\n\n"
|
|
6252
|
+
"top_miss = (\n"
|
|
6253
|
+
" missing_df[missing_df['Missing Values'] > 0]\n"
|
|
6254
|
+
" .sort_values('Missing (%)', ascending=False)\n"
|
|
6255
|
+
" .loc[:, ['Column', 'Missing (%)']]\n"
|
|
6256
|
+
" .head(20)\n"
|
|
6257
|
+
" .reset_index(drop=True)\n"
|
|
6258
|
+
")\n"
|
|
6259
|
+
"top_miss"
|
|
6260
|
+
),
|
|
6261
|
+
"span":"eda-col-4"
|
|
6262
|
+
})
|
|
6263
|
+
|
|
6264
|
+
# 11 Category Distribution — 3D doughnut (dataset-agnostic, capped 5k)
|
|
6265
|
+
try:
|
|
6266
|
+
# 1) Column universe: object / category / bool (integers remain numeric)
|
|
6267
|
+
cat_cols_all = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
|
|
6268
|
+
|
|
6269
|
+
# 2) Honour user pick if categorical; otherwise auto-pick a sensible default
|
|
6270
|
+
dist_param = (request.args.get("dist") or request.form.get("dist") or "").strip()
|
|
6271
|
+
if dist_param and dist_param in cat_cols_all:
|
|
6272
|
+
dist_col = dist_param
|
|
6273
|
+
else:
|
|
6274
|
+
# Auto-pick preference: 3–20 unique values excluding obvious ID-like;
|
|
6275
|
+
# else allow 2-level; else first categorical.
|
|
6276
|
+
n_total = len(df)
|
|
6277
|
+
uniques_loc = df.nunique(dropna=True)
|
|
6278
|
+
miss_pct_loc = (df.isnull().mean() * 100).round(1)
|
|
6279
|
+
id_like_loc = {c for c in cat_cols_all if n_total > 0 and (uniques_loc.get(c, 0) / n_total) >= 0.95}
|
|
6280
|
+
|
|
6281
|
+
multilevel = [c for c in cat_cols_all
|
|
6282
|
+
if (3 <= int(uniques_loc.get(c, df[c].nunique(dropna=True))) <= 20)
|
|
6283
|
+
and (c not in id_like_loc)]
|
|
6284
|
+
if multilevel:
|
|
6285
|
+
# score nearer 8 levels and lower missingness
|
|
6286
|
+
best, best_score = "", -1e9
|
|
6287
|
+
for c in multilevel:
|
|
6288
|
+
k = int(uniques_loc.get(c, df[c].nunique(dropna=True)))
|
|
6289
|
+
miss = float(miss_pct_loc.get(c, (df[c].isna().mean() * 100)))
|
|
6290
|
+
score = -abs(k - 8) - (miss / 10.0)
|
|
6291
|
+
if score > best_score:
|
|
6292
|
+
best, best_score = c, score
|
|
6293
|
+
dist_col = best
|
|
6294
|
+
else:
|
|
6295
|
+
twolevel = [c for c in cat_cols_all if int(uniques_loc.get(c, df[c].nunique(dropna=True))) == 2]
|
|
6296
|
+
dist_col = (twolevel[0] if twolevel else (cat_cols_all[0] if cat_cols_all else ""))
|
|
6297
|
+
|
|
6298
|
+
# 3) Build options AFTER dist_col is final (so selection sticks)
|
|
6299
|
+
opts = []
|
|
6300
|
+
for c in cat_cols_all:
|
|
6301
|
+
sel = " selected" if c == dist_col else ""
|
|
6302
|
+
opts.append(f'<option value="{html.escape(str(c))}"{sel}>{html.escape(str(c))}</option>')
|
|
6303
|
+
opts_html = "\n".join(opts)
|
|
6304
|
+
|
|
6305
|
+
form_html = f"""
|
|
6306
|
+
<a id="dist3d"></a>
|
|
6307
|
+
<form method="get" action="/dashboard#dist3d"
|
|
6308
|
+
style="display:flex; flex-wrap:wrap; gap:10px; align-items:center; margin-bottom:8px;">
|
|
6309
|
+
<input type="hidden" name="section" value="explore">
|
|
6310
|
+
<input type="hidden" name="dataset" value="{html.escape(str(selected_dataset or ''))}">
|
|
6311
|
+
<label><strong>Distribution column:</strong></label>
|
|
6312
|
+
<select name="dist" onchange="this.form.submit()" style="min-width:200px; height:28px;">
|
|
6313
|
+
{opts_html}
|
|
6314
|
+
</select>
|
|
6315
|
+
</form>
|
|
6316
|
+
"""
|
|
6317
|
+
|
|
6318
|
+
if dist_col:
|
|
6319
|
+
s = df[dist_col]
|
|
6320
|
+
# cap cheap counting to 5k
|
|
6321
|
+
if len(s) > 5000:
|
|
6322
|
+
s = s.sample(5000, random_state=0)
|
|
6323
|
+
|
|
6324
|
+
# 4) Robust counting: treat NaN as "Missing", stringify labels for safety
|
|
6325
|
+
s = s.astype("object")
|
|
6326
|
+
s = s.where(~s.isna(), other="Missing")
|
|
6327
|
+
vc = s.value_counts(dropna=False)
|
|
6328
|
+
|
|
6329
|
+
if vc.empty:
|
|
6330
|
+
raise ValueError("No values to display for the selected column.")
|
|
6331
|
+
|
|
6332
|
+
# Top-8 + 'Other' (excluding 'Missing' which we keep separate)
|
|
6333
|
+
top_k = 8
|
|
6334
|
+
non_missing = vc.drop(index=["Missing"], errors="ignore") if "Missing" in vc.index else vc
|
|
6335
|
+
head = non_missing.sort_values(ascending=False).head(top_k)
|
|
6336
|
+
other = int(non_missing.iloc[top_k:].sum()) if len(non_missing) > top_k else 0
|
|
6337
|
+
miss = int(vc.get("Missing", 0))
|
|
6338
|
+
|
|
6339
|
+
labels = [str(x) for x in head.index.tolist()]
|
|
6340
|
+
values = [int(v) for v in head.values.tolist()]
|
|
6341
|
+
if other > 0:
|
|
6342
|
+
labels.append("Other"); values.append(other)
|
|
6343
|
+
if miss > 0:
|
|
6344
|
+
labels.append("Missing"); values.append(miss)
|
|
6345
|
+
|
|
6346
|
+
# colours for faux 3D (no external deps)
|
|
6347
|
+
k = len(labels)
|
|
6348
|
+
def _hsl(i, n, l=0.58, s=0.62):
|
|
6349
|
+
h = (i / max(1, n)) * 360.0
|
|
6350
|
+
return f"hsl({int(h)}, {int(s*100)}%, {int(l*100)}%)"
|
|
6351
|
+
top_colors = [_hsl(i, k, l=0.58) for i in range(k)]
|
|
6352
|
+
base_colors = [_hsl(i, k, l=0.40) for i in range(k)]
|
|
6353
|
+
|
|
6354
|
+
container_id = f"dist3d_{uuid.uuid4().hex}"
|
|
6355
|
+
total = int(sum(values))
|
|
6356
|
+
|
|
6357
|
+
plot_html = f"""
|
|
6358
|
+
<div id="{container_id}" class="dist3d-chart"></div>
|
|
6359
|
+
<script>
|
|
6360
|
+
(function(){{
|
|
6361
|
+
var el = document.getElementById("{container_id}");
|
|
6362
|
+
var labels = {json.dumps(labels)};
|
|
6363
|
+
var values = {json.dumps(values)};
|
|
6364
|
+
var total = {total};
|
|
6365
|
+
|
|
6366
|
+
var base = {{
|
|
6367
|
+
type: 'pie', labels: labels, values: values,
|
|
6368
|
+
hole: 0.64, sort: false, textinfo: 'none', hoverinfo: 'skip',
|
|
6369
|
+
marker: {{ colors: {json.dumps(base_colors)} }},
|
|
6370
|
+
showlegend: false
|
|
6371
|
+
}};
|
|
6372
|
+
var top = {{
|
|
6373
|
+
type: 'pie', labels: labels, values: values,
|
|
6374
|
+
hole: 0.52, sort: false,
|
|
6375
|
+
textinfo: 'percent', textposition: 'inside', insidetextorientation: 'radial',
|
|
6376
|
+
hovertemplate: '%{{label}}<br>%{{value}} of {total:,} (%{{percent}})<extra></extra>',
|
|
6377
|
+
marker: {{ colors: {json.dumps(top_colors)}, line: {{ width: 1, color: 'rgba(0,0,0,0.25)' }} }},
|
|
6378
|
+
showlegend: true, legendgroup: 'dist'
|
|
6379
|
+
}};
|
|
6380
|
+
|
|
6381
|
+
function parentWidth(){{
|
|
6382
|
+
return (el && el.parentElement ? el.parentElement.clientWidth : (window.innerWidth||360));
|
|
6383
|
+
}}
|
|
6384
|
+
|
|
6385
|
+
// Smooth, monotonic: height = 0.65 * width, clamped [220, 520].
|
|
6386
|
+
function chartHeight(){{
|
|
6387
|
+
var w = parentWidth();
|
|
6388
|
+
return Math.round(Math.max(220, Math.min(520, w * 0.65)));
|
|
6389
|
+
}}
|
|
6390
|
+
|
|
6391
|
+
function legendOrientation(){{
|
|
6392
|
+
return parentWidth() < 640 ? 'h' : 'v';
|
|
6393
|
+
}}
|
|
6394
|
+
|
|
6395
|
+
function makeLayout(){{
|
|
6396
|
+
return {{
|
|
6397
|
+
margin: {{ l:10, r:10, t:10, b:10 }},
|
|
6398
|
+
legend: {{ orientation: legendOrientation(), x:1, xanchor:'right', y:1 }},
|
|
6399
|
+
uniformtext: {{ mode: 'hide', minsize: 10 }}
|
|
6400
|
+
}};
|
|
6401
|
+
}}
|
|
6402
|
+
|
|
6403
|
+
function applySize(){{
|
|
6404
|
+
// Override global .plotly-graph-div {{ height:auto !important }}
|
|
6405
|
+
el.style.setProperty('height', chartHeight() + 'px', 'important');
|
|
6406
|
+
if (window.Plotly) {{
|
|
6407
|
+
Plotly.relayout(el, {{ 'legend.orientation': legendOrientation() }});
|
|
6408
|
+
Plotly.Plots.resize(el);
|
|
6409
|
+
}}
|
|
6410
|
+
}}
|
|
6411
|
+
|
|
6412
|
+
if (window.Plotly && Plotly.newPlot) {{
|
|
6413
|
+
// Initial explicit height before draw
|
|
6414
|
+
el.style.setProperty('height', chartHeight() + 'px', 'important');
|
|
6415
|
+
Plotly.newPlot(el, [base, top], makeLayout(), {{ displayModeBar:true, responsive:true }})
|
|
6416
|
+
.then(function(){{ applySize(); }});
|
|
6417
|
+
window.addEventListener('resize', applySize);
|
|
6418
|
+
}} else {{
|
|
6419
|
+
var p=document.createElement('div'); p.style.color='crimson'; p.style.marginTop='8px';
|
|
6420
|
+
p.textContent='Plotly is not loaded.'; el.appendChild(p);
|
|
6421
|
+
}}
|
|
6422
|
+
}})();
|
|
6423
|
+
</script>
|
|
6424
|
+
"""
|
|
6425
|
+
|
|
6426
|
+
data_cells.append({
|
|
6427
|
+
"title": f"Category Distribution — ({html.escape(dist_col)})",
|
|
6428
|
+
"output": Markup(form_html + plot_html),
|
|
6429
|
+
"code": (
|
|
6430
|
+
"dist_col = '<chosen categorical>'\n"
|
|
6431
|
+
"s = df[dist_col].astype('object').where(~df[dist_col].isna(), other='Missing')\n"
|
|
6432
|
+
"vc = s.value_counts(dropna=False)\n"
|
|
6433
|
+
"top_k = 8 # Top-8 + Other (+ Missing)\n"
|
|
6434
|
+
),
|
|
6435
|
+
"span": "eda-col-4"
|
|
6436
|
+
})
|
|
6437
|
+
else:
|
|
6438
|
+
data_cells.append({
|
|
6439
|
+
"title": "Category Distribution — 3D doughnut",
|
|
6440
|
+
"output": "<em>No categorical columns found.</em>",
|
|
6441
|
+
"code": "# no categorical columns",
|
|
6442
|
+
"span": "eda-col-4"
|
|
6443
|
+
})
|
|
6444
|
+
except Exception as _e:
|
|
6445
|
+
data_cells.append({
|
|
6446
|
+
"title": "Category Distribution — 3D doughnut",
|
|
6447
|
+
"output": f"<em>Could not render distribution: {html.escape(str(_e))}</em>",
|
|
6448
|
+
"code": "# error during distribution rendering",
|
|
6449
|
+
"span": "eda-col-4"
|
|
6450
|
+
})
|
|
6451
|
+
|
|
5650
6452
|
for cell in data_cells:
|
|
5651
|
-
|
|
6453
|
+
cell["highlighted_code"] = Markup(_pygmentize(cell["code"]))
|
|
6454
|
+
|
|
5652
6455
|
highlighted_ai_code = _pygmentize(ai_code)
|
|
6456
|
+
|
|
5653
6457
|
return render_template(
|
|
5654
6458
|
"dashboard.html",
|
|
5655
6459
|
section=section,
|
|
5656
6460
|
datasets=datasets,
|
|
5657
6461
|
selected_dataset=selected_dataset,
|
|
5658
6462
|
ai_outputs=ai_outputs,
|
|
5659
|
-
ai_code=ai_code,
|
|
6463
|
+
ai_code=ai_code,
|
|
5660
6464
|
highlighted_ai_code=highlighted_ai_code if ai_code else None,
|
|
5661
6465
|
askai_question=smx.sanitize_rough_to_markdown_task(askai_question),
|
|
5662
|
-
refined_question=refined_question,
|
|
6466
|
+
refined_question=refined_question,
|
|
6467
|
+
tasks=tags,
|
|
5663
6468
|
data_cells=data_cells,
|
|
5664
6469
|
session_id=session_id,
|
|
6470
|
+
llm_usage=llm_usage
|
|
5665
6471
|
)
|
|
5666
6472
|
|
|
5667
6473
|
|
|
@@ -5672,7 +6478,7 @@ def setup_routes(smx):
|
|
|
5672
6478
|
if not html_doc:
|
|
5673
6479
|
return ("No result available.", 404)
|
|
5674
6480
|
|
|
5675
|
-
buf =
|
|
6481
|
+
buf = _std_io.BytesIO(html_doc.encode("utf-8"))
|
|
5676
6482
|
buf.seek(0)
|
|
5677
6483
|
|
|
5678
6484
|
# keep a copy if you wish, or free it:
|
|
@@ -5744,7 +6550,7 @@ def setup_routes(smx):
|
|
|
5744
6550
|
text = re.sub(r"<[^>]+>", " ", text)
|
|
5745
6551
|
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
5746
6552
|
text = html.unescape(text).strip()
|
|
5747
|
-
buf =
|
|
6553
|
+
buf = _std_io.BytesIO()
|
|
5748
6554
|
doc = SimpleDocTemplate(buf, pagesize=A4, leftMargin=16*mm, rightMargin=16*mm, topMargin=16*mm, bottomMargin=16*mm)
|
|
5749
6555
|
styles = getSampleStyleSheet()
|
|
5750
6556
|
flow = []
|