syntaxmatrix 2.5.1__py3-none-any.whl → 2.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- syntaxmatrix/__init__.py +3 -3
- syntaxmatrix/commentary.py +134 -112
- syntaxmatrix/core.py +449 -338
- syntaxmatrix/dataset_preprocessing.py +218 -0
- syntaxmatrix/display.py +89 -37
- syntaxmatrix/gpt_models_latest.py +5 -4
- syntaxmatrix/profiles.py +19 -4
- syntaxmatrix/routes.py +932 -131
- syntaxmatrix/settings/model_map.py +38 -30
- syntaxmatrix/static/icons/hero_bg.jpg +0 -0
- syntaxmatrix/templates/dashboard.html +256 -55
- syntaxmatrix/utils.py +2254 -84
- {syntaxmatrix-2.5.1.dist-info → syntaxmatrix-2.5.3.dist-info}/METADATA +3 -1
- {syntaxmatrix-2.5.1.dist-info → syntaxmatrix-2.5.3.dist-info}/RECORD +17 -18
- syntaxmatrix/model_templates.py +0 -29
- syntaxmatrix/smx_task_runner.py +0 -12
- syntaxmatrix/smx_usage_example.py +0 -4
- {syntaxmatrix-2.5.1.dist-info → syntaxmatrix-2.5.3.dist-info}/WHEEL +0 -0
- {syntaxmatrix-2.5.1.dist-info → syntaxmatrix-2.5.3.dist-info}/licenses/LICENSE.txt +0 -0
- {syntaxmatrix-2.5.1.dist-info → syntaxmatrix-2.5.3.dist-info}/top_level.txt +0 -0
syntaxmatrix/core.py
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
+
import ast
|
|
3
|
+
import textwrap
|
|
2
4
|
import os, webbrowser, uuid, secrets, re
|
|
3
5
|
|
|
4
6
|
from flask import Flask, Response, session, request, has_request_context
|
|
7
|
+
from syntaxmatrix.agentic.agents import mlearning_agent
|
|
5
8
|
from syntaxmatrix.history_store import SQLHistoryStore as Store, PersistentHistoryStore as _Store
|
|
6
9
|
from collections import OrderedDict
|
|
7
10
|
from syntaxmatrix.llm_store import save_embed_model, load_embed_model, delete_embed_key
|
|
@@ -17,7 +20,6 @@ from syntaxmatrix.settings.prompts import SMXAI_CHAT_ID, SMXAI_CHAT_INSTRUCTIONS
|
|
|
17
20
|
from typing import List, Generator
|
|
18
21
|
from .auth import init_auth_db
|
|
19
22
|
from . import profiles as _prof
|
|
20
|
-
from syntaxmatrix.utils import strip_describe_slice, drop_bad_classification_metrics
|
|
21
23
|
from syntaxmatrix.smiv import SMIV
|
|
22
24
|
from .project_root import detect_project_root
|
|
23
25
|
from syntaxmatrix.gpt_models_latest import extract_output_text as _out, set_args
|
|
@@ -25,6 +27,8 @@ from dotenv import load_dotenv
|
|
|
25
27
|
from html import unescape
|
|
26
28
|
from .plottings import render_plotly, pyplot, describe_plotly, describe_matplotlib
|
|
27
29
|
from threading import RLock
|
|
30
|
+
from syntaxmatrix.settings.model_map import GPT_MODELS_LATEST
|
|
31
|
+
|
|
28
32
|
|
|
29
33
|
# ──────── framework‐local storage paths ────────
|
|
30
34
|
# this ensures the key & data always live under the package dir,
|
|
@@ -46,7 +50,7 @@ EDA_OUTPUT = {} # global buffer for EDA output by session
|
|
|
46
50
|
|
|
47
51
|
class SyntaxMUI:
|
|
48
52
|
def __init__(self,
|
|
49
|
-
|
|
53
|
+
host="127.0.0.1",
|
|
50
54
|
port="5080",
|
|
51
55
|
user_icon="👩🏿🦲",
|
|
52
56
|
bot_icon="<img src='/static/icons/favicon.png' width=20' alt='bot'/>",
|
|
@@ -76,7 +80,7 @@ class SyntaxMUI:
|
|
|
76
80
|
self.website_description = SMXAI_WEBSITE_DESCRIPTION
|
|
77
81
|
self._eda_output = {} # {chat_id: html}
|
|
78
82
|
self._eda_lock = RLock()
|
|
79
|
-
|
|
83
|
+
|
|
80
84
|
db.init_db()
|
|
81
85
|
self.page = ""
|
|
82
86
|
self.pages = db.get_pages()
|
|
@@ -88,14 +92,15 @@ class SyntaxMUI:
|
|
|
88
92
|
self.app_token = str(uuid.uuid4()) # NEW: Unique token for each app launch.
|
|
89
93
|
self.admin_pdf_chunks = {} # In-memory store for admin PDF chunks
|
|
90
94
|
self.user_file_chunks = {} # In-memory store of user‑uploaded chunks, scoped per chat session
|
|
91
|
-
routes.setup_routes(self)
|
|
92
95
|
|
|
96
|
+
self._last_llm_usage = None
|
|
97
|
+
routes.setup_routes(self)
|
|
98
|
+
|
|
93
99
|
self._admin_profile = {}
|
|
94
100
|
self._chat_profile = {}
|
|
95
101
|
self._coding_profile = {}
|
|
96
102
|
self._classification_profile = {}
|
|
97
103
|
self._summarization_profile = {}
|
|
98
|
-
self.vision2text_profile = {}
|
|
99
104
|
|
|
100
105
|
self._gpt_models_latest_prev_resp_ids = {}
|
|
101
106
|
self.is_streaming = False
|
|
@@ -282,12 +287,14 @@ class SyntaxMUI:
|
|
|
282
287
|
|
|
283
288
|
@staticmethod
|
|
284
289
|
def get_ui_modes():
|
|
285
|
-
return list(UI_MODES.keys())
|
|
290
|
+
return list(UI_MODES.keys())
|
|
291
|
+
# return "default", "card", "bubble", "smx"
|
|
286
292
|
|
|
287
293
|
@staticmethod
|
|
288
294
|
def get_themes():
|
|
289
295
|
return list(DEFAULT_THEMES.keys())
|
|
290
296
|
|
|
297
|
+
|
|
291
298
|
def set_theme(self, theme_name, theme=None):
|
|
292
299
|
if theme_name in DEFAULT_THEMES:
|
|
293
300
|
self.theme = DEFAULT_THEMES[theme_name]
|
|
@@ -319,8 +326,8 @@ class SyntaxMUI:
|
|
|
319
326
|
def set_project_name(self, project_name):
|
|
320
327
|
self.project_name = project_name
|
|
321
328
|
|
|
322
|
-
def set_favicon(self, icon):
|
|
323
|
-
|
|
329
|
+
# def set_favicon(self, icon):
|
|
330
|
+
# self.favicon = icon
|
|
324
331
|
|
|
325
332
|
def set_site_logo(self, logo):
|
|
326
333
|
self.site_logo = logo
|
|
@@ -453,7 +460,7 @@ class SyntaxMUI:
|
|
|
453
460
|
except Exception as e:
|
|
454
461
|
self.error(f"Plotly rendering failed: {e}")
|
|
455
462
|
|
|
456
|
-
|
|
463
|
+
|
|
457
464
|
def write(self, content):
|
|
458
465
|
self.bot_message(content)
|
|
459
466
|
|
|
@@ -465,15 +472,19 @@ class SyntaxMUI:
|
|
|
465
472
|
if end: # final flush → history
|
|
466
473
|
self.bot_message(chunk) # persists the final message
|
|
467
474
|
|
|
475
|
+
|
|
468
476
|
def error(self, content):
|
|
469
477
|
self.bot_message(f'<div style="color:red; font-weight:bold;">{content}</div>')
|
|
470
478
|
|
|
479
|
+
|
|
471
480
|
def warning(self, content):
|
|
472
481
|
self.bot_message(f'<div style="color:orange; font-weight:bold;">{content}</div>')
|
|
473
482
|
|
|
483
|
+
|
|
474
484
|
def success(self, content):
|
|
475
485
|
self.bot_message(f'<div style="color:green; font-weight:bold;">{content}</div>')
|
|
476
486
|
|
|
487
|
+
|
|
477
488
|
def info(self, content):
|
|
478
489
|
self.bot_message(f'<div style="color:blue;">{content}</div>')
|
|
479
490
|
|
|
@@ -503,15 +514,18 @@ class SyntaxMUI:
|
|
|
503
514
|
# ──────────────────────────────────────────────────────────────
|
|
504
515
|
# *********** LLM CLIENT HELPERS **********************
|
|
505
516
|
# ──────────────────────────────────────────────────────────────
|
|
506
|
-
def
|
|
507
|
-
self.
|
|
517
|
+
def set_prompt_profile(self, profile):
|
|
518
|
+
self.ai_chat_id = profile
|
|
508
519
|
|
|
509
|
-
|
|
510
|
-
|
|
520
|
+
|
|
521
|
+
def set_prompt_instructions(self, instructions):
|
|
522
|
+
self.ai_chat_instructions = instructions
|
|
523
|
+
|
|
511
524
|
|
|
512
525
|
def set_website_description(self, desc):
|
|
513
526
|
self.website_description = desc
|
|
514
527
|
|
|
528
|
+
|
|
515
529
|
def embed_query(self, q):
|
|
516
530
|
return embed_text(q)
|
|
517
531
|
|
|
@@ -553,16 +567,8 @@ class SyntaxMUI:
|
|
|
553
567
|
def delete_embed_key(self):
|
|
554
568
|
return delete_embed_key()
|
|
555
569
|
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
def get_stream_args(self):
|
|
560
|
-
return self.stream_args
|
|
561
|
-
def stream(self):
|
|
562
|
-
return self.is_streaming
|
|
563
|
-
|
|
564
|
-
def gpt_models_latest(self):
|
|
565
|
-
from syntaxmatrix.settings.model_map import GPT_MODELS_LATEST
|
|
570
|
+
|
|
571
|
+
def get_gpt_models_latest(self):
|
|
566
572
|
return GPT_MODELS_LATEST
|
|
567
573
|
|
|
568
574
|
def get_text_input_value(self, key, default=""):
|
|
@@ -575,13 +581,23 @@ class SyntaxMUI:
|
|
|
575
581
|
return q, None
|
|
576
582
|
return q, intent
|
|
577
583
|
|
|
584
|
+
def enable_stream(self):
|
|
585
|
+
self.is_streaming = True
|
|
586
|
+
|
|
587
|
+
def stream(self):
|
|
588
|
+
return self.is_streaming
|
|
589
|
+
|
|
590
|
+
def get_stream_args(self):
|
|
591
|
+
return self.stream_args
|
|
592
|
+
|
|
593
|
+
|
|
578
594
|
def classify_query_intent(self, query: str) -> str:
|
|
579
|
-
|
|
595
|
+
from syntaxmatrix.gpt_models_latest import extract_output_text as _out, set_args
|
|
596
|
+
|
|
580
597
|
if not self._classification_profile:
|
|
581
|
-
classification_profile = _prof.get_profile('classification') or _prof.get_profile('admin')
|
|
598
|
+
classification_profile = _prof.get_profile('classification') or _prof.get_profile('chat') or _prof.get_profile('admin')
|
|
582
599
|
if not classification_profile:
|
|
583
|
-
|
|
584
|
-
return None
|
|
600
|
+
return {"Error": "Set a profile for Classification"}
|
|
585
601
|
self._classification_profile = classification_profile
|
|
586
602
|
self._classification_profile['client'] = _prof.get_client(classification_profile)
|
|
587
603
|
|
|
@@ -590,13 +606,13 @@ class SyntaxMUI:
|
|
|
590
606
|
_model = self._classification_profile['model']
|
|
591
607
|
|
|
592
608
|
# New instruction format with hybrid option
|
|
593
|
-
_intent_profile = "You are an intent classifier. Respond ONLY with the intent name.
|
|
609
|
+
_intent_profile = "You are an intent classifier. Respond ONLY with the intent name."
|
|
594
610
|
_instructions = f"""
|
|
595
611
|
Classify the given query into ONE of these intents You must return ONLY the intent name with no comment or any preamble:
|
|
596
612
|
- "none": Casual chat/greetings
|
|
597
613
|
- "user_docs": Requires user-uploaded documents
|
|
598
|
-
- "system_docs": Requires company
|
|
599
|
-
- "hybrid": Requires BOTH
|
|
614
|
+
- "system_docs": Requires company knowledge/docs
|
|
615
|
+
- "hybrid": Requires BOTH user docs AND company docs
|
|
600
616
|
|
|
601
617
|
Examples:
|
|
602
618
|
Query: "Hi there!" → none
|
|
@@ -606,11 +622,15 @@ class SyntaxMUI:
|
|
|
606
622
|
Query: "What is the weather today?" → none
|
|
607
623
|
Query: "Cross-reference the customer feedback from my uploaded survey results with our product's feature list in the official documentation." → hybrid
|
|
608
624
|
|
|
609
|
-
Now classify
|
|
625
|
+
Now classify:
|
|
610
626
|
Query: "{query}"
|
|
611
627
|
Intent:
|
|
612
628
|
"""
|
|
613
|
-
|
|
629
|
+
openai_sdk_messages = [
|
|
630
|
+
{"role": "system", "content": _intent_profile},
|
|
631
|
+
{"role": "user", "content": _instructions}
|
|
632
|
+
]
|
|
633
|
+
|
|
614
634
|
def google_classify_query():
|
|
615
635
|
response = _client.models.generate_content(
|
|
616
636
|
model=_model,
|
|
@@ -638,8 +658,8 @@ class SyntaxMUI:
|
|
|
638
658
|
try:
|
|
639
659
|
response = _client.messages.create(
|
|
640
660
|
model=_model,
|
|
641
|
-
max_tokens=
|
|
642
|
-
system
|
|
661
|
+
max_tokens=1024,
|
|
662
|
+
system=_intent_profile,
|
|
643
663
|
messages=[{"role": "user", "content":_instructions}],
|
|
644
664
|
stream=False,
|
|
645
665
|
)
|
|
@@ -651,11 +671,8 @@ class SyntaxMUI:
|
|
|
651
671
|
def openai_sdk_classify_query():
|
|
652
672
|
try:
|
|
653
673
|
response = _client.chat.completions.create(
|
|
654
|
-
model
|
|
655
|
-
messages
|
|
656
|
-
{"role": "system", "content": _intent_profile},
|
|
657
|
-
{"role": "user", "content": _instructions}
|
|
658
|
-
],
|
|
674
|
+
model=_model,
|
|
675
|
+
messages=openai_sdk_messages,
|
|
659
676
|
temperature=0,
|
|
660
677
|
max_tokens=100
|
|
661
678
|
)
|
|
@@ -665,21 +682,25 @@ class SyntaxMUI:
|
|
|
665
682
|
return f"Error!"
|
|
666
683
|
|
|
667
684
|
if _provider == "google":
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
685
|
+
intent = google_classify_query()
|
|
686
|
+
return intent
|
|
687
|
+
if _model in self.get_gpt_models_latest():
|
|
688
|
+
intent = gpt_models_latest_classify_query()
|
|
689
|
+
return intent
|
|
671
690
|
if _provider == "anthropic":
|
|
672
|
-
|
|
691
|
+
intent = anthropic_classify_query()
|
|
692
|
+
return intent
|
|
673
693
|
else:
|
|
674
|
-
|
|
675
|
-
|
|
694
|
+
intent = openai_sdk_classify_query()
|
|
695
|
+
return intent
|
|
696
|
+
|
|
697
|
+
|
|
676
698
|
def generate_contextual_title(self, chat_history):
|
|
677
699
|
|
|
678
700
|
if not self._summarization_profile:
|
|
679
|
-
summarization_profile = _prof.get_profile('summarization') or _prof.get_profile('admin')
|
|
701
|
+
summarization_profile = _prof.get_profile('summarization') or _prof.get_profile('chat') or _prof.get_profile('admin')
|
|
680
702
|
if not summarization_profile:
|
|
681
|
-
|
|
682
|
-
return None
|
|
703
|
+
return {"Error": "Chat profile not set yet."}
|
|
683
704
|
|
|
684
705
|
self._summarization_profile = summarization_profile
|
|
685
706
|
self._summarization_profile['client'] = _prof.get_client(summarization_profile)
|
|
@@ -706,14 +727,14 @@ class SyntaxMUI:
|
|
|
706
727
|
except Exception as e:
|
|
707
728
|
return f"Summary agent error!"
|
|
708
729
|
|
|
709
|
-
def gpt_models_latest_generated_title(
|
|
730
|
+
def gpt_models_latest_generated_title():
|
|
710
731
|
try:
|
|
711
732
|
args = set_args(
|
|
712
733
|
model=_model,
|
|
713
734
|
instructions=_title_profile,
|
|
714
735
|
input=_instructions,
|
|
715
|
-
reasoning_effort=reasoning_effort,
|
|
716
|
-
verbosity=verbosity,
|
|
736
|
+
# reasoning_effort=reasoning_effort,
|
|
737
|
+
# verbosity=verbosity,
|
|
717
738
|
)
|
|
718
739
|
|
|
719
740
|
resp = _client.responses.create(**args)
|
|
@@ -725,7 +746,7 @@ class SyntaxMUI:
|
|
|
725
746
|
try:
|
|
726
747
|
response = _client.messages.create(
|
|
727
748
|
model=_model,
|
|
728
|
-
max_tokens=
|
|
749
|
+
max_tokens=50,
|
|
729
750
|
system=_title_profile,
|
|
730
751
|
messages=[{"role": "user", "content":_instructions}],
|
|
731
752
|
stream=False,
|
|
@@ -739,12 +760,11 @@ class SyntaxMUI:
|
|
|
739
760
|
{ "role": "system", "content": _title_profile },
|
|
740
761
|
{ "role": "user", "content": _instructions },
|
|
741
762
|
]
|
|
742
|
-
|
|
743
763
|
try:
|
|
744
764
|
response = _client.chat.completions.create(
|
|
745
765
|
model=_model,
|
|
746
766
|
messages=prompt,
|
|
747
|
-
temperature=0,
|
|
767
|
+
temperature=0.3,
|
|
748
768
|
max_tokens=50
|
|
749
769
|
)
|
|
750
770
|
title = response.choices[0].message.content.strip().lower()
|
|
@@ -754,7 +774,7 @@ class SyntaxMUI:
|
|
|
754
774
|
|
|
755
775
|
if _provider == "google":
|
|
756
776
|
title = google_generated_title()
|
|
757
|
-
elif _model in self.
|
|
777
|
+
elif _model in self.get_gpt_models_latest():
|
|
758
778
|
title = gpt_models_latest_generated_title()
|
|
759
779
|
elif _provider == "anthropic":
|
|
760
780
|
title = anthropic_generated_title()
|
|
@@ -762,26 +782,22 @@ class SyntaxMUI:
|
|
|
762
782
|
title = openai_sdk_generated_title()
|
|
763
783
|
return title
|
|
764
784
|
|
|
785
|
+
|
|
765
786
|
def stream_process_query(self, query, context, conversations, sources):
|
|
766
787
|
self.stream_args['query'] = query
|
|
767
788
|
self.stream_args['context'] = context
|
|
768
789
|
self.stream_args['conversations'] = conversations
|
|
769
790
|
self.stream_args['sources'] = sources
|
|
770
791
|
|
|
792
|
+
|
|
771
793
|
def process_query_stream(self, query: str, context: str, history: list, stream=True) -> Generator[str, None, None]:
|
|
772
794
|
|
|
773
795
|
if not self._chat_profile:
|
|
774
796
|
chat_profile = _prof.get_profile("chat") or _prof.get_profile("admin")
|
|
775
797
|
if not chat_profile:
|
|
776
|
-
yield
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
<p style='color:red;'>Error: Chat profile is not configured. Add a chat profile inside the admin panel or contact your administrator.
|
|
780
|
-
</p>
|
|
781
|
-
</div>
|
|
782
|
-
"""
|
|
783
|
-
)
|
|
784
|
-
return
|
|
798
|
+
yield """<p style='color:red;'>Error: Chat profile is not configured. Add a chat profile inside the admin panel or contact your administrator.</p>
|
|
799
|
+
"""
|
|
800
|
+
return None
|
|
785
801
|
self._chat_profile = chat_profile
|
|
786
802
|
self._chat_profile['client'] = _prof.get_client(chat_profile)
|
|
787
803
|
|
|
@@ -798,23 +814,21 @@ class SyntaxMUI:
|
|
|
798
814
|
"""
|
|
799
815
|
|
|
800
816
|
try:
|
|
801
|
-
if _provider == "google":
|
|
802
|
-
|
|
803
|
-
types.Content(
|
|
804
|
-
role="user",
|
|
805
|
-
parts=[
|
|
806
|
-
types.Part.from_text(text=f"{self.smxai_identity}\n\n{_contents}"),
|
|
807
|
-
],
|
|
808
|
-
),
|
|
809
|
-
]
|
|
810
|
-
|
|
817
|
+
if _provider == "google": # Google, non openai skd series
|
|
818
|
+
|
|
811
819
|
for chunk in _client.models.generate_content_stream(
|
|
812
820
|
model=_model,
|
|
813
|
-
contents=
|
|
821
|
+
contents=_contents,
|
|
822
|
+
config=types.GenerateContentConfig(
|
|
823
|
+
system_instruction=self.smxai_identity,
|
|
824
|
+
temperature=0.3,
|
|
825
|
+
max_output_tokens=1024,
|
|
826
|
+
),
|
|
814
827
|
):
|
|
828
|
+
|
|
815
829
|
yield chunk.text
|
|
816
830
|
|
|
817
|
-
elif _model in self.
|
|
831
|
+
elif _provider == "openai" and _model in self.get_gpt_models_latest(): # GPt 5 series
|
|
818
832
|
input_prompt = (
|
|
819
833
|
f"{self.smxai_instructions}\n\n"
|
|
820
834
|
f"Generate a response to this query:\n{query}\n"
|
|
@@ -866,18 +880,15 @@ class SyntaxMUI:
|
|
|
866
880
|
if not self._chat_profile:
|
|
867
881
|
chat_profile = _prof.get_profile("chat") or _prof.get_profile("admin")
|
|
868
882
|
if not chat_profile:
|
|
869
|
-
return
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
<p style='color:red;'>Error: Chat profile is not configured. Add a chat profile inside the admin panel or contact your administrator.
|
|
873
|
-
</p>
|
|
874
|
-
</div>
|
|
875
|
-
"""
|
|
876
|
-
)
|
|
883
|
+
return """<p style='color:red;'>Error: Chat profile is not configured. Add a chat profile inside the admin panel or contact your administrator.</p>
|
|
884
|
+
"""
|
|
885
|
+
return
|
|
877
886
|
|
|
878
887
|
self._chat_profile = chat_profile
|
|
879
888
|
self._chat_profile['client'] = _prof.get_client(chat_profile)
|
|
880
|
-
|
|
889
|
+
_provider = self._chat_profile['provider']
|
|
890
|
+
_client = self._chat_profile['client']
|
|
891
|
+
_model = self._chat_profile['model']
|
|
881
892
|
_contents = f"""
|
|
882
893
|
{self.smxai_instructions}\n\n
|
|
883
894
|
Question: {query}\n
|
|
@@ -885,19 +896,31 @@ class SyntaxMUI:
|
|
|
885
896
|
History: {history}\n\n
|
|
886
897
|
Use conversation continuity if available.
|
|
887
898
|
"""
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
899
|
+
|
|
900
|
+
openai_sdk_prompt = [
|
|
901
|
+
{"role": "system", "content": self.smxai_identity},
|
|
902
|
+
{"role": "user", "content": f"""{self.smxai_instructions}\n\n
|
|
903
|
+
Generate response to this query: {query}\n
|
|
904
|
+
based on this context:\n{context}\n
|
|
905
|
+
and history:\n{history}\n\n
|
|
906
|
+
Use conversation continuity if available.)
|
|
907
|
+
"""
|
|
908
|
+
},
|
|
909
|
+
]
|
|
892
910
|
|
|
893
911
|
def google_process_query():
|
|
894
912
|
try:
|
|
895
913
|
response = _client.models.generate_content(
|
|
896
914
|
model=_model,
|
|
897
|
-
contents=
|
|
915
|
+
contents=_contents,
|
|
916
|
+
config=types.GenerateContentConfig(
|
|
917
|
+
system_instruction=self.smxai_identity,
|
|
918
|
+
temperature=0.3,
|
|
919
|
+
max_output_tokens=1024,
|
|
920
|
+
),
|
|
898
921
|
)
|
|
899
922
|
answer = response.text
|
|
900
|
-
|
|
923
|
+
|
|
901
924
|
# answer = strip_html(answer)
|
|
902
925
|
return answer
|
|
903
926
|
except Exception as e:
|
|
@@ -945,11 +968,12 @@ class SyntaxMUI:
|
|
|
945
968
|
response = _client.messages.create(
|
|
946
969
|
model=_model,
|
|
947
970
|
max_tokens=1024,
|
|
948
|
-
system=self.smxai_identity,
|
|
971
|
+
system=self.self.smxai_identity,
|
|
949
972
|
messages=[{"role": "user", "content":_contents}],
|
|
950
973
|
stream=False,
|
|
951
974
|
)
|
|
952
|
-
return response.content[0].text.strip()
|
|
975
|
+
return response.content[0].text.strip()
|
|
976
|
+
|
|
953
977
|
except Exception as e:
|
|
954
978
|
return f"Error: {str(e)}"
|
|
955
979
|
|
|
@@ -958,16 +982,7 @@ class SyntaxMUI:
|
|
|
958
982
|
try:
|
|
959
983
|
response = _client.chat.completions.create(
|
|
960
984
|
model=_model,
|
|
961
|
-
messages
|
|
962
|
-
{"role": "system", "content": self.smxai_identity},
|
|
963
|
-
{"role": "user", "content": f"""{self.smxai_instructions}\n\n
|
|
964
|
-
Generate response to this query: {query}\n
|
|
965
|
-
based on this context:\n{context}\n
|
|
966
|
-
and history:\n{history}\n\n
|
|
967
|
-
Use conversation continuity if available.
|
|
968
|
-
"""
|
|
969
|
-
},
|
|
970
|
-
],
|
|
985
|
+
messages=openai_sdk_prompt,
|
|
971
986
|
stream=False,
|
|
972
987
|
)
|
|
973
988
|
|
|
@@ -979,278 +994,374 @@ class SyntaxMUI:
|
|
|
979
994
|
|
|
980
995
|
if _provider == "google":
|
|
981
996
|
return google_process_query()
|
|
982
|
-
if _provider == "openai" and _model in self.
|
|
997
|
+
if _provider == "openai" and _model in self.get_gpt_models_latest():
|
|
983
998
|
return gpt_models_latest_process_query(self._gpt_models_latest_prev_resp_ids.get(self.get_session_id()))
|
|
984
999
|
if _provider == "anthropic":
|
|
985
1000
|
return anthropic_process_query()
|
|
986
1001
|
return openai_sdk_process_query()
|
|
987
1002
|
|
|
988
|
-
|
|
1003
|
+
|
|
1004
|
+
def repair_python_cell(self, py_code: str) -> str:
|
|
1005
|
+
|
|
1006
|
+
_CELL_REPAIR_RULES = """
|
|
1007
|
+
Fix the Python cell to satisfy:
|
|
1008
|
+
- Single valid cell; imports at the top.
|
|
1009
|
+
- Do not import or invoke or use 'python-dotenv' or 'dotenv' because it's not needed.
|
|
1010
|
+
- No top-level statements between if/elif/else branches.
|
|
1011
|
+
- Regression must use either sklearn with train_test_split (then X_test exists) and R^2/MAE/RMSE,
|
|
1012
|
+
or statsmodels OLS. No accuracy_score in regression.
|
|
1013
|
+
- Keep all plotting + savefig + BytesIO + display inside the branch that created the figure.
|
|
1014
|
+
- Return ONLY the corrected cell.
|
|
1015
|
+
"""
|
|
1016
|
+
code = textwrap.dedent(py_code or "").strip()
|
|
1017
|
+
needs_fix = False
|
|
1018
|
+
if re.search(r"\baccuracy_score\b", code) and re.search(r"\bLinearRegression\b|\bOLS\b", code):
|
|
1019
|
+
needs_fix = True
|
|
1020
|
+
if re.search(r"\bX_test\b", code) and not re.search(r"\bX_test\s*=", code):
|
|
1021
|
+
needs_fix = True
|
|
1022
|
+
try:
|
|
1023
|
+
ast.parse(code)
|
|
1024
|
+
except SyntaxError:
|
|
1025
|
+
needs_fix = True
|
|
1026
|
+
if not needs_fix:
|
|
1027
|
+
return code
|
|
1028
|
+
_prompt = f"```python\n{code}\n```"
|
|
1029
|
+
|
|
1030
|
+
repair_profile = _prof.get_profile("vision2text") or _prof.get_profile("admin")
|
|
1031
|
+
if not repair_profile:
|
|
1032
|
+
return (
|
|
1033
|
+
'<div class="smx-alert smx-alert-warn">'
|
|
1034
|
+
'No LLM profile configured for <code>coding</code> (or <code>admin</code>). <br>'
|
|
1035
|
+
'Please, add the LLM profile inside the admin panel or contact your Administrator.'
|
|
1036
|
+
'</div>'
|
|
1037
|
+
)
|
|
1038
|
+
|
|
1039
|
+
_client = _prof.get_client(repair_profile)
|
|
1040
|
+
_provider = repair_profile['provider'].lower()
|
|
1041
|
+
_model = repair_profile['model']
|
|
1042
|
+
|
|
1043
|
+
#1 Google
|
|
1044
|
+
if _provider == "google":
|
|
1045
|
+
from google.genai import types
|
|
1046
|
+
|
|
1047
|
+
fixed = _client.models.generate_content(
|
|
1048
|
+
model=_model,
|
|
1049
|
+
contents=_prompt,
|
|
1050
|
+
config=types.GenerateContentConfig(
|
|
1051
|
+
system_instruction=_CELL_REPAIR_RULES,
|
|
1052
|
+
temperature=0.8,
|
|
1053
|
+
max_output_tokens=1024,
|
|
1054
|
+
),
|
|
1055
|
+
)
|
|
1056
|
+
|
|
1057
|
+
#2 Openai
|
|
1058
|
+
elif _provider == "openai" and _model in GPT_MODELS_LATEST:
|
|
1059
|
+
|
|
1060
|
+
args = set_args(
|
|
1061
|
+
model=_model,
|
|
1062
|
+
instructions=_CELL_REPAIR_RULES,
|
|
1063
|
+
input=[{"role": "user", "content": _prompt}],
|
|
1064
|
+
previous_id=None,
|
|
1065
|
+
store=False,
|
|
1066
|
+
reasoning_effort="medium",
|
|
1067
|
+
verbosity="medium",
|
|
1068
|
+
)
|
|
1069
|
+
fixed = _out(_client.responses.create(**args))
|
|
989
1070
|
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
if not coding_profile:
|
|
993
|
-
# tell the user exactly what to configure
|
|
994
|
-
return (
|
|
995
|
-
'<div class="smx-alert smx-alert-warn">'
|
|
996
|
-
'No LLM profile configured for <code>coding</code> (or <code>admin</code>). '
|
|
997
|
-
'Please, contact your Administrator.'
|
|
998
|
-
'</div>'
|
|
999
|
-
)
|
|
1071
|
+
# Anthropic
|
|
1072
|
+
elif _provider == "anthropic":
|
|
1000
1073
|
|
|
1001
|
-
|
|
1002
|
-
|
|
1074
|
+
fixed = _client.messages.create(
|
|
1075
|
+
model=_model,
|
|
1076
|
+
max_tokens=1024,
|
|
1077
|
+
system=_CELL_REPAIR_RULES,
|
|
1078
|
+
messages=[{"role": "user", "content":_prompt}],
|
|
1079
|
+
stream=False,
|
|
1080
|
+
)
|
|
1081
|
+
|
|
1082
|
+
# OpenAI SDK
|
|
1083
|
+
else:
|
|
1084
|
+
fixed = _client.chat.completions.create(
|
|
1085
|
+
model=_model,
|
|
1086
|
+
messages=[
|
|
1087
|
+
{"role": "system", "content":_CELL_REPAIR_RULES},
|
|
1088
|
+
{"role": "user", "content":_prompt},
|
|
1089
|
+
],
|
|
1090
|
+
max_tokens=1024,
|
|
1091
|
+
)
|
|
1092
|
+
|
|
1093
|
+
try:
|
|
1094
|
+
ast.parse(fixed);
|
|
1095
|
+
return fixed
|
|
1096
|
+
except Exception:
|
|
1097
|
+
return code
|
|
1003
1098
|
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
_model = self._coding_profile['model']
|
|
1099
|
+
def get_last_llm_usage(self):
|
|
1100
|
+
return getattr(self, "_last_llm_usage", None)
|
|
1007
1101
|
|
|
1008
|
-
|
|
1009
|
-
ALLOWED_COLUMNS = list(df.columns)
|
|
1102
|
+
def ai_generate_code(self, refined_question, tasks, df):
|
|
1010
1103
|
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1104
|
+
def normalise_llm_code(s: str) -> str:
|
|
1105
|
+
s = s.replace("\t", " ")
|
|
1106
|
+
s = textwrap.dedent(s)
|
|
1107
|
+
lines = s.splitlines()
|
|
1015
1108
|
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
2) Import everything you use explicitly. Assume: pandas≥2, numpy≥1.25, matplotlib≥3.8, seaborn≥0.13, scikit-learn≥1.4 are available.
|
|
1028
|
-
3) **Avoid deprecated / removed APIs**, e.g.:
|
|
1029
|
-
- pandas: do not use `.append`, `.ix`, `.as_matrix`, `DataFrame.select_dtypes(include='category')` is OK, but prefer current patterns.
|
|
1030
|
-
- seaborn: do not use `distplot`, `pairplot` on very large data without sampling; prefer `histplot`, `displot`, `regplot`, or FacetGrid with `.map_dataframe`.
|
|
1031
|
-
- scikit-learn: import from `sklearn.model_selection` (not `sklearn.cross_validation`); for confusion matrices use `ConfusionMatrixDisplay.from_estimator`; set `random_state=42` where relevant.
|
|
1032
|
-
4) Be **defensive**:
|
|
1033
|
-
- Verify required columns exist; if any are missing, raise `ValueError("Missing columns: ...")` early.
|
|
1034
|
-
- Handle missing values sensibly (e.g., drop rows for simple EDA; use `ColumnTransformer` + `SimpleImputer` for modeling).
|
|
1035
|
-
- For categorical features in ML, use `OneHotEncoder(handle_unknown="ignore")` inside a `Pipeline`/`ColumnTransformer` (no `LabelEncoder` on features).
|
|
1036
|
-
5) Keep it **fast** (kernel timeout ~8s):
|
|
1037
|
-
- For plots on large frames (>20k rows), downsample to ~1,000 rows (`df.sample(1000, random_state=42)`) unless aggregation is more appropriate.
|
|
1038
|
-
- Prefer vectorized ops; avoid O(n²) Python loops.
|
|
1039
|
-
6) Always **produce at least one visible result** at the end:
|
|
1040
|
-
- If plotting with matplotlib/seaborn: call `plt.tight_layout(); plt.show()`.
|
|
1041
|
-
- If producing a table or metrics: from `syntaxmatrix.display import show` then `show(object_or_dataframe)`.
|
|
1042
|
-
7) Follow task type conventions:
|
|
1043
|
-
- **EDA/Stats**: compute the requested stat, then show a relevant table (e.g., summary/crosstab) or plot.
|
|
1044
|
-
- **Classification**: train/valid split (`train_test_split`), build a pipeline with scaling/encoding as needed, fit, show accuracy **and** a confusion matrix via `ConfusionMatrixDisplay.from_estimator(...); plt.show()`. Also show `classification_report` as a dataframe if short.
|
|
1045
|
-
- **Regression**: train/valid split, pipeline as needed, fit, show R² and MAE; plot predicted vs actual scatter.
|
|
1046
|
-
- **Correlation/Chi-square/ANOVA**: compute the statistic + p-value and show a concise result table (with `show(...)`) and, when sensible, a small plot (heatmap/bar).
|
|
1047
|
-
8) Don't mutate or recreate target columns if they already exist (e.g., if asked to “predict TARGET”, use `y = df['TARGET']` as-is).
|
|
1048
|
-
9) Keep variable names short and clear; prefer `num_cols` / `cat_cols` discovery by dtype.
|
|
1049
|
-
10) You MUST NOT reference any column outside Allowed columns: {ALLOWED_COLUMNS}\n.
|
|
1050
|
-
11) If asked to predict/classify, choose the target by matching the task text to Allowed columns: {ALLOWED_COLUMNS}\n and never invent a new name (e.g., 'whether', 'the').
|
|
1051
|
-
</Hard requirements>
|
|
1052
|
-
|
|
1053
|
-
<Output>
|
|
1054
|
-
Return **only runnable Python** that:
|
|
1055
|
-
- Imports what it needs,
|
|
1056
|
-
- Validates columns,
|
|
1057
|
-
- Solves: {question},
|
|
1058
|
-
- And ends with at least one visible output (`show(...)` and/or `plt.show()`).
|
|
1059
|
-
</Output>
|
|
1060
|
-
"""
|
|
1109
|
+
# drop leading blank lines
|
|
1110
|
+
while lines and not lines[0].strip():
|
|
1111
|
+
lines.pop(0)
|
|
1112
|
+
|
|
1113
|
+
# if everything is still indented >=4 spaces, shift left
|
|
1114
|
+
indents = [len(l) - len(l.lstrip(" ")) for l in lines if l.strip()]
|
|
1115
|
+
if indents and min(indents) >= 4:
|
|
1116
|
+
m = min(indents)
|
|
1117
|
+
lines = [l[m:] if len(l) >= m else l for l in lines]
|
|
1118
|
+
|
|
1119
|
+
return "\n".join(lines)
|
|
1061
1120
|
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
# Combine system prompt and instructions for Gemini
|
|
1065
|
-
|
|
1066
|
-
# Gemini expects a simple generate_content call with the model and contents
|
|
1067
|
-
response = _client.models.generate_content(
|
|
1068
|
-
model=_model,
|
|
1069
|
-
contents=f"{ai_profile}\n\n{instructions}"
|
|
1070
|
-
)
|
|
1071
|
-
|
|
1072
|
-
# Extract text from response
|
|
1073
|
-
if hasattr(response, 'text'):
|
|
1074
|
-
return response.text
|
|
1075
|
-
elif hasattr(response, 'candidates') and response.candidates:
|
|
1076
|
-
candidate = response.candidates[0]
|
|
1077
|
-
if hasattr(candidate.content, 'parts'):
|
|
1078
|
-
return ''.join(part.text for part in candidate.content.parts if hasattr(part, 'text'))
|
|
1079
|
-
return str(response)
|
|
1080
|
-
except Exception as e:
|
|
1081
|
-
return f"Error!"
|
|
1082
|
-
|
|
1083
|
-
# except Exception as e:
|
|
1084
|
-
# return """
|
|
1085
|
-
# import pandas as pd
|
|
1086
|
-
# import matplotlib.pyplot as plt
|
|
1087
|
-
# import seaborn as sns
|
|
1088
|
-
# import numpy as np
|
|
1089
|
-
# import io
|
|
1090
|
-
# import base64
|
|
1091
|
-
# from syntaxmatrix.display import show
|
|
1092
|
-
|
|
1093
|
-
# print("Basic DataFrame Info:")
|
|
1094
|
-
# print(f"Shape: {df.shape}")
|
|
1095
|
-
# print("\\nColumns and dtypes:")
|
|
1096
|
-
# print(df.dtypes)
|
|
1097
|
-
# print("\\nBasic statistics:")
|
|
1098
|
-
# show(df.describe())
|
|
1099
|
-
|
|
1100
|
-
# print("\\nFirst few rows:")
|
|
1101
|
-
# show(df.head())
|
|
1102
|
-
|
|
1103
|
-
# # Generate a simple visualization based on available columns
|
|
1104
|
-
# plt.figure(figsize=(10, 6))
|
|
1105
|
-
|
|
1106
|
-
# if len(df.columns) >= 2:
|
|
1107
|
-
# # Try to find numeric columns for scatter plot
|
|
1108
|
-
# numeric_cols = df.select_dtypes(include=['number']).columns
|
|
1109
|
-
# if len(numeric_cols) >= 2:
|
|
1110
|
-
# sns.scatterplot(data=df, x=numeric_cols[0], y=numeric_cols[1])
|
|
1111
|
-
# plt.title(f"Scatter plot: {numeric_cols[0]} vs {numeric_cols[1]}")
|
|
1112
|
-
# plt.tight_layout()
|
|
1113
|
-
# plt.show()
|
|
1114
|
-
# else:
|
|
1115
|
-
# # Use first column for bar plot
|
|
1116
|
-
# top_values = df[df.columns[0]].value_counts().head(10)
|
|
1117
|
-
# top_values.plot(kind='bar')
|
|
1118
|
-
# plt.title(f"Top 10 values in {df.columns[0]}")
|
|
1119
|
-
# plt.tight_layout()
|
|
1120
|
-
# plt.show()
|
|
1121
|
-
# else:
|
|
1122
|
-
# # Single column analysis
|
|
1123
|
-
# if len(df.columns) == 1:
|
|
1124
|
-
# col_name = df.columns[0]
|
|
1125
|
-
# if df[col_name].dtype in ['object', 'category']:
|
|
1126
|
-
# df[col_name].value_counts().head(10).plot(kind='bar')
|
|
1127
|
-
# plt.title(f"Value counts for {col_name}")
|
|
1128
|
-
# else:
|
|
1129
|
-
# df[col_name].hist(bins=20)
|
|
1130
|
-
# plt.title(f"Distribution of {col_name}")
|
|
1131
|
-
# plt.tight_layout()
|
|
1132
|
-
# plt.show()
|
|
1133
|
-
# else:
|
|
1134
|
-
# print("Insufficient columns for detailed analysis")
|
|
1135
|
-
# show(df)
|
|
1136
|
-
# """
|
|
1137
|
-
|
|
1138
|
-
def gpt_models_latest_generate_code(reasoning_effort = "medium", verbosity = "medium"):
|
|
1139
|
-
# verbosities = ["low", "medium", "high"] # default is "low"
|
|
1140
|
-
# reasoning_efforts = ["minimal", "low", "medium", "high"] # default is "medium"
|
|
1141
|
-
|
|
1142
|
-
if _model == "gpt-5-mini":
|
|
1143
|
-
reasoning_effort = "high"
|
|
1144
|
-
elif _model == "gpt-5-high":
|
|
1145
|
-
reasoning_effort = "high"
|
|
1146
|
-
verbosity = "high"
|
|
1147
|
-
try:
|
|
1148
|
-
args = set_args(
|
|
1149
|
-
model=_model,
|
|
1150
|
-
instructions=ai_profile,
|
|
1151
|
-
input=instructions,
|
|
1152
|
-
reasoning_effort=reasoning_effort,
|
|
1153
|
-
verbosity=verbosity,
|
|
1154
|
-
)
|
|
1155
|
-
|
|
1156
|
-
resp = _client.responses.create(**args)
|
|
1157
|
-
code = _out(resp)
|
|
1158
|
-
return code
|
|
1159
|
-
except Exception as e:
|
|
1160
|
-
return f"Error!"
|
|
1121
|
+
CONTEXT = f"Columns: {list(df.columns)}\n\nDtypes: {df.dtypes.astype(str).to_dict()}\n\n"
|
|
1122
|
+
AVAILABLE_COLUMNS = list(df.columns)
|
|
1161
1123
|
|
|
1162
|
-
|
|
1124
|
+
# --- SMX: normalise tasks coming from intent agent ---
|
|
1125
|
+
if isinstance(tasks, str):
|
|
1126
|
+
import json, ast, re
|
|
1163
1127
|
try:
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1128
|
+
tasks_parsed = json.loads(tasks)
|
|
1129
|
+
except Exception:
|
|
1130
|
+
try:
|
|
1131
|
+
tasks_parsed = ast.literal_eval(tasks)
|
|
1132
|
+
except Exception:
|
|
1133
|
+
tasks_parsed = re.findall(r"[A-Za-z_]+", tasks)
|
|
1134
|
+
tasks = tasks_parsed
|
|
1135
|
+
if not isinstance(tasks, list):
|
|
1136
|
+
tasks = [str(tasks)]
|
|
1137
|
+
tasks = [str(t).strip().lower() for t in tasks if str(t).strip()]
|
|
1138
|
+
|
|
1139
|
+
ai_profile = """
|
|
1140
|
+
- You are a Python expert specializing in data science and machine learning.
|
|
1141
|
+
- Your task is to generate a single, complete, production-quality, executable Python script for a Jupyter-like Python kernel, based on the given instructions.
|
|
1142
|
+
- The dataset is already loaded as a pandas DataFrame named `df` (no file I/O or file uploads).
|
|
1143
|
+
- Make a copy of `df` and name it `df_copy`. Make sure `df_copy` is preprocessed and cleaned, named `df_cleaned`, if not already done so. Then use `df_cleaned` to perform the ML tasks described in the given context.
|
|
1144
|
+
- Select your features and target, from `df_cleaned`, with care and name it `required_cols`
|
|
1145
|
+
- Create your 'df_filtered by doing: df_filtered = df_cleaned[required_cols].
|
|
1146
|
+
- Use the {TEMPLATE_CATALOGUE} below to educate yourself on which visualizations you will implement in the code.
|
|
1147
|
+
- The final output MUST be the complete, executable Python code only, enclosed in a single markdown code block (```python ... ```), which is required to fulfill the user's request. See the {tasks} below.
|
|
1148
|
+
- Do not include any explanatory text or markdown outside the code block.
|
|
1149
|
+
"""
|
|
1150
|
+
|
|
1151
|
+
TEMPLATE_CATALOGUE = """
|
|
1152
|
+
### Available SyntaxMatrix templates (use these instead of inventing new helpers)
|
|
1153
|
+
|
|
1154
|
+
Visualisation templates (dataset-agnostic):
|
|
1155
|
+
- viz_pie(df, category_col=None, top_k=8): pie/donut shares within a category.
|
|
1156
|
+
- viz_stacked_bar(df, x=None, hue=None, normalise=True): composition across groups.
|
|
1157
|
+
- viz_count_bar(df, category_col=None, top_k=12): counts/denominators by category.
|
|
1158
|
+
- viz_box(df, x=None, y=None): spread/outliers of numeric by category.
|
|
1159
|
+
- viz_scatter(df, x=None, y=None, hue=None): relationship between two numeric vars.
|
|
1160
|
+
- viz_distribution(df, col=None): histogram-style distribution for numeric.
|
|
1161
|
+
- viz_kde(df, col=None): density curve for numeric.
|
|
1162
|
+
- viz_area(df, time_col=None, y_col=None): area/trend over time.
|
|
1163
|
+
- viz_line(df, x=None, y=None, hue=None): line/trend plot.
|
|
1164
|
+
|
|
1165
|
+
ML/stat templates:
|
|
1166
|
+
- classification(df): standard classification pipeline + metrics + plots.
|
|
1167
|
+
- regression(df): standard regression pipeline + metrics + plots.
|
|
1168
|
+
- clustering(df): clustering workflow + cluster plots.
|
|
1169
|
+
- anomaly_detection(df)
|
|
1170
|
+
- ts_anomaly_detection(df)
|
|
1171
|
+
- time_series_forecasting(df)
|
|
1172
|
+
- time_series_classification(df, entity_col, time_col, target_col)
|
|
1173
|
+
- dimensionality_reduction(df)
|
|
1174
|
+
- feature_selection(df)
|
|
1175
|
+
- eda_overview(df)
|
|
1176
|
+
- eda_correlation(df)
|
|
1177
|
+
- multilabel_classification(df, label_cols)
|
|
1178
|
+
- recommendation(df)
|
|
1179
|
+
- topic_modelling(df)
|
|
1180
|
+
"""
|
|
1181
|
+
|
|
1182
|
+
instructions = (
|
|
1183
|
+
"### Context"
|
|
1184
|
+
f"- DataFrame - (`df`): {df}"
|
|
1185
|
+
f"- Schema (names → dtypes): {CONTEXT}"
|
|
1186
|
+
f"- Row count: {len(df)}"
|
|
1187
|
+
f"- Task description: {refined_question}"
|
|
1188
|
+
f"- Tasks: {tasks}"
|
|
1189
|
+
f"- Available columns: {AVAILABLE_COLUMNS}"
|
|
1190
|
+
f"- Template catalogue: {TEMPLATE_CATALOGUE}"
|
|
1184
1191
|
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
)
|
|
1202
|
-
|
|
1192
|
+
"""
|
|
1193
|
+
### Template rules
|
|
1194
|
+
- You MAY call a template if it matches the task.
|
|
1195
|
+
- Do NOT invent template names.
|
|
1196
|
+
- If no template fits, write minimal direct pandas/sklearn/seaborn code instead.
|
|
1197
|
+
- Keep the solution short: avoid writing wrappers/utilities already handled by SyntaxMatrix hardener.
|
|
1198
|
+
|
|
1199
|
+
#### Template selection hint examples:
|
|
1200
|
+
- If the task asks for pie/donut/composition shares → use viz_pie.
|
|
1201
|
+
- If it asks for denominators/counts per category → viz_count_bar.
|
|
1202
|
+
- If it asks for spread/outliers/comparison across groups → viz_box.
|
|
1203
|
+
- If it asks for relationship / “X vs Y” → viz_scatter.
|
|
1204
|
+
- If it asks for trend over time → viz_line or viz_area.
|
|
1205
|
+
|
|
1206
|
+
### Hard requirements
|
|
1207
|
+
1) Code only. No markdown, no comments, no explanations.
|
|
1208
|
+
2) Import everything you use explicitly.
|
|
1209
|
+
- Use pandas/numpy/matplotlib by default.
|
|
1210
|
+
- Seaborn may be unavailable at runtime; **do not import seaborn inside your code**.
|
|
1211
|
+
- If you call sns.*, assume sns is already defined by the framework.
|
|
1212
|
+
3) Avoid deprecated / removed APIs**, e.g.:
|
|
1213
|
+
- pandas: do not use `.append`, `.ix`, `.as_matrix`; prefer current patterns.
|
|
1214
|
+
- seaborn: do not use `distplot`; avoid `pairplot` on very large data unless sampling.
|
|
1215
|
+
- scikit-learn: import from `sklearn.model_selection` (not `sklearn.cross_validation`);
|
|
1216
|
+
set `random_state=42` where relevant.
|
|
1217
|
+
4) Be defensive, but avoid hard-failing on optional fields:
|
|
1218
|
+
- If the primary column, needed to answer the question, is missing, review your copy of the `df` again.
|
|
1219
|
+
Make sure that you selected the proper column.
|
|
1220
|
+
Never use a column/variable which isn't available or defined.
|
|
1221
|
+
- If a secondary/extra column is missing, show a warning with `show(...)` and continue using available fields.
|
|
1222
|
+
- Handle missing values sensibly (drop rows for simple EDA; use `ColumnTransformer` + `SimpleImputer` for modelling).
|
|
1223
|
+
- For categorical features in ML, use `OneHotEncoder(handle_unknown="ignore")`
|
|
1224
|
+
inside a `Pipeline`/`ColumnTransformer` (no `LabelEncoder` on features).
|
|
1225
|
+
5) Keep it fast (kernel timeout ~8s):
|
|
1226
|
+
- For plots on large frames (>20k rows), downsample to ~1,000 rows
|
|
1227
|
+
(`df.sample(1000, random_state=42)`) unless aggregation is more appropriate.
|
|
1228
|
+
- Prefer vectorised ops; avoid O(n²) Python loops.
|
|
1229
|
+
6) Keep the solution compact:
|
|
1230
|
+
- Do not define large helper libraries or long “required column” sets.
|
|
1231
|
+
- Aim for ≤120 lines excluding imports.
|
|
1232
|
+
7) Always produce at least one visible result at the end:
|
|
1233
|
+
- If plotting with matplotlib/seaborn: call `plt.tight_layout(); plt.show()`.
|
|
1234
|
+
- If producing a table or metrics:
|
|
1235
|
+
`from syntaxmatrix.display import show` then `show(object_or_dataframe)`.
|
|
1236
|
+
8) Follow task type conventions:
|
|
1237
|
+
- **EDA/Stats**: compute the requested stat, then show a relevant table
|
|
1238
|
+
(e.g., summary/crosstab) or plot.
|
|
1239
|
+
- **Classification**: train/valid split (`train_test_split`), pipeline with scaling/encoding,
|
|
1240
|
+
fit, show accuracy and a confusion matrix via
|
|
1241
|
+
`ConfusionMatrixDisplay.from_estimator(...); plt.show()`.
|
|
1242
|
+
Also show `classification_report` as a dataframe if short.
|
|
1243
|
+
- **Regression**: train/valid split, pipeline as needed, fit, show R² and MAE;
|
|
1244
|
+
plot predicted vs actual scatter.
|
|
1245
|
+
- **Correlation/Chi-square/ANOVA**: compute the statistic + p-value and show a concise
|
|
1246
|
+
result table (with `show(...)`) and, when sensible, a small plot (heatmap/bar).
|
|
1247
|
+
9) Don't mutate or recreate target columns if they already exist.
|
|
1248
|
+
10) Keep variable names short and clear; prefer `num_cols` / `cat_cols` discovery by dtype.
|
|
1249
|
+
11) You MUST NOT reference any column outside Available columns: {AVAILABLE_COLUMNS}.
|
|
1250
|
+
12) If asked to predict/classify, choose the target by matching the task text to Allowed columns
|
|
1251
|
+
and never invent a new name.
|
|
1203
1252
|
|
|
1253
|
+
#### Cohort rules
|
|
1254
|
+
When you generate plots for cohorts or categories, you MUST obey these rules:
|
|
1255
|
+
1) ALWAYS guard cohort masks:
|
|
1256
|
+
- After you define something like:
|
|
1257
|
+
_mask_a = (df['BMI'] < 18.5) & df['BMI'].notna()
|
|
1258
|
+
_mask_b = ~(df['BMI'] < 18.5) & df['BMI'].notna()
|
|
1259
|
+
compute their sizes:
|
|
1260
|
+
n_a = int(_mask_a.sum())
|
|
1261
|
+
n_b = int(_mask_b.sum())
|
|
1262
|
+
- If a mask has no rows (or almost none), do NOT draw an empty plot.
|
|
1263
|
+
Instead call:
|
|
1264
|
+
show(f"Skipping cohort '{label}': no rows after filtering.")
|
|
1265
|
+
and return.
|
|
1266
|
+
|
|
1267
|
+
2) Before any groupby / crosstab for a plot:
|
|
1268
|
+
- Fill missing categories so groupby does not drop everything:
|
|
1269
|
+
df[col] = df[col].fillna("Unknown")
|
|
1270
|
+
- After building the table:
|
|
1271
|
+
tab = tmp.groupby([...]).size().unstack(...).fillna(0)
|
|
1272
|
+
ALWAYS check:
|
|
1273
|
+
if tab.empty:
|
|
1274
|
+
show(f"Skipping plot for {col}: no data after grouping.")
|
|
1275
|
+
continue
|
|
1276
|
+
Only call .plot(...) if the table is non-empty.
|
|
1277
|
+
|
|
1278
|
+
3) For value_counts-based plots:
|
|
1279
|
+
- If the Series is empty after filtering (len(s) == 0),
|
|
1280
|
+
do NOT draw a figure. Just call:
|
|
1281
|
+
show(f"No data available to plot for {col} in this cohort.")
|
|
1282
|
+
and skip.
|
|
1283
|
+
|
|
1284
|
+
4) Never try to “hide” an error with a blank plot.
|
|
1285
|
+
A blank chart is treated as a bug. If there is no data, explain it
|
|
1286
|
+
clearly using show(...), and avoid calling matplotlib/Seaborn.
|
|
1287
|
+
|
|
1288
|
+
5) Never use print(...). All user-visible diagnostics go through show(...).
|
|
1204
1289
|
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1290
|
+
|
|
1291
|
+
### Output
|
|
1292
|
+
Return only runnable Python that:
|
|
1293
|
+
- Imports what it needs,
|
|
1294
|
+
- Validates columns,
|
|
1295
|
+
- Visualize tables, charts, and graphs, each with appropriate caption.
|
|
1296
|
+
- Solution: {tasks} to solve {refined_question},
|
|
1297
|
+
- And ends with at least 3 visible output (`show(...)` and/or `plt.show()`).
|
|
1298
|
+
""")
|
|
1299
|
+
|
|
1300
|
+
if not self._coding_profile:
|
|
1301
|
+
coding_profile = _prof.get_profile("coding") or _prof.get_profile("admin")
|
|
1302
|
+
if not coding_profile:
|
|
1303
|
+
return (
|
|
1304
|
+
'<div class="smx-alert smx-alert-warn">'
|
|
1305
|
+
'No LLM profile configured for <code>coding</code> (or <code>admin</code>). <br>'
|
|
1306
|
+
'Please, add the LLM profile inside the admin panel or contact your Administrator.'
|
|
1307
|
+
'</div>'
|
|
1215
1308
|
)
|
|
1216
|
-
return response.choices[0].message.content
|
|
1217
|
-
except Exception as e:
|
|
1218
|
-
return "Error!"
|
|
1219
1309
|
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1310
|
+
self._coding_profile = coding_profile
|
|
1311
|
+
self._coding_profile['client'] = _prof.get_client(coding_profile)
|
|
1312
|
+
|
|
1313
|
+
# code = mlearning_agent(instructions, ai_profile, self._coding_profile)
|
|
1314
|
+
code, usage = mlearning_agent(instructions, ai_profile, self._coding_profile)
|
|
1315
|
+
self._last_llm_usage = usage
|
|
1316
|
+
|
|
1317
|
+
|
|
1229
1318
|
if code:
|
|
1319
|
+
import re
|
|
1320
|
+
code = normalise_llm_code(code)
|
|
1321
|
+
|
|
1230
1322
|
m = re.search(r"```(?:python)?\s*(.*?)\s*```", code, re.DOTALL | re.IGNORECASE)
|
|
1231
1323
|
if m:
|
|
1232
1324
|
code = m.group(1).strip()
|
|
1233
|
-
code = drop_bad_classification_metrics(code, df)
|
|
1234
1325
|
|
|
1235
1326
|
if "import io" not in code and "io.BytesIO" in code:
|
|
1236
1327
|
lines = code.split('\n')
|
|
1237
1328
|
import_lines = []
|
|
1238
1329
|
other_lines = []
|
|
1239
|
-
|
|
1330
|
+
|
|
1240
1331
|
for line in lines:
|
|
1241
1332
|
if line.strip().startswith('import ') or line.strip().startswith('from '):
|
|
1242
1333
|
import_lines.append(line)
|
|
1243
1334
|
else:
|
|
1244
1335
|
other_lines.append(line)
|
|
1245
|
-
|
|
1246
|
-
# Add missing io import
|
|
1336
|
+
|
|
1247
1337
|
if "import io" not in '\n'.join(import_lines):
|
|
1248
1338
|
import_lines.append('import io')
|
|
1249
|
-
|
|
1339
|
+
|
|
1250
1340
|
code = '\n'.join(import_lines + [''] + other_lines)
|
|
1251
|
-
|
|
1341
|
+
|
|
1342
|
+
TEMPLATE_NAMES = [
|
|
1343
|
+
"viz_pie","viz_stacked_bar","viz_count_bar","viz_box","viz_scatter",
|
|
1344
|
+
"viz_distribution","viz_kde","viz_area","viz_line",
|
|
1345
|
+
"classification","regression","clustering","anomaly_detection",
|
|
1346
|
+
"ts_anomaly_detection","time_series_forecasting","time_series_classification",
|
|
1347
|
+
"dimensionality_reduction","feature_selection","eda_overview","eda_correlation",
|
|
1348
|
+
"multilabel_classification","recommendation","topic_modelling"
|
|
1349
|
+
]
|
|
1350
|
+
|
|
1351
|
+
used = [t for t in TEMPLATE_NAMES if re.search(rf"\\b{t}\\s*\\(", code)]
|
|
1352
|
+
if used:
|
|
1353
|
+
import_line = (
|
|
1354
|
+
"from syntaxmatrix.agentic.model_templates import " +
|
|
1355
|
+
", ".join(sorted(set(used)))
|
|
1356
|
+
)
|
|
1357
|
+
if import_line not in code:
|
|
1358
|
+
code = import_line + "\n" + code
|
|
1359
|
+
|
|
1252
1360
|
return code.strip()
|
|
1253
|
-
|
|
1361
|
+
|
|
1362
|
+
return "Error: AI code generation failed."
|
|
1363
|
+
|
|
1364
|
+
|
|
1254
1365
|
def sanitize_rough_to_markdown_task(self, rough: str) -> str:
|
|
1255
1366
|
"""
|
|
1256
1367
|
Return only the Task text (no tags).
|