syntaxmatrix 2.3.5__py3-none-any.whl → 2.5.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- syntaxmatrix/agentic/__init__.py +0 -0
- syntaxmatrix/agentic/agent_tools.py +24 -0
- syntaxmatrix/agentic/agents.py +810 -0
- syntaxmatrix/agentic/code_tools_registry.py +37 -0
- syntaxmatrix/agentic/model_templates.py +1790 -0
- syntaxmatrix/commentary.py +134 -112
- syntaxmatrix/core.py +385 -245
- syntaxmatrix/dataset_preprocessing.py +218 -0
- syntaxmatrix/display.py +89 -37
- syntaxmatrix/gpt_models_latest.py +5 -4
- syntaxmatrix/profiles.py +19 -4
- syntaxmatrix/routes.py +947 -141
- syntaxmatrix/settings/model_map.py +38 -30
- syntaxmatrix/static/icons/hero_bg.jpg +0 -0
- syntaxmatrix/templates/dashboard.html +248 -54
- syntaxmatrix/utils.py +2254 -84
- {syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.5.dist-info}/METADATA +16 -17
- {syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.5.dist-info}/RECORD +21 -15
- syntaxmatrix/model_templates.py +0 -29
- {syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.5.dist-info}/WHEEL +0 -0
- {syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.5.dist-info}/licenses/LICENSE.txt +0 -0
- {syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.5.dist-info}/top_level.txt +0 -0
syntaxmatrix/core.py
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
+
import ast
|
|
3
|
+
import textwrap
|
|
2
4
|
import os, webbrowser, uuid, secrets, re
|
|
3
5
|
|
|
4
6
|
from flask import Flask, Response, session, request, has_request_context
|
|
7
|
+
from syntaxmatrix.agentic.agents import mlearning_agent
|
|
5
8
|
from syntaxmatrix.history_store import SQLHistoryStore as Store, PersistentHistoryStore as _Store
|
|
6
9
|
from collections import OrderedDict
|
|
7
10
|
from syntaxmatrix.llm_store import save_embed_model, load_embed_model, delete_embed_key
|
|
@@ -17,7 +20,6 @@ from syntaxmatrix.settings.prompts import SMXAI_CHAT_ID, SMXAI_CHAT_INSTRUCTIONS
|
|
|
17
20
|
from typing import List, Generator
|
|
18
21
|
from .auth import init_auth_db
|
|
19
22
|
from . import profiles as _prof
|
|
20
|
-
from syntaxmatrix.utils import strip_describe_slice, drop_bad_classification_metrics
|
|
21
23
|
from syntaxmatrix.smiv import SMIV
|
|
22
24
|
from .project_root import detect_project_root
|
|
23
25
|
from syntaxmatrix.gpt_models_latest import extract_output_text as _out, set_args
|
|
@@ -25,6 +27,8 @@ from dotenv import load_dotenv
|
|
|
25
27
|
from html import unescape
|
|
26
28
|
from .plottings import render_plotly, pyplot, describe_plotly, describe_matplotlib
|
|
27
29
|
from threading import RLock
|
|
30
|
+
from syntaxmatrix.settings.model_map import GPT_MODELS_LATEST
|
|
31
|
+
|
|
28
32
|
|
|
29
33
|
# ──────── framework‐local storage paths ────────
|
|
30
34
|
# this ensures the key & data always live under the package dir,
|
|
@@ -46,10 +50,10 @@ EDA_OUTPUT = {} # global buffer for EDA output by session
|
|
|
46
50
|
|
|
47
51
|
class SyntaxMUI:
|
|
48
52
|
def __init__(self,
|
|
49
|
-
|
|
53
|
+
host="127.0.0.1",
|
|
50
54
|
port="5080",
|
|
51
55
|
user_icon="👩🏿🦲",
|
|
52
|
-
bot_icon="<img src='/static/icons/
|
|
56
|
+
bot_icon="<img src='/static/icons/favicon.png' width=20' alt='bot'/>",
|
|
53
57
|
favicon="/static/icons/favicon.png",
|
|
54
58
|
site_logo="<img src='/static/icons/logo.png' width='30' alt='logo'/>",
|
|
55
59
|
site_title="SyntaxMatrix",
|
|
@@ -71,12 +75,12 @@ class SyntaxMUI:
|
|
|
71
75
|
self.ui_mode = ui_mode
|
|
72
76
|
self.theme_toggle_enabled = False
|
|
73
77
|
self.user_files_enabled = False
|
|
74
|
-
self.
|
|
75
|
-
self.
|
|
78
|
+
self.smxai_identity = SMXAI_CHAT_ID
|
|
79
|
+
self.smxai_instructions = SMXAI_CHAT_INSTRUCTIONS
|
|
76
80
|
self.website_description = SMXAI_WEBSITE_DESCRIPTION
|
|
77
81
|
self._eda_output = {} # {chat_id: html}
|
|
78
82
|
self._eda_lock = RLock()
|
|
79
|
-
|
|
83
|
+
|
|
80
84
|
db.init_db()
|
|
81
85
|
self.page = ""
|
|
82
86
|
self.pages = db.get_pages()
|
|
@@ -88,8 +92,10 @@ class SyntaxMUI:
|
|
|
88
92
|
self.app_token = str(uuid.uuid4()) # NEW: Unique token for each app launch.
|
|
89
93
|
self.admin_pdf_chunks = {} # In-memory store for admin PDF chunks
|
|
90
94
|
self.user_file_chunks = {} # In-memory store of user‑uploaded chunks, scoped per chat session
|
|
91
|
-
routes.setup_routes(self)
|
|
92
95
|
|
|
96
|
+
self._last_llm_usage = None
|
|
97
|
+
routes.setup_routes(self)
|
|
98
|
+
|
|
93
99
|
self._admin_profile = {}
|
|
94
100
|
self._chat_profile = {}
|
|
95
101
|
self._coding_profile = {}
|
|
@@ -561,8 +567,8 @@ class SyntaxMUI:
|
|
|
561
567
|
def delete_embed_key(self):
|
|
562
568
|
return delete_embed_key()
|
|
563
569
|
|
|
564
|
-
|
|
565
|
-
|
|
570
|
+
|
|
571
|
+
def get_gpt_models_latest(self):
|
|
566
572
|
return GPT_MODELS_LATEST
|
|
567
573
|
|
|
568
574
|
def get_text_input_value(self, key, default=""):
|
|
@@ -678,7 +684,7 @@ class SyntaxMUI:
|
|
|
678
684
|
if _provider == "google":
|
|
679
685
|
intent = google_classify_query()
|
|
680
686
|
return intent
|
|
681
|
-
if _model in self.
|
|
687
|
+
if _model in self.get_gpt_models_latest():
|
|
682
688
|
intent = gpt_models_latest_classify_query()
|
|
683
689
|
return intent
|
|
684
690
|
if _provider == "anthropic":
|
|
@@ -721,14 +727,14 @@ class SyntaxMUI:
|
|
|
721
727
|
except Exception as e:
|
|
722
728
|
return f"Summary agent error!"
|
|
723
729
|
|
|
724
|
-
def gpt_models_latest_generated_title(
|
|
730
|
+
def gpt_models_latest_generated_title():
|
|
725
731
|
try:
|
|
726
732
|
args = set_args(
|
|
727
733
|
model=_model,
|
|
728
734
|
instructions=_title_profile,
|
|
729
735
|
input=_instructions,
|
|
730
|
-
reasoning_effort=reasoning_effort,
|
|
731
|
-
verbosity=verbosity,
|
|
736
|
+
# reasoning_effort=reasoning_effort,
|
|
737
|
+
# verbosity=verbosity,
|
|
732
738
|
)
|
|
733
739
|
|
|
734
740
|
resp = _client.responses.create(**args)
|
|
@@ -740,7 +746,7 @@ class SyntaxMUI:
|
|
|
740
746
|
try:
|
|
741
747
|
response = _client.messages.create(
|
|
742
748
|
model=_model,
|
|
743
|
-
max_tokens=
|
|
749
|
+
max_tokens=50,
|
|
744
750
|
system=_title_profile,
|
|
745
751
|
messages=[{"role": "user", "content":_instructions}],
|
|
746
752
|
stream=False,
|
|
@@ -754,12 +760,11 @@ class SyntaxMUI:
|
|
|
754
760
|
{ "role": "system", "content": _title_profile },
|
|
755
761
|
{ "role": "user", "content": _instructions },
|
|
756
762
|
]
|
|
757
|
-
|
|
758
763
|
try:
|
|
759
764
|
response = _client.chat.completions.create(
|
|
760
765
|
model=_model,
|
|
761
766
|
messages=prompt,
|
|
762
|
-
temperature=0,
|
|
767
|
+
temperature=0.3,
|
|
763
768
|
max_tokens=50
|
|
764
769
|
)
|
|
765
770
|
title = response.choices[0].message.content.strip().lower()
|
|
@@ -769,7 +774,7 @@ class SyntaxMUI:
|
|
|
769
774
|
|
|
770
775
|
if _provider == "google":
|
|
771
776
|
title = google_generated_title()
|
|
772
|
-
elif _model in self.
|
|
777
|
+
elif _model in self.get_gpt_models_latest():
|
|
773
778
|
title = gpt_models_latest_generated_title()
|
|
774
779
|
elif _provider == "anthropic":
|
|
775
780
|
title = anthropic_generated_title()
|
|
@@ -792,7 +797,7 @@ class SyntaxMUI:
|
|
|
792
797
|
if not chat_profile:
|
|
793
798
|
yield """<p style='color:red;'>Error: Chat profile is not configured. Add a chat profile inside the admin panel or contact your administrator.</p>
|
|
794
799
|
"""
|
|
795
|
-
return
|
|
800
|
+
return None
|
|
796
801
|
self._chat_profile = chat_profile
|
|
797
802
|
self._chat_profile['client'] = _prof.get_client(chat_profile)
|
|
798
803
|
|
|
@@ -801,7 +806,7 @@ class SyntaxMUI:
|
|
|
801
806
|
_model = self._chat_profile['model']
|
|
802
807
|
|
|
803
808
|
_contents = f"""
|
|
804
|
-
{self.
|
|
809
|
+
{self.smxai_instructions}\n\n
|
|
805
810
|
Question: {query}\n
|
|
806
811
|
Context: {context}\n\n
|
|
807
812
|
History: {history}\n\n
|
|
@@ -809,32 +814,30 @@ class SyntaxMUI:
|
|
|
809
814
|
"""
|
|
810
815
|
|
|
811
816
|
try:
|
|
812
|
-
if _provider == "google": # Google, non openai skd series
|
|
813
|
-
|
|
814
|
-
types.Content(
|
|
815
|
-
role="user",
|
|
816
|
-
parts=[
|
|
817
|
-
types.Part.from_text(text=f"{self.ai_chat_id}\n\n{_contents}"),
|
|
818
|
-
],
|
|
819
|
-
),
|
|
820
|
-
]
|
|
821
|
-
|
|
817
|
+
if _provider == "google": # Google, non openai skd series
|
|
818
|
+
|
|
822
819
|
for chunk in _client.models.generate_content_stream(
|
|
823
820
|
model=_model,
|
|
824
|
-
contents=
|
|
821
|
+
contents=_contents,
|
|
822
|
+
config=types.GenerateContentConfig(
|
|
823
|
+
system_instruction=self.smxai_identity,
|
|
824
|
+
temperature=0.3,
|
|
825
|
+
max_output_tokens=1024,
|
|
826
|
+
),
|
|
825
827
|
):
|
|
828
|
+
|
|
826
829
|
yield chunk.text
|
|
827
830
|
|
|
828
|
-
elif _provider == "openai" and _model in self.
|
|
831
|
+
elif _provider == "openai" and _model in self.get_gpt_models_latest(): # GPt 5 series
|
|
829
832
|
input_prompt = (
|
|
830
|
-
f"{self.
|
|
833
|
+
f"{self.smxai_instructions}\n\n"
|
|
831
834
|
f"Generate a response to this query:\n{query}\n"
|
|
832
835
|
f"based on this given context:\n{context}\n\n"
|
|
833
836
|
f"(Use conversation continuity if available.)"
|
|
834
837
|
)
|
|
835
838
|
sid = self.get_session_id()
|
|
836
839
|
prev_id = self._gpt_models_latest_prev_resp_ids.get(sid)
|
|
837
|
-
args = set_args(model=_model, instructions=self.
|
|
840
|
+
args = set_args(model=_model, instructions=self.smxai_identity, input=input_prompt, previous_id=prev_id, store=True)
|
|
838
841
|
|
|
839
842
|
with _client.responses.stream(**args) as s:
|
|
840
843
|
for event in s:
|
|
@@ -849,7 +852,7 @@ class SyntaxMUI:
|
|
|
849
852
|
elif _provider == "anthropic":
|
|
850
853
|
with _client.messages.stream(
|
|
851
854
|
max_tokens=1024,
|
|
852
|
-
messages=[{"role": "user", "content":f"{self.
|
|
855
|
+
messages=[{"role": "user", "content":f"{self.smxai_identity}\n\n {_contents}"},],
|
|
853
856
|
model=_model,
|
|
854
857
|
) as stream:
|
|
855
858
|
for text in stream.text_stream:
|
|
@@ -857,8 +860,8 @@ class SyntaxMUI:
|
|
|
857
860
|
|
|
858
861
|
else: # Assumes standard openai_sdk
|
|
859
862
|
openai_sdk_prompt = [
|
|
860
|
-
{"role": "system", "content": self.
|
|
861
|
-
{"role": "user", "content": f"{self.
|
|
863
|
+
{"role": "system", "content": self.smxai_identity},
|
|
864
|
+
{"role": "user", "content": f"{self.smxai_instructions}\n\nGenerate response to this query: {query}\nbased on this context:\n{context}\nand history:\n{history}\n\nUse conversation continuity if available.)"},
|
|
862
865
|
]
|
|
863
866
|
response = _client.chat.completions.create(
|
|
864
867
|
model=_model,
|
|
@@ -883,9 +886,11 @@ class SyntaxMUI:
|
|
|
883
886
|
|
|
884
887
|
self._chat_profile = chat_profile
|
|
885
888
|
self._chat_profile['client'] = _prof.get_client(chat_profile)
|
|
886
|
-
|
|
889
|
+
_provider = self._chat_profile['provider']
|
|
890
|
+
_client = self._chat_profile['client']
|
|
891
|
+
_model = self._chat_profile['model']
|
|
887
892
|
_contents = f"""
|
|
888
|
-
{self.
|
|
893
|
+
{self.smxai_instructions}\n\n
|
|
889
894
|
Question: {query}\n
|
|
890
895
|
Context: {context}\n\n
|
|
891
896
|
History: {history}\n\n
|
|
@@ -893,8 +898,8 @@ class SyntaxMUI:
|
|
|
893
898
|
"""
|
|
894
899
|
|
|
895
900
|
openai_sdk_prompt = [
|
|
896
|
-
{"role": "system", "content": self.
|
|
897
|
-
{"role": "user", "content": f"""{self.
|
|
901
|
+
{"role": "system", "content": self.smxai_identity},
|
|
902
|
+
{"role": "user", "content": f"""{self.smxai_instructions}\n\n
|
|
898
903
|
Generate response to this query: {query}\n
|
|
899
904
|
based on this context:\n{context}\n
|
|
900
905
|
and history:\n{history}\n\n
|
|
@@ -903,18 +908,19 @@ class SyntaxMUI:
|
|
|
903
908
|
},
|
|
904
909
|
]
|
|
905
910
|
|
|
906
|
-
_provider = self._chat_profile['provider']
|
|
907
|
-
_client = self._chat_profile['client']
|
|
908
|
-
_model = self._chat_profile['model']
|
|
909
|
-
|
|
910
911
|
def google_process_query():
|
|
911
912
|
try:
|
|
912
913
|
response = _client.models.generate_content(
|
|
913
914
|
model=_model,
|
|
914
|
-
contents=
|
|
915
|
+
contents=_contents,
|
|
916
|
+
config=types.GenerateContentConfig(
|
|
917
|
+
system_instruction=self.smxai_identity,
|
|
918
|
+
temperature=0.3,
|
|
919
|
+
max_output_tokens=1024,
|
|
920
|
+
),
|
|
915
921
|
)
|
|
916
922
|
answer = response.text
|
|
917
|
-
|
|
923
|
+
|
|
918
924
|
# answer = strip_html(answer)
|
|
919
925
|
return answer
|
|
920
926
|
except Exception as e:
|
|
@@ -926,7 +932,7 @@ class SyntaxMUI:
|
|
|
926
932
|
"""
|
|
927
933
|
# Prepare the prompt with conversation history and context
|
|
928
934
|
input = (
|
|
929
|
-
f"{self.
|
|
935
|
+
f"{self.smxai_instructions}\n\n"
|
|
930
936
|
f"Generate a response to this query:\n{query}\n"
|
|
931
937
|
f"based on this given context:\n{context}\n\n"
|
|
932
938
|
f"(Use conversation continuity if available.)"
|
|
@@ -937,7 +943,7 @@ class SyntaxMUI:
|
|
|
937
943
|
|
|
938
944
|
args = set_args(
|
|
939
945
|
model=_model,
|
|
940
|
-
instructions=self.
|
|
946
|
+
instructions=self.smxai_identity,
|
|
941
947
|
input=input,
|
|
942
948
|
previous_id=prev_id,
|
|
943
949
|
store=True,
|
|
@@ -962,7 +968,7 @@ class SyntaxMUI:
|
|
|
962
968
|
response = _client.messages.create(
|
|
963
969
|
model=_model,
|
|
964
970
|
max_tokens=1024,
|
|
965
|
-
system=self.
|
|
971
|
+
system=self.self.smxai_identity,
|
|
966
972
|
messages=[{"role": "user", "content":_contents}],
|
|
967
973
|
stream=False,
|
|
968
974
|
)
|
|
@@ -977,7 +983,7 @@ class SyntaxMUI:
|
|
|
977
983
|
response = _client.chat.completions.create(
|
|
978
984
|
model=_model,
|
|
979
985
|
messages=openai_sdk_prompt,
|
|
980
|
-
|
|
986
|
+
stream=False,
|
|
981
987
|
)
|
|
982
988
|
|
|
983
989
|
# -------- one-shot buffered --------
|
|
@@ -988,239 +994,373 @@ class SyntaxMUI:
|
|
|
988
994
|
|
|
989
995
|
if _provider == "google":
|
|
990
996
|
return google_process_query()
|
|
991
|
-
if _provider == "openai" and _model in self.
|
|
997
|
+
if _provider == "openai" and _model in self.get_gpt_models_latest():
|
|
992
998
|
return gpt_models_latest_process_query(self._gpt_models_latest_prev_resp_ids.get(self.get_session_id()))
|
|
993
999
|
if _provider == "anthropic":
|
|
994
1000
|
return anthropic_process_query()
|
|
995
1001
|
return openai_sdk_process_query()
|
|
996
1002
|
|
|
997
1003
|
|
|
998
|
-
def
|
|
1004
|
+
def repair_python_cell(self, py_code: str) -> str:
|
|
1005
|
+
|
|
1006
|
+
_CELL_REPAIR_RULES = """
|
|
1007
|
+
Fix the Python cell to satisfy:
|
|
1008
|
+
- Single valid cell; imports at the top.
|
|
1009
|
+
- Do not import or invoke or use 'python-dotenv' or 'dotenv' because it's not needed.
|
|
1010
|
+
- No top-level statements between if/elif/else branches.
|
|
1011
|
+
- Regression must use either sklearn with train_test_split (then X_test exists) and R^2/MAE/RMSE,
|
|
1012
|
+
or statsmodels OLS. No accuracy_score in regression.
|
|
1013
|
+
- Keep all plotting + savefig + BytesIO + display inside the branch that created the figure.
|
|
1014
|
+
- Return ONLY the corrected cell.
|
|
1015
|
+
"""
|
|
1016
|
+
code = textwrap.dedent(py_code or "").strip()
|
|
1017
|
+
needs_fix = False
|
|
1018
|
+
if re.search(r"\baccuracy_score\b", code) and re.search(r"\bLinearRegression\b|\bOLS\b", code):
|
|
1019
|
+
needs_fix = True
|
|
1020
|
+
if re.search(r"\bX_test\b", code) and not re.search(r"\bX_test\s*=", code):
|
|
1021
|
+
needs_fix = True
|
|
1022
|
+
try:
|
|
1023
|
+
ast.parse(code)
|
|
1024
|
+
except SyntaxError:
|
|
1025
|
+
needs_fix = True
|
|
1026
|
+
if not needs_fix:
|
|
1027
|
+
return code
|
|
1028
|
+
_prompt = f"```python\n{code}\n```"
|
|
1029
|
+
|
|
1030
|
+
repair_profile = _prof.get_profile("vision2text") or _prof.get_profile("admin")
|
|
1031
|
+
if not repair_profile:
|
|
1032
|
+
return (
|
|
1033
|
+
'<div class="smx-alert smx-alert-warn">'
|
|
1034
|
+
'No LLM profile configured for <code>coding</code> (or <code>admin</code>). <br>'
|
|
1035
|
+
'Please, add the LLM profile inside the admin panel or contact your Administrator.'
|
|
1036
|
+
'</div>'
|
|
1037
|
+
)
|
|
1038
|
+
|
|
1039
|
+
_client = _prof.get_client(repair_profile)
|
|
1040
|
+
_provider = repair_profile['provider'].lower()
|
|
1041
|
+
_model = repair_profile['model']
|
|
1042
|
+
|
|
1043
|
+
#1 Google
|
|
1044
|
+
if _provider == "google":
|
|
1045
|
+
from google.genai import types
|
|
1046
|
+
|
|
1047
|
+
fixed = _client.models.generate_content(
|
|
1048
|
+
model=_model,
|
|
1049
|
+
contents=_prompt,
|
|
1050
|
+
config=types.GenerateContentConfig(
|
|
1051
|
+
system_instruction=_CELL_REPAIR_RULES,
|
|
1052
|
+
temperature=0.8,
|
|
1053
|
+
max_output_tokens=1024,
|
|
1054
|
+
),
|
|
1055
|
+
)
|
|
1056
|
+
|
|
1057
|
+
#2 Openai
|
|
1058
|
+
elif _provider == "openai" and _model in GPT_MODELS_LATEST:
|
|
1059
|
+
|
|
1060
|
+
args = set_args(
|
|
1061
|
+
model=_model,
|
|
1062
|
+
instructions=_CELL_REPAIR_RULES,
|
|
1063
|
+
input=[{"role": "user", "content": _prompt}],
|
|
1064
|
+
previous_id=None,
|
|
1065
|
+
store=False,
|
|
1066
|
+
reasoning_effort="medium",
|
|
1067
|
+
verbosity="medium",
|
|
1068
|
+
)
|
|
1069
|
+
fixed = _out(_client.responses.create(**args))
|
|
999
1070
|
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
if not coding_profile:
|
|
1003
|
-
# tell the user exactly what to configure
|
|
1004
|
-
return (
|
|
1005
|
-
'<div class="smx-alert smx-alert-warn">'
|
|
1006
|
-
'No LLM profile configured for <code>coding</code> (or <code>admin</code>). '
|
|
1007
|
-
'Please, contact your Administrator.'
|
|
1008
|
-
'</div>'
|
|
1009
|
-
)
|
|
1071
|
+
# Anthropic
|
|
1072
|
+
elif _provider == "anthropic":
|
|
1010
1073
|
|
|
1011
|
-
|
|
1012
|
-
|
|
1074
|
+
fixed = _client.messages.create(
|
|
1075
|
+
model=_model,
|
|
1076
|
+
max_tokens=1024,
|
|
1077
|
+
system=_CELL_REPAIR_RULES,
|
|
1078
|
+
messages=[{"role": "user", "content":_prompt}],
|
|
1079
|
+
stream=False,
|
|
1080
|
+
)
|
|
1081
|
+
|
|
1082
|
+
# OpenAI SDK
|
|
1083
|
+
else:
|
|
1084
|
+
fixed = _client.chat.completions.create(
|
|
1085
|
+
model=_model,
|
|
1086
|
+
messages=[
|
|
1087
|
+
{"role": "system", "content":_CELL_REPAIR_RULES},
|
|
1088
|
+
{"role": "user", "content":_prompt},
|
|
1089
|
+
],
|
|
1090
|
+
max_tokens=1024,
|
|
1091
|
+
)
|
|
1092
|
+
|
|
1093
|
+
try:
|
|
1094
|
+
ast.parse(fixed);
|
|
1095
|
+
return fixed
|
|
1096
|
+
except Exception:
|
|
1097
|
+
return code
|
|
1013
1098
|
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
_model = self._coding_profile['model']
|
|
1099
|
+
def get_last_llm_usage(self):
|
|
1100
|
+
return getattr(self, "_last_llm_usage", None)
|
|
1017
1101
|
|
|
1018
|
-
|
|
1019
|
-
ALLOWED_COLUMNS = list(df.columns)
|
|
1102
|
+
def ai_generate_code(self, refined_question, tasks, df):
|
|
1020
1103
|
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1104
|
+
def normalise_llm_code(s: str) -> str:
|
|
1105
|
+
s = s.replace("\t", " ")
|
|
1106
|
+
s = textwrap.dedent(s)
|
|
1107
|
+
lines = s.splitlines()
|
|
1024
1108
|
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
3) **Avoid deprecated / removed APIs**, e.g.:
|
|
1037
|
-
- pandas: do not use `.append`, `.ix`, `.as_matrix`, `DataFrame.select_dtypes(include='category')` is OK, but prefer current patterns.
|
|
1038
|
-
- seaborn: do not use `distplot`, `pairplot` on very large data without sampling; prefer `histplot`, `displot`, `regplot`, or FacetGrid with `.map_dataframe`.
|
|
1039
|
-
- scikit-learn: import from `sklearn.model_selection` (not `sklearn.cross_validation`); for confusion matrices use `ConfusionMatrixDisplay.from_estimator`; set `random_state=42` where relevant.
|
|
1040
|
-
4) Be **defensive**:
|
|
1041
|
-
- Verify required columns exist; if any are missing, raise `ValueError("Missing columns: ...")` early.
|
|
1042
|
-
- Handle missing values sensibly (e.g., drop rows for simple EDA; use `ColumnTransformer` + `SimpleImputer` for modeling).
|
|
1043
|
-
- For categorical features in ML, use `OneHotEncoder(handle_unknown="ignore")` inside a `Pipeline`/`ColumnTransformer` (no `LabelEncoder` on features).
|
|
1044
|
-
5) Keep it **fast** (kernel timeout ~8s):
|
|
1045
|
-
- For plots on large frames (>20k rows), downsample to ~1,000 rows (`df.sample(1000, random_state=42)`) unless aggregation is more appropriate.
|
|
1046
|
-
- Prefer vectorized ops; avoid O(n²) Python loops.
|
|
1047
|
-
6) Always **produce at least one visible result** at the end:
|
|
1048
|
-
- If plotting with matplotlib/seaborn: call `plt.tight_layout(); plt.show()`.
|
|
1049
|
-
- If producing a table or metrics: from `syntaxmatrix.display import show` then `show(object_or_dataframe)`.
|
|
1050
|
-
7) Follow task type conventions:
|
|
1051
|
-
- **EDA/Stats**: compute the requested stat, then show a relevant table (e.g., summary/crosstab) or plot.
|
|
1052
|
-
- **Classification**: train/valid split (`train_test_split`), build a pipeline with scaling/encoding as needed, fit, show accuracy **and** a confusion matrix via `ConfusionMatrixDisplay.from_estimator(...); plt.show()`. Also show `classification_report` as a dataframe if short.
|
|
1053
|
-
- **Regression**: train/valid split, pipeline as needed, fit, show R² and MAE; plot predicted vs actual scatter.
|
|
1054
|
-
- **Correlation/Chi-square/ANOVA**: compute the statistic + p-value and show a concise result table (with `show(...)`) and, when sensible, a small plot (heatmap/bar).
|
|
1055
|
-
8) Don't mutate or recreate target columns if they already exist (e.g., if asked to “predict TARGET”, use `y = df['TARGET']` as-is).
|
|
1056
|
-
9) Keep variable names short and clear; prefer `num_cols` / `cat_cols` discovery by dtype.
|
|
1057
|
-
10) You MUST NOT reference any column outside Allowed columns: {ALLOWED_COLUMNS}\n.
|
|
1058
|
-
11) If asked to predict/classify, choose the target by matching the task text to Allowed columns: {ALLOWED_COLUMNS}\n and never invent a new name (e.g., 'whether', 'the').
|
|
1059
|
-
|
|
1060
|
-
### Output
|
|
1061
|
-
Return **only runnable Python** that:
|
|
1062
|
-
- Imports what it needs,
|
|
1063
|
-
- Validates columns,
|
|
1064
|
-
- Solves: {question},
|
|
1065
|
-
- And ends with at least one visible output (`show(...)` and/or `plt.show()`).
|
|
1066
|
-
"""
|
|
1109
|
+
# drop leading blank lines
|
|
1110
|
+
while lines and not lines[0].strip():
|
|
1111
|
+
lines.pop(0)
|
|
1112
|
+
|
|
1113
|
+
# if everything is still indented >=4 spaces, shift left
|
|
1114
|
+
indents = [len(l) - len(l.lstrip(" ")) for l in lines if l.strip()]
|
|
1115
|
+
if indents and min(indents) >= 4:
|
|
1116
|
+
m = min(indents)
|
|
1117
|
+
lines = [l[m:] if len(l) >= m else l for l in lines]
|
|
1118
|
+
|
|
1119
|
+
return "\n".join(lines)
|
|
1067
1120
|
|
|
1068
|
-
|
|
1121
|
+
CONTEXT = f"Columns: {list(df.columns)}\n\nDtypes: {df.dtypes.astype(str).to_dict()}\n\n"
|
|
1122
|
+
AVAILABLE_COLUMNS = list(df.columns)
|
|
1123
|
+
|
|
1124
|
+
# --- SMX: normalise tasks coming from intent agent ---
|
|
1125
|
+
if isinstance(tasks, str):
|
|
1126
|
+
import json, ast, re
|
|
1069
1127
|
try:
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
import pandas as pd
|
|
1093
|
-
import matplotlib.pyplot as plt
|
|
1094
|
-
import seaborn as sns
|
|
1095
|
-
import numpy as np
|
|
1096
|
-
import io
|
|
1097
|
-
import base64
|
|
1098
|
-
from syntaxmatrix.display import show
|
|
1099
|
-
|
|
1100
|
-
print("Basic DataFrame Info:")
|
|
1101
|
-
print(f"Shape: {df.shape}")
|
|
1102
|
-
print("\\nColumns and dtypes:")
|
|
1103
|
-
print(df.dtypes)
|
|
1104
|
-
print("\\nBasic statistics:")
|
|
1105
|
-
show(df.describe())
|
|
1106
|
-
|
|
1107
|
-
print("\\nFirst few rows:")
|
|
1108
|
-
show(df.head())
|
|
1109
|
-
|
|
1110
|
-
# Generate a simple visualization based on available columns
|
|
1111
|
-
plt.figure(figsize=(10, 6))
|
|
1112
|
-
|
|
1113
|
-
if len(df.columns) >= 2:
|
|
1114
|
-
# Try to find numeric columns for scatter plot
|
|
1115
|
-
numeric_cols = df.select_dtypes(include=['number']).columns
|
|
1116
|
-
if len(numeric_cols) >= 2:
|
|
1117
|
-
sns.scatterplot(data=df, x=numeric_cols[0], y=numeric_cols[1])
|
|
1118
|
-
plt.title(f"Scatter plot: {numeric_cols[0]} vs {numeric_cols[1]}")
|
|
1119
|
-
plt.tight_layout()
|
|
1120
|
-
plt.show()
|
|
1121
|
-
else:
|
|
1122
|
-
# Use first column for bar plot
|
|
1123
|
-
top_values = df[df.columns[0]].value_counts().head(10)
|
|
1124
|
-
top_values.plot(kind='bar')
|
|
1125
|
-
plt.title(f"Top 10 values in {df.columns[0]}")
|
|
1126
|
-
plt.tight_layout()
|
|
1127
|
-
plt.show()
|
|
1128
|
-
else:
|
|
1129
|
-
# Single column analysis
|
|
1130
|
-
if len(df.columns) == 1:
|
|
1131
|
-
col_name = df.columns[0]
|
|
1132
|
-
if df[col_name].dtype in ['object', 'category']:
|
|
1133
|
-
df[col_name].value_counts().head(10).plot(kind='bar')
|
|
1134
|
-
plt.title(f"Value counts for {col_name}")
|
|
1135
|
-
else:
|
|
1136
|
-
df[col_name].hist(bins=20)
|
|
1137
|
-
plt.title(f"Distribution of {col_name}")
|
|
1138
|
-
plt.tight_layout()
|
|
1139
|
-
plt.show()
|
|
1140
|
-
else:
|
|
1141
|
-
print("Insufficient columns for detailed analysis")
|
|
1142
|
-
show(df)
|
|
1143
|
-
"""
|
|
1128
|
+
tasks_parsed = json.loads(tasks)
|
|
1129
|
+
except Exception:
|
|
1130
|
+
try:
|
|
1131
|
+
tasks_parsed = ast.literal_eval(tasks)
|
|
1132
|
+
except Exception:
|
|
1133
|
+
tasks_parsed = re.findall(r"[A-Za-z_]+", tasks)
|
|
1134
|
+
tasks = tasks_parsed
|
|
1135
|
+
if not isinstance(tasks, list):
|
|
1136
|
+
tasks = [str(tasks)]
|
|
1137
|
+
tasks = [str(t).strip().lower() for t in tasks if str(t).strip()]
|
|
1138
|
+
|
|
1139
|
+
ai_profile = """
|
|
1140
|
+
- You are a Python expert specializing in data science and machine learning.
|
|
1141
|
+
- Your task is to generate a single, complete, production-quality, executable Python script for a Jupyter-like Python kernel, based on the given instructions.
|
|
1142
|
+
- The dataset is already loaded as a pandas DataFrame named `df` (no file I/O or file uploads).
|
|
1143
|
+
- Make a copy of `df` and name it `df_copy`. Make sure `df_copy` is preprocessed and cleaned, named `df_cleaned`, if not already done so. Then use `df_cleaned` to perform the ML tasks described in the given context.
|
|
1144
|
+
- Select your features and target, from `df_cleaned`, with care and name it `required_cols`
|
|
1145
|
+
- Create your 'df_filtered by doing: df_filtered = df_cleaned[required_cols].
|
|
1146
|
+
- Use the {TEMPLATE_CATALOGUE} below to educate yourself on which visualizations you will implement in the code.
|
|
1147
|
+
- The final output MUST be the complete, executable Python code only, enclosed in a single markdown code block (```python ... ```), which is required to fulfill the user's request. See the {tasks} below.
|
|
1148
|
+
- Do not include any explanatory text or markdown outside the code block.
|
|
1149
|
+
"""
|
|
1144
1150
|
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1151
|
+
TEMPLATE_CATALOGUE = """
|
|
1152
|
+
### Available SyntaxMatrix templates (use these instead of inventing new helpers)
|
|
1153
|
+
|
|
1154
|
+
Visualisation templates (dataset-agnostic):
|
|
1155
|
+
- viz_pie(df, category_col=None, top_k=8): pie/donut shares within a category.
|
|
1156
|
+
- viz_stacked_bar(df, x=None, hue=None, normalise=True): composition across groups.
|
|
1157
|
+
- viz_count_bar(df, category_col=None, top_k=12): counts/denominators by category.
|
|
1158
|
+
- viz_box(df, x=None, y=None): spread/outliers of numeric by category.
|
|
1159
|
+
- viz_scatter(df, x=None, y=None, hue=None): relationship between two numeric vars.
|
|
1160
|
+
- viz_distribution(df, col=None): histogram-style distribution for numeric.
|
|
1161
|
+
- viz_kde(df, col=None): density curve for numeric.
|
|
1162
|
+
- viz_area(df, time_col=None, y_col=None): area/trend over time.
|
|
1163
|
+
- viz_line(df, x=None, y=None, hue=None): line/trend plot.
|
|
1164
|
+
|
|
1165
|
+
ML/stat templates:
|
|
1166
|
+
- classification(df): standard classification pipeline + metrics + plots.
|
|
1167
|
+
- regression(df): standard regression pipeline + metrics + plots.
|
|
1168
|
+
- clustering(df): clustering workflow + cluster plots.
|
|
1169
|
+
- anomaly_detection(df)
|
|
1170
|
+
- ts_anomaly_detection(df)
|
|
1171
|
+
- time_series_forecasting(df)
|
|
1172
|
+
- time_series_classification(df, entity_col, time_col, target_col)
|
|
1173
|
+
- dimensionality_reduction(df)
|
|
1174
|
+
- feature_selection(df)
|
|
1175
|
+
- eda_overview(df)
|
|
1176
|
+
- eda_correlation(df)
|
|
1177
|
+
- multilabel_classification(df, label_cols)
|
|
1178
|
+
- recommendation(df)
|
|
1179
|
+
- topic_modelling(df)
|
|
1180
|
+
"""
|
|
1181
|
+
|
|
1182
|
+
instructions = (
|
|
1183
|
+
"### Context"
|
|
1184
|
+
f"- DataFrame - (`df`): {df}"
|
|
1185
|
+
f"- Schema (names → dtypes): {CONTEXT}"
|
|
1186
|
+
f"- Row count: {len(df)}"
|
|
1187
|
+
f"- Task description: {refined_question}"
|
|
1188
|
+
f"- Tasks: {tasks}"
|
|
1189
|
+
f"- Available columns: {AVAILABLE_COLUMNS}"
|
|
1190
|
+
f"- Template catalogue: {TEMPLATE_CATALOGUE}"
|
|
1154
1191
|
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1192
|
+
"""
|
|
1193
|
+
### Template rules
|
|
1194
|
+
- You MAY call a template if it matches the task.
|
|
1195
|
+
- Do NOT invent template names.
|
|
1196
|
+
- If no template fits, write minimal direct pandas/sklearn/seaborn code instead.
|
|
1197
|
+
- Keep the solution short: avoid writing wrappers/utilities already handled by SyntaxMatrix hardener.
|
|
1198
|
+
|
|
1199
|
+
#### Template selection hint examples:
|
|
1200
|
+
- If the task asks for pie/donut/composition shares → use viz_pie.
|
|
1201
|
+
- If it asks for denominators/counts per category → viz_count_bar.
|
|
1202
|
+
- If it asks for spread/outliers/comparison across groups → viz_box.
|
|
1203
|
+
- If it asks for relationship / “X vs Y” → viz_scatter.
|
|
1204
|
+
- If it asks for trend over time → viz_line or viz_area.
|
|
1205
|
+
|
|
1206
|
+
### Hard requirements
|
|
1207
|
+
1) Code only. No markdown, no comments, no explanations.
|
|
1208
|
+
2) Import everything you use explicitly.
|
|
1209
|
+
- Use pandas/numpy/matplotlib by default.
|
|
1210
|
+
- Seaborn may be unavailable at runtime; **do not import seaborn inside your code**.
|
|
1211
|
+
- If you call sns.*, assume sns is already defined by the framework.
|
|
1212
|
+
3) Avoid deprecated / removed APIs**, e.g.:
|
|
1213
|
+
- pandas: do not use `.append`, `.ix`, `.as_matrix`; prefer current patterns.
|
|
1214
|
+
- seaborn: do not use `distplot`; avoid `pairplot` on very large data unless sampling.
|
|
1215
|
+
- scikit-learn: import from `sklearn.model_selection` (not `sklearn.cross_validation`);
|
|
1216
|
+
set `random_state=42` where relevant.
|
|
1217
|
+
4) Be defensive, but avoid hard-failing on optional fields:
|
|
1218
|
+
- If the primary column, needed to answer the question, is missing, review your copy of the `df` again.
|
|
1219
|
+
Make sure that you selected the proper column.
|
|
1220
|
+
Never use a column/variable which isn't available or defined.
|
|
1221
|
+
- If a secondary/extra column is missing, show a warning with `show(...)` and continue using available fields.
|
|
1222
|
+
- Handle missing values sensibly (drop rows for simple EDA; use `ColumnTransformer` + `SimpleImputer` for modelling).
|
|
1223
|
+
- For categorical features in ML, use `OneHotEncoder(handle_unknown="ignore")`
|
|
1224
|
+
inside a `Pipeline`/`ColumnTransformer` (no `LabelEncoder` on features).
|
|
1225
|
+
5) Keep it fast (kernel timeout ~8s):
|
|
1226
|
+
- For plots on large frames (>20k rows), downsample to ~1,000 rows
|
|
1227
|
+
(`df.sample(1000, random_state=42)`) unless aggregation is more appropriate.
|
|
1228
|
+
- Prefer vectorised ops; avoid O(n²) Python loops.
|
|
1229
|
+
6) Keep the solution compact:
|
|
1230
|
+
- Do not define large helper libraries or long “required column” sets.
|
|
1231
|
+
- Aim for ≤120 lines excluding imports.
|
|
1232
|
+
7) Always produce at least one visible result at the end:
|
|
1233
|
+
- If plotting with matplotlib/seaborn: call `plt.tight_layout(); plt.show()`.
|
|
1234
|
+
- If producing a table or metrics:
|
|
1235
|
+
`from syntaxmatrix.display import show` then `show(object_or_dataframe)`.
|
|
1236
|
+
8) Follow task type conventions:
|
|
1237
|
+
- **EDA/Stats**: compute the requested stat, then show a relevant table
|
|
1238
|
+
(e.g., summary/crosstab) or plot.
|
|
1239
|
+
- **Classification**: train/valid split (`train_test_split`), pipeline with scaling/encoding,
|
|
1240
|
+
fit, show accuracy and a confusion matrix via
|
|
1241
|
+
`ConfusionMatrixDisplay.from_estimator(...); plt.show()`.
|
|
1242
|
+
Also show `classification_report` as a dataframe if short.
|
|
1243
|
+
- **Regression**: train/valid split, pipeline as needed, fit, show R² and MAE;
|
|
1244
|
+
plot predicted vs actual scatter.
|
|
1245
|
+
- **Correlation/Chi-square/ANOVA**: compute the statistic + p-value and show a concise
|
|
1246
|
+
result table (with `show(...)`) and, when sensible, a small plot (heatmap/bar).
|
|
1247
|
+
9) Don't mutate or recreate target columns if they already exist.
|
|
1248
|
+
10) Keep variable names short and clear; prefer `num_cols` / `cat_cols` discovery by dtype.
|
|
1249
|
+
11) You MUST NOT reference any column outside Available columns: {AVAILABLE_COLUMNS}.
|
|
1250
|
+
12) If asked to predict/classify, choose the target by matching the task text to Allowed columns
|
|
1251
|
+
and never invent a new name.
|
|
1252
|
+
|
|
1253
|
+
#### Cohort rules
|
|
1254
|
+
When you generate plots for cohorts or categories, you MUST obey these rules:
|
|
1255
|
+
1) ALWAYS guard cohort masks:
|
|
1256
|
+
- After you define something like:
|
|
1257
|
+
_mask_a = (df['BMI'] < 18.5) & df['BMI'].notna()
|
|
1258
|
+
_mask_b = ~(df['BMI'] < 18.5) & df['BMI'].notna()
|
|
1259
|
+
compute their sizes:
|
|
1260
|
+
n_a = int(_mask_a.sum())
|
|
1261
|
+
n_b = int(_mask_b.sum())
|
|
1262
|
+
- If a mask has no rows (or almost none), do NOT draw an empty plot.
|
|
1263
|
+
Instead call:
|
|
1264
|
+
show(f"Skipping cohort '{label}': no rows after filtering.")
|
|
1265
|
+
and return.
|
|
1266
|
+
|
|
1267
|
+
2) Before any groupby / crosstab for a plot:
|
|
1268
|
+
- Fill missing categories so groupby does not drop everything:
|
|
1269
|
+
df[col] = df[col].fillna("Unknown")
|
|
1270
|
+
- After building the table:
|
|
1271
|
+
tab = tmp.groupby([...]).size().unstack(...).fillna(0)
|
|
1272
|
+
ALWAYS check:
|
|
1273
|
+
if tab.empty:
|
|
1274
|
+
show(f"Skipping plot for {col}: no data after grouping.")
|
|
1275
|
+
continue
|
|
1276
|
+
Only call .plot(...) if the table is non-empty.
|
|
1277
|
+
|
|
1278
|
+
3) For value_counts-based plots:
|
|
1279
|
+
- If the Series is empty after filtering (len(s) == 0),
|
|
1280
|
+
do NOT draw a figure. Just call:
|
|
1281
|
+
show(f"No data available to plot for {col} in this cohort.")
|
|
1282
|
+
and skip.
|
|
1283
|
+
|
|
1284
|
+
4) Never try to “hide” an error with a blank plot.
|
|
1285
|
+
A blank chart is treated as a bug. If there is no data, explain it
|
|
1286
|
+
clearly using show(...), and avoid calling matplotlib/Seaborn.
|
|
1287
|
+
|
|
1288
|
+
5) Never use print(...). All user-visible diagnostics go through show(...).
|
|
1160
1289
|
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1290
|
+
|
|
1291
|
+
### Output
|
|
1292
|
+
Return only runnable Python that:
|
|
1293
|
+
- Imports what it needs,
|
|
1294
|
+
- Validates columns,
|
|
1295
|
+
- Visualize tables, charts, and graphs, each with appropriate caption.
|
|
1296
|
+
- Solution: {tasks} to solve {refined_question},
|
|
1297
|
+
- And ends with at least 3 visible output (`show(...)` and/or `plt.show()`).
|
|
1298
|
+
""")
|
|
1299
|
+
|
|
1300
|
+
if not self._coding_profile:
|
|
1301
|
+
coding_profile = _prof.get_profile("coding") or _prof.get_profile("admin")
|
|
1302
|
+
if not coding_profile:
|
|
1303
|
+
return (
|
|
1304
|
+
'<div class="smx-alert smx-alert-warn">'
|
|
1305
|
+
'No LLM profile configured for <code>coding</code> (or <code>admin</code>). <br>'
|
|
1306
|
+
'Please, add the LLM profile inside the admin panel or contact your Administrator.'
|
|
1307
|
+
'</div>'
|
|
1169
1308
|
)
|
|
1170
|
-
return response.content[0].text.strip()
|
|
1171
|
-
except Exception as e:
|
|
1172
|
-
return f"Error!"
|
|
1173
1309
|
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
{"role": "user", "content": instructions},
|
|
1181
|
-
],
|
|
1182
|
-
temperature=0.3,
|
|
1183
|
-
max_tokens=2048,
|
|
1184
|
-
)
|
|
1185
|
-
return response.choices[0].message.content
|
|
1186
|
-
except Exception as e:
|
|
1187
|
-
return "Error!"
|
|
1310
|
+
self._coding_profile = coding_profile
|
|
1311
|
+
self._coding_profile['client'] = _prof.get_client(coding_profile)
|
|
1312
|
+
|
|
1313
|
+
# code = mlearning_agent(instructions, ai_profile, self._coding_profile)
|
|
1314
|
+
code, usage = mlearning_agent(instructions, ai_profile, self._coding_profile)
|
|
1315
|
+
self._last_llm_usage = usage
|
|
1188
1316
|
|
|
1189
|
-
if _provider == 'google':
|
|
1190
|
-
code = google_generate_code()
|
|
1191
|
-
elif _provider == "openai" and _model in self.gpt_models_latest():
|
|
1192
|
-
code = gpt_models_latest_generate_code()
|
|
1193
|
-
elif _provider == "anthropic":
|
|
1194
|
-
code = anthropic_generate_code()
|
|
1195
|
-
else:
|
|
1196
|
-
code = openai_sdk_generate_code()
|
|
1197
|
-
|
|
1198
1317
|
if code:
|
|
1318
|
+
import re
|
|
1319
|
+
code = normalise_llm_code(code)
|
|
1320
|
+
|
|
1199
1321
|
m = re.search(r"```(?:python)?\s*(.*?)\s*```", code, re.DOTALL | re.IGNORECASE)
|
|
1200
1322
|
if m:
|
|
1201
1323
|
code = m.group(1).strip()
|
|
1202
|
-
code = drop_bad_classification_metrics(code, df)
|
|
1203
1324
|
|
|
1204
1325
|
if "import io" not in code and "io.BytesIO" in code:
|
|
1205
1326
|
lines = code.split('\n')
|
|
1206
1327
|
import_lines = []
|
|
1207
1328
|
other_lines = []
|
|
1208
|
-
|
|
1329
|
+
|
|
1209
1330
|
for line in lines:
|
|
1210
1331
|
if line.strip().startswith('import ') or line.strip().startswith('from '):
|
|
1211
1332
|
import_lines.append(line)
|
|
1212
1333
|
else:
|
|
1213
1334
|
other_lines.append(line)
|
|
1214
|
-
|
|
1215
|
-
# Add missing io import
|
|
1335
|
+
|
|
1216
1336
|
if "import io" not in '\n'.join(import_lines):
|
|
1217
1337
|
import_lines.append('import io')
|
|
1218
|
-
|
|
1338
|
+
|
|
1219
1339
|
code = '\n'.join(import_lines + [''] + other_lines)
|
|
1220
|
-
|
|
1340
|
+
|
|
1341
|
+
TEMPLATE_NAMES = [
|
|
1342
|
+
"viz_pie","viz_stacked_bar","viz_count_bar","viz_box","viz_scatter",
|
|
1343
|
+
"viz_distribution","viz_kde","viz_area","viz_line",
|
|
1344
|
+
"classification","regression","clustering","anomaly_detection",
|
|
1345
|
+
"ts_anomaly_detection","time_series_forecasting","time_series_classification",
|
|
1346
|
+
"dimensionality_reduction","feature_selection","eda_overview","eda_correlation",
|
|
1347
|
+
"multilabel_classification","recommendation","topic_modelling"
|
|
1348
|
+
]
|
|
1349
|
+
|
|
1350
|
+
used = [t for t in TEMPLATE_NAMES if re.search(rf"\\b{t}\\s*\\(", code)]
|
|
1351
|
+
if used:
|
|
1352
|
+
import_line = (
|
|
1353
|
+
"from syntaxmatrix.agentic.model_templates import " +
|
|
1354
|
+
", ".join(sorted(set(used)))
|
|
1355
|
+
)
|
|
1356
|
+
if import_line not in code:
|
|
1357
|
+
code = import_line + "\n" + code
|
|
1358
|
+
|
|
1221
1359
|
return code.strip()
|
|
1222
|
-
|
|
1223
|
-
|
|
1360
|
+
|
|
1361
|
+
return "Error: AI code generation failed."
|
|
1362
|
+
|
|
1363
|
+
|
|
1224
1364
|
def sanitize_rough_to_markdown_task(self, rough: str) -> str:
|
|
1225
1365
|
"""
|
|
1226
1366
|
Return only the Task text (no tags).
|