syntaxmatrix 2.3.5__py3-none-any.whl → 2.5.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
syntaxmatrix/core.py CHANGED
@@ -1,7 +1,10 @@
1
1
  from __future__ import annotations
2
+ import ast
3
+ import textwrap
2
4
  import os, webbrowser, uuid, secrets, re
3
5
 
4
6
  from flask import Flask, Response, session, request, has_request_context
7
+ from syntaxmatrix.agentic.agents import mlearning_agent
5
8
  from syntaxmatrix.history_store import SQLHistoryStore as Store, PersistentHistoryStore as _Store
6
9
  from collections import OrderedDict
7
10
  from syntaxmatrix.llm_store import save_embed_model, load_embed_model, delete_embed_key
@@ -17,7 +20,6 @@ from syntaxmatrix.settings.prompts import SMXAI_CHAT_ID, SMXAI_CHAT_INSTRUCTIONS
17
20
  from typing import List, Generator
18
21
  from .auth import init_auth_db
19
22
  from . import profiles as _prof
20
- from syntaxmatrix.utils import strip_describe_slice, drop_bad_classification_metrics
21
23
  from syntaxmatrix.smiv import SMIV
22
24
  from .project_root import detect_project_root
23
25
  from syntaxmatrix.gpt_models_latest import extract_output_text as _out, set_args
@@ -25,6 +27,8 @@ from dotenv import load_dotenv
25
27
  from html import unescape
26
28
  from .plottings import render_plotly, pyplot, describe_plotly, describe_matplotlib
27
29
  from threading import RLock
30
+ from syntaxmatrix.settings.model_map import GPT_MODELS_LATEST
31
+
28
32
 
29
33
  # ──────── framework‐local storage paths ────────
30
34
  # this ensures the key & data always live under the package dir,
@@ -46,10 +50,10 @@ EDA_OUTPUT = {} # global buffer for EDA output by session
46
50
 
47
51
  class SyntaxMUI:
48
52
  def __init__(self,
49
- host="127.0.0.1",
53
+ host="127.0.0.1",
50
54
  port="5080",
51
55
  user_icon="👩🏿‍🦲",
52
- bot_icon="<img src='/static/icons/logo.png' width='15' alt='bot'/>",
56
+ bot_icon="<img src='/static/icons/favicon.png' width=20' alt='bot'/>",
53
57
  favicon="/static/icons/favicon.png",
54
58
  site_logo="<img src='/static/icons/logo.png' width='30' alt='logo'/>",
55
59
  site_title="SyntaxMatrix",
@@ -71,12 +75,12 @@ class SyntaxMUI:
71
75
  self.ui_mode = ui_mode
72
76
  self.theme_toggle_enabled = False
73
77
  self.user_files_enabled = False
74
- self.ai_chat_id = SMXAI_CHAT_ID
75
- self.ai_chat_instructions = SMXAI_CHAT_INSTRUCTIONS
78
+ self.smxai_identity = SMXAI_CHAT_ID
79
+ self.smxai_instructions = SMXAI_CHAT_INSTRUCTIONS
76
80
  self.website_description = SMXAI_WEBSITE_DESCRIPTION
77
81
  self._eda_output = {} # {chat_id: html}
78
82
  self._eda_lock = RLock()
79
-
83
+
80
84
  db.init_db()
81
85
  self.page = ""
82
86
  self.pages = db.get_pages()
@@ -88,8 +92,10 @@ class SyntaxMUI:
88
92
  self.app_token = str(uuid.uuid4()) # NEW: Unique token for each app launch.
89
93
  self.admin_pdf_chunks = {} # In-memory store for admin PDF chunks
90
94
  self.user_file_chunks = {} # In-memory store of user‑uploaded chunks, scoped per chat session
91
- routes.setup_routes(self)
92
95
 
96
+ self._last_llm_usage = None
97
+ routes.setup_routes(self)
98
+
93
99
  self._admin_profile = {}
94
100
  self._chat_profile = {}
95
101
  self._coding_profile = {}
@@ -561,8 +567,8 @@ class SyntaxMUI:
561
567
  def delete_embed_key(self):
562
568
  return delete_embed_key()
563
569
 
564
- def gpt_models_latest(self):
565
- from syntaxmatrix.settings.model_map import GPT_MODELS_LATEST
570
+
571
+ def get_gpt_models_latest(self):
566
572
  return GPT_MODELS_LATEST
567
573
 
568
574
  def get_text_input_value(self, key, default=""):
@@ -678,7 +684,7 @@ class SyntaxMUI:
678
684
  if _provider == "google":
679
685
  intent = google_classify_query()
680
686
  return intent
681
- if _model in self.gpt_models_latest():
687
+ if _model in self.get_gpt_models_latest():
682
688
  intent = gpt_models_latest_classify_query()
683
689
  return intent
684
690
  if _provider == "anthropic":
@@ -721,14 +727,14 @@ class SyntaxMUI:
721
727
  except Exception as e:
722
728
  return f"Summary agent error!"
723
729
 
724
- def gpt_models_latest_generated_title(reasoning_effort = "minimal", verbosity = "low"):
730
+ def gpt_models_latest_generated_title():
725
731
  try:
726
732
  args = set_args(
727
733
  model=_model,
728
734
  instructions=_title_profile,
729
735
  input=_instructions,
730
- reasoning_effort=reasoning_effort,
731
- verbosity=verbosity,
736
+ # reasoning_effort=reasoning_effort,
737
+ # verbosity=verbosity,
732
738
  )
733
739
 
734
740
  resp = _client.responses.create(**args)
@@ -740,7 +746,7 @@ class SyntaxMUI:
740
746
  try:
741
747
  response = _client.messages.create(
742
748
  model=_model,
743
- max_tokens=1024,
749
+ max_tokens=50,
744
750
  system=_title_profile,
745
751
  messages=[{"role": "user", "content":_instructions}],
746
752
  stream=False,
@@ -754,12 +760,11 @@ class SyntaxMUI:
754
760
  { "role": "system", "content": _title_profile },
755
761
  { "role": "user", "content": _instructions },
756
762
  ]
757
-
758
763
  try:
759
764
  response = _client.chat.completions.create(
760
765
  model=_model,
761
766
  messages=prompt,
762
- temperature=0,
767
+ temperature=0.3,
763
768
  max_tokens=50
764
769
  )
765
770
  title = response.choices[0].message.content.strip().lower()
@@ -769,7 +774,7 @@ class SyntaxMUI:
769
774
 
770
775
  if _provider == "google":
771
776
  title = google_generated_title()
772
- elif _model in self.gpt_models_latest():
777
+ elif _model in self.get_gpt_models_latest():
773
778
  title = gpt_models_latest_generated_title()
774
779
  elif _provider == "anthropic":
775
780
  title = anthropic_generated_title()
@@ -792,7 +797,7 @@ class SyntaxMUI:
792
797
  if not chat_profile:
793
798
  yield """<p style='color:red;'>Error: Chat profile is not configured. Add a chat profile inside the admin panel or contact your administrator.</p>
794
799
  """
795
- return
800
+ return None
796
801
  self._chat_profile = chat_profile
797
802
  self._chat_profile['client'] = _prof.get_client(chat_profile)
798
803
 
@@ -801,7 +806,7 @@ class SyntaxMUI:
801
806
  _model = self._chat_profile['model']
802
807
 
803
808
  _contents = f"""
804
- {self.ai_chat_instructions}\n\n
809
+ {self.smxai_instructions}\n\n
805
810
  Question: {query}\n
806
811
  Context: {context}\n\n
807
812
  History: {history}\n\n
@@ -809,32 +814,30 @@ class SyntaxMUI:
809
814
  """
810
815
 
811
816
  try:
812
- if _provider == "google": # Google, non openai skd series
813
- contents = [
814
- types.Content(
815
- role="user",
816
- parts=[
817
- types.Part.from_text(text=f"{self.ai_chat_id}\n\n{_contents}"),
818
- ],
819
- ),
820
- ]
821
-
817
+ if _provider == "google": # Google, non openai skd series
818
+
822
819
  for chunk in _client.models.generate_content_stream(
823
820
  model=_model,
824
- contents=contents,
821
+ contents=_contents,
822
+ config=types.GenerateContentConfig(
823
+ system_instruction=self.smxai_identity,
824
+ temperature=0.3,
825
+ max_output_tokens=1024,
826
+ ),
825
827
  ):
828
+
826
829
  yield chunk.text
827
830
 
828
- elif _provider == "openai" and _model in self.gpt_models_latest(): # GPt 5 series
831
+ elif _provider == "openai" and _model in self.get_gpt_models_latest(): # GPt 5 series
829
832
  input_prompt = (
830
- f"{self.ai_chat_instructions}\n\n"
833
+ f"{self.smxai_instructions}\n\n"
831
834
  f"Generate a response to this query:\n{query}\n"
832
835
  f"based on this given context:\n{context}\n\n"
833
836
  f"(Use conversation continuity if available.)"
834
837
  )
835
838
  sid = self.get_session_id()
836
839
  prev_id = self._gpt_models_latest_prev_resp_ids.get(sid)
837
- args = set_args(model=_model, instructions=self.ai_chat_id, input=input_prompt, previous_id=prev_id, store=True)
840
+ args = set_args(model=_model, instructions=self.smxai_identity, input=input_prompt, previous_id=prev_id, store=True)
838
841
 
839
842
  with _client.responses.stream(**args) as s:
840
843
  for event in s:
@@ -849,7 +852,7 @@ class SyntaxMUI:
849
852
  elif _provider == "anthropic":
850
853
  with _client.messages.stream(
851
854
  max_tokens=1024,
852
- messages=[{"role": "user", "content":f"{self.ai_chat_id}\n\n {_contents}"},],
855
+ messages=[{"role": "user", "content":f"{self.smxai_identity}\n\n {_contents}"},],
853
856
  model=_model,
854
857
  ) as stream:
855
858
  for text in stream.text_stream:
@@ -857,8 +860,8 @@ class SyntaxMUI:
857
860
 
858
861
  else: # Assumes standard openai_sdk
859
862
  openai_sdk_prompt = [
860
- {"role": "system", "content": self.ai_chat_id},
861
- {"role": "user", "content": f"{self.ai_chat_instructions}\n\nGenerate response to this query: {query}\nbased on this context:\n{context}\nand history:\n{history}\n\nUse conversation continuity if available.)"},
863
+ {"role": "system", "content": self.smxai_identity},
864
+ {"role": "user", "content": f"{self.smxai_instructions}\n\nGenerate response to this query: {query}\nbased on this context:\n{context}\nand history:\n{history}\n\nUse conversation continuity if available.)"},
862
865
  ]
863
866
  response = _client.chat.completions.create(
864
867
  model=_model,
@@ -883,9 +886,11 @@ class SyntaxMUI:
883
886
 
884
887
  self._chat_profile = chat_profile
885
888
  self._chat_profile['client'] = _prof.get_client(chat_profile)
886
-
889
+ _provider = self._chat_profile['provider']
890
+ _client = self._chat_profile['client']
891
+ _model = self._chat_profile['model']
887
892
  _contents = f"""
888
- {self.ai_chat_instructions}\n\n
893
+ {self.smxai_instructions}\n\n
889
894
  Question: {query}\n
890
895
  Context: {context}\n\n
891
896
  History: {history}\n\n
@@ -893,8 +898,8 @@ class SyntaxMUI:
893
898
  """
894
899
 
895
900
  openai_sdk_prompt = [
896
- {"role": "system", "content": self.ai_chat_id},
897
- {"role": "user", "content": f"""{self.ai_chat_instructions}\n\n
901
+ {"role": "system", "content": self.smxai_identity},
902
+ {"role": "user", "content": f"""{self.smxai_instructions}\n\n
898
903
  Generate response to this query: {query}\n
899
904
  based on this context:\n{context}\n
900
905
  and history:\n{history}\n\n
@@ -903,18 +908,19 @@ class SyntaxMUI:
903
908
  },
904
909
  ]
905
910
 
906
- _provider = self._chat_profile['provider']
907
- _client = self._chat_profile['client']
908
- _model = self._chat_profile['model']
909
-
910
911
  def google_process_query():
911
912
  try:
912
913
  response = _client.models.generate_content(
913
914
  model=_model,
914
- contents=f"{self.ai_chat_id}\n\n{_contents}"
915
+ contents=_contents,
916
+ config=types.GenerateContentConfig(
917
+ system_instruction=self.smxai_identity,
918
+ temperature=0.3,
919
+ max_output_tokens=1024,
920
+ ),
915
921
  )
916
922
  answer = response.text
917
-
923
+
918
924
  # answer = strip_html(answer)
919
925
  return answer
920
926
  except Exception as e:
@@ -926,7 +932,7 @@ class SyntaxMUI:
926
932
  """
927
933
  # Prepare the prompt with conversation history and context
928
934
  input = (
929
- f"{self.ai_chat_instructions}\n\n"
935
+ f"{self.smxai_instructions}\n\n"
930
936
  f"Generate a response to this query:\n{query}\n"
931
937
  f"based on this given context:\n{context}\n\n"
932
938
  f"(Use conversation continuity if available.)"
@@ -937,7 +943,7 @@ class SyntaxMUI:
937
943
 
938
944
  args = set_args(
939
945
  model=_model,
940
- instructions=self.ai_chat_id,
946
+ instructions=self.smxai_identity,
941
947
  input=input,
942
948
  previous_id=prev_id,
943
949
  store=True,
@@ -962,7 +968,7 @@ class SyntaxMUI:
962
968
  response = _client.messages.create(
963
969
  model=_model,
964
970
  max_tokens=1024,
965
- system=self.ai_chat_id,
971
+ system=self.self.smxai_identity,
966
972
  messages=[{"role": "user", "content":_contents}],
967
973
  stream=False,
968
974
  )
@@ -977,7 +983,7 @@ class SyntaxMUI:
977
983
  response = _client.chat.completions.create(
978
984
  model=_model,
979
985
  messages=openai_sdk_prompt,
980
- stream=False,
986
+ stream=False,
981
987
  )
982
988
 
983
989
  # -------- one-shot buffered --------
@@ -988,239 +994,373 @@ class SyntaxMUI:
988
994
 
989
995
  if _provider == "google":
990
996
  return google_process_query()
991
- if _provider == "openai" and _model in self.gpt_models_latest():
997
+ if _provider == "openai" and _model in self.get_gpt_models_latest():
992
998
  return gpt_models_latest_process_query(self._gpt_models_latest_prev_resp_ids.get(self.get_session_id()))
993
999
  if _provider == "anthropic":
994
1000
  return anthropic_process_query()
995
1001
  return openai_sdk_process_query()
996
1002
 
997
1003
 
998
- def ai_generate_code(self, question, intent, df):
1004
+ def repair_python_cell(self, py_code: str) -> str:
1005
+
1006
+ _CELL_REPAIR_RULES = """
1007
+ Fix the Python cell to satisfy:
1008
+ - Single valid cell; imports at the top.
1009
+ - Do not import or invoke or use 'python-dotenv' or 'dotenv' because it's not needed.
1010
+ - No top-level statements between if/elif/else branches.
1011
+ - Regression must use either sklearn with train_test_split (then X_test exists) and R^2/MAE/RMSE,
1012
+ or statsmodels OLS. No accuracy_score in regression.
1013
+ - Keep all plotting + savefig + BytesIO + display inside the branch that created the figure.
1014
+ - Return ONLY the corrected cell.
1015
+ """
1016
+ code = textwrap.dedent(py_code or "").strip()
1017
+ needs_fix = False
1018
+ if re.search(r"\baccuracy_score\b", code) and re.search(r"\bLinearRegression\b|\bOLS\b", code):
1019
+ needs_fix = True
1020
+ if re.search(r"\bX_test\b", code) and not re.search(r"\bX_test\s*=", code):
1021
+ needs_fix = True
1022
+ try:
1023
+ ast.parse(code)
1024
+ except SyntaxError:
1025
+ needs_fix = True
1026
+ if not needs_fix:
1027
+ return code
1028
+ _prompt = f"```python\n{code}\n```"
1029
+
1030
+ repair_profile = _prof.get_profile("vision2text") or _prof.get_profile("admin")
1031
+ if not repair_profile:
1032
+ return (
1033
+ '<div class="smx-alert smx-alert-warn">'
1034
+ 'No LLM profile configured for <code>coding</code> (or <code>admin</code>). <br>'
1035
+ 'Please, add the LLM profile inside the admin panel or contact your Administrator.'
1036
+ '</div>'
1037
+ )
1038
+
1039
+ _client = _prof.get_client(repair_profile)
1040
+ _provider = repair_profile['provider'].lower()
1041
+ _model = repair_profile['model']
1042
+
1043
+ #1 Google
1044
+ if _provider == "google":
1045
+ from google.genai import types
1046
+
1047
+ fixed = _client.models.generate_content(
1048
+ model=_model,
1049
+ contents=_prompt,
1050
+ config=types.GenerateContentConfig(
1051
+ system_instruction=_CELL_REPAIR_RULES,
1052
+ temperature=0.8,
1053
+ max_output_tokens=1024,
1054
+ ),
1055
+ )
1056
+
1057
+ #2 Openai
1058
+ elif _provider == "openai" and _model in GPT_MODELS_LATEST:
1059
+
1060
+ args = set_args(
1061
+ model=_model,
1062
+ instructions=_CELL_REPAIR_RULES,
1063
+ input=[{"role": "user", "content": _prompt}],
1064
+ previous_id=None,
1065
+ store=False,
1066
+ reasoning_effort="medium",
1067
+ verbosity="medium",
1068
+ )
1069
+ fixed = _out(_client.responses.create(**args))
999
1070
 
1000
- if not self._coding_profile:
1001
- coding_profile = _prof.get_profile("coding") or _prof.get_profile("admin")
1002
- if not coding_profile:
1003
- # tell the user exactly what to configure
1004
- return (
1005
- '<div class="smx-alert smx-alert-warn">'
1006
- 'No LLM profile configured for <code>coding</code> (or <code>admin</code>). '
1007
- 'Please, contact your Administrator.'
1008
- '</div>'
1009
- )
1071
+ # Anthropic
1072
+ elif _provider == "anthropic":
1010
1073
 
1011
- self._coding_profile = coding_profile
1012
- self._coding_profile['client'] = _prof.get_client(coding_profile)
1074
+ fixed = _client.messages.create(
1075
+ model=_model,
1076
+ max_tokens=1024,
1077
+ system=_CELL_REPAIR_RULES,
1078
+ messages=[{"role": "user", "content":_prompt}],
1079
+ stream=False,
1080
+ )
1081
+
1082
+ # OpenAI SDK
1083
+ else:
1084
+ fixed = _client.chat.completions.create(
1085
+ model=_model,
1086
+ messages=[
1087
+ {"role": "system", "content":_CELL_REPAIR_RULES},
1088
+ {"role": "user", "content":_prompt},
1089
+ ],
1090
+ max_tokens=1024,
1091
+ )
1092
+
1093
+ try:
1094
+ ast.parse(fixed);
1095
+ return fixed
1096
+ except Exception:
1097
+ return code
1013
1098
 
1014
- _client = self._coding_profile['client']
1015
- _provider = self._coding_profile['provider']
1016
- _model = self._coding_profile['model']
1099
+ def get_last_llm_usage(self):
1100
+ return getattr(self, "_last_llm_usage", None)
1017
1101
 
1018
- context = f"Columns: {list(df.columns)}\n\nDtypes: {df.dtypes.astype(str).to_dict()}\n\n"
1019
- ALLOWED_COLUMNS = list(df.columns)
1102
+ def ai_generate_code(self, refined_question, tasks, df):
1020
1103
 
1021
- ai_profile = f"""
1022
- You are a senior Python data scientist writing production-quality, **runnable** code for a Jupyter-like kernel. You are given a pandas DataFrame named `df`. Begin ONLY the data already in `df` (no file I/O).
1023
- """
1104
+ def normalise_llm_code(s: str) -> str:
1105
+ s = s.replace("\t", " ")
1106
+ s = textwrap.dedent(s)
1107
+ lines = s.splitlines()
1024
1108
 
1025
- instructions = f"""
1026
- ### Context
1027
- - Schema (names → dtypes): {context}
1028
- - Row count: {len(df)}
1029
- - Task: {question}
1030
- - Task type: {intent}
1031
- - Allowed columns: {ALLOWED_COLUMNS}
1032
-
1033
- ### Hard requirements
1034
- 1) **Code only**. No markdown, no comments, no explanations.
1035
- 2) Import everything you use explicitly. Assume: pandas≥2, numpy≥1.25, matplotlib≥3.8, seaborn≥0.13, scikit-learn≥1.4 are available.
1036
- 3) **Avoid deprecated / removed APIs**, e.g.:
1037
- - pandas: do not use `.append`, `.ix`, `.as_matrix`, `DataFrame.select_dtypes(include='category')` is OK, but prefer current patterns.
1038
- - seaborn: do not use `distplot`, `pairplot` on very large data without sampling; prefer `histplot`, `displot`, `regplot`, or FacetGrid with `.map_dataframe`.
1039
- - scikit-learn: import from `sklearn.model_selection` (not `sklearn.cross_validation`); for confusion matrices use `ConfusionMatrixDisplay.from_estimator`; set `random_state=42` where relevant.
1040
- 4) Be **defensive**:
1041
- - Verify required columns exist; if any are missing, raise `ValueError("Missing columns: ...")` early.
1042
- - Handle missing values sensibly (e.g., drop rows for simple EDA; use `ColumnTransformer` + `SimpleImputer` for modeling).
1043
- - For categorical features in ML, use `OneHotEncoder(handle_unknown="ignore")` inside a `Pipeline`/`ColumnTransformer` (no `LabelEncoder` on features).
1044
- 5) Keep it **fast** (kernel timeout ~8s):
1045
- - For plots on large frames (>20k rows), downsample to ~1,000 rows (`df.sample(1000, random_state=42)`) unless aggregation is more appropriate.
1046
- - Prefer vectorized ops; avoid O(n²) Python loops.
1047
- 6) Always **produce at least one visible result** at the end:
1048
- - If plotting with matplotlib/seaborn: call `plt.tight_layout(); plt.show()`.
1049
- - If producing a table or metrics: from `syntaxmatrix.display import show` then `show(object_or_dataframe)`.
1050
- 7) Follow task type conventions:
1051
- - **EDA/Stats**: compute the requested stat, then show a relevant table (e.g., summary/crosstab) or plot.
1052
- - **Classification**: train/valid split (`train_test_split`), build a pipeline with scaling/encoding as needed, fit, show accuracy **and** a confusion matrix via `ConfusionMatrixDisplay.from_estimator(...); plt.show()`. Also show `classification_report` as a dataframe if short.
1053
- - **Regression**: train/valid split, pipeline as needed, fit, show R² and MAE; plot predicted vs actual scatter.
1054
- - **Correlation/Chi-square/ANOVA**: compute the statistic + p-value and show a concise result table (with `show(...)`) and, when sensible, a small plot (heatmap/bar).
1055
- 8) Don't mutate or recreate target columns if they already exist (e.g., if asked to “predict TARGET”, use `y = df['TARGET']` as-is).
1056
- 9) Keep variable names short and clear; prefer `num_cols` / `cat_cols` discovery by dtype.
1057
- 10) You MUST NOT reference any column outside Allowed columns: {ALLOWED_COLUMNS}\n.
1058
- 11) If asked to predict/classify, choose the target by matching the task text to Allowed columns: {ALLOWED_COLUMNS}\n and never invent a new name (e.g., 'whether', 'the').
1059
-
1060
- ### Output
1061
- Return **only runnable Python** that:
1062
- - Imports what it needs,
1063
- - Validates columns,
1064
- - Solves: {question},
1065
- - And ends with at least one visible output (`show(...)` and/or `plt.show()`).
1066
- """
1109
+ # drop leading blank lines
1110
+ while lines and not lines[0].strip():
1111
+ lines.pop(0)
1112
+
1113
+ # if everything is still indented >=4 spaces, shift left
1114
+ indents = [len(l) - len(l.lstrip(" ")) for l in lines if l.strip()]
1115
+ if indents and min(indents) >= 4:
1116
+ m = min(indents)
1117
+ lines = [l[m:] if len(l) >= m else l for l in lines]
1118
+
1119
+ return "\n".join(lines)
1067
1120
 
1068
- def google_generate_code():
1121
+ CONTEXT = f"Columns: {list(df.columns)}\n\nDtypes: {df.dtypes.astype(str).to_dict()}\n\n"
1122
+ AVAILABLE_COLUMNS = list(df.columns)
1123
+
1124
+ # --- SMX: normalise tasks coming from intent agent ---
1125
+ if isinstance(tasks, str):
1126
+ import json, ast, re
1069
1127
  try:
1070
- # Combine system prompt and instructions for Gemini
1071
- full_prompt = f"{ai_profile}\n\n{instructions}"
1072
-
1073
- # Gemini expects a simple generate_content call with the model and contents
1074
- response = _client.models.generate_content(
1075
- model=_model,
1076
- contents=full_prompt
1077
- )
1078
-
1079
- # Extract text from response
1080
- if hasattr(response, 'text'):
1081
- return response.text
1082
- elif hasattr(response, 'candidates') and response.candidates:
1083
- candidate = response.candidates[0]
1084
- if hasattr(candidate.content, 'parts'):
1085
- return ''.join(part.text for part in candidate.content.parts if hasattr(part, 'text'))
1086
- return str(response)
1087
-
1088
- except Exception as e:
1089
- print(f"Google Gemini code generation error: {e}")
1090
- # Return a basic analysis code with ALL necessary imports
1091
- return """
1092
- import pandas as pd
1093
- import matplotlib.pyplot as plt
1094
- import seaborn as sns
1095
- import numpy as np
1096
- import io
1097
- import base64
1098
- from syntaxmatrix.display import show
1099
-
1100
- print("Basic DataFrame Info:")
1101
- print(f"Shape: {df.shape}")
1102
- print("\\nColumns and dtypes:")
1103
- print(df.dtypes)
1104
- print("\\nBasic statistics:")
1105
- show(df.describe())
1106
-
1107
- print("\\nFirst few rows:")
1108
- show(df.head())
1109
-
1110
- # Generate a simple visualization based on available columns
1111
- plt.figure(figsize=(10, 6))
1112
-
1113
- if len(df.columns) >= 2:
1114
- # Try to find numeric columns for scatter plot
1115
- numeric_cols = df.select_dtypes(include=['number']).columns
1116
- if len(numeric_cols) >= 2:
1117
- sns.scatterplot(data=df, x=numeric_cols[0], y=numeric_cols[1])
1118
- plt.title(f"Scatter plot: {numeric_cols[0]} vs {numeric_cols[1]}")
1119
- plt.tight_layout()
1120
- plt.show()
1121
- else:
1122
- # Use first column for bar plot
1123
- top_values = df[df.columns[0]].value_counts().head(10)
1124
- top_values.plot(kind='bar')
1125
- plt.title(f"Top 10 values in {df.columns[0]}")
1126
- plt.tight_layout()
1127
- plt.show()
1128
- else:
1129
- # Single column analysis
1130
- if len(df.columns) == 1:
1131
- col_name = df.columns[0]
1132
- if df[col_name].dtype in ['object', 'category']:
1133
- df[col_name].value_counts().head(10).plot(kind='bar')
1134
- plt.title(f"Value counts for {col_name}")
1135
- else:
1136
- df[col_name].hist(bins=20)
1137
- plt.title(f"Distribution of {col_name}")
1138
- plt.tight_layout()
1139
- plt.show()
1140
- else:
1141
- print("Insufficient columns for detailed analysis")
1142
- show(df)
1143
- """
1128
+ tasks_parsed = json.loads(tasks)
1129
+ except Exception:
1130
+ try:
1131
+ tasks_parsed = ast.literal_eval(tasks)
1132
+ except Exception:
1133
+ tasks_parsed = re.findall(r"[A-Za-z_]+", tasks)
1134
+ tasks = tasks_parsed
1135
+ if not isinstance(tasks, list):
1136
+ tasks = [str(tasks)]
1137
+ tasks = [str(t).strip().lower() for t in tasks if str(t).strip()]
1138
+
1139
+ ai_profile = """
1140
+ - You are a Python expert specializing in data science and machine learning.
1141
+ - Your task is to generate a single, complete, production-quality, executable Python script for a Jupyter-like Python kernel, based on the given instructions.
1142
+ - The dataset is already loaded as a pandas DataFrame named `df` (no file I/O or file uploads).
1143
+ - Make a copy of `df` and name it `df_copy`. Make sure `df_copy` is preprocessed and cleaned, named `df_cleaned`, if not already done so. Then use `df_cleaned` to perform the ML tasks described in the given context.
1144
+ - Select your features and target, from `df_cleaned`, with care and name it `required_cols`
1145
+ - Create your 'df_filtered by doing: df_filtered = df_cleaned[required_cols].
1146
+ - Use the {TEMPLATE_CATALOGUE} below to educate yourself on which visualizations you will implement in the code.
1147
+ - The final output MUST be the complete, executable Python code only, enclosed in a single markdown code block (```python ... ```), which is required to fulfill the user's request. See the {tasks} below.
1148
+ - Do not include any explanatory text or markdown outside the code block.
1149
+ """
1144
1150
 
1145
- def gpt_models_latest_generate_code(reasoning_effort = "medium", verbosity = "medium"):
1146
- try:
1147
- args = set_args(
1148
- model=_model,
1149
- instructions=ai_profile,
1150
- input=instructions,
1151
- reasoning_effort=reasoning_effort,
1152
- verbosity=verbosity,
1153
- )
1151
+ TEMPLATE_CATALOGUE = """
1152
+ ### Available SyntaxMatrix templates (use these instead of inventing new helpers)
1153
+
1154
+ Visualisation templates (dataset-agnostic):
1155
+ - viz_pie(df, category_col=None, top_k=8): pie/donut shares within a category.
1156
+ - viz_stacked_bar(df, x=None, hue=None, normalise=True): composition across groups.
1157
+ - viz_count_bar(df, category_col=None, top_k=12): counts/denominators by category.
1158
+ - viz_box(df, x=None, y=None): spread/outliers of numeric by category.
1159
+ - viz_scatter(df, x=None, y=None, hue=None): relationship between two numeric vars.
1160
+ - viz_distribution(df, col=None): histogram-style distribution for numeric.
1161
+ - viz_kde(df, col=None): density curve for numeric.
1162
+ - viz_area(df, time_col=None, y_col=None): area/trend over time.
1163
+ - viz_line(df, x=None, y=None, hue=None): line/trend plot.
1164
+
1165
+ ML/stat templates:
1166
+ - classification(df): standard classification pipeline + metrics + plots.
1167
+ - regression(df): standard regression pipeline + metrics + plots.
1168
+ - clustering(df): clustering workflow + cluster plots.
1169
+ - anomaly_detection(df)
1170
+ - ts_anomaly_detection(df)
1171
+ - time_series_forecasting(df)
1172
+ - time_series_classification(df, entity_col, time_col, target_col)
1173
+ - dimensionality_reduction(df)
1174
+ - feature_selection(df)
1175
+ - eda_overview(df)
1176
+ - eda_correlation(df)
1177
+ - multilabel_classification(df, label_cols)
1178
+ - recommendation(df)
1179
+ - topic_modelling(df)
1180
+ """
1181
+
1182
+ instructions = (
1183
+ "### Context"
1184
+ f"- DataFrame - (`df`): {df}"
1185
+ f"- Schema (names → dtypes): {CONTEXT}"
1186
+ f"- Row count: {len(df)}"
1187
+ f"- Task description: {refined_question}"
1188
+ f"- Tasks: {tasks}"
1189
+ f"- Available columns: {AVAILABLE_COLUMNS}"
1190
+ f"- Template catalogue: {TEMPLATE_CATALOGUE}"
1154
1191
 
1155
- resp = _client.responses.create(**args)
1156
- code = _out(resp)
1157
- return code
1158
- except Exception as e:
1159
- return f"Error!"
1192
+ """
1193
+ ### Template rules
1194
+ - You MAY call a template if it matches the task.
1195
+ - Do NOT invent template names.
1196
+ - If no template fits, write minimal direct pandas/sklearn/seaborn code instead.
1197
+ - Keep the solution short: avoid writing wrappers/utilities already handled by SyntaxMatrix hardener.
1198
+
1199
+ #### Template selection hint examples:
1200
+ - If the task asks for pie/donut/composition shares → use viz_pie.
1201
+ - If it asks for denominators/counts per category → viz_count_bar.
1202
+ - If it asks for spread/outliers/comparison across groups → viz_box.
1203
+ - If it asks for relationship / “X vs Y” → viz_scatter.
1204
+ - If it asks for trend over time → viz_line or viz_area.
1205
+
1206
+ ### Hard requirements
1207
+ 1) Code only. No markdown, no comments, no explanations.
1208
+ 2) Import everything you use explicitly.
1209
+ - Use pandas/numpy/matplotlib by default.
1210
+ - Seaborn may be unavailable at runtime; **do not import seaborn inside your code**.
1211
+ - If you call sns.*, assume sns is already defined by the framework.
1212
+ 3) Avoid deprecated / removed APIs**, e.g.:
1213
+ - pandas: do not use `.append`, `.ix`, `.as_matrix`; prefer current patterns.
1214
+ - seaborn: do not use `distplot`; avoid `pairplot` on very large data unless sampling.
1215
+ - scikit-learn: import from `sklearn.model_selection` (not `sklearn.cross_validation`);
1216
+ set `random_state=42` where relevant.
1217
+ 4) Be defensive, but avoid hard-failing on optional fields:
1218
+ - If the primary column, needed to answer the question, is missing, review your copy of the `df` again.
1219
+ Make sure that you selected the proper column.
1220
+ Never use a column/variable which isn't available or defined.
1221
+ - If a secondary/extra column is missing, show a warning with `show(...)` and continue using available fields.
1222
+ - Handle missing values sensibly (drop rows for simple EDA; use `ColumnTransformer` + `SimpleImputer` for modelling).
1223
+ - For categorical features in ML, use `OneHotEncoder(handle_unknown="ignore")`
1224
+ inside a `Pipeline`/`ColumnTransformer` (no `LabelEncoder` on features).
1225
+ 5) Keep it fast (kernel timeout ~8s):
1226
+ - For plots on large frames (>20k rows), downsample to ~1,000 rows
1227
+ (`df.sample(1000, random_state=42)`) unless aggregation is more appropriate.
1228
+ - Prefer vectorised ops; avoid O(n²) Python loops.
1229
+ 6) Keep the solution compact:
1230
+ - Do not define large helper libraries or long “required column” sets.
1231
+ - Aim for ≤120 lines excluding imports.
1232
+ 7) Always produce at least one visible result at the end:
1233
+ - If plotting with matplotlib/seaborn: call `plt.tight_layout(); plt.show()`.
1234
+ - If producing a table or metrics:
1235
+ `from syntaxmatrix.display import show` then `show(object_or_dataframe)`.
1236
+ 8) Follow task type conventions:
1237
+ - **EDA/Stats**: compute the requested stat, then show a relevant table
1238
+ (e.g., summary/crosstab) or plot.
1239
+ - **Classification**: train/valid split (`train_test_split`), pipeline with scaling/encoding,
1240
+ fit, show accuracy and a confusion matrix via
1241
+ `ConfusionMatrixDisplay.from_estimator(...); plt.show()`.
1242
+ Also show `classification_report` as a dataframe if short.
1243
+ - **Regression**: train/valid split, pipeline as needed, fit, show R² and MAE;
1244
+ plot predicted vs actual scatter.
1245
+ - **Correlation/Chi-square/ANOVA**: compute the statistic + p-value and show a concise
1246
+ result table (with `show(...)`) and, when sensible, a small plot (heatmap/bar).
1247
+ 9) Don't mutate or recreate target columns if they already exist.
1248
+ 10) Keep variable names short and clear; prefer `num_cols` / `cat_cols` discovery by dtype.
1249
+ 11) You MUST NOT reference any column outside Available columns: {AVAILABLE_COLUMNS}.
1250
+ 12) If asked to predict/classify, choose the target by matching the task text to Allowed columns
1251
+ and never invent a new name.
1252
+
1253
+ #### Cohort rules
1254
+ When you generate plots for cohorts or categories, you MUST obey these rules:
1255
+ 1) ALWAYS guard cohort masks:
1256
+ - After you define something like:
1257
+ _mask_a = (df['BMI'] < 18.5) & df['BMI'].notna()
1258
+ _mask_b = ~(df['BMI'] < 18.5) & df['BMI'].notna()
1259
+ compute their sizes:
1260
+ n_a = int(_mask_a.sum())
1261
+ n_b = int(_mask_b.sum())
1262
+ - If a mask has no rows (or almost none), do NOT draw an empty plot.
1263
+ Instead call:
1264
+ show(f"Skipping cohort '{label}': no rows after filtering.")
1265
+ and return.
1266
+
1267
+ 2) Before any groupby / crosstab for a plot:
1268
+ - Fill missing categories so groupby does not drop everything:
1269
+ df[col] = df[col].fillna("Unknown")
1270
+ - After building the table:
1271
+ tab = tmp.groupby([...]).size().unstack(...).fillna(0)
1272
+ ALWAYS check:
1273
+ if tab.empty:
1274
+ show(f"Skipping plot for {col}: no data after grouping.")
1275
+ continue
1276
+ Only call .plot(...) if the table is non-empty.
1277
+
1278
+ 3) For value_counts-based plots:
1279
+ - If the Series is empty after filtering (len(s) == 0),
1280
+ do NOT draw a figure. Just call:
1281
+ show(f"No data available to plot for {col} in this cohort.")
1282
+ and skip.
1283
+
1284
+ 4) Never try to “hide” an error with a blank plot.
1285
+ A blank chart is treated as a bug. If there is no data, explain it
1286
+ clearly using show(...), and avoid calling matplotlib/Seaborn.
1287
+
1288
+ 5) Never use print(...). All user-visible diagnostics go through show(...).
1160
1289
 
1161
- def anthropic_generate_code():
1162
- try:
1163
- response = _client.messages.create(
1164
- model=_model,
1165
- max_tokens=1024,
1166
- system=ai_profile,
1167
- messages=[{"role": "user", "content":instructions}],
1168
- stream=False,
1290
+
1291
+ ### Output
1292
+ Return only runnable Python that:
1293
+ - Imports what it needs,
1294
+ - Validates columns,
1295
+ - Visualize tables, charts, and graphs, each with appropriate caption.
1296
+ - Solution: {tasks} to solve {refined_question},
1297
+ - And ends with at least 3 visible output (`show(...)` and/or `plt.show()`).
1298
+ """)
1299
+
1300
+ if not self._coding_profile:
1301
+ coding_profile = _prof.get_profile("coding") or _prof.get_profile("admin")
1302
+ if not coding_profile:
1303
+ return (
1304
+ '<div class="smx-alert smx-alert-warn">'
1305
+ 'No LLM profile configured for <code>coding</code> (or <code>admin</code>). <br>'
1306
+ 'Please, add the LLM profile inside the admin panel or contact your Administrator.'
1307
+ '</div>'
1169
1308
  )
1170
- return response.content[0].text.strip()
1171
- except Exception as e:
1172
- return f"Error!"
1173
1309
 
1174
- def openai_sdk_generate_code():
1175
- try:
1176
- response = _client.chat.completions.create(
1177
- model=_model,
1178
- messages=[
1179
- {"role": "system", "content": ai_profile},
1180
- {"role": "user", "content": instructions},
1181
- ],
1182
- temperature=0.3,
1183
- max_tokens=2048,
1184
- )
1185
- return response.choices[0].message.content
1186
- except Exception as e:
1187
- return "Error!"
1310
+ self._coding_profile = coding_profile
1311
+ self._coding_profile['client'] = _prof.get_client(coding_profile)
1312
+
1313
+ # code = mlearning_agent(instructions, ai_profile, self._coding_profile)
1314
+ code, usage = mlearning_agent(instructions, ai_profile, self._coding_profile)
1315
+ self._last_llm_usage = usage
1188
1316
 
1189
- if _provider == 'google':
1190
- code = google_generate_code()
1191
- elif _provider == "openai" and _model in self.gpt_models_latest():
1192
- code = gpt_models_latest_generate_code()
1193
- elif _provider == "anthropic":
1194
- code = anthropic_generate_code()
1195
- else:
1196
- code = openai_sdk_generate_code()
1197
-
1198
1317
  if code:
1318
+ import re
1319
+ code = normalise_llm_code(code)
1320
+
1199
1321
  m = re.search(r"```(?:python)?\s*(.*?)\s*```", code, re.DOTALL | re.IGNORECASE)
1200
1322
  if m:
1201
1323
  code = m.group(1).strip()
1202
- code = drop_bad_classification_metrics(code, df)
1203
1324
 
1204
1325
  if "import io" not in code and "io.BytesIO" in code:
1205
1326
  lines = code.split('\n')
1206
1327
  import_lines = []
1207
1328
  other_lines = []
1208
-
1329
+
1209
1330
  for line in lines:
1210
1331
  if line.strip().startswith('import ') or line.strip().startswith('from '):
1211
1332
  import_lines.append(line)
1212
1333
  else:
1213
1334
  other_lines.append(line)
1214
-
1215
- # Add missing io import
1335
+
1216
1336
  if "import io" not in '\n'.join(import_lines):
1217
1337
  import_lines.append('import io')
1218
-
1338
+
1219
1339
  code = '\n'.join(import_lines + [''] + other_lines)
1220
-
1340
+
1341
+ TEMPLATE_NAMES = [
1342
+ "viz_pie","viz_stacked_bar","viz_count_bar","viz_box","viz_scatter",
1343
+ "viz_distribution","viz_kde","viz_area","viz_line",
1344
+ "classification","regression","clustering","anomaly_detection",
1345
+ "ts_anomaly_detection","time_series_forecasting","time_series_classification",
1346
+ "dimensionality_reduction","feature_selection","eda_overview","eda_correlation",
1347
+ "multilabel_classification","recommendation","topic_modelling"
1348
+ ]
1349
+
1350
+ used = [t for t in TEMPLATE_NAMES if re.search(rf"\\b{t}\\s*\\(", code)]
1351
+ if used:
1352
+ import_line = (
1353
+ "from syntaxmatrix.agentic.model_templates import " +
1354
+ ", ".join(sorted(set(used)))
1355
+ )
1356
+ if import_line not in code:
1357
+ code = import_line + "\n" + code
1358
+
1221
1359
  return code.strip()
1222
-
1223
-
1360
+
1361
+ return "Error: AI code generation failed."
1362
+
1363
+
1224
1364
  def sanitize_rough_to_markdown_task(self, rough: str) -> str:
1225
1365
  """
1226
1366
  Return only the Task text (no tags).