PyPI - syntaxmatrix - Versions diffs - 2.5.1__py3-none-any.whl → 2.5.3__py3-none-any.whl - Mend

syntaxmatrix 2.5.1py3-none-any.whl → 2.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

syntaxmatrix/__init__.py +3 -3
syntaxmatrix/commentary.py +134 -112
syntaxmatrix/core.py +449 -338
syntaxmatrix/dataset_preprocessing.py +218 -0
syntaxmatrix/display.py +89 -37
syntaxmatrix/gpt_models_latest.py +5 -4
syntaxmatrix/profiles.py +19 -4
syntaxmatrix/routes.py +932 -131
syntaxmatrix/settings/model_map.py +38 -30
syntaxmatrix/static/icons/hero_bg.jpg +0 -0
syntaxmatrix/templates/dashboard.html +256 -55
syntaxmatrix/utils.py +2254 -84
{syntaxmatrix-2.5.1.dist-info → syntaxmatrix-2.5.3.dist-info}/METADATA +3 -1
{syntaxmatrix-2.5.1.dist-info → syntaxmatrix-2.5.3.dist-info}/RECORD +17 -18
syntaxmatrix/model_templates.py +0 -29
syntaxmatrix/smx_task_runner.py +0 -12
syntaxmatrix/smx_usage_example.py +0 -4
{syntaxmatrix-2.5.1.dist-info → syntaxmatrix-2.5.3.dist-info}/WHEEL +0 -0
{syntaxmatrix-2.5.1.dist-info → syntaxmatrix-2.5.3.dist-info}/licenses/LICENSE.txt +0 -0
{syntaxmatrix-2.5.1.dist-info → syntaxmatrix-2.5.3.dist-info}/top_level.txt +0 -0

syntaxmatrix/core.py CHANGED Viewed

@@ -1,7 +1,10 @@
 from __future__ import annotations
+import ast
+import textwrap
 import os, webbrowser, uuid, secrets, re
 from flask import Flask, Response, session, request, has_request_context
+from syntaxmatrix.agentic.agents import mlearning_agent
 from syntaxmatrix.history_store import SQLHistoryStore as Store, PersistentHistoryStore as _Store
 from collections import OrderedDict
 from syntaxmatrix.llm_store import save_embed_model, load_embed_model, delete_embed_key
@@ -17,7 +20,6 @@ from syntaxmatrix.settings.prompts import SMXAI_CHAT_ID, SMXAI_CHAT_INSTRUCTIONS
 from typing import List, Generator
 from .auth import init_auth_db
 from . import profiles as _prof
-from syntaxmatrix.utils import strip_describe_slice, drop_bad_classification_metrics
 from syntaxmatrix.smiv import SMIV
 from .project_root import detect_project_root
 from syntaxmatrix.gpt_models_latest import extract_output_text as _out, set_args
@@ -25,6 +27,8 @@ from dotenv import load_dotenv
 from html import unescape
 from .plottings import render_plotly, pyplot, describe_plotly, describe_matplotlib
 from threading import RLock
+from syntaxmatrix.settings.model_map import GPT_MODELS_LATEST
 # ──────── framework‐local storage paths ────────
 # this ensures the key & data always live under the package dir,
@@ -46,7 +50,7 @@ EDA_OUTPUT = {}  # global buffer for EDA output by session
 class SyntaxMUI:
     def __init__(self,
-            host="127.0.0.1",
+           host="127.0.0.1",
             port="5080",
             user_icon="👩🏿‍🦲",
             bot_icon="<img src='/static/icons/favicon.png' width=20' alt='bot'/>",
@@ -76,7 +80,7 @@ class SyntaxMUI:
         self.website_description = SMXAI_WEBSITE_DESCRIPTION
         self._eda_output = {}      # {chat_id: html}
         self._eda_lock = RLock()
         db.init_db()
         self.page = ""
         self.pages = db.get_pages()
@@ -88,14 +92,15 @@ class SyntaxMUI:
         self.app_token = str(uuid.uuid4())  # NEW: Unique token for each app launch.
         self.admin_pdf_chunks = {}   # In-memory store for admin PDF chunks
         self.user_file_chunks = {}  # In-memory store of user‑uploaded chunks, scoped per chat session
-        routes.setup_routes(self)
+        self._last_llm_usage = None
+        routes.setup_routes(self)
         self._admin_profile = {}
         self._chat_profile = {}
         self._coding_profile = {}
         self._classification_profile = {}
         self._summarization_profile = {}
-        self.vision2text_profile = {}
         self._gpt_models_latest_prev_resp_ids = {}
         self.is_streaming = False
@@ -282,12 +287,14 @@ class SyntaxMUI:
     @staticmethod
     def get_ui_modes():
-        return list(UI_MODES.keys())  # "default", "card", "bubble", "smx"
+        return list(UI_MODES.keys())
+        # return "default", "card", "bubble", "smx"
     @staticmethod
     def get_themes():
         return list(DEFAULT_THEMES.keys())
     def set_theme(self, theme_name, theme=None):
         if theme_name in DEFAULT_THEMES:
             self.theme = DEFAULT_THEMES[theme_name]
@@ -319,8 +326,8 @@ class SyntaxMUI:
     def set_project_name(self, project_name):
         self.project_name = project_name
-    def set_favicon(self, icon):
-        self.favicon = icon
+    # def set_favicon(self, icon):
+    #     self.favicon = icon
     def set_site_logo(self, logo):
         self.site_logo = logo
@@ -453,7 +460,7 @@ class SyntaxMUI:
         except Exception as e:
             self.error(f"Plotly rendering failed: {e}")
-    # --------- Message helpers ---------------
     def write(self, content):
         self.bot_message(content)
@@ -465,15 +472,19 @@ class SyntaxMUI:
         if end:                           # final flush → history
             self.bot_message(chunk)       # persists the final message
     def error(self, content):
         self.bot_message(f'<div style="color:red; font-weight:bold;">{content}</div>')
     def warning(self, content):
         self.bot_message(f'<div style="color:orange; font-weight:bold;">{content}</div>')
     def success(self, content):
         self.bot_message(f'<div style="color:green; font-weight:bold;">{content}</div>')
     def info(self, content):
         self.bot_message(f'<div style="color:blue;">{content}</div>')
@@ -503,15 +514,18 @@ class SyntaxMUI:
     # ──────────────────────────────────────────────────────────────
     #  *********** LLM CLIENT HELPERS  **********************
     # ──────────────────────────────────────────────────────────────
-    def set_smxai_identity(self, profile):
-        self.smxai_identity = profile
+    def set_prompt_profile(self, profile):
+        self.ai_chat_id = profile
-    def set_smxai_instructions(self, instructions):
-        self.smxai_instructions = instructions
+    def set_prompt_instructions(self, instructions):
+        self.ai_chat_instructions = instructions
     def set_website_description(self, desc):
         self.website_description = desc
     def embed_query(self, q):
         return embed_text(q)
@@ -553,16 +567,8 @@ class SyntaxMUI:
     def delete_embed_key(self):
         return delete_embed_key()
-    def enable_stream(self):
-        self.is_streaming = True
-    def get_stream_args(self):
-        return self.stream_args
-    def stream(self):
-        return self.is_streaming
-    def gpt_models_latest(self):
-        from syntaxmatrix.settings.model_map import GPT_MODELS_LATEST
+    def get_gpt_models_latest(self):
         return GPT_MODELS_LATEST
     def get_text_input_value(self, key, default=""):
@@ -575,13 +581,23 @@ class SyntaxMUI:
             return q, None
         return q, intent
+    def enable_stream(self):
+        self.is_streaming = True
+    def stream(self):
+        return self.is_streaming
+    def get_stream_args(self):
+        return self.stream_args
     def classify_query_intent(self, query: str) -> str:
+        from syntaxmatrix.gpt_models_latest import extract_output_text as _out, set_args
         if not self._classification_profile:
-            classification_profile = _prof.get_profile('classification') or _prof.get_profile('admin')
+            classification_profile = _prof.get_profile('classification') or _prof.get_profile('chat') or _prof.get_profile('admin')
             if not classification_profile:
-                self.error("Error. Set a profile for Classification")
-                return None
+                return {"Error": "Set a profile for Classification"}
             self._classification_profile = classification_profile
             self._classification_profile['client'] = _prof.get_client(classification_profile)
@@ -590,13 +606,13 @@ class SyntaxMUI:
         _model = self._classification_profile['model']
         # New instruction format with hybrid option
-        _intent_profile = "You are an intent classifier. Respond ONLY with the intent name. Based on a given query and context, you are to classify the intent into one of these four categories: none, user_docs, system_docs, hybrid."
+        _intent_profile = "You are an intent classifier. Respond ONLY with the intent name."
         _instructions = f"""
             Classify the given query into ONE of these intents You must return ONLY the intent name with no comment or any preamble:
             - "none": Casual chat/greetings
             - "user_docs": Requires user-uploaded documents
-            - "system_docs": Requires company/organization knowledgebase/data/files/docs
-            - "hybrid": Requires BOTH user_docs and system_docs.
+            - "system_docs": Requires company knowledge/docs
+            - "hybrid": Requires BOTH user docs AND company docs
             Examples:
             Query: "Hi there!" → none
@@ -606,11 +622,15 @@ class SyntaxMUI:
             Query: "What is the weather today?" → none
             Query: "Cross-reference the customer feedback from my uploaded survey results with our product's feature list in the official documentation." → hybrid
-            Now classify this
+            Now classify:
             Query: "{query}"
             Intent:
         """
+        openai_sdk_messages = [
+            {"role": "system", "content": _intent_profile},
+            {"role": "user", "content": _instructions}
+        ]
         def google_classify_query():
             response = _client.models.generate_content(
                 model=_model,
@@ -638,8 +658,8 @@ class SyntaxMUI:
             try:
                 response = _client.messages.create(
                     model=_model,
-                    max_tokens=100,
-                    system = _intent_profile,
+                    max_tokens=1024,
+                    system=_intent_profile,
                     messages=[{"role": "user", "content":_instructions}],
                     stream=False,
                 )
@@ -651,11 +671,8 @@ class SyntaxMUI:
         def openai_sdk_classify_query():
             try:
                 response = _client.chat.completions.create(
-                    model = _model,
-                    messages = [
-                        {"role": "system", "content": _intent_profile},
-                        {"role": "user", "content": _instructions}
-                    ],
+                    model=_model,
+                    messages=openai_sdk_messages,
                     temperature=0,
                     max_tokens=100
                 )
@@ -665,21 +682,25 @@ class SyntaxMUI:
                 return f"Error!"
         if _provider == "google":
-            return google_classify_query()
-        if _model in self.gpt_models_latest():
-            return gpt_models_latest_classify_query()
+            intent = google_classify_query()
+            return intent
+        if _model in self.get_gpt_models_latest():
+            intent = gpt_models_latest_classify_query()
+            return intent
         if _provider == "anthropic":
-            return anthropic_classify_query()
+            intent = anthropic_classify_query()
+            return intent
         else:
-            return openai_sdk_classify_query()
+            intent = openai_sdk_classify_query()
+            return intent
     def generate_contextual_title(self, chat_history):
         if not self._summarization_profile:
-            summarization_profile = _prof.get_profile('summarization') or _prof.get_profile('admin')
+            summarization_profile = _prof.get_profile('summarization') or _prof.get_profile('chat') or _prof.get_profile('admin')
             if not summarization_profile:
-                self.error("Error. Chat profile not set yet.")
-                return None
+                return {"Error": "Chat profile not set yet."}
             self._summarization_profile = summarization_profile
             self._summarization_profile['client'] = _prof.get_client(summarization_profile)
@@ -706,14 +727,14 @@ class SyntaxMUI:
             except Exception as e:
                 return f"Summary agent error!"
-        def gpt_models_latest_generated_title(reasoning_effort = "minimal", verbosity = "low"):
+        def gpt_models_latest_generated_title():
             try:
                 args = set_args(
                     model=_model,
                     instructions=_title_profile,
                     input=_instructions,
-                    reasoning_effort=reasoning_effort,
-                    verbosity=verbosity,
+                    # reasoning_effort=reasoning_effort,
+                    # verbosity=verbosity,
                 )
                 resp = _client.responses.create(**args)
@@ -725,7 +746,7 @@ class SyntaxMUI:
             try:
                 response = _client.messages.create(
                     model=_model,
-                    max_tokens=100,
+                    max_tokens=50,
                     system=_title_profile,
                     messages=[{"role": "user", "content":_instructions}],
                     stream=False,
@@ -739,12 +760,11 @@ class SyntaxMUI:
                 { "role": "system", "content": _title_profile },
                 { "role": "user", "content": _instructions },
             ]
             try:
                 response = _client.chat.completions.create(
                     model=_model,
                     messages=prompt,
-                    temperature=0,
+                    temperature=0.3,
                     max_tokens=50
                 )
                 title = response.choices[0].message.content.strip().lower()
@@ -754,7 +774,7 @@ class SyntaxMUI:
         if _provider == "google":
             title = google_generated_title()
-        elif _model in self.gpt_models_latest():
+        elif _model in self.get_gpt_models_latest():
             title = gpt_models_latest_generated_title()
         elif _provider == "anthropic":
             title = anthropic_generated_title()
@@ -762,26 +782,22 @@ class SyntaxMUI:
             title = openai_sdk_generated_title()
         return title
     def stream_process_query(self, query, context, conversations, sources):
         self.stream_args['query'] = query
         self.stream_args['context'] = context
         self.stream_args['conversations'] = conversations
         self.stream_args['sources'] = sources
     def process_query_stream(self, query: str, context: str, history: list, stream=True) -> Generator[str, None, None]:
         if not self._chat_profile:
             chat_profile = _prof.get_profile("chat") or _prof.get_profile("admin")
             if not chat_profile:
-                yield (
-                    """
-                    <div class="smx-alert smx-alert-warn">
-                        <p style='color:red;'>Error: Chat profile is not configured. Add a chat profile inside the admin panel or contact your administrator.
-                        </p>
-                    </div>
-                    """
-                )
-                return
+                yield """<p style='color:red;'>Error: Chat profile is not configured. Add a chat profile inside the admin panel or contact your administrator.</p>
+                """
+                return None
             self._chat_profile = chat_profile
             self._chat_profile['client'] = _prof.get_client(chat_profile)
@@ -798,23 +814,21 @@ class SyntaxMUI:
         """
         try:
-            if _provider == "google":
-                contents = [
-                    types.Content(
-                        role="user",
-                        parts=[
-                            types.Part.from_text(text=f"{self.smxai_identity}\n\n{_contents}"),
-                        ],
-                    ),
-                ]
+            if _provider == "google":     # Google, non openai skd series
                 for chunk in _client.models.generate_content_stream(
                     model=_model,
-                    contents=contents,
+                    contents=_contents,
+                    config=types.GenerateContentConfig(
+                        system_instruction=self.smxai_identity,
+                        temperature=0.3,
+                        max_output_tokens=1024,
+                    ),
                 ):
                     yield chunk.text
-            elif _model in self.gpt_models_latest():  # GPt 5 series
+            elif _provider == "openai" and _model in self.get_gpt_models_latest():  # GPt 5 series
                 input_prompt = (
                     f"{self.smxai_instructions}\n\n"
                     f"Generate a response to this query:\n{query}\n"
@@ -866,18 +880,15 @@ class SyntaxMUI:
         if not self._chat_profile:
             chat_profile = _prof.get_profile("chat") or _prof.get_profile("admin")
             if not chat_profile:
-                return (
-                    """
-                    <div class="smx-alert smx-alert-warn">
-                        <p style='color:red;'>Error: Chat profile is not configured. Add a chat profile inside the admin panel or contact your administrator.
-                        </p>
-                    </div>
-                    """
-                )
+                return """<p style='color:red;'>Error: Chat profile is not configured. Add a chat profile inside the admin panel or contact your administrator.</p>
+                """
+                return
             self._chat_profile = chat_profile
             self._chat_profile['client'] = _prof.get_client(chat_profile)
+        _provider = self._chat_profile['provider']
+        _client = self._chat_profile['client']
+        _model = self._chat_profile['model']
         _contents = f"""
                     {self.smxai_instructions}\n\n
                     Question: {query}\n
@@ -885,19 +896,31 @@ class SyntaxMUI:
                     History: {history}\n\n
                     Use conversation continuity if available.
                 """
-        _provider = self._chat_profile['provider']
-        _client = self._chat_profile['client']
-        _model = self._chat_profile['model']
+        openai_sdk_prompt = [
+                {"role": "system", "content": self.smxai_identity},
+                {"role": "user", "content": f"""{self.smxai_instructions}\n\n
+                                                Generate response to this query: {query}\n
+                                                based on this context:\n{context}\n
+                                                and history:\n{history}\n\n
+                                                Use conversation continuity if available.)
+                                            """
+                },
+            ]
         def google_process_query():
             try:
                 response = _client.models.generate_content(
                     model=_model,
-                    contents=f"{self.smxai_identity}\n\n{_contents}"
+                    contents=_contents,
+                    config=types.GenerateContentConfig(
+                        system_instruction=self.smxai_identity,
+                        temperature=0.3,
+                        max_output_tokens=1024,
+                    ),
                 )
                 answer = response.text
                 # answer = strip_html(answer)
                 return answer
             except Exception as e:
@@ -945,11 +968,12 @@ class SyntaxMUI:
                 response = _client.messages.create(
                     model=_model,
                     max_tokens=1024,
-                    system=self.smxai_identity,
+                    system=self.self.smxai_identity,
                     messages=[{"role": "user", "content":_contents}],
                     stream=False,
                 )
-                return response.content[0].text.strip()
+                return response.content[0].text.strip()
             except Exception as e:
                 return f"Error: {str(e)}"
@@ -958,16 +982,7 @@ class SyntaxMUI:
             try:
                 response = _client.chat.completions.create(
                     model=_model,
-                    messages = [
-                        {"role": "system", "content": self.smxai_identity},
-                        {"role": "user", "content": f"""{self.smxai_instructions}\n\n
-                                                        Generate response to this query: {query}\n
-                                                        based on this context:\n{context}\n
-                                                        and history:\n{history}\n\n
-                                                        Use conversation continuity if available.
-                                                    """
-                        },
-                    ],
+                    messages=openai_sdk_prompt,
                     stream=False,
                 )
@@ -979,278 +994,374 @@ class SyntaxMUI:
         if _provider == "google":
             return google_process_query()
-        if _provider == "openai" and _model in self.gpt_models_latest():
+        if _provider == "openai" and _model in self.get_gpt_models_latest():
             return gpt_models_latest_process_query(self._gpt_models_latest_prev_resp_ids.get(self.get_session_id()))
         if _provider == "anthropic":
             return anthropic_process_query()
         return openai_sdk_process_query()
-    def ai_generate_code(self, question, intent, df):
+    def repair_python_cell(self, py_code: str) -> str:
+        _CELL_REPAIR_RULES = """
+        Fix the Python cell to satisfy:
+        - Single valid cell; imports at the top.
+        - Do not import or invoke or use 'python-dotenv' or 'dotenv' because it's not needed.
+        - No top-level statements between if/elif/else branches.
+        - Regression must use either sklearn with train_test_split (then X_test exists) and R^2/MAE/RMSE,
+            or statsmodels OLS. No accuracy_score in regression.
+        - Keep all plotting + savefig + BytesIO + display inside the branch that created the figure.
+        - Return ONLY the corrected cell.
+        """
+        code = textwrap.dedent(py_code or "").strip()
+        needs_fix = False
+        if re.search(r"\baccuracy_score\b", code) and re.search(r"\bLinearRegression\b|\bOLS\b", code):
+            needs_fix = True
+        if re.search(r"\bX_test\b", code) and not re.search(r"\bX_test\s*=", code):
+            needs_fix = True
+        try:
+            ast.parse(code)
+        except SyntaxError:
+            needs_fix = True
+        if not needs_fix:
+            return code
+        _prompt = f"```python\n{code}\n```"
+        repair_profile = _prof.get_profile("vision2text") or _prof.get_profile("admin")
+        if not repair_profile:
+            return (
+                '<div class="smx-alert smx-alert-warn">'
+                    'No LLM profile configured for <code>coding</code> (or <code>admin</code>). <br>'
+                    'Please, add the LLM profile inside the admin panel or contact your Administrator.'
+                '</div>'
+            )
+        _client = _prof.get_client(repair_profile)
+        _provider = repair_profile['provider'].lower()
+        _model = repair_profile['model']
+        #1 Google
+        if _provider == "google":
+            from google.genai import types
+            fixed = _client.models.generate_content(
+                model=_model,
+                contents=_prompt,
+                config=types.GenerateContentConfig(
+                    system_instruction=_CELL_REPAIR_RULES,
+                    temperature=0.8,
+                    max_output_tokens=1024,
+                ),
+            )
+        #2 Openai
+        elif _provider == "openai" and _model in GPT_MODELS_LATEST:
+            args = set_args(
+                model=_model,
+                instructions=_CELL_REPAIR_RULES,
+                input=[{"role": "user", "content": _prompt}],
+                previous_id=None,
+                store=False,
+                reasoning_effort="medium",
+                verbosity="medium",
+            )
+            fixed = _out(_client.responses.create(**args))
-        if not self._coding_profile:
-            coding_profile = _prof.get_profile("coding") or _prof.get_profile("admin")
-            if not coding_profile:
-                # tell the user exactly what to configure
-                return (
-                    '<div class="smx-alert smx-alert-warn">'
-                        'No LLM profile configured for <code>coding</code> (or <code>admin</code>). '
-                        'Please,  contact your Administrator.'
-                    '</div>'
-                )
+        # Anthropic
+        elif _provider == "anthropic":
-            self._coding_profile = coding_profile
-            self._coding_profile['client'] = _prof.get_client(coding_profile)
+            fixed = _client.messages.create(
+                model=_model,
+                max_tokens=1024,
+                system=_CELL_REPAIR_RULES,
+                messages=[{"role": "user", "content":_prompt}],
+                stream=False,
+            )
+        # OpenAI SDK
+        else:
+            fixed = _client.chat.completions.create(
+                model=_model,
+                messages=[
+                    {"role": "system", "content":_CELL_REPAIR_RULES},
+                    {"role": "user", "content":_prompt},
+                ],
+                max_tokens=1024,
+            )
+        try:
+            ast.parse(fixed);
+            return fixed
+        except Exception:
+            return code
-        _client = self._coding_profile['client']
-        _provider = self._coding_profile['provider']
-        _model = self._coding_profile['model']
+    def get_last_llm_usage(self):
+        return getattr(self, "_last_llm_usage", None)
-        context = f"Columns: {list(df.columns)}\n\nDtypes: {df.dtypes.astype(str).to_dict()}\n\n"
-        ALLOWED_COLUMNS = list(df.columns)
+    def ai_generate_code(self, refined_question, tasks, df):
-        ai_profile = f"""
-        You are a senior Python data scientist writing production-quality, **runnable** code for a Jupyter-like kernel.
-        You are given a pandas DataFrame named `df`. Begin ONLY the data already in `df` (no file I/O).
-        """
+        def normalise_llm_code(s: str) -> str:
+            s = s.replace("\t", "    ")
+            s = textwrap.dedent(s)
+            lines = s.splitlines()
-        instructions = f"""
-        <Context>
-        - Schema (names → dtypes): {context}
-        - Row count: {len(df)}
-        - Task: {question}
-        - Task type: {intent}
-        - Allowed columns: {ALLOWED_COLUMNS}
-        </context>
-        <Hard requirements>
-        1) **Code only**. No markdown, no comments, no explanations.
-        2) Import everything you use explicitly. Assume: pandas≥2, numpy≥1.25, matplotlib≥3.8, seaborn≥0.13, scikit-learn≥1.4 are available.
-        3) **Avoid deprecated / removed APIs**, e.g.:
-        - pandas: do not use `.append`, `.ix`, `.as_matrix`, `DataFrame.select_dtypes(include='category')` is OK, but prefer current patterns.
-        - seaborn: do not use `distplot`, `pairplot` on very large data without sampling; prefer `histplot`, `displot`, `regplot`, or FacetGrid with `.map_dataframe`.
-        - scikit-learn: import from `sklearn.model_selection` (not `sklearn.cross_validation`); for confusion matrices use `ConfusionMatrixDisplay.from_estimator`; set `random_state=42` where relevant.
-        4) Be **defensive**:
-        - Verify required columns exist; if any are missing, raise `ValueError("Missing columns: ...")` early.
-        - Handle missing values sensibly (e.g., drop rows for simple EDA; use `ColumnTransformer` + `SimpleImputer` for modeling).
-        - For categorical features in ML, use `OneHotEncoder(handle_unknown="ignore")` inside a `Pipeline`/`ColumnTransformer` (no `LabelEncoder` on features).
-        5) Keep it **fast** (kernel timeout ~8s):
-        - For plots on large frames (>20k rows), downsample to ~1,000 rows (`df.sample(1000, random_state=42)`) unless aggregation is more appropriate.
-        - Prefer vectorized ops; avoid O(n²) Python loops.
-        6) Always **produce at least one visible result** at the end:
-        - If plotting with matplotlib/seaborn: call `plt.tight_layout(); plt.show()`.
-        - If producing a table or metrics: from `syntaxmatrix.display import show` then `show(object_or_dataframe)`.
-        7) Follow task type conventions:
-        - **EDA/Stats**: compute the requested stat, then show a relevant table (e.g., summary/crosstab) or plot.
-        - **Classification**: train/valid split (`train_test_split`), build a pipeline with scaling/encoding as needed, fit, show accuracy **and** a confusion matrix via `ConfusionMatrixDisplay.from_estimator(...); plt.show()`. Also show `classification_report` as a dataframe if short.
-        - **Regression**: train/valid split, pipeline as needed, fit, show R² and MAE; plot predicted vs actual scatter.
-        - **Correlation/Chi-square/ANOVA**: compute the statistic + p-value and show a concise result table (with `show(...)`) and, when sensible, a small plot (heatmap/bar).
-        8) Don't mutate or recreate target columns if they already exist (e.g., if asked to “predict TARGET”, use `y = df['TARGET']` as-is).
-        9) Keep variable names short and clear; prefer `num_cols` / `cat_cols` discovery by dtype.
-        10) You MUST NOT reference any column outside Allowed columns: {ALLOWED_COLUMNS}\n.
-        11) If asked to predict/classify, choose the target by matching the task text to Allowed columns: {ALLOWED_COLUMNS}\n and never invent a new name (e.g., 'whether', 'the').
-        </Hard requirements>
-        <Output>
-        Return **only runnable Python** that:
-        - Imports what it needs,
-        - Validates columns,
-        - Solves: {question},
-        - And ends with at least one visible output (`show(...)` and/or `plt.show()`).
-        </Output>
-        """
+            # drop leading blank lines
+            while lines and not lines[0].strip():
+                lines.pop(0)
+            # if everything is still indented >=4 spaces, shift left
+            indents = [len(l) - len(l.lstrip(" ")) for l in lines if l.strip()]
+            if indents and min(indents) >= 4:
+                m = min(indents)
+                lines = [l[m:] if len(l) >= m else l for l in lines]
+            return "\n".join(lines)
-        def google_generate_code():
-            try:
-                # Combine system prompt and instructions for Gemini
-                # Gemini expects a simple generate_content call with the model and contents
-                response = _client.models.generate_content(
-                    model=_model,
-                    contents=f"{ai_profile}\n\n{instructions}"
-                )
-                # Extract text from response
-                if hasattr(response, 'text'):
-                    return response.text
-                elif hasattr(response, 'candidates') and response.candidates:
-                    candidate = response.candidates[0]
-                    if hasattr(candidate.content, 'parts'):
-                        return ''.join(part.text for part in candidate.content.parts if hasattr(part, 'text'))
-                return str(response)
-            except Exception as e:
-                return f"Error!"
-            # except Exception as e:
-            #     return """
-            #     import pandas as pd
-            #     import matplotlib.pyplot as plt
-            #     import seaborn as sns
-            #     import numpy as np
-            #     import io
-            #     import base64
-            #     from syntaxmatrix.display import show
-            #     print("Basic DataFrame Info:")
-            #     print(f"Shape: {df.shape}")
-            #     print("\\nColumns and dtypes:")
-            #     print(df.dtypes)
-            #     print("\\nBasic statistics:")
-            #     show(df.describe())
-            #     print("\\nFirst few rows:")
-            #     show(df.head())
-            #     # Generate a simple visualization based on available columns
-            #     plt.figure(figsize=(10, 6))
-            #     if len(df.columns) >= 2:
-            #         # Try to find numeric columns for scatter plot
-            #         numeric_cols = df.select_dtypes(include=['number']).columns
-            #         if len(numeric_cols) >= 2:
-            #             sns.scatterplot(data=df, x=numeric_cols[0], y=numeric_cols[1])
-            #             plt.title(f"Scatter plot: {numeric_cols[0]} vs {numeric_cols[1]}")
-            #             plt.tight_layout()
-            #             plt.show()
-            #         else:
-            #             # Use first column for bar plot
-            #             top_values = df[df.columns[0]].value_counts().head(10)
-            #             top_values.plot(kind='bar')
-            #             plt.title(f"Top 10 values in {df.columns[0]}")
-            #             plt.tight_layout()
-            #             plt.show()
-            #     else:
-            #         # Single column analysis
-            #         if len(df.columns) == 1:
-            #             col_name = df.columns[0]
-            #             if df[col_name].dtype in ['object', 'category']:
-            #                 df[col_name].value_counts().head(10).plot(kind='bar')
-            #                 plt.title(f"Value counts for {col_name}")
-            #             else:
-            #                 df[col_name].hist(bins=20)
-            #                 plt.title(f"Distribution of {col_name}")
-            #             plt.tight_layout()
-            #             plt.show()
-            #         else:
-            #             print("Insufficient columns for detailed analysis")
-            #             show(df)
-            #     """
-        def gpt_models_latest_generate_code(reasoning_effort = "medium", verbosity = "medium"):
-            # verbosities = ["low", "medium", "high"]  # default is "low"
-            # reasoning_efforts = ["minimal", "low", "medium", "high"]  # default is "medium"
-            if _model == "gpt-5-mini":
-                reasoning_effort = "high"
-            elif _model == "gpt-5-high":
-                reasoning_effort = "high"
-                verbosity = "high"
-            try:
-                args = set_args(
-                    model=_model,
-                    instructions=ai_profile,
-                    input=instructions,
-                    reasoning_effort=reasoning_effort,
-                    verbosity=verbosity,
-                )
-                resp = _client.responses.create(**args)
-                code = _out(resp)
-                return code
-            except Exception as e:
-                return f"Error!"
+        CONTEXT = f"Columns: {list(df.columns)}\n\nDtypes: {df.dtypes.astype(str).to_dict()}\n\n"
+        AVAILABLE_COLUMNS = list(df.columns)
-        def anthropic_generate_code():
+        # --- SMX: normalise tasks coming from intent agent ---
+        if isinstance(tasks, str):
+            import json, ast, re
             try:
-                response = _client.messages.create(
-                    model=_model,
-                    max_tokens=64000,
-                    temperature=0,
-                    system=ai_profile + "\n\n" + instructions,
-                    messages=[
-                        {
-                            "role": "user",
-                            "content": [
-                                {
-                                    "type": "text",
-                                    "text": question,
-                                }
-                            ],
-                        }
-                    ],
-                )
-                # return response.content[0].text
-            except Exception as e:
-                return f"Error!"
+                tasks_parsed = json.loads(tasks)
+            except Exception:
+                try:
+                    tasks_parsed = ast.literal_eval(tasks)
+                except Exception:
+                    tasks_parsed = re.findall(r"[A-Za-z_]+", tasks)
+            tasks = tasks_parsed
+        if not isinstance(tasks, list):
+            tasks = [str(tasks)]
+        tasks = [str(t).strip().lower() for t in tasks if str(t).strip()]
+        ai_profile = """
+        - You are a Python expert specializing in data science and machine learning.
+        - Your task is to generate a single, complete, production-quality, executable Python script for a Jupyter-like Python kernel, based on the given instructions.
+        - The dataset is already loaded as a pandas DataFrame named `df` (no file I/O or file uploads).
+        - Make a copy of `df` and name it `df_copy`. Make sure `df_copy` is preprocessed and cleaned, named `df_cleaned`, if not already done so. Then use `df_cleaned` to perform the ML tasks described in the given context.
+        - Select your features and target, from `df_cleaned`, with care and name it `required_cols`
+        - Create your 'df_filtered by doing: df_filtered = df_cleaned[required_cols].
+        - Use the {TEMPLATE_CATALOGUE} below to educate yourself on which visualizations you will implement in the code.
+        - The final output MUST be the complete, executable Python code only, enclosed in a single markdown code block (```python ... ```), which is required to fulfill the user's request. See the {tasks} below.
+        - Do not include any explanatory text or markdown outside the code block.
+        """
+        TEMPLATE_CATALOGUE = """
+        ### Available SyntaxMatrix templates (use these instead of inventing new helpers)
+        Visualisation templates (dataset-agnostic):
+        - viz_pie(df, category_col=None, top_k=8): pie/donut shares within a category.
+        - viz_stacked_bar(df, x=None, hue=None, normalise=True): composition across groups.
+        - viz_count_bar(df, category_col=None, top_k=12): counts/denominators by category.
+        - viz_box(df, x=None, y=None): spread/outliers of numeric by category.
+        - viz_scatter(df, x=None, y=None, hue=None): relationship between two numeric vars.
+        - viz_distribution(df, col=None): histogram-style distribution for numeric.
+        - viz_kde(df, col=None): density curve for numeric.
+        - viz_area(df, time_col=None, y_col=None): area/trend over time.
+        - viz_line(df, x=None, y=None, hue=None): line/trend plot.
+        ML/stat templates:
+        - classification(df): standard classification pipeline + metrics + plots.
+        - regression(df): standard regression pipeline + metrics + plots.
+        - clustering(df): clustering workflow + cluster plots.
+        - anomaly_detection(df)
+        - ts_anomaly_detection(df)
+        - time_series_forecasting(df)
+        - time_series_classification(df, entity_col, time_col, target_col)
+        - dimensionality_reduction(df)
+        - feature_selection(df)
+        - eda_overview(df)
+        - eda_correlation(df)
+        - multilabel_classification(df, label_cols)
+        - recommendation(df)
+        - topic_modelling(df)
+        """
+        instructions = (
+            "### Context"
+            f"- DataFrame - (`df`): {df}"
+            f"- Schema (names → dtypes): {CONTEXT}"
+            f"- Row count: {len(df)}"
+            f"- Task description: {refined_question}"
+            f"- Tasks: {tasks}"
+            f"- Available columns: {AVAILABLE_COLUMNS}"
+            f"- Template catalogue: {TEMPLATE_CATALOGUE}"
-            message = client.messages.create(
-                model=_model,
-                max_tokens=64000,
-                temperature=0,
-                system="Your task is to analyze the provided Python code snippet and suggest improvements to optimize its performance. Identify areas where the code can be made more efficient, faster, or less resource-intensive. Provide specific suggestions for optimization, along with explanations of how these changes can enhance the code's performance. The optimized code should maintain the same functionality as the original code while demonstrating improved efficiency.",
-                messages=[
-                    {
-                        "role": "user",
-                        "content": [
-                            {
-                                "type": "text",
-                                "text": "def fibonacci(n):\n if n <= 0:\n return []\n elif n == 1:\n return [0]\n elif n == 2:\n return [0, 1]\n else:\n fib = [0, 1]\n for i in range(2, n):\n fib.append(fib[i-1] + fib[i-2])\n return fib",
-                            }
-                        ],
-                    }
-                ],
-            )
-            return response.content[0].text
+            """
+            ### Template rules
+            - You MAY call a template if it matches the task.
+            - Do NOT invent template names.
+            - If no template fits, write minimal direct pandas/sklearn/seaborn code instead.
+            - Keep the solution short: avoid writing wrappers/utilities already handled by SyntaxMatrix hardener.
+            #### Template selection hint examples:
+            - If the task asks for pie/donut/composition shares → use viz_pie.
+            - If it asks for denominators/counts per category → viz_count_bar.
+            - If it asks for spread/outliers/comparison across groups → viz_box.
+            - If it asks for relationship / “X vs Y” → viz_scatter.
+            - If it asks for trend over time → viz_line or viz_area.
+            ### Hard requirements
+            1) Code only. No markdown, no comments, no explanations.
+            2) Import everything you use explicitly.
+            - Use pandas/numpy/matplotlib by default.
+            - Seaborn may be unavailable at runtime; **do not import seaborn inside your code**.
+            - If you call sns.*, assume sns is already defined by the framework.
+            3) Avoid deprecated / removed APIs**, e.g.:
+            - pandas: do not use `.append`, `.ix`, `.as_matrix`; prefer current patterns.
+            - seaborn: do not use `distplot`; avoid `pairplot` on very large data unless sampling.
+            - scikit-learn: import from `sklearn.model_selection` (not `sklearn.cross_validation`);
+                set `random_state=42` where relevant.
+            4) Be defensive, but avoid hard-failing on optional fields:
+            - If the primary column, needed to answer the question, is missing, review your copy of the `df` again.
+            Make sure that you selected the proper column.
+            Never use a column/variable which isn't available or defined.
+            - If a secondary/extra column is missing, show a warning with `show(...)` and continue using available fields.
+            - Handle missing values sensibly (drop rows for simple EDA; use `ColumnTransformer` + `SimpleImputer` for modelling).
+            - For categorical features in ML, use `OneHotEncoder(handle_unknown="ignore")`
+                inside a `Pipeline`/`ColumnTransformer` (no `LabelEncoder` on features).
+            5) Keep it fast (kernel timeout ~8s):
+            - For plots on large frames (>20k rows), downsample to ~1,000 rows
+                (`df.sample(1000, random_state=42)`) unless aggregation is more appropriate.
+            - Prefer vectorised ops; avoid O(n²) Python loops.
+            6) Keep the solution compact:
+            - Do not define large helper libraries or long “required column” sets.
+            - Aim for ≤120 lines excluding imports.
+            7) Always produce at least one visible result at the end:
+            - If plotting with matplotlib/seaborn: call `plt.tight_layout(); plt.show()`.
+            - If producing a table or metrics:
+                `from syntaxmatrix.display import show` then `show(object_or_dataframe)`.
+            8) Follow task type conventions:
+            - **EDA/Stats**: compute the requested stat, then show a relevant table
+                (e.g., summary/crosstab) or plot.
+            - **Classification**: train/valid split (`train_test_split`), pipeline with scaling/encoding,
+                fit, show accuracy and a confusion matrix via
+                `ConfusionMatrixDisplay.from_estimator(...); plt.show()`.
+                Also show `classification_report` as a dataframe if short.
+            - **Regression**: train/valid split, pipeline as needed, fit, show R² and MAE;
+                plot predicted vs actual scatter.
+            - **Correlation/Chi-square/ANOVA**: compute the statistic + p-value and show a concise
+                result table (with `show(...)`) and, when sensible, a small plot (heatmap/bar).
+            9) Don't mutate or recreate target columns if they already exist.
+            10) Keep variable names short and clear; prefer `num_cols` / `cat_cols` discovery by dtype.
+            11) You MUST NOT reference any column outside Available columns: {AVAILABLE_COLUMNS}.
+            12) If asked to predict/classify, choose the target by matching the task text to Allowed columns
+                and never invent a new name.
+            #### Cohort rules
+            When you generate plots for cohorts or categories, you MUST obey these rules:
+            1) ALWAYS guard cohort masks:
+            - After you define something like:
+                _mask_a = (df['BMI'] < 18.5) & df['BMI'].notna()
+                _mask_b = ~(df['BMI'] < 18.5) & df['BMI'].notna()
+                compute their sizes:
+                n_a = int(_mask_a.sum())
+                n_b = int(_mask_b.sum())
+            - If a mask has no rows (or almost none), do NOT draw an empty plot.
+                Instead call:
+                show(f"Skipping cohort '{label}': no rows after filtering.")
+                and return.
+            2) Before any groupby / crosstab for a plot:
+            - Fill missing categories so groupby does not drop everything:
+                df[col] = df[col].fillna("Unknown")
+            - After building the table:
+                tab = tmp.groupby([...]).size().unstack(...).fillna(0)
+                ALWAYS check:
+                if tab.empty:
+                    show(f"Skipping plot for {col}: no data after grouping.")
+                    continue
+                Only call .plot(...) if the table is non-empty.
+            3) For value_counts-based plots:
+            - If the Series is empty after filtering (len(s) == 0),
+                do NOT draw a figure. Just call:
+                show(f"No data available to plot for {col} in this cohort.")
+                and skip.
+            4) Never try to “hide” an error with a blank plot.
+            A blank chart is treated as a bug. If there is no data, explain it
+            clearly using show(...), and avoid calling matplotlib/Seaborn.
+            5) Never use print(...). All user-visible diagnostics go through show(...).
-        def openai_sdk_generate_code():
-            try:
-                response = _client.chat.completions.create(
-                    model=_model,
-                    messages=[
-                        {"role": "system", "content": ai_profile},
-                        {"role": "user", "content": instructions},
-                        ],
-                    temperature=0.3,
-                    max_tokens=64000,
+            ### Output
+            Return only runnable Python that:
+            - Imports what it needs,
+            - Validates columns,
+            - Visualize tables, charts, and graphs, each with appropriate caption.
+            - Solution: {tasks} to solve {refined_question},
+            - And ends with at least 3 visible output (`show(...)` and/or `plt.show()`).
+        """)
+        if not self._coding_profile:
+            coding_profile = _prof.get_profile("coding") or _prof.get_profile("admin")
+            if not coding_profile:
+                return (
+                    '<div class="smx-alert smx-alert-warn">'
+                        'No LLM profile configured for <code>coding</code> (or <code>admin</code>). <br>'
+                        'Please, add the LLM profile inside the admin panel or contact your Administrator.'
+                    '</div>'
                 )
-                return response.choices[0].message.content
-            except Exception as e:
-                return "Error!"
-        if _provider == 'google':
-            code = google_generate_code()
-        elif _provider == "openai" and _model in self.gpt_models_latest():
-            code = gpt_models_latest_generate_code()
-        elif _provider == "anthropic":
-            code = anthropic_generate_code()
-        else:
-            code = openai_sdk_generate_code()
+            self._coding_profile = coding_profile
+            self._coding_profile['client'] = _prof.get_client(coding_profile)
+        # code = mlearning_agent(instructions, ai_profile, self._coding_profile)
+        code, usage = mlearning_agent(instructions, ai_profile, self._coding_profile)
+        self._last_llm_usage = usage
         if code:
+            import re
+            code = normalise_llm_code(code)
             m = re.search(r"```(?:python)?\s*(.*?)\s*```", code, re.DOTALL | re.IGNORECASE)
             if m:
                 code = m.group(1).strip()
-            code = drop_bad_classification_metrics(code, df)
             if "import io" not in code and "io.BytesIO" in code:
                 lines = code.split('\n')
                 import_lines = []
                 other_lines = []
                 for line in lines:
                     if line.strip().startswith('import ') or line.strip().startswith('from '):
                         import_lines.append(line)
                     else:
                         other_lines.append(line)
-                # Add missing io import
                 if "import io" not in '\n'.join(import_lines):
                     import_lines.append('import io')
                 code = '\n'.join(import_lines + [''] + other_lines)
+                TEMPLATE_NAMES = [
+                    "viz_pie","viz_stacked_bar","viz_count_bar","viz_box","viz_scatter",
+                    "viz_distribution","viz_kde","viz_area","viz_line",
+                    "classification","regression","clustering","anomaly_detection",
+                    "ts_anomaly_detection","time_series_forecasting","time_series_classification",
+                    "dimensionality_reduction","feature_selection","eda_overview","eda_correlation",
+                    "multilabel_classification","recommendation","topic_modelling"
+                ]
+                used = [t for t in TEMPLATE_NAMES if re.search(rf"\\b{t}\\s*\\(", code)]
+                if used:
+                    import_line = (
+                        "from syntaxmatrix.agentic.model_templates import " +
+                        ", ".join(sorted(set(used)))
+                    )
+                    if import_line not in code:
+                        code = import_line + "\n" + code
             return code.strip()
+        return "Error: AI code generation failed."
     def sanitize_rough_to_markdown_task(self, rough: str) -> str:
         """
         Return only the Task text (no tags).

syntaxmatrix 2.5.1__py3-none-any.whl → 2.5.3__py3-none-any.whl

syntaxmatrix 2.5.1py3-none-any.whl → 2.5.3py3-none-any.whl