PyPI - syntaxmatrix - Versions diffs - 2.3.5__py3-none-any.whl → 2.5.5.4__py3-none-any.whl - Mend

syntaxmatrix 2.3.5py3-none-any.whl → 2.5.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

syntaxmatrix/agentic/__init__.py +0 -0
syntaxmatrix/agentic/agent_tools.py +24 -0
syntaxmatrix/agentic/agents.py +810 -0
syntaxmatrix/agentic/code_tools_registry.py +37 -0
syntaxmatrix/agentic/model_templates.py +1790 -0
syntaxmatrix/commentary.py +134 -112
syntaxmatrix/core.py +385 -245
syntaxmatrix/dataset_preprocessing.py +218 -0
syntaxmatrix/display.py +89 -37
syntaxmatrix/gpt_models_latest.py +5 -4
syntaxmatrix/profiles.py +19 -4
syntaxmatrix/routes.py +947 -141
syntaxmatrix/settings/model_map.py +38 -30
syntaxmatrix/static/icons/hero_bg.jpg +0 -0
syntaxmatrix/templates/dashboard.html +248 -54
syntaxmatrix/utils.py +2254 -84
{syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/METADATA +16 -17
{syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/RECORD +21 -15
syntaxmatrix/model_templates.py +0 -29
{syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/WHEEL +0 -0
{syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/licenses/LICENSE.txt +0 -0
{syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/top_level.txt +0 -0

syntaxmatrix/core.py CHANGED Viewed

@@ -1,7 +1,10 @@
 from __future__ import annotations
+import ast
+import textwrap
 import os, webbrowser, uuid, secrets, re
 from flask import Flask, Response, session, request, has_request_context
+from syntaxmatrix.agentic.agents import mlearning_agent
 from syntaxmatrix.history_store import SQLHistoryStore as Store, PersistentHistoryStore as _Store
 from collections import OrderedDict
 from syntaxmatrix.llm_store import save_embed_model, load_embed_model, delete_embed_key
@@ -17,7 +20,6 @@ from syntaxmatrix.settings.prompts import SMXAI_CHAT_ID, SMXAI_CHAT_INSTRUCTIONS
 from typing import List, Generator
 from .auth import init_auth_db
 from . import profiles as _prof
-from syntaxmatrix.utils import strip_describe_slice, drop_bad_classification_metrics
 from syntaxmatrix.smiv import SMIV
 from .project_root import detect_project_root
 from syntaxmatrix.gpt_models_latest import extract_output_text as _out, set_args
@@ -25,6 +27,8 @@ from dotenv import load_dotenv
 from html import unescape
 from .plottings import render_plotly, pyplot, describe_plotly, describe_matplotlib
 from threading import RLock
+from syntaxmatrix.settings.model_map import GPT_MODELS_LATEST
 # ──────── framework‐local storage paths ────────
 # this ensures the key & data always live under the package dir,
@@ -46,10 +50,10 @@ EDA_OUTPUT = {}  # global buffer for EDA output by session
 class SyntaxMUI:
     def __init__(self,
-            host="127.0.0.1",
+           host="127.0.0.1",
             port="5080",
             user_icon="👩🏿‍🦲",
-            bot_icon="<img src='/static/icons/logo.png' width='15' alt='bot'/>",
+            bot_icon="<img src='/static/icons/favicon.png' width=20' alt='bot'/>",
             favicon="/static/icons/favicon.png",
             site_logo="<img src='/static/icons/logo.png' width='30' alt='logo'/>",
             site_title="SyntaxMatrix",
@@ -71,12 +75,12 @@ class SyntaxMUI:
         self.ui_mode = ui_mode
         self.theme_toggle_enabled = False
         self.user_files_enabled = False
-        self.ai_chat_id = SMXAI_CHAT_ID
-        self.ai_chat_instructions = SMXAI_CHAT_INSTRUCTIONS
+        self.smxai_identity = SMXAI_CHAT_ID
+        self.smxai_instructions = SMXAI_CHAT_INSTRUCTIONS
         self.website_description = SMXAI_WEBSITE_DESCRIPTION
         self._eda_output = {}      # {chat_id: html}
         self._eda_lock = RLock()
         db.init_db()
         self.page = ""
         self.pages = db.get_pages()
@@ -88,8 +92,10 @@ class SyntaxMUI:
         self.app_token = str(uuid.uuid4())  # NEW: Unique token for each app launch.
         self.admin_pdf_chunks = {}   # In-memory store for admin PDF chunks
         self.user_file_chunks = {}  # In-memory store of user‑uploaded chunks, scoped per chat session
-        routes.setup_routes(self)
+        self._last_llm_usage = None
+        routes.setup_routes(self)
         self._admin_profile = {}
         self._chat_profile = {}
         self._coding_profile = {}
@@ -561,8 +567,8 @@ class SyntaxMUI:
     def delete_embed_key(self):
         return delete_embed_key()
-    def gpt_models_latest(self):
-        from syntaxmatrix.settings.model_map import GPT_MODELS_LATEST
+    def get_gpt_models_latest(self):
         return GPT_MODELS_LATEST
     def get_text_input_value(self, key, default=""):
@@ -678,7 +684,7 @@ class SyntaxMUI:
         if _provider == "google":
             intent = google_classify_query()
             return intent
-        if _model in self.gpt_models_latest():
+        if _model in self.get_gpt_models_latest():
             intent = gpt_models_latest_classify_query()
             return intent
         if _provider == "anthropic":
@@ -721,14 +727,14 @@ class SyntaxMUI:
             except Exception as e:
                 return f"Summary agent error!"
-        def gpt_models_latest_generated_title(reasoning_effort = "minimal", verbosity = "low"):
+        def gpt_models_latest_generated_title():
             try:
                 args = set_args(
                     model=_model,
                     instructions=_title_profile,
                     input=_instructions,
-                    reasoning_effort=reasoning_effort,
-                    verbosity=verbosity,
+                    # reasoning_effort=reasoning_effort,
+                    # verbosity=verbosity,
                 )
                 resp = _client.responses.create(**args)
@@ -740,7 +746,7 @@ class SyntaxMUI:
             try:
                 response = _client.messages.create(
                     model=_model,
-                    max_tokens=1024,
+                    max_tokens=50,
                     system=_title_profile,
                     messages=[{"role": "user", "content":_instructions}],
                     stream=False,
@@ -754,12 +760,11 @@ class SyntaxMUI:
                 { "role": "system", "content": _title_profile },
                 { "role": "user", "content": _instructions },
             ]
             try:
                 response = _client.chat.completions.create(
                     model=_model,
                     messages=prompt,
-                    temperature=0,
+                    temperature=0.3,
                     max_tokens=50
                 )
                 title = response.choices[0].message.content.strip().lower()
@@ -769,7 +774,7 @@ class SyntaxMUI:
         if _provider == "google":
             title = google_generated_title()
-        elif _model in self.gpt_models_latest():
+        elif _model in self.get_gpt_models_latest():
             title = gpt_models_latest_generated_title()
         elif _provider == "anthropic":
             title = anthropic_generated_title()
@@ -792,7 +797,7 @@ class SyntaxMUI:
             if not chat_profile:
                 yield """<p style='color:red;'>Error: Chat profile is not configured. Add a chat profile inside the admin panel or contact your administrator.</p>
                 """
-                return
+                return None
             self._chat_profile = chat_profile
             self._chat_profile['client'] = _prof.get_client(chat_profile)
@@ -801,7 +806,7 @@ class SyntaxMUI:
         _model = self._chat_profile['model']
         _contents = f"""
-            {self.ai_chat_instructions}\n\n
+            {self.smxai_instructions}\n\n
             Question: {query}\n
             Context: {context}\n\n
             History: {history}\n\n
@@ -809,32 +814,30 @@ class SyntaxMUI:
         """
         try:
-            if _provider == "google":     # Google, non openai skd series
-                contents = [
-                    types.Content(
-                        role="user",
-                        parts=[
-                            types.Part.from_text(text=f"{self.ai_chat_id}\n\n{_contents}"),
-                        ],
-                    ),
-                ]
+            if _provider == "google":     # Google, non openai skd series
                 for chunk in _client.models.generate_content_stream(
                     model=_model,
-                    contents=contents,
+                    contents=_contents,
+                    config=types.GenerateContentConfig(
+                        system_instruction=self.smxai_identity,
+                        temperature=0.3,
+                        max_output_tokens=1024,
+                    ),
                 ):
                     yield chunk.text
-            elif _provider == "openai" and _model in self.gpt_models_latest():  # GPt 5 series
+            elif _provider == "openai" and _model in self.get_gpt_models_latest():  # GPt 5 series
                 input_prompt = (
-                    f"{self.ai_chat_instructions}\n\n"
+                    f"{self.smxai_instructions}\n\n"
                     f"Generate a response to this query:\n{query}\n"
                     f"based on this given context:\n{context}\n\n"
                     f"(Use conversation continuity if available.)"
                 )
                 sid = self.get_session_id()
                 prev_id = self._gpt_models_latest_prev_resp_ids.get(sid)
-                args = set_args(model=_model, instructions=self.ai_chat_id, input=input_prompt, previous_id=prev_id, store=True)
+                args = set_args(model=_model, instructions=self.smxai_identity, input=input_prompt, previous_id=prev_id, store=True)
                 with _client.responses.stream(**args) as s:
                     for event in s:
@@ -849,7 +852,7 @@ class SyntaxMUI:
             elif _provider == "anthropic":
                 with _client.messages.stream(
                     max_tokens=1024,
-                    messages=[{"role": "user", "content":f"{self.ai_chat_id}\n\n {_contents}"},],
+                    messages=[{"role": "user", "content":f"{self.smxai_identity}\n\n {_contents}"},],
                     model=_model,
                 ) as stream:
                     for text in stream.text_stream:
@@ -857,8 +860,8 @@ class SyntaxMUI:
             else:  # Assumes standard openai_sdk
                 openai_sdk_prompt = [
-                    {"role": "system", "content": self.ai_chat_id},
-                    {"role": "user", "content": f"{self.ai_chat_instructions}\n\nGenerate response to this query: {query}\nbased on this context:\n{context}\nand history:\n{history}\n\nUse conversation continuity if available.)"},
+                    {"role": "system", "content": self.smxai_identity},
+                    {"role": "user", "content": f"{self.smxai_instructions}\n\nGenerate response to this query: {query}\nbased on this context:\n{context}\nand history:\n{history}\n\nUse conversation continuity if available.)"},
                 ]
                 response = _client.chat.completions.create(
                     model=_model,
@@ -883,9 +886,11 @@ class SyntaxMUI:
             self._chat_profile = chat_profile
             self._chat_profile['client'] = _prof.get_client(chat_profile)
+        _provider = self._chat_profile['provider']
+        _client = self._chat_profile['client']
+        _model = self._chat_profile['model']
         _contents = f"""
-                    {self.ai_chat_instructions}\n\n
+                    {self.smxai_instructions}\n\n
                     Question: {query}\n
                     Context: {context}\n\n
                     History: {history}\n\n
@@ -893,8 +898,8 @@ class SyntaxMUI:
                 """
         openai_sdk_prompt = [
-                {"role": "system", "content": self.ai_chat_id},
-                {"role": "user", "content": f"""{self.ai_chat_instructions}\n\n
+                {"role": "system", "content": self.smxai_identity},
+                {"role": "user", "content": f"""{self.smxai_instructions}\n\n
                                                 Generate response to this query: {query}\n
                                                 based on this context:\n{context}\n
                                                 and history:\n{history}\n\n
@@ -903,18 +908,19 @@ class SyntaxMUI:
                 },
             ]
-        _provider = self._chat_profile['provider']
-        _client = self._chat_profile['client']
-        _model = self._chat_profile['model']
         def google_process_query():
             try:
                 response = _client.models.generate_content(
                     model=_model,
-                    contents=f"{self.ai_chat_id}\n\n{_contents}"
+                    contents=_contents,
+                    config=types.GenerateContentConfig(
+                        system_instruction=self.smxai_identity,
+                        temperature=0.3,
+                        max_output_tokens=1024,
+                    ),
                 )
                 answer = response.text
                 # answer = strip_html(answer)
                 return answer
             except Exception as e:
@@ -926,7 +932,7 @@ class SyntaxMUI:
             """
             # Prepare the prompt with conversation history and context
             input = (
-                f"{self.ai_chat_instructions}\n\n"
+                f"{self.smxai_instructions}\n\n"
                 f"Generate a response to this query:\n{query}\n"
                 f"based on this given context:\n{context}\n\n"
                 f"(Use conversation continuity if available.)"
@@ -937,7 +943,7 @@ class SyntaxMUI:
             args = set_args(
                 model=_model,
-                instructions=self.ai_chat_id,
+                instructions=self.smxai_identity,
                 input=input,
                 previous_id=prev_id,
                 store=True,
@@ -962,7 +968,7 @@ class SyntaxMUI:
                 response = _client.messages.create(
                     model=_model,
                     max_tokens=1024,
-                    system=self.ai_chat_id,
+                    system=self.self.smxai_identity,
                     messages=[{"role": "user", "content":_contents}],
                     stream=False,
                 )
@@ -977,7 +983,7 @@ class SyntaxMUI:
                 response = _client.chat.completions.create(
                     model=_model,
                     messages=openai_sdk_prompt,
-                stream=False,
+                    stream=False,
                 )
                 # -------- one-shot buffered --------
@@ -988,239 +994,373 @@ class SyntaxMUI:
         if _provider == "google":
             return google_process_query()
-        if _provider == "openai" and _model in self.gpt_models_latest():
+        if _provider == "openai" and _model in self.get_gpt_models_latest():
             return gpt_models_latest_process_query(self._gpt_models_latest_prev_resp_ids.get(self.get_session_id()))
         if _provider == "anthropic":
             return anthropic_process_query()
         return openai_sdk_process_query()
-    def ai_generate_code(self, question, intent, df):
+    def repair_python_cell(self, py_code: str) -> str:
+        _CELL_REPAIR_RULES = """
+        Fix the Python cell to satisfy:
+        - Single valid cell; imports at the top.
+        - Do not import or invoke or use 'python-dotenv' or 'dotenv' because it's not needed.
+        - No top-level statements between if/elif/else branches.
+        - Regression must use either sklearn with train_test_split (then X_test exists) and R^2/MAE/RMSE,
+            or statsmodels OLS. No accuracy_score in regression.
+        - Keep all plotting + savefig + BytesIO + display inside the branch that created the figure.
+        - Return ONLY the corrected cell.
+        """
+        code = textwrap.dedent(py_code or "").strip()
+        needs_fix = False
+        if re.search(r"\baccuracy_score\b", code) and re.search(r"\bLinearRegression\b|\bOLS\b", code):
+            needs_fix = True
+        if re.search(r"\bX_test\b", code) and not re.search(r"\bX_test\s*=", code):
+            needs_fix = True
+        try:
+            ast.parse(code)
+        except SyntaxError:
+            needs_fix = True
+        if not needs_fix:
+            return code
+        _prompt = f"```python\n{code}\n```"
+        repair_profile = _prof.get_profile("vision2text") or _prof.get_profile("admin")
+        if not repair_profile:
+            return (
+                '<div class="smx-alert smx-alert-warn">'
+                    'No LLM profile configured for <code>coding</code> (or <code>admin</code>). <br>'
+                    'Please, add the LLM profile inside the admin panel or contact your Administrator.'
+                '</div>'
+            )
+        _client = _prof.get_client(repair_profile)
+        _provider = repair_profile['provider'].lower()
+        _model = repair_profile['model']
+        #1 Google
+        if _provider == "google":
+            from google.genai import types
+            fixed = _client.models.generate_content(
+                model=_model,
+                contents=_prompt,
+                config=types.GenerateContentConfig(
+                    system_instruction=_CELL_REPAIR_RULES,
+                    temperature=0.8,
+                    max_output_tokens=1024,
+                ),
+            )
+        #2 Openai
+        elif _provider == "openai" and _model in GPT_MODELS_LATEST:
+            args = set_args(
+                model=_model,
+                instructions=_CELL_REPAIR_RULES,
+                input=[{"role": "user", "content": _prompt}],
+                previous_id=None,
+                store=False,
+                reasoning_effort="medium",
+                verbosity="medium",
+            )
+            fixed = _out(_client.responses.create(**args))
-        if not self._coding_profile:
-            coding_profile = _prof.get_profile("coding") or _prof.get_profile("admin")
-            if not coding_profile:
-                # tell the user exactly what to configure
-                return (
-                    '<div class="smx-alert smx-alert-warn">'
-                        'No LLM profile configured for <code>coding</code> (or <code>admin</code>). '
-                        'Please,  contact your Administrator.'
-                    '</div>'
-                )
+        # Anthropic
+        elif _provider == "anthropic":
-            self._coding_profile = coding_profile
-            self._coding_profile['client'] = _prof.get_client(coding_profile)
+            fixed = _client.messages.create(
+                model=_model,
+                max_tokens=1024,
+                system=_CELL_REPAIR_RULES,
+                messages=[{"role": "user", "content":_prompt}],
+                stream=False,
+            )
+        # OpenAI SDK
+        else:
+            fixed = _client.chat.completions.create(
+                model=_model,
+                messages=[
+                    {"role": "system", "content":_CELL_REPAIR_RULES},
+                    {"role": "user", "content":_prompt},
+                ],
+                max_tokens=1024,
+            )
+        try:
+            ast.parse(fixed);
+            return fixed
+        except Exception:
+            return code
-        _client = self._coding_profile['client']
-        _provider = self._coding_profile['provider']
-        _model = self._coding_profile['model']
+    def get_last_llm_usage(self):
+        return getattr(self, "_last_llm_usage", None)
-        context = f"Columns: {list(df.columns)}\n\nDtypes: {df.dtypes.astype(str).to_dict()}\n\n"
-        ALLOWED_COLUMNS = list(df.columns)
+    def ai_generate_code(self, refined_question, tasks, df):
-        ai_profile = f"""
-        You are a senior Python data scientist writing production-quality, **runnable** code for a Jupyter-like kernel. You are given a pandas DataFrame named `df`. Begin ONLY the data already in `df` (no file I/O).
-        """
+        def normalise_llm_code(s: str) -> str:
+            s = s.replace("\t", "    ")
+            s = textwrap.dedent(s)
+            lines = s.splitlines()
-        instructions = f"""
-        ### Context
-        - Schema (names → dtypes): {context}
-        - Row count: {len(df)}
-        - Task: {question}
-        - Task type: {intent}
-        - Allowed columns: {ALLOWED_COLUMNS}
-        ### Hard requirements
-        1) **Code only**. No markdown, no comments, no explanations.
-        2) Import everything you use explicitly. Assume: pandas≥2, numpy≥1.25, matplotlib≥3.8, seaborn≥0.13, scikit-learn≥1.4 are available.
-        3) **Avoid deprecated / removed APIs**, e.g.:
-        - pandas: do not use `.append`, `.ix`, `.as_matrix`, `DataFrame.select_dtypes(include='category')` is OK, but prefer current patterns.
-        - seaborn: do not use `distplot`, `pairplot` on very large data without sampling; prefer `histplot`, `displot`, `regplot`, or FacetGrid with `.map_dataframe`.
-        - scikit-learn: import from `sklearn.model_selection` (not `sklearn.cross_validation`); for confusion matrices use `ConfusionMatrixDisplay.from_estimator`; set `random_state=42` where relevant.
-        4) Be **defensive**:
-        - Verify required columns exist; if any are missing, raise `ValueError("Missing columns: ...")` early.
-        - Handle missing values sensibly (e.g., drop rows for simple EDA; use `ColumnTransformer` + `SimpleImputer` for modeling).
-        - For categorical features in ML, use `OneHotEncoder(handle_unknown="ignore")` inside a `Pipeline`/`ColumnTransformer` (no `LabelEncoder` on features).
-        5) Keep it **fast** (kernel timeout ~8s):
-        - For plots on large frames (>20k rows), downsample to ~1,000 rows (`df.sample(1000, random_state=42)`) unless aggregation is more appropriate.
-        - Prefer vectorized ops; avoid O(n²) Python loops.
-        6) Always **produce at least one visible result** at the end:
-        - If plotting with matplotlib/seaborn: call `plt.tight_layout(); plt.show()`.
-        - If producing a table or metrics: from `syntaxmatrix.display import show` then `show(object_or_dataframe)`.
-        7) Follow task type conventions:
-        - **EDA/Stats**: compute the requested stat, then show a relevant table (e.g., summary/crosstab) or plot.
-        - **Classification**: train/valid split (`train_test_split`), build a pipeline with scaling/encoding as needed, fit, show accuracy **and** a confusion matrix via `ConfusionMatrixDisplay.from_estimator(...); plt.show()`. Also show `classification_report` as a dataframe if short.
-        - **Regression**: train/valid split, pipeline as needed, fit, show R² and MAE; plot predicted vs actual scatter.
-        - **Correlation/Chi-square/ANOVA**: compute the statistic + p-value and show a concise result table (with `show(...)`) and, when sensible, a small plot (heatmap/bar).
-        8) Don't mutate or recreate target columns if they already exist (e.g., if asked to “predict TARGET”, use `y = df['TARGET']` as-is).
-        9) Keep variable names short and clear; prefer `num_cols` / `cat_cols` discovery by dtype.
-        10) You MUST NOT reference any column outside Allowed columns: {ALLOWED_COLUMNS}\n.
-        11) If asked to predict/classify, choose the target by matching the task text to Allowed columns: {ALLOWED_COLUMNS}\n and never invent a new name (e.g., 'whether', 'the').
-        ### Output
-        Return **only runnable Python** that:
-        - Imports what it needs,
-        - Validates columns,
-        - Solves: {question},
-        - And ends with at least one visible output (`show(...)` and/or `plt.show()`).
-        """
+            # drop leading blank lines
+            while lines and not lines[0].strip():
+                lines.pop(0)
+            # if everything is still indented >=4 spaces, shift left
+            indents = [len(l) - len(l.lstrip(" ")) for l in lines if l.strip()]
+            if indents and min(indents) >= 4:
+                m = min(indents)
+                lines = [l[m:] if len(l) >= m else l for l in lines]
+            return "\n".join(lines)
-        def google_generate_code():
+        CONTEXT = f"Columns: {list(df.columns)}\n\nDtypes: {df.dtypes.astype(str).to_dict()}\n\n"
+        AVAILABLE_COLUMNS = list(df.columns)
+        # --- SMX: normalise tasks coming from intent agent ---
+        if isinstance(tasks, str):
+            import json, ast, re
             try:
-                # Combine system prompt and instructions for Gemini
-                full_prompt = f"{ai_profile}\n\n{instructions}"
-                # Gemini expects a simple generate_content call with the model and contents
-                response = _client.models.generate_content(
-                    model=_model,
-                    contents=full_prompt
-                )
-                # Extract text from response
-                if hasattr(response, 'text'):
-                    return response.text
-                elif hasattr(response, 'candidates') and response.candidates:
-                    candidate = response.candidates[0]
-                    if hasattr(candidate.content, 'parts'):
-                        return ''.join(part.text for part in candidate.content.parts if hasattr(part, 'text'))
-                return str(response)
-            except Exception as e:
-                print(f"Google Gemini code generation error: {e}")
-                # Return a basic analysis code with ALL necessary imports
-                return """
-                import pandas as pd
-                import matplotlib.pyplot as plt
-                import seaborn as sns
-                import numpy as np
-                import io
-                import base64
-                from syntaxmatrix.display import show
-                print("Basic DataFrame Info:")
-                print(f"Shape: {df.shape}")
-                print("\\nColumns and dtypes:")
-                print(df.dtypes)
-                print("\\nBasic statistics:")
-                show(df.describe())
-                print("\\nFirst few rows:")
-                show(df.head())
-                # Generate a simple visualization based on available columns
-                plt.figure(figsize=(10, 6))
-                if len(df.columns) >= 2:
-                    # Try to find numeric columns for scatter plot
-                    numeric_cols = df.select_dtypes(include=['number']).columns
-                    if len(numeric_cols) >= 2:
-                        sns.scatterplot(data=df, x=numeric_cols[0], y=numeric_cols[1])
-                        plt.title(f"Scatter plot: {numeric_cols[0]} vs {numeric_cols[1]}")
-                        plt.tight_layout()
-                        plt.show()
-                    else:
-                        # Use first column for bar plot
-                        top_values = df[df.columns[0]].value_counts().head(10)
-                        top_values.plot(kind='bar')
-                        plt.title(f"Top 10 values in {df.columns[0]}")
-                        plt.tight_layout()
-                        plt.show()
-                else:
-                    # Single column analysis
-                    if len(df.columns) == 1:
-                        col_name = df.columns[0]
-                        if df[col_name].dtype in ['object', 'category']:
-                            df[col_name].value_counts().head(10).plot(kind='bar')
-                            plt.title(f"Value counts for {col_name}")
-                        else:
-                            df[col_name].hist(bins=20)
-                            plt.title(f"Distribution of {col_name}")
-                        plt.tight_layout()
-                        plt.show()
-                    else:
-                        print("Insufficient columns for detailed analysis")
-                        show(df)
-                """
+                tasks_parsed = json.loads(tasks)
+            except Exception:
+                try:
+                    tasks_parsed = ast.literal_eval(tasks)
+                except Exception:
+                    tasks_parsed = re.findall(r"[A-Za-z_]+", tasks)
+            tasks = tasks_parsed
+        if not isinstance(tasks, list):
+            tasks = [str(tasks)]
+        tasks = [str(t).strip().lower() for t in tasks if str(t).strip()]
+        ai_profile = """
+        - You are a Python expert specializing in data science and machine learning.
+        - Your task is to generate a single, complete, production-quality, executable Python script for a Jupyter-like Python kernel, based on the given instructions.
+        - The dataset is already loaded as a pandas DataFrame named `df` (no file I/O or file uploads).
+        - Make a copy of `df` and name it `df_copy`. Make sure `df_copy` is preprocessed and cleaned, named `df_cleaned`, if not already done so. Then use `df_cleaned` to perform the ML tasks described in the given context.
+        - Select your features and target, from `df_cleaned`, with care and name it `required_cols`
+        - Create your 'df_filtered by doing: df_filtered = df_cleaned[required_cols].
+        - Use the {TEMPLATE_CATALOGUE} below to educate yourself on which visualizations you will implement in the code.
+        - The final output MUST be the complete, executable Python code only, enclosed in a single markdown code block (```python ... ```), which is required to fulfill the user's request. See the {tasks} below.
+        - Do not include any explanatory text or markdown outside the code block.
+        """
-        def gpt_models_latest_generate_code(reasoning_effort = "medium", verbosity = "medium"):
-            try:
-                args = set_args(
-                    model=_model,
-                    instructions=ai_profile,
-                    input=instructions,
-                    reasoning_effort=reasoning_effort,
-                    verbosity=verbosity,
-                )
+        TEMPLATE_CATALOGUE = """
+        ### Available SyntaxMatrix templates (use these instead of inventing new helpers)
+        Visualisation templates (dataset-agnostic):
+        - viz_pie(df, category_col=None, top_k=8): pie/donut shares within a category.
+        - viz_stacked_bar(df, x=None, hue=None, normalise=True): composition across groups.
+        - viz_count_bar(df, category_col=None, top_k=12): counts/denominators by category.
+        - viz_box(df, x=None, y=None): spread/outliers of numeric by category.
+        - viz_scatter(df, x=None, y=None, hue=None): relationship between two numeric vars.
+        - viz_distribution(df, col=None): histogram-style distribution for numeric.
+        - viz_kde(df, col=None): density curve for numeric.
+        - viz_area(df, time_col=None, y_col=None): area/trend over time.
+        - viz_line(df, x=None, y=None, hue=None): line/trend plot.
+        ML/stat templates:
+        - classification(df): standard classification pipeline + metrics + plots.
+        - regression(df): standard regression pipeline + metrics + plots.
+        - clustering(df): clustering workflow + cluster plots.
+        - anomaly_detection(df)
+        - ts_anomaly_detection(df)
+        - time_series_forecasting(df)
+        - time_series_classification(df, entity_col, time_col, target_col)
+        - dimensionality_reduction(df)
+        - feature_selection(df)
+        - eda_overview(df)
+        - eda_correlation(df)
+        - multilabel_classification(df, label_cols)
+        - recommendation(df)
+        - topic_modelling(df)
+        """
+        instructions = (
+            "### Context"
+            f"- DataFrame - (`df`): {df}"
+            f"- Schema (names → dtypes): {CONTEXT}"
+            f"- Row count: {len(df)}"
+            f"- Task description: {refined_question}"
+            f"- Tasks: {tasks}"
+            f"- Available columns: {AVAILABLE_COLUMNS}"
+            f"- Template catalogue: {TEMPLATE_CATALOGUE}"
-                resp = _client.responses.create(**args)
-                code = _out(resp)
-                return code
-            except Exception as e:
-                return f"Error!"
+            """
+            ### Template rules
+            - You MAY call a template if it matches the task.
+            - Do NOT invent template names.
+            - If no template fits, write minimal direct pandas/sklearn/seaborn code instead.
+            - Keep the solution short: avoid writing wrappers/utilities already handled by SyntaxMatrix hardener.
+            #### Template selection hint examples:
+            - If the task asks for pie/donut/composition shares → use viz_pie.
+            - If it asks for denominators/counts per category → viz_count_bar.
+            - If it asks for spread/outliers/comparison across groups → viz_box.
+            - If it asks for relationship / “X vs Y” → viz_scatter.
+            - If it asks for trend over time → viz_line or viz_area.
+            ### Hard requirements
+            1) Code only. No markdown, no comments, no explanations.
+            2) Import everything you use explicitly.
+            - Use pandas/numpy/matplotlib by default.
+            - Seaborn may be unavailable at runtime; **do not import seaborn inside your code**.
+            - If you call sns.*, assume sns is already defined by the framework.
+            3) Avoid deprecated / removed APIs**, e.g.:
+            - pandas: do not use `.append`, `.ix`, `.as_matrix`; prefer current patterns.
+            - seaborn: do not use `distplot`; avoid `pairplot` on very large data unless sampling.
+            - scikit-learn: import from `sklearn.model_selection` (not `sklearn.cross_validation`);
+                set `random_state=42` where relevant.
+            4) Be defensive, but avoid hard-failing on optional fields:
+            - If the primary column, needed to answer the question, is missing, review your copy of the `df` again.
+            Make sure that you selected the proper column.
+            Never use a column/variable which isn't available or defined.
+            - If a secondary/extra column is missing, show a warning with `show(...)` and continue using available fields.
+            - Handle missing values sensibly (drop rows for simple EDA; use `ColumnTransformer` + `SimpleImputer` for modelling).
+            - For categorical features in ML, use `OneHotEncoder(handle_unknown="ignore")`
+                inside a `Pipeline`/`ColumnTransformer` (no `LabelEncoder` on features).
+            5) Keep it fast (kernel timeout ~8s):
+            - For plots on large frames (>20k rows), downsample to ~1,000 rows
+                (`df.sample(1000, random_state=42)`) unless aggregation is more appropriate.
+            - Prefer vectorised ops; avoid O(n²) Python loops.
+            6) Keep the solution compact:
+            - Do not define large helper libraries or long “required column” sets.
+            - Aim for ≤120 lines excluding imports.
+            7) Always produce at least one visible result at the end:
+            - If plotting with matplotlib/seaborn: call `plt.tight_layout(); plt.show()`.
+            - If producing a table or metrics:
+                `from syntaxmatrix.display import show` then `show(object_or_dataframe)`.
+            8) Follow task type conventions:
+            - **EDA/Stats**: compute the requested stat, then show a relevant table
+                (e.g., summary/crosstab) or plot.
+            - **Classification**: train/valid split (`train_test_split`), pipeline with scaling/encoding,
+                fit, show accuracy and a confusion matrix via
+                `ConfusionMatrixDisplay.from_estimator(...); plt.show()`.
+                Also show `classification_report` as a dataframe if short.
+            - **Regression**: train/valid split, pipeline as needed, fit, show R² and MAE;
+                plot predicted vs actual scatter.
+            - **Correlation/Chi-square/ANOVA**: compute the statistic + p-value and show a concise
+                result table (with `show(...)`) and, when sensible, a small plot (heatmap/bar).
+            9) Don't mutate or recreate target columns if they already exist.
+            10) Keep variable names short and clear; prefer `num_cols` / `cat_cols` discovery by dtype.
+            11) You MUST NOT reference any column outside Available columns: {AVAILABLE_COLUMNS}.
+            12) If asked to predict/classify, choose the target by matching the task text to Allowed columns
+                and never invent a new name.
+            #### Cohort rules
+            When you generate plots for cohorts or categories, you MUST obey these rules:
+            1) ALWAYS guard cohort masks:
+            - After you define something like:
+                _mask_a = (df['BMI'] < 18.5) & df['BMI'].notna()
+                _mask_b = ~(df['BMI'] < 18.5) & df['BMI'].notna()
+                compute their sizes:
+                n_a = int(_mask_a.sum())
+                n_b = int(_mask_b.sum())
+            - If a mask has no rows (or almost none), do NOT draw an empty plot.
+                Instead call:
+                show(f"Skipping cohort '{label}': no rows after filtering.")
+                and return.
+            2) Before any groupby / crosstab for a plot:
+            - Fill missing categories so groupby does not drop everything:
+                df[col] = df[col].fillna("Unknown")
+            - After building the table:
+                tab = tmp.groupby([...]).size().unstack(...).fillna(0)
+                ALWAYS check:
+                if tab.empty:
+                    show(f"Skipping plot for {col}: no data after grouping.")
+                    continue
+                Only call .plot(...) if the table is non-empty.
+            3) For value_counts-based plots:
+            - If the Series is empty after filtering (len(s) == 0),
+                do NOT draw a figure. Just call:
+                show(f"No data available to plot for {col} in this cohort.")
+                and skip.
+            4) Never try to “hide” an error with a blank plot.
+            A blank chart is treated as a bug. If there is no data, explain it
+            clearly using show(...), and avoid calling matplotlib/Seaborn.
+            5) Never use print(...). All user-visible diagnostics go through show(...).
-        def anthropic_generate_code():
-            try:
-                response = _client.messages.create(
-                    model=_model,
-                    max_tokens=1024,
-                    system=ai_profile,
-                    messages=[{"role": "user", "content":instructions}],
-                    stream=False,
+            ### Output
+            Return only runnable Python that:
+            - Imports what it needs,
+            - Validates columns,
+            - Visualize tables, charts, and graphs, each with appropriate caption.
+            - Solution: {tasks} to solve {refined_question},
+            - And ends with at least 3 visible output (`show(...)` and/or `plt.show()`).
+        """)
+        if not self._coding_profile:
+            coding_profile = _prof.get_profile("coding") or _prof.get_profile("admin")
+            if not coding_profile:
+                return (
+                    '<div class="smx-alert smx-alert-warn">'
+                        'No LLM profile configured for <code>coding</code> (or <code>admin</code>). <br>'
+                        'Please, add the LLM profile inside the admin panel or contact your Administrator.'
+                    '</div>'
                 )
-                return response.content[0].text.strip()
-            except Exception as e:
-                return f"Error!"
-        def openai_sdk_generate_code():
-            try:
-                response = _client.chat.completions.create(
-                    model=_model,
-                    messages=[
-                        {"role": "system", "content": ai_profile},
-                        {"role": "user", "content": instructions},
-                        ],
-                    temperature=0.3,
-                    max_tokens=2048,
-                )
-                return response.choices[0].message.content
-            except Exception as e:
-                return "Error!"
+            self._coding_profile = coding_profile
+            self._coding_profile['client'] = _prof.get_client(coding_profile)
+        # code = mlearning_agent(instructions, ai_profile, self._coding_profile)
+        code, usage = mlearning_agent(instructions, ai_profile, self._coding_profile)
+        self._last_llm_usage = usage
-        if _provider == 'google':
-            code = google_generate_code()
-        elif _provider == "openai" and _model in self.gpt_models_latest():
-            code = gpt_models_latest_generate_code()
-        elif _provider == "anthropic":
-            code = anthropic_generate_code()
-        else:
-            code = openai_sdk_generate_code()
         if code:
+            import re
+            code = normalise_llm_code(code)
             m = re.search(r"```(?:python)?\s*(.*?)\s*```", code, re.DOTALL | re.IGNORECASE)
             if m:
                 code = m.group(1).strip()
-            code = drop_bad_classification_metrics(code, df)
             if "import io" not in code and "io.BytesIO" in code:
                 lines = code.split('\n')
                 import_lines = []
                 other_lines = []
                 for line in lines:
                     if line.strip().startswith('import ') or line.strip().startswith('from '):
                         import_lines.append(line)
                     else:
                         other_lines.append(line)
-                # Add missing io import
                 if "import io" not in '\n'.join(import_lines):
                     import_lines.append('import io')
                 code = '\n'.join(import_lines + [''] + other_lines)
+                TEMPLATE_NAMES = [
+                    "viz_pie","viz_stacked_bar","viz_count_bar","viz_box","viz_scatter",
+                    "viz_distribution","viz_kde","viz_area","viz_line",
+                    "classification","regression","clustering","anomaly_detection",
+                    "ts_anomaly_detection","time_series_forecasting","time_series_classification",
+                    "dimensionality_reduction","feature_selection","eda_overview","eda_correlation",
+                    "multilabel_classification","recommendation","topic_modelling"
+                ]
+                used = [t for t in TEMPLATE_NAMES if re.search(rf"\\b{t}\\s*\\(", code)]
+                if used:
+                    import_line = (
+                        "from syntaxmatrix.agentic.model_templates import " +
+                        ", ".join(sorted(set(used)))
+                    )
+                    if import_line not in code:
+                        code = import_line + "\n" + code
             return code.strip()
+        return "Error: AI code generation failed."
     def sanitize_rough_to_markdown_task(self, rough: str) -> str:
         """
         Return only the Task text (no tags).

syntaxmatrix 2.3.5__py3-none-any.whl → 2.5.5.4__py3-none-any.whl

syntaxmatrix 2.3.5py3-none-any.whl → 2.5.5.4py3-none-any.whl