PyPI - syntaxmatrix - Versions diffs - 2.5.6.1__py3-none-any.whl → 2.5.8__py3-none-any.whl - Mend

syntaxmatrix 2.5.6.1py3-none-any.whl → 2.5.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

syntaxmatrix/agentic/agents.py +80 -116
syntaxmatrix/core.py +3 -3
syntaxmatrix/routes.py +192 -6
syntaxmatrix/settings/model_map.py +4 -3
syntaxmatrix/templates/dashboard.html +206 -87
syntaxmatrix/templates/dataset_resize.html +535 -0
syntaxmatrix/utils.py +9 -0
{syntaxmatrix-2.5.6.1.dist-info → syntaxmatrix-2.5.8.dist-info}/METADATA +1 -1
{syntaxmatrix-2.5.6.1.dist-info → syntaxmatrix-2.5.8.dist-info}/RECORD +12 -11
{syntaxmatrix-2.5.6.1.dist-info → syntaxmatrix-2.5.8.dist-info}/WHEEL +0 -0
{syntaxmatrix-2.5.6.1.dist-info → syntaxmatrix-2.5.8.dist-info}/licenses/LICENSE.txt +0 -0
{syntaxmatrix-2.5.6.1.dist-info → syntaxmatrix-2.5.8.dist-info}/top_level.txt +0 -0

syntaxmatrix/agentic/agents.py CHANGED Viewed

@@ -42,7 +42,7 @@ def token_calculator(total_input_content, llm_profile):
         input_prompt_tokens = len(enc.encode(total_input_content))
         return input_prompt_tokens
-def mlearning_agent(user_prompt, system_prompt, coding_profile, temperature=0.1, max_tokens=4096):
+def mlearning_agent(user_prompt, system_prompt, coding_profile):
     """
     Returns:
         (text, usage_dict)
@@ -95,72 +95,41 @@ def mlearning_agent(user_prompt, system_prompt, coding_profile, temperature=0.1,
     # Google
     def google_generate_code():
         nonlocal usage
-        """
-        Generates content using the Gemini API and calculates token usage
-        including Context Overhead for consistency.
-        """
-        try:
-            # 1. Client Initialization
-            config = types.GenerateContentConfig(
-                system_instruction=system_prompt,
-                temperature=temperature,
-                max_output_tokens=max_tokens,
-            )
+        config = types.GenerateContentConfig(
+            system_instruction=system_prompt,
+            # Optional: Force the model to generate a Python code block as JSON
+            response_mime_type="application/json",
+            response_schema=types.Schema(
+                type=types.Type.OBJECT,
+                properties={
+                    "code": types.Schema(type=types.Type.STRING, description="The runnable Python code."),
+                    "explanation": types.Schema(type=types.Type.STRING, description="A brief explanation of the code."),
+                },
+                required=["code"]
+            ),
+        )
-            # 2. API Call
-            resp = _client.models.generate_content(
+        try:
+            response = _client.models.generate_content(
                 model=_model,
-                contents=[user_prompt],
+                contents=user_prompt,
                 config=config,
             )
+        except Exception as e:
+            return f"An error occurred during API call: {e}"
-            # 3. Token Usage Capture and Context Overhead Calculation
-            um = resp.usage_metadata
-            usage["input_tokens"] = um.prompt_token_count
-            usage["output_tokens"] = um.thoughts_token_count
-            usage["total_tokens"] = um.total_token_count
-            # 4. Response Extraction (same robust logic as before)
-            text = getattr(resp, "text", None)
-            if isinstance(text, str) and text.strip():
-                return text.strip()
-            chunks = []
-            candidates = getattr(resp, "candidates", None) or []
-            for cand in candidates:
-                content = getattr(cand, "content", None)
-                if content:
-                    parts = getattr(content, "parts", None) or []
-                    for part in parts:
-                        t = getattr(part, "text", None)
-                        if t:
-                            chunks.append(str(t))
-            text = "\n".join(chunks).strip()
-            if text:
-                return text
-            # 5. Handle blocked response
-            fb = getattr(resp, "prompt_feedback", None)
-            block_reason = getattr(fb, "block_reason", None) if fb else None
-            if block_reason and block_reason != types.BlockedReason.REASON_UNSPECIFIED:
-                raise RuntimeError(f"{_model} blocked the response. Reason: {block_reason.name}")
-            raise RuntimeError(f"{_model} failed to return content due to insufficient data.")
+        # 3. Token Usage Capture and Context Overhead Calculation
+        um = response.usage_metadata
+        usage["input_tokens"] = um.prompt_token_count
+        usage["output_tokens"] = um.candidates_token_count + um.thoughts_token_count
+        usage["total_tokens"] = um.total_token_count
-        except APIError as e:
-            error_msg = f"Gemini API Error: {e}"
+        try:
+            # The response text will be a JSON string due to the config.
+            response_json = json.loads(response.text)
+            return response_json.get("code", "Error: Code field not found in response.")
         except Exception as e:
-            error_msg = f"An unexpected error occurred during API call or processing: {e}"
-        # --- Return the error message wrapped in the required output code structure ---
-        msg = f"I smxAI have instructed {error_msg}\n"
-        return (
-            f"# {msg}\n"
-            "from syntaxmatrix.display import show\n"
-            f"show({msg!r})\n"
-        )
+            return f"Error parsing response as JSON: {e}\nRaw Response: {response.text}"
     # OpenAI Responses API
     def gpt_models_latest_generate_code():
@@ -170,7 +139,7 @@ def mlearning_agent(user_prompt, system_prompt, coding_profile, temperature=0.1,
             reasoning_effort, verbosity = "medium", "medium"
             if _model == "gpt-5-nano":
                 reasoning_effort, verbosity = "low", "low"
-            elif _model in ["gpt-5-mini", "gpt-5-codex-mini"]:
+            elif _model in ["gpt-5-mini", "gpt-5-mini-codex"]:
                 reasoning_effort, verbosity = "medium", "medium"
             elif _model in ["gpt-5", "gpt-5-codex", "gpt-5-pro"]:
                 reasoning_effort, verbosity = "high", "high"
@@ -194,19 +163,7 @@ def mlearning_agent(user_prompt, system_prompt, coding_profile, temperature=0.1,
             code = _out(resp).strip()
             if code:
-                return code
-            # Try to surface any block reason (safety / policy / etc.)
-            block_reason = None
-            output = resp.get("output")
-            for item in output:
-                fr = getattr(item, "finish_reason", None)
-                if fr and fr != "stop":
-                    block_reason = fr
-                    break
-            if block_reason:
-                raise RuntimeError(f"{_model} stopped with reason: {block_reason}")
-            raise RuntimeError(f"{_model} returned an empty response in this section due to insufficient data.")
+                return code
         except APIError as e:
             # IMPORTANT: return VALID PYTHON so the dashboard can show the error
@@ -225,15 +182,14 @@ def mlearning_agent(user_prompt, system_prompt, coding_profile, temperature=0.1,
                 "from syntaxmatrix.display import show\n"
                 f"show({msg!r})\n"
             )
     # Anthropic
     def anthropic_generate_code():
         nonlocal usage
         try:
             resp = _client.messages.create(
                 model=_model,
-                max_tokens=max_tokens,
-                temperature=temperature,
+                temperature=0,
                 system=system_prompt,
                 messages=[
                     {"role": "user", "content": user_prompt}
@@ -276,40 +232,43 @@ def mlearning_agent(user_prompt, system_prompt, coding_profile, temperature=0.1,
     def openai_sdk_generate_code():
         nonlocal usage
         try:
-            resp = _client.chat.completions.create(
+            response = None
+            if _model == "deepseek-reasoner":
+                response = _client.chat.completions.create(
+                    model=_model,
+                    messages=[
+                        {"role": "system", "content": system_prompt},
+                        {"role": "user", "content": user_prompt},
+                    ],
+                    extra_body={"thinking": {"type": "enabled"}},
+                    temperature=0,
+                    stream=False
+                )
+            else:
+                response = _client.chat.completions.create(
                 model=_model,
                 messages=[
                     {"role": "system", "content": system_prompt},
                     {"role": "user", "content": user_prompt},
                 ],
-                temperature=temperature,
-                max_tokens=max_tokens,
+                temperature=0,
+                stream=False
             )
+            content = response.choices[0].message.content
-            um = resp.usage
+            um = response.usage
             usage["input_tokens"] = um.prompt_tokens
             usage["output_tokens"] = um.completion_tokens
             usage["total_tokens"] = um.total_tokens
-            text = resp.choices[0].message.content
-            if text:
-                return text
-            # Try to surface any block reason (safety / policy / etc.)
-            block_reason = None
-            choices = getattr(resp, "choices", None) or []
-            if choices:
-                first = choices[0]
-                fr = getattr(first, "finish_reason", None)
-                if fr and fr != "stop":
-                    block_reason = fr
-            if block_reason:
-                raise RuntimeError(f"{_model} stopped with reason: {block_reason}")
-             # Fallback: nothing useful came back
-            raise RuntimeError(f"{_model} returned nothing in this section due to insufficient data.")
+            code_match = re.search(r"```(?:python)?\n(.*?)```", content, re.DOTALL)
+            if code_match:
+                return code_match.group(1).strip()
+            else:
+                # If no markdown blocks are found, return the raw content
+                # (assuming the model obeyed instructions to output only code)
+                return content.strip()
         except Exception as e:
             # IMPORTANT: return VALID PYTHON so the dashboard can show the error
@@ -318,9 +277,7 @@ def mlearning_agent(user_prompt, system_prompt, coding_profile, temperature=0.1,
                 f"# {msg}\n"
                 "from syntaxmatrix.display import show\n"
                 f"show({msg!r})\n"
-            )
-    # print("TTOOKKEENN: ", token_calculator(system_prompt + user_prompt, coding_profile))
+            )
     if _provider == "google":
         code = google_generate_code()
@@ -427,9 +384,11 @@ def refine_question_agent(raw_question: str, dataset_context: str | None = None)
     system_prompt = ("""
         - You are a Machine Learning (ML) and Data Science (DS) expert.
-        - You rewrite user questions into clear ML job specifications to help AI assistant generate Python code that provides solution to the user question when it is run. Most user questions are vague. So, your goal is to ensure that your output guards the assistant agains making potential errors that you anticipated could arise due to the nature of the question.
-        - If a dataset summary is provided, use it to respect column and help you rewrite the question properly.
-        - DO NOT write andy prelude or preamble"
+        - Your goal is to use the provided dataset summary to convert given question into clear ML job specifications.
+        - Use the provided dataset summary to respect columns and aid you in properly refining the user question.
+        - Include chronological outline in order to guide a code generator to avoid falling off tracks.
+        - DO NOT include any prelude or preamble. Just the refined tasks.
+        - If and only if the dataset summary columns are not relevant to your desired columns that you deduced by analysing the question, and you suspect that the wrong dataset was used in the dataset summary, stop and just say: 'incompatible'.
     """)
     user_prompt = f"User question:\n{raw_question}\n\n"
@@ -447,7 +406,7 @@ def refine_question_agent(raw_question: str, dataset_context: str | None = None)
 def classify_ml_job_agent(refined_question, dataset_profile):
+    import ast
     def ml_response(user_prompt, system_prompt, profile):
         _profile = profile  # _prof.get_profile["admin"]
@@ -562,8 +521,7 @@ def classify_ml_job_agent(refined_question, dataset_profile):
     system_prompt = ("""
         You are a strict machine learning task classifier for an ML workbench.
         Your goal is to correctly label the user's task specifications with the most relevant tags from a fixed list.
-        You Must always have 'data_preprocessing' as the 1st tag. Then add up to 4 to make 5 max. Your list, therefore, should have 1-5 tags. If you think a task is too complext for the given context, even if relevant, exclude it.
-        If no relevant tag, default to "data_preprocessing" and return that alone.
+        You Must always have 'data_preprocessing' as the 1st tag. Then add all other relevant tags.
         You should return only your list of tags, no prelude or preamble.
     """)
@@ -583,15 +541,14 @@ def classify_ml_job_agent(refined_question, dataset_profile):
         "generative_modeling", "causal_inference", "risk_modeling", "graph_analysis",
         # Foundational/Pipeline Steps
-        "feature_engineering", "statistical_inference", "data_preprocessing",
-        "model_validation", "hyperparameter_tuning"
+        "data_preprocessing", "feature_engineering", "statistical_inference", "clustering", "hyperparameter_tuning"
     ]
     # --- 2. Construct the Generalized Prompt for the LLM ---
     task_description = refined_question
     user_prompt = f"""
-    Analyze the following task description:
+    Analyze and classify the following task description:
     ---
     {task_description}
     ---
@@ -604,7 +561,7 @@ def classify_ml_job_agent(refined_question, dataset_profile):
     ML Jobs List: {', '.join(ml_task_list)}
     Respond ONLY with a valid JSON array of strings containing the selected ML job names.
-    Example Response: ["natural_language_processing", "classification", "feature_engineering"]
+    Example Response: ["data_preprocessing", "regression", "classification", "feature_engineering"]
     """
     if dataset_profile:
@@ -612,13 +569,20 @@ def classify_ml_job_agent(refined_question, dataset_profile):
     llm_profile =  _prof.get_profile("classification") or _prof.get_profile("admin")
     if not llm_profile:
-        return "ERROR"
+        return (
+            "<div class='smx-alert smx-alert-warn'>"
+                "No LLM profile is configured for Classification. Please, do that in the Admin panel or contact your Administrator."
+            "</div>"
+        )
     llm_profile['client'] = _prof.get_client(llm_profile)
-    # Extract raw content
     tasks = ml_response(user_prompt, system_prompt, llm_profile)
-    return tasks
+    try:
+        return ast.literal_eval(tasks)
+    except Exception:
+        return tasks
 def text_formatter_agent(text):

syntaxmatrix/core.py CHANGED Viewed

@@ -599,7 +599,7 @@ class SyntaxMUI:
         from syntaxmatrix.gpt_models_latest import extract_output_text as _out, set_args
         if not self._classification_profile:
-            classification_profile = _prof.get_profile('classification') or _prof.get_profile('chat') or _prof.get_profile('admin')
+            classification_profile = _prof.get_profile('classification') or _prof.get_profile('admin')
             if not classification_profile:
                 return {"Error": "Set a profile for Classification"}
             self._classification_profile = classification_profile
@@ -1317,11 +1317,11 @@ class SyntaxMUI:
         """)
         if not self._coding_profile:
-            coding_profile = _prof.get_profile("coding") or _prof.get_profile("admin")
+            coding_profile = _prof.get_profile("coding")  # or _prof.get_profile("admin")
             if not coding_profile:
                 return (
                     '<div class="smx-alert smx-alert-warn">'
-                        'No LLM profile configured for <code>coding</code> (or <code>admin</code>). <br>'
+                        'No LLM profile configured for <code>coding</code> <br>'
                         'Please, add the LLM profile inside the admin panel or contact your Administrator.'
                     '</div>'
                 )

syntaxmatrix/routes.py CHANGED Viewed

@@ -65,6 +65,7 @@ _CLIENT_DIR = detect_project_root()
 _stream_q = queue.Queue()
 _stream_cancelled = {}
 _last_result_html = {}  # { session_id: html_doc }
+_last_resized_csv = {}  # { resize_id: bytes for last resized CSV per browser session }
 # single, reused formatter: inline styles, padding, rounded corners, scroll
 _FMT = _HtmlFmt(
@@ -3047,7 +3048,7 @@ def setup_routes(smx):
                         }) + "\n\n"
             except GeneratorExit:
-                smx.info("Client aborted the stream.")
+                return "Client aborted the stream."
             except Exception as e:
                 smx.error(f"Stream error: {e}")
                 yield "data: " + json.dumps({"event": "error", "error": str(e)}) + "\n\n"
@@ -5610,8 +5611,19 @@ def setup_routes(smx):
                 dataset_profile = f"modality: tabular; columns: {columns_summary}"
                 refined_question = refine_question_agent(askai_question, dataset_context)
-                tags = classify_ml_job_agent(refined_question, dataset_profile)
+                tags = []
+                if refined_question.lower() == "incompatible" or refined_question.lower() == "mismatch":
+                  return ("""
+                      <div style="position: fixed; top: 50%; left: 50%; transform: translate(-50%, -50%); text-align: center;">
+                          <h1 style="margin: 0 0 10px 0;">Oops: Context mismatch</h1>
+                          <p style="margin: 0;">Please, upload the proper dataset for solution to your query.</p>
+                          <br>
+                          <a class='button' href='/dashboard' style='text-decoration:none;'>Return</a>
+                      </div>
+                  """)
+                else:
+                  tags = classify_ml_job_agent(refined_question, dataset_profile)
                 ai_code = smx.ai_generate_code(refined_question, tags, df)
                 llm_usage = smx.get_last_llm_usage()
                 ai_code = auto_inject_template(ai_code, tags, df)
@@ -6513,8 +6525,8 @@ def setup_routes(smx):
                 cell["highlighted_code"] = Markup(_pygmentize(cell["code"]))
         highlighted_ai_code = _pygmentize(ai_code)
-        tasks = [tag.replace("_", " ").replace('"', '').capitalize() for tag in tags]
+        smxAI = "Orion"
         return render_template(
           "dashboard.html",
           section=section,
@@ -6525,7 +6537,8 @@ def setup_routes(smx):
           highlighted_ai_code=highlighted_ai_code if ai_code else None,
           askai_question=smx.sanitize_rough_to_markdown_task(askai_question),
           refined_question=refined_question,
-          tasks=tasks,
+          tasks=tags,
+          smxAI=smxAI,
           data_cells=data_cells,
           session_id=session_id,
           llm_usage=llm_usage
@@ -6589,6 +6602,179 @@ def setup_routes(smx):
         # go back to the dashboard; dashboard() will auto-select the next file
         return redirect(url_for("dashboard"))
+        # ── DATASET RESIZE (independent helper page) -------------------------
+    @smx.app.route("/dataset/resize", methods=["GET", "POST"])
+    def dataset_resize():
+        """
+        User uploads any CSV and picks a target size (percentage of rows).
+        We keep the last resized CSV in memory and expose a download link.
+        """
+        # One id per browser session to index _last_resized_csv
+        resize_id = session.get("dataset_resize_id")
+        if not resize_id:
+            resize_id = str(uuid.uuid4())
+            session["dataset_resize_id"] = resize_id
+        resize_info = None  # stats we pass down to the template
+        if request.method == "POST":
+            file = request.files.get("dataset_file")
+            target_pct_raw = (request.form.get("target_pct") or "").strip()
+            strat_col = (request.form.get("strat_col") or "").strip()
+            error_msg = None
+            df = None
+            # --- Basic validation ---
+            if not file or file.filename == "":
+                error_msg = "Please choose a CSV file."
+            elif not file.filename.lower().endswith(".csv"):
+                error_msg = "Only CSV files are supported."
+            # --- Read CSV into a DataFrame ---
+            if not error_msg:
+                try:
+                    df = pd.read_csv(file)
+                except Exception as e:
+                    error_msg = f"Could not read CSV: {e}"
+            # --- Parse target percentage ---
+            pct = None
+            if not error_msg:
+                try:
+                    pct = float(target_pct_raw)
+                except Exception:
+                    error_msg = "Target size must be a number between 1 and 100."
+            if not error_msg and (pct <= 0 or pct > 100):
+                error_msg = "Target size must be between 1 and 100."
+            if error_msg:
+                flash(error_msg, "error")
+            else:
+                frac = pct / 100.0
+                n_orig = len(df)
+                n_target = max(1, int(round(n_orig * frac)))
+                df_resized = None
+                used_strat = False
+                # --- Advanced: stratified sampling by a column (behind 'Show advanced options') ---
+                if strat_col and strat_col in df.columns and n_orig > 0:
+                    used_strat = True
+                    groups = df.groupby(strat_col, sort=False)
+                    # First pass: proportional allocation with rounding and minimum 1 per non-empty group
+                    allocations = {}
+                    total_alloc = 0
+                    for key, group in groups:
+                        size = len(group)
+                        if size <= 0:
+                            allocations[key] = 0
+                            continue
+                        alloc = int(round(size * frac))
+                        if alloc == 0 and size > 0:
+                            alloc = 1
+                        if alloc > size:
+                            alloc = size
+                        allocations[key] = alloc
+                        total_alloc += alloc
+                    keys = list(allocations.keys())
+                    # Adjust downwards if we overshot
+                    if total_alloc > n_target:
+                        idx = 0
+                        while total_alloc > n_target and any(v > 1 for v in allocations.values()):
+                            k = keys[idx % len(keys)]
+                            if allocations[k] > 1:
+                                allocations[k] -= 1
+                                total_alloc -= 1
+                            idx += 1
+                    # Adjust upwards if we undershot and we still have room in groups
+                    if total_alloc < n_target and keys:
+                        idx = 0
+                        while total_alloc < n_target:
+                            k = keys[idx % len(keys)]
+                            group_size = len(groups.get_group(k))
+                            if allocations[k] < group_size:
+                                allocations[k] += 1
+                                total_alloc += 1
+                            idx += 1
+                            if idx > len(keys) * 3:
+                                break
+                    sampled_parts = []
+                    for key, group in groups:
+                        n_g = allocations.get(key, 0)
+                        if n_g > 0:
+                            sampled_parts.append(group.sample(n=n_g, random_state=0))
+                    if sampled_parts:
+                        df_resized = (
+                            pd.concat(sampled_parts, axis=0)
+                              .sample(frac=1.0, random_state=0)
+                              .reset_index(drop=True)
+                        )
+                # --- Default: simple random sample over all rows ---
+                if df_resized is None:
+                    if n_target >= n_orig:
+                        df_resized = df.copy()
+                    else:
+                        df_resized = df.sample(n=n_target, random_state=0).reset_index(drop=True)
+                    if strat_col and strat_col not in df.columns:
+                        flash(
+                            f"Column '{strat_col}' not found. Used simple random sampling instead.",
+                            "warning",
+                        )
+                # --- Serialise to CSV in memory and stash in _last_resized_csv ---
+                buf = _std_io.BytesIO()
+                df_resized.to_csv(buf, index=False)
+                buf.seek(0)
+                _last_resized_csv[resize_id] = buf.getvalue()
+                resize_info = {
+                    "rows_in": n_orig,
+                    "rows_out": len(df_resized),
+                    "pct": pct,
+                    "used_strat": used_strat,
+                    "strat_col": strat_col if used_strat else "",
+                }
+                flash("Dataset resized successfully. Use the download link below.", "success")
+        return render_template("dataset_resize.html", resize_info=resize_info)
+    @smx.app.route("/dataset/resize/download", methods=["GET"])
+    def download_resized_dataset():
+        """Download the last resized dataset for this browser session as a CSV."""
+        resize_id = session.get("dataset_resize_id")
+        if not resize_id:
+            return ("No resized dataset available.", 404)
+        data = _last_resized_csv.get(resize_id)
+        if not data:
+            return ("No resized dataset available.", 404)
+        buf = _std_io.BytesIO(data)
+        buf.seek(0)
+        stamp = datetime.now().strftime("%Y%m%d-%H%M%S-%f")
+        filename = f"resized_dataset_{stamp}.csv"
+        # Drop it from memory once downloaded
+        _last_resized_csv.pop(resize_id, None)
+        return send_file(
+            buf,
+            mimetype="text/csv; charset=utf-8",
+            as_attachment=True,
+            download_name=filename,
+        )
     def _pdf_fallback_reportlab(full_html: str):
         """ReportLab fallback: extract text + base64 <img> and lay them out."""

syntaxmatrix/settings/model_map.py CHANGED Viewed

@@ -36,12 +36,14 @@ PROVIDERS_MODELS = {
     ],
     #4
-    "deepseek": [
+    "deepseek": [
+        "deepseek-reasoner",
         "deepseek-chat",
     ],
     #5
     "moonshot": [
-        "kimi-k2-0905-preview",
+        "kimi-k2-thinking",
+        "kimi-k2-instruct",
     ],
     #6
     "alibaba": [
@@ -57,7 +59,6 @@ PROVIDERS_MODELS = {
         "claude-sonnet-4-5",
         "claude-sonnet-4-0",
         "claude-3-5-haiku-latest",
-        "claude-3-haiku-20240307",
     ]
 }

syntaxmatrix 2.5.6.1__py3-none-any.whl → 2.5.8__py3-none-any.whl

syntaxmatrix 2.5.6.1py3-none-any.whl → 2.5.8py3-none-any.whl