PyPI - syntaxmatrix - Versions diffs - 2.3.5__py3-none-any.whl → 2.5.5.4__py3-none-any.whl - Mend

syntaxmatrix 2.3.5py3-none-any.whl → 2.5.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

syntaxmatrix/agentic/__init__.py +0 -0
syntaxmatrix/agentic/agent_tools.py +24 -0
syntaxmatrix/agentic/agents.py +810 -0
syntaxmatrix/agentic/code_tools_registry.py +37 -0
syntaxmatrix/agentic/model_templates.py +1790 -0
syntaxmatrix/commentary.py +134 -112
syntaxmatrix/core.py +385 -245
syntaxmatrix/dataset_preprocessing.py +218 -0
syntaxmatrix/display.py +89 -37
syntaxmatrix/gpt_models_latest.py +5 -4
syntaxmatrix/profiles.py +19 -4
syntaxmatrix/routes.py +947 -141
syntaxmatrix/settings/model_map.py +38 -30
syntaxmatrix/static/icons/hero_bg.jpg +0 -0
syntaxmatrix/templates/dashboard.html +248 -54
syntaxmatrix/utils.py +2254 -84
{syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/METADATA +16 -17
{syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/RECORD +21 -15
syntaxmatrix/model_templates.py +0 -29
{syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/WHEEL +0 -0
{syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/licenses/LICENSE.txt +0 -0
{syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/top_level.txt +0 -0

syntaxmatrix/dataset_preprocessing.py ADDED Viewed

@@ -0,0 +1,218 @@
+# syntaxmatrix/dataset_preprocessing.py
+# -----------------------------------------------------------------------------
+# Dataset-agnostic cleaning for analysis with imputation and audit outputs.
+# Writes:
+#   DATA_FOLDER / selected_dataset / cleaned_df.csv
+#   DATA_FOLDER / selected_dataset / missingness.csv
+# Does NOT mutate the in-memory EDA df. Call ensure_cleaned_df(...) after df load.
+# -----------------------------------------------------------------------------
+from __future__ import annotations
+import os
+import re
+import pandas as pd
+import numpy as np
+from typing import Tuple, Dict
+__all__ = ["ensure_cleaned_df"]
+# Common tokens that should be treated as missing
+_MISSING_TOKENS = {
+    "", "na", "n/a", "n.a.", "nan", "none", "null", "-", "--", "?", "unknown"
+}
+_BOOL_TRUE  = {"true", "t", "yes", "y", "1", "on"}
+_BOOL_FALSE = {"false", "f", "no", "n", "0", "off"}
+# Columns whose names hint at date/time content (case-insensitive)
+_DATE_HINTS = re.compile(r"(date|time|timestamp|_dt)$", re.IGNORECASE)
+# -----------------------------------------------------------------------------
+# Helpers
+# -----------------------------------------------------------------------------
+def _strip_column_names_only(df: pd.DataFrame) -> pd.DataFrame:
+    """Trim surrounding whitespace in column names (preserve original names)."""
+    df = df.copy()
+    df.rename(columns=lambda c: str(c).strip(), inplace=True)
+    return df
+def _standardise_missing_tokens(s: pd.Series) -> pd.Series:
+    """Map common missing tokens to NaN in object-like columns."""
+    if s.dtype != "object":
+        return s
+    mapped = s.astype(str).str.strip()
+    lowered = mapped.str.lower()
+    is_missing = lowered.isin(_MISSING_TOKENS)
+    mapped = mapped.mask(is_missing, np.nan)
+    return mapped
+def _coerce_booleans(s: pd.Series) -> pd.Series:
+    if s.dtype != "object":
+        return s
+    cand = s.astype(str).str.strip().str.lower()
+    uniq = set(cand.dropna().unique().tolist())
+    if uniq and uniq.issubset(_BOOL_TRUE | _BOOL_FALSE):
+        return cand.map(lambda v: True if v in _BOOL_TRUE else False if v in _BOOL_FALSE else np.nan)
+    return s
+_NUM_RE = re.compile(r"[,\s£$€]")
+def _looks_numeric(x: str) -> bool:
+    v = _NUM_RE.sub("", x.strip()).replace("%", "")
+    return bool(re.match(r"^[+-]?(\d+(\.\d*)?|\.\d+)$", v))
+def _coerce_numerics(s: pd.Series) -> pd.Series:
+    if s.dtype != "object":
+        return s
+    sample = s.dropna().astype(str).head(1000)
+    if len(sample) == 0:
+        return s
+    ratio = np.mean([_looks_numeric(x) for x in sample])
+    if ratio >= 0.8:
+        cleaned = _NUM_RE.sub("", s.astype(str).str.strip())
+        # If many values end with %, interpret as percent
+        if (cleaned.str.endswith("%")).mean() > 0.6:
+            # remove % and divide by 100
+            cleaned = cleaned.str.replace("%", "", regex=False)
+            out = pd.to_numeric(cleaned, errors="coerce") / 100.0
+        else:
+            out = pd.to_numeric(cleaned.str.replace("%", "", regex=False), errors="coerce")
+        return out
+    return s
+def _parse_datetimes(df: pd.DataFrame, col: str) -> pd.Series:
+    """Parse datetimes robustly; produce tz-naive UTC for consistent .dt."""
+    s = df[col].astype(str)
+    dt = pd.to_datetime(s, errors="coerce", infer_datetime_format=True, utc=True)
+    if dt.isna().mean() > 0.9:
+        # strip trailing ' (PDT)' etc.
+        s2 = s.str.replace(r"\s*\([^)]*\)\s*$", "", regex=True)
+        dt = pd.to_datetime(s2, errors="coerce", infer_datetime_format=True, utc=True)
+    # Convert to tz-naive UTC if we parsed anything meaningful
+    if dt.notna().sum() >= max(3, int(0.1 * len(df))):
+        try:
+            return dt.dt.tz_convert("UTC").dt.tz_localize(None)
+        except Exception:
+            return dt  # already tz-naive
+    return df[col]  # leave original if parsing failed
+def _summarise_missingness(df: pd.DataFrame) -> pd.DataFrame:
+    total = len(df)
+    miss = df.isna().sum()
+    pct = (miss / total * 100.0).round(2)
+    dtype = df.dtypes.astype(str)
+    return pd.DataFrame({"column": df.columns, "missing": miss.values, "missing_%": pct.values, "dtype": dtype.values})
+# -----------------------------------------------------------------------------
+# Main cleaner (type coercion + imputation for analysis)
+# -----------------------------------------------------------------------------
+def _clean_and_coerce(df: pd.DataFrame) -> pd.DataFrame:
+    df = df.copy()
+    # 0) tidy strings and standardise missing tokens
+    for c in df.columns:
+        s = df[c]
+        if s.dtype == "object":
+            s = s.astype(str).str.strip().str.replace(r"\s+", " ", regex=True)
+            s = _standardise_missing_tokens(s)
+            df[c] = s
+    # 1) booleans
+    for c in df.columns:
+        df[c] = _coerce_booleans(df[c])
+    # 2) numerics
+    for c in df.columns:
+        df[c] = _coerce_numerics(df[c])
+    # 3) datetimes (by name hint + explicit 'saledate')
+    for c in list(df.columns):
+        n = str(c).lower()
+        if _DATE_HINTS.search(n) or n == "saledate":
+            try:
+                df[c] = _parse_datetimes(df, c)
+            except Exception:
+                pass
+    # 4) drop exact duplicates
+    df = df.drop_duplicates()
+    return df
+def _impute_for_analysis(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, str]]:
+    """
+    Impute missing values:
+      - numeric -> median
+      - categorical/object/bool -> most frequent (fallback 'Unknown')
+    Adds <col>__imputed boolean flags where any fills occurred.
+    Returns cleaned df and a dict of imputation strategies used.
+    """
+    df = df.copy()
+    strategy: Dict[str, str] = {}
+    # numeric
+    num_cols = df.select_dtypes(include=["number"]).columns.tolist()
+    for c in num_cols:
+        if df[c].isna().any():
+            med = df[c].median(skipna=True)
+            if pd.isna(med):
+                continue  # cannot impute an all-NaN column
+            df[f"{c}__imputed"] = df[c].isna()
+            df[c] = df[c].fillna(med)
+            strategy[c] = "median"
+    # categoricals & booleans (object/category/bool)
+    cat_cols = [c for c in df.columns
+                if df[c].dtype == "object" or str(df[c].dtype).startswith("category") or df[c].dtype == "bool"]
+    for c in cat_cols:
+        if df[c].isna().any():
+            # mode; if multiple modes, pick the first stable value
+            try:
+                mode_val = df[c].mode(dropna=True)
+                fill = mode_val.iloc[0] if not mode_val.empty else "Unknown"
+            except Exception:
+                fill = "Unknown"
+            df[f"{c}__imputed"] = df[c].isna()
+            df[c] = df[c].fillna(fill)
+            strategy[c] = f"mode('{fill}')"
+    return df, strategy
+def ensure_cleaned_df(DATA_FOLDER: str, cleaned_folder: str, df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Build (or reuse) an analysis-ready cleaned dataset and persist to:
+        f"{DATA_FOLDER}/{selected_dataset}/cleaned_df.csv"
+    Also writes a missingness audit:
+        f"{DATA_FOLDER}/{selected_dataset}/missingness.csv"
+    Returns the cleaned frame. Does NOT mutate the provided df.
+    """
+    target_dir = os.path.join(DATA_FOLDER, cleaned_folder)
+    os.makedirs(target_dir, exist_ok=True)
+    target_csv = os.path.join(target_dir, "cleaned_df.csv")
+    # miss_csv   = os.path.join(target_dir, "missingness.csv")
+    # If a cleaned file already exists, reuse it (your pipeline already calls this once per dataset)
+    if os.path.exists(target_csv):
+        try:
+            return pd.read_csv(target_csv, low_memory=False)
+        except Exception:
+            # fall through to rebuild if unreadable
+            pass
+    # Pipeline: normalise headers → coerce types → impute → audits → save
+    step0 = _strip_column_names_only(df)
+    step1 = _clean_and_coerce(step0)
+    # audit BEFORE imputation (raw missingness after coercion)
+    #_summarise_missingness(step1).to_csv(miss_csv, index=False)
+    step2, _strategy = _impute_for_analysis(step1)
+    # Drop id-like columns (high-uniqueness or name pattern)
+    name_hit = [c for c in step2.columns if re.search(r'\b(id|uuid|vin|serial|record|row_?id)\b', c, re.I)]
+    uniq_hit = [c for c in step2.columns if step2[c].nunique(dropna=True) >= 0.98 * len(step2)]
+    id_like = sorted(set(name_hit) | set(uniq_hit))
+    step2 = step2.drop(columns=id_like, errors='ignore')
+    # Persist cleaned for tasks
+    step2.to_csv(target_csv, index=False)
+    return step2

syntaxmatrix/display.py CHANGED Viewed

@@ -1,54 +1,106 @@
- # -----------------------------------------------------------------
-#  Paste *inside* syntaxmatrix/display.py – only the show() body
-# -----------------------------------------------------------------
-def show(obj):
+"""
+syntaxmatrix.display
+--------------------
+Single responsibility: render arbitrary Python objects in the SMX UI.
+- Matplotlib figures: displayed directly.
+- Pandas Styler (with .set_caption): rendered to HTML so captions always show.
+- Pandas DataFrame/Series: rendered to HTML (no caption path).
+- Dict of scalars: rendered as a small table.
+- Tuple of two numbers (e.g., mse, r2): rendered as a labelled 2-row table.
+- Everything else: shown as <pre> for safe inspection.
+"""
+from typing import Any
+import numbers
+import pandas as pd
+import matplotlib.figure as mpfig
+from IPython.display import display, HTML
+try:
+    # Optional: if pandas Styler exists, we can keep captions reliably
+    from pandas.io.formats.style import Styler as _Styler  # type: ignore
+except Exception:  # pragma: no cover
+    _Styler = None  # type: ignore
+__all__ = ["show"]
+# ---- internal helpers -------------------------------------------------------
+def _wrap_html_table(html: str) -> str:
+    """Apply consistent UI styling and horizontal scrolling."""
+    return (
+        "<style>"
+        "caption{caption-side: top; font-weight:600; margin:0 0 6px 0;}"
+        "table{border-collapse:collapse;font-size:0.9em;white-space:nowrap;}"
+        "th{background:#f0f2f5;text-align:left;padding:6px 8px;border:1px solid gray;}"
+        "td{border:1px solid #ddd;padding:6px 8px;}"
+        "tbody tr:nth-child(even){background-color:#f9f9f9;}"
+        "</style>"
+        "<div style='overflow-x:auto;max-width:100%;margin-bottom:1rem;'>"
+        + html +
+        "</div>"
+    )
+# ---- public API -------------------------------------------------------------
+def show(obj: Any) -> None:
     """
     Render common objects so the Dashboard (or chat) always shows output.
-    """
-    import io, base64, numbers
-    from IPython.display import display, HTML
-    import pandas as pd
-    import matplotlib.figure as mpfig
-    # ── matplotlib Figure ─────────────────────────────────────────
+    Notes
+    -----
+    * Do not print here. All rendering goes through IPython's display layer.
+    * Captions are supplied upstream by the SMX PREFACE via DataFrame.style.set_caption(...).
+    """
+    # 1) Matplotlib figures
     if isinstance(obj, mpfig.Figure):
-        display(obj)
+        display(obj)
         return None
-    if isinstance(obj, (pd.Series, pd.DataFrame)):
+    # 2) Pandas Styler (keeps caption)
+    if _Styler is not None and isinstance(obj, _Styler):  # type: ignore
+        try:
+            html = obj.to_html()
+            display(HTML(_wrap_html_table(html)))
+        except Exception:
+            # Fallback: if Styler HTML fails for any reason, display raw Styler
+            display(obj)
+        return None
+    # 3) Series / DataFrame (no caption path)
+    if isinstance(obj, (pd.Series, pd.DataFrame)):
         html = obj.to_html(classes="smx-table", border=0)
-        wrapped_html = (
-            "<style>"
-            ".smx-table{border-collapse:collapse;font-size:0.9em;white-space:nowrap;}"
-            ".smx-table th{background:#f0f2f5;text-align:left;padding:6px 8px;border:1px solid gray;}"
-            ".smx-table td{border:1px solid #ddd;padding:6px 8px;}"
-            ".smx-table tbody tr:nth-child(even){background-color:#f9f9f9;}"
-            "</style>"
-            "<div style='overflow-x:auto; max-width:100%; margin-bottom:1rem;'>"
-            + html +
-            "</div>"
-        )
-        display(HTML(wrapped_html))
+        display(HTML(_wrap_html_table(html)))
         return None
-    # ── dict of scalars → pretty 2-col table ─────────────────────
+    # 4) Dict of scalar numbers → pretty 2-col table
     if isinstance(obj, dict) and all(isinstance(v, numbers.Number) for v in obj.values()):
-        df_ = pd.DataFrame({"metric": list(obj.keys()),
-                            "value":  list(obj.values())})
-        display(df_)
+        df_ = pd.DataFrame({"metric": list(obj.keys()), "value": list(obj.values())})
+        html = df_.to_html(classes="smx-table", border=0, index=False)
+        display(HTML(_wrap_html_table(html)))
         return None
-    # ── 2-tuple of numbers (mse, r²) ─────────────────────────────
-    if (isinstance(obj, tuple) and len(obj) == 2 and
-            all(isinstance(v, numbers.Number) for v in obj)):
+    # 5) Two-number tuple → labelled metric table (e.g., (mse, r2))
+    if (
+        isinstance(obj, tuple)
+        and len(obj) == 2
+        and all(isinstance(v, numbers.Number) for v in obj)
+    ):
         mse, r2 = obj
-        df_ = pd.DataFrame({"metric": ["Mean-squared error", "R²"],
-                            "value":  [mse, r2]})
-        display(df_)
+        df_ = pd.DataFrame(
+            {"metric": ["Mean-squared error", "R²"], "value": [mse, r2]}
+        )
+        html = df_.to_html(classes="smx-table", border=0, index=False)
+        display(HTML(_wrap_html_table(html)))
         return None
-    # ── fallback ─────────────────────────────────────────────────
+    # 6) Fallback: show as preformatted text (safe and predictable)
     display(HTML(f"<pre>{obj}</pre>"))
     return None

syntaxmatrix/gpt_models_latest.py CHANGED Viewed

@@ -21,10 +21,11 @@ def extract_output_text(resp) -> str:
 def set_args(
         model,
         instructions,
-        input, previous_id=None,
+        input,
+        previous_id=None,
         store=False,
-        reasoning_effort="minimal",
-        verbosity="low",
+        reasoning_effort="medium",  # "minimal", "low", "medium", "high"
+        verbosity="medium",            # "low", "medium", "high"
         truncation="auto",
     ):
     base_params = {
@@ -35,7 +36,7 @@ def set_args(
             "store": store,
             "truncation": truncation,
     }
-    if model == "gpt-5-chat-latest":
+    if model == "gpt-5.1-chat-latest":
         args = base_params
     else:
         args = {**base_params,

syntaxmatrix/profiles.py CHANGED Viewed

@@ -25,24 +25,39 @@ def get_profile(purpose: str) -> dict:
     _refresh_profiles()
     return _profiles.get(purpose)
+def get_profiles():
+    return list_profiles()
 def get_client(profile):
     provider = profile["provider"].lower()
     api_key = profile["api_key"]
-    if provider == "google":    #1
+    #1 - Google - gemini series
+    if provider == "google":
         return  genai.Client(api_key=api_key)
-    if provider == "openai":    #2
+    #2 OpenAI gpt-5 series
+    if provider == "openai":
         return OpenAI(api_key=api_key)
-    if provider == "xai":   #3
+    #3 - xAI - grok series
+    if provider == "xai":
         return OpenAI(api_key=api_key, base_url="https://api.x.ai/v1")
-    if provider == "deepseek":  #4
+    #4 - DeepSeek chat model
+    if provider == "deepseek":
         return OpenAI(api_key=api_key, base_url="https://api.deepseek.com")
+    #5 - Moonshot chat model
     if provider == "moonshot":  #5
         return OpenAI(api_key=api_key, base_url="https://api.moonshot.ai/v1")
+    #6 - Alibaba qwen series
     if provider == "alibaba":   #6
         return OpenAI(api_key=api_key, base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",)
+    #7 - Anthropic claude series
     if provider == "anthropic": #7
         return anthropic.Anthropic(api_key=api_key)

syntaxmatrix 2.3.5__py3-none-any.whl → 2.5.5.4__py3-none-any.whl

syntaxmatrix 2.3.5py3-none-any.whl → 2.5.5.4py3-none-any.whl