PyPI - syntaxmatrix - Versions diffs - 1.4.6__py3-none-any.whl → 2.5.5.4__py3-none-any.whl - Mend

syntaxmatrix 1.4.6py3-none-any.whl → 2.5.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

syntaxmatrix/__init__.py +13 -8
syntaxmatrix/agentic/__init__.py +0 -0
syntaxmatrix/agentic/agent_tools.py +24 -0
syntaxmatrix/agentic/agents.py +810 -0
syntaxmatrix/agentic/code_tools_registry.py +37 -0
syntaxmatrix/agentic/model_templates.py +1790 -0
syntaxmatrix/auth.py +308 -14
syntaxmatrix/commentary.py +328 -0
syntaxmatrix/core.py +993 -375
syntaxmatrix/dataset_preprocessing.py +218 -0
syntaxmatrix/db.py +92 -95
syntaxmatrix/display.py +95 -121
syntaxmatrix/generate_page.py +634 -0
syntaxmatrix/gpt_models_latest.py +46 -0
syntaxmatrix/history_store.py +26 -29
syntaxmatrix/kernel_manager.py +96 -17
syntaxmatrix/llm_store.py +1 -1
syntaxmatrix/plottings.py +6 -0
syntaxmatrix/profiles.py +64 -8
syntaxmatrix/project_root.py +55 -43
syntaxmatrix/routes.py +5072 -1398
syntaxmatrix/session.py +19 -0
syntaxmatrix/settings/logging.py +40 -0
syntaxmatrix/settings/model_map.py +300 -33
syntaxmatrix/settings/prompts.py +273 -62
syntaxmatrix/settings/string_navbar.py +3 -3
syntaxmatrix/static/docs.md +272 -0
syntaxmatrix/static/icons/favicon.png +0 -0
syntaxmatrix/static/icons/hero_bg.jpg +0 -0
syntaxmatrix/templates/dashboard.html +608 -147
syntaxmatrix/templates/docs.html +71 -0
syntaxmatrix/templates/error.html +2 -3
syntaxmatrix/templates/login.html +1 -0
syntaxmatrix/templates/register.html +1 -0
syntaxmatrix/ui_modes.py +14 -0
syntaxmatrix/utils.py +2482 -159
syntaxmatrix/vectorizer.py +16 -12
{syntaxmatrix-1.4.6.dist-info → syntaxmatrix-2.5.5.4.dist-info}/METADATA +20 -17
syntaxmatrix-2.5.5.4.dist-info/RECORD +68 -0
syntaxmatrix/model_templates.py +0 -30
syntaxmatrix/static/icons/favicon.ico +0 -0
syntaxmatrix-1.4.6.dist-info/RECORD +0 -54
{syntaxmatrix-1.4.6.dist-info → syntaxmatrix-2.5.5.4.dist-info}/WHEEL +0 -0
{syntaxmatrix-1.4.6.dist-info → syntaxmatrix-2.5.5.4.dist-info}/licenses/LICENSE.txt +0 -0
{syntaxmatrix-1.4.6.dist-info → syntaxmatrix-2.5.5.4.dist-info}/top_level.txt +0 -0

syntaxmatrix/dataset_preprocessing.py ADDED Viewed

@@ -0,0 +1,218 @@
+# syntaxmatrix/dataset_preprocessing.py
+# -----------------------------------------------------------------------------
+# Dataset-agnostic cleaning for analysis with imputation and audit outputs.
+# Writes:
+#   DATA_FOLDER / selected_dataset / cleaned_df.csv
+#   DATA_FOLDER / selected_dataset / missingness.csv
+# Does NOT mutate the in-memory EDA df. Call ensure_cleaned_df(...) after df load.
+# -----------------------------------------------------------------------------
+from __future__ import annotations
+import os
+import re
+import pandas as pd
+import numpy as np
+from typing import Tuple, Dict
+__all__ = ["ensure_cleaned_df"]
+# Common tokens that should be treated as missing
+_MISSING_TOKENS = {
+    "", "na", "n/a", "n.a.", "nan", "none", "null", "-", "--", "?", "unknown"
+}
+_BOOL_TRUE  = {"true", "t", "yes", "y", "1", "on"}
+_BOOL_FALSE = {"false", "f", "no", "n", "0", "off"}
+# Columns whose names hint at date/time content (case-insensitive)
+_DATE_HINTS = re.compile(r"(date|time|timestamp|_dt)$", re.IGNORECASE)
+# -----------------------------------------------------------------------------
+# Helpers
+# -----------------------------------------------------------------------------
+def _strip_column_names_only(df: pd.DataFrame) -> pd.DataFrame:
+    """Trim surrounding whitespace in column names (preserve original names)."""
+    df = df.copy()
+    df.rename(columns=lambda c: str(c).strip(), inplace=True)
+    return df
+def _standardise_missing_tokens(s: pd.Series) -> pd.Series:
+    """Map common missing tokens to NaN in object-like columns."""
+    if s.dtype != "object":
+        return s
+    mapped = s.astype(str).str.strip()
+    lowered = mapped.str.lower()
+    is_missing = lowered.isin(_MISSING_TOKENS)
+    mapped = mapped.mask(is_missing, np.nan)
+    return mapped
+def _coerce_booleans(s: pd.Series) -> pd.Series:
+    if s.dtype != "object":
+        return s
+    cand = s.astype(str).str.strip().str.lower()
+    uniq = set(cand.dropna().unique().tolist())
+    if uniq and uniq.issubset(_BOOL_TRUE | _BOOL_FALSE):
+        return cand.map(lambda v: True if v in _BOOL_TRUE else False if v in _BOOL_FALSE else np.nan)
+    return s
+_NUM_RE = re.compile(r"[,\s£$€]")
+def _looks_numeric(x: str) -> bool:
+    v = _NUM_RE.sub("", x.strip()).replace("%", "")
+    return bool(re.match(r"^[+-]?(\d+(\.\d*)?|\.\d+)$", v))
+def _coerce_numerics(s: pd.Series) -> pd.Series:
+    if s.dtype != "object":
+        return s
+    sample = s.dropna().astype(str).head(1000)
+    if len(sample) == 0:
+        return s
+    ratio = np.mean([_looks_numeric(x) for x in sample])
+    if ratio >= 0.8:
+        cleaned = _NUM_RE.sub("", s.astype(str).str.strip())
+        # If many values end with %, interpret as percent
+        if (cleaned.str.endswith("%")).mean() > 0.6:
+            # remove % and divide by 100
+            cleaned = cleaned.str.replace("%", "", regex=False)
+            out = pd.to_numeric(cleaned, errors="coerce") / 100.0
+        else:
+            out = pd.to_numeric(cleaned.str.replace("%", "", regex=False), errors="coerce")
+        return out
+    return s
+def _parse_datetimes(df: pd.DataFrame, col: str) -> pd.Series:
+    """Parse datetimes robustly; produce tz-naive UTC for consistent .dt."""
+    s = df[col].astype(str)
+    dt = pd.to_datetime(s, errors="coerce", infer_datetime_format=True, utc=True)
+    if dt.isna().mean() > 0.9:
+        # strip trailing ' (PDT)' etc.
+        s2 = s.str.replace(r"\s*\([^)]*\)\s*$", "", regex=True)
+        dt = pd.to_datetime(s2, errors="coerce", infer_datetime_format=True, utc=True)
+    # Convert to tz-naive UTC if we parsed anything meaningful
+    if dt.notna().sum() >= max(3, int(0.1 * len(df))):
+        try:
+            return dt.dt.tz_convert("UTC").dt.tz_localize(None)
+        except Exception:
+            return dt  # already tz-naive
+    return df[col]  # leave original if parsing failed
+def _summarise_missingness(df: pd.DataFrame) -> pd.DataFrame:
+    total = len(df)
+    miss = df.isna().sum()
+    pct = (miss / total * 100.0).round(2)
+    dtype = df.dtypes.astype(str)
+    return pd.DataFrame({"column": df.columns, "missing": miss.values, "missing_%": pct.values, "dtype": dtype.values})
+# -----------------------------------------------------------------------------
+# Main cleaner (type coercion + imputation for analysis)
+# -----------------------------------------------------------------------------
+def _clean_and_coerce(df: pd.DataFrame) -> pd.DataFrame:
+    df = df.copy()
+    # 0) tidy strings and standardise missing tokens
+    for c in df.columns:
+        s = df[c]
+        if s.dtype == "object":
+            s = s.astype(str).str.strip().str.replace(r"\s+", " ", regex=True)
+            s = _standardise_missing_tokens(s)
+            df[c] = s
+    # 1) booleans
+    for c in df.columns:
+        df[c] = _coerce_booleans(df[c])
+    # 2) numerics
+    for c in df.columns:
+        df[c] = _coerce_numerics(df[c])
+    # 3) datetimes (by name hint + explicit 'saledate')
+    for c in list(df.columns):
+        n = str(c).lower()
+        if _DATE_HINTS.search(n) or n == "saledate":
+            try:
+                df[c] = _parse_datetimes(df, c)
+            except Exception:
+                pass
+    # 4) drop exact duplicates
+    df = df.drop_duplicates()
+    return df
+def _impute_for_analysis(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, str]]:
+    """
+    Impute missing values:
+      - numeric -> median
+      - categorical/object/bool -> most frequent (fallback 'Unknown')
+    Adds <col>__imputed boolean flags where any fills occurred.
+    Returns cleaned df and a dict of imputation strategies used.
+    """
+    df = df.copy()
+    strategy: Dict[str, str] = {}
+    # numeric
+    num_cols = df.select_dtypes(include=["number"]).columns.tolist()
+    for c in num_cols:
+        if df[c].isna().any():
+            med = df[c].median(skipna=True)
+            if pd.isna(med):
+                continue  # cannot impute an all-NaN column
+            df[f"{c}__imputed"] = df[c].isna()
+            df[c] = df[c].fillna(med)
+            strategy[c] = "median"
+    # categoricals & booleans (object/category/bool)
+    cat_cols = [c for c in df.columns
+                if df[c].dtype == "object" or str(df[c].dtype).startswith("category") or df[c].dtype == "bool"]
+    for c in cat_cols:
+        if df[c].isna().any():
+            # mode; if multiple modes, pick the first stable value
+            try:
+                mode_val = df[c].mode(dropna=True)
+                fill = mode_val.iloc[0] if not mode_val.empty else "Unknown"
+            except Exception:
+                fill = "Unknown"
+            df[f"{c}__imputed"] = df[c].isna()
+            df[c] = df[c].fillna(fill)
+            strategy[c] = f"mode('{fill}')"
+    return df, strategy
+def ensure_cleaned_df(DATA_FOLDER: str, cleaned_folder: str, df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Build (or reuse) an analysis-ready cleaned dataset and persist to:
+        f"{DATA_FOLDER}/{selected_dataset}/cleaned_df.csv"
+    Also writes a missingness audit:
+        f"{DATA_FOLDER}/{selected_dataset}/missingness.csv"
+    Returns the cleaned frame. Does NOT mutate the provided df.
+    """
+    target_dir = os.path.join(DATA_FOLDER, cleaned_folder)
+    os.makedirs(target_dir, exist_ok=True)
+    target_csv = os.path.join(target_dir, "cleaned_df.csv")
+    # miss_csv   = os.path.join(target_dir, "missingness.csv")
+    # If a cleaned file already exists, reuse it (your pipeline already calls this once per dataset)
+    if os.path.exists(target_csv):
+        try:
+            return pd.read_csv(target_csv, low_memory=False)
+        except Exception:
+            # fall through to rebuild if unreadable
+            pass
+    # Pipeline: normalise headers → coerce types → impute → audits → save
+    step0 = _strip_column_names_only(df)
+    step1 = _clean_and_coerce(step0)
+    # audit BEFORE imputation (raw missingness after coercion)
+    #_summarise_missingness(step1).to_csv(miss_csv, index=False)
+    step2, _strategy = _impute_for_analysis(step1)
+    # Drop id-like columns (high-uniqueness or name pattern)
+    name_hit = [c for c in step2.columns if re.search(r'\b(id|uuid|vin|serial|record|row_?id)\b', c, re.I)]
+    uniq_hit = [c for c in step2.columns if step2[c].nunique(dropna=True) >= 0.98 * len(step2)]
+    id_like = sorted(set(name_hit) | set(uniq_hit))
+    step2 = step2.drop(columns=id_like, errors='ignore')
+    # Persist cleaned for tasks
+    step2.to_csv(target_csv, index=False)
+    return step2

syntaxmatrix/db.py CHANGED Viewed

@@ -1,15 +1,17 @@
-# syntaxmatrix/db.py
-from datetime import datetime
+from __future__ import annotations
 import sqlite3
-import time
 import os
 import json
+from werkzeug.utils import secure_filename
 from syntaxmatrix.project_root import detect_project_root
 _CLIENT_DIR = detect_project_root()
 DB_PATH = os.path.join(_CLIENT_DIR, "data", "syntaxmatrix.db")
 os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
+TEMPLATES_DIR = os.path.join(_CLIENT_DIR, "templates")
+os.makedirs(TEMPLATES_DIR, exist_ok=True)
 # ***************************************
@@ -25,16 +27,6 @@ def init_db():
         )
     """)
-    # # Create table for pdf_chunks for the admin files
-    # conn.execute("""
-    #     CREATE TABLE IF NOT EXISTS pdf_chunks (
-    #         id INTEGER PRIMARY KEY AUTOINCREMENT,
-    #         file_name TEXT,
-    #         chunk_index INTEGER,
-    #         chunk_text TEXT
-    #     )
-    # """)
     conn.execute("""
         CREATE TABLE IF NOT EXISTS askai_cells (
             id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -48,110 +40,115 @@ def init_db():
     conn.commit()
     conn.close()
 def get_pages():
+    """Return {page_name: html} resolving relative paths under syntaxmatrixdir/templates."""
+    import sqlite3
     conn = sqlite3.connect(DB_PATH)
-    cursor = conn.cursor()
-    cursor.execute("SELECT name, content FROM pages")
-    rows = cursor.fetchall()
+    rows = conn.execute("SELECT name, content FROM pages").fetchall()
     conn.close()
-    return {row[0]: row[1] for row in rows}
-def add_page(name, content):
-    conn = sqlite3.connect(DB_PATH)
-    cursor = conn.cursor()
-    cursor.execute("INSERT INTO pages (name, content) VALUES (?, ?)", (name, content))
-    conn.commit()
-    conn.close()
+    pages = {}
+    for name, file_path in rows:
+        # If the DB holds a relative path (e.g. 'templates/about.html'), make it absolute.
+        if file_path and not os.path.isabs(file_path):
+            file_path = os.path.join(_CLIENT_DIR, file_path)
+        try:
+            with open(file_path, "r", encoding="utf-8") as f:
+                pages[name] = f.read()
+        except Exception:
+            pages[name] = f"<p>Missing file for page '{name}'.</p>"
+    return pages
-def update_page(old_name, new_name, content):
-    conn = sqlite3.connect(DB_PATH)
-    cursor = conn.cursor()
-    cursor.execute("UPDATE pages SET name = ?, content = ? WHERE name = ?", (new_name, content, old_name))
-    conn.commit()
-    conn.close()
-def delete_page(name):
-    conn = sqlite3.connect(DB_PATH)
-    cursor = conn.cursor()
-    cursor.execute("DELETE FROM pages WHERE name = ?", (name,))
-    conn.commit()
-    conn.close()
+def add_page(name, html):
+    """Create templates/<slug>.html and store a relative path in the DB."""
+    filename = secure_filename(name.lower()) + ".html"
+    abs_path = os.path.join(TEMPLATES_DIR, filename)
+    with open(abs_path, "w", encoding="utf-8") as f:
+        f.write(html)
-def add_pdf_chunk(file_name: str, chunk_index: int, chunk_text: str):
+    rel_path = f"templates/{filename}"
     conn = sqlite3.connect(DB_PATH)
-    conn.execute(
-        "INSERT INTO pdf_chunks (file_name, chunk_index, chunk_text) VALUES (?, ?, ?)",
-        (file_name, chunk_index, chunk_text)
-    )
+    cur = conn.cursor()
+    cur.execute("INSERT INTO pages (name, content) VALUES (?, ?)", (name, rel_path))
     conn.commit()
     conn.close()
-def get_pdf_chunks(file_name: str = None):
-    conn = sqlite3.connect(DB_PATH)
-    cursor = conn.cursor()
-    if file_name:
-        cursor.execute(
-            "SELECT chunk_index, chunk_text FROM pdf_chunks WHERE file_name = ? ORDER BY chunk_index",
-            (file_name,)
-        )
-    else:
-        cursor.execute(
-            "SELECT file_name, chunk_index, chunk_text FROM pdf_chunks ORDER BY file_name, chunk_index"
-        )
-    rows = cursor.fetchall()
-    conn.close()
-    return rows
-def update_pdf_chunk(chunk_id: int, new_chunk_text: str):
+def update_page(old_name, new_name, html):
     """
-    Updates the chunk_text of a PDF chunk record identified by chunk_id.
+    Overwrite the page file; if the title changes, rename the file.
+    Always store a relative path 'templates/<slug>.html' in the DB.
     """
-    conn = sqlite3.connect(DB_PATH)
-    cursor = conn.cursor()
-    cursor.execute("""
-        UPDATE pdf_chunks
-        SET chunk_text = ?
-        WHERE id = ?
-    """, (new_chunk_text, chunk_id))
-    conn.commit()
-    conn.close()
+    import sqlite3, os
+    from werkzeug.utils import secure_filename
-def delete_pdf_chunks(file_name):
-    """
-    Delete all chunks associated with the given PDF file name.
-    """
     conn = sqlite3.connect(DB_PATH)
-    conn.execute(
-        "DELETE FROM pdf_chunks WHERE file_name = ?",
-        (file_name,)
-    )
-    conn.commit()
-    conn.close()
+    cur = conn.cursor()
-# ***************************************
-#               AskAI
-# ***************************************
+    row = cur.execute("SELECT content FROM pages WHERE name = ?", (old_name,)).fetchone()
+    if not row:
+        conn.close()
+        return
-def add_askai_cell(session_id, question, output, code):
-    conn = sqlite3.connect(DB_PATH)
-    conn.execute(
-        "INSERT INTO askai_cells (session_id, question, output, code) VALUES (?, ?, ?, ?)",
-        (session_id, question, output, code)
+    # Resolve current path (absolute if DB stored absolute; otherwise under syntaxmatrixdir)
+    current = row[0] or ""
+    if current and not os.path.isabs(current):
+        current_abs = os.path.join(_CLIENT_DIR, current)
+    else:
+        current_abs = current
+    # Target filename/path for the new name
+    new_filename = secure_filename(new_name.lower()) + ".html"
+    target_abs   = os.path.join(_CLIENT_DIR, "templates", new_filename)
+    os.makedirs(os.path.dirname(target_abs), exist_ok=True)
+    # If name changed and the old file exists, rename; otherwise we’ll just write fresh
+    if old_name != new_name and current_abs and os.path.exists(current_abs) and current_abs != target_abs:
+        try:
+            os.replace(current_abs, target_abs)
+        except Exception:
+            # If rename fails (e.g. old file missing), we’ll write the new file below
+            pass
+    # Write the HTML (create if missing, overwrite if present)
+    with open(target_abs, "w", encoding="utf-8") as f:
+        f.write(html)
+    # Store a relative, OS-agnostic path in the DB
+    rel_path = f"templates/{new_filename}"
+    cur.execute(
+        "UPDATE pages SET name = ?, content = ? WHERE name = ?",
+        (new_name, rel_path, old_name)
     )
     conn.commit()
     conn.close()
-def get_askai_cells(session_id, limit=15):
-    conn = sqlite3.connect(DB_PATH)
-    cursor = conn.cursor()
-    cursor.execute(
-        "SELECT question, output, code FROM askai_cells WHERE session_id = ? ORDER BY id DESC LIMIT ?",
-        (session_id, limit)
-    )
-    cells = [{"question": q, "output": o, "code": c} for q, o, c in cursor.fetchall()]
-    conn.close()
-    return cells
+def delete_page(name):
+    """
+    Delete the page file (if present) and remove the row from the DB.
+    Works whether 'content' is absolute or relative.
+    """
+    import sqlite3, os
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+    row = cur.execute("SELECT content FROM pages WHERE name = ?", (name,)).fetchone()
+    if row:
+        path = row[0] or ""
+        abs_path = path if os.path.isabs(path) else os.path.join(_CLIENT_DIR, path)
+        if os.path.exists(abs_path):
+            try:
+                os.remove(abs_path)
+            except Exception:
+                # Don’t block deletion if the file cannot be removed
+                pass
+    cur.execute("DELETE FROM pages WHERE name = ?", (name,))
+    conn.commit()
+    conn.close()

syntaxmatrix 1.4.6__py3-none-any.whl → 2.5.5.4__py3-none-any.whl

syntaxmatrix 1.4.6py3-none-any.whl → 2.5.5.4py3-none-any.whl