syntaxmatrix 1.4.6__py3-none-any.whl → 2.5.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. syntaxmatrix/__init__.py +13 -8
  2. syntaxmatrix/agentic/__init__.py +0 -0
  3. syntaxmatrix/agentic/agent_tools.py +24 -0
  4. syntaxmatrix/agentic/agents.py +810 -0
  5. syntaxmatrix/agentic/code_tools_registry.py +37 -0
  6. syntaxmatrix/agentic/model_templates.py +1790 -0
  7. syntaxmatrix/auth.py +308 -14
  8. syntaxmatrix/commentary.py +328 -0
  9. syntaxmatrix/core.py +993 -375
  10. syntaxmatrix/dataset_preprocessing.py +218 -0
  11. syntaxmatrix/db.py +92 -95
  12. syntaxmatrix/display.py +95 -121
  13. syntaxmatrix/generate_page.py +634 -0
  14. syntaxmatrix/gpt_models_latest.py +46 -0
  15. syntaxmatrix/history_store.py +26 -29
  16. syntaxmatrix/kernel_manager.py +96 -17
  17. syntaxmatrix/llm_store.py +1 -1
  18. syntaxmatrix/plottings.py +6 -0
  19. syntaxmatrix/profiles.py +64 -8
  20. syntaxmatrix/project_root.py +55 -43
  21. syntaxmatrix/routes.py +5072 -1398
  22. syntaxmatrix/session.py +19 -0
  23. syntaxmatrix/settings/logging.py +40 -0
  24. syntaxmatrix/settings/model_map.py +300 -33
  25. syntaxmatrix/settings/prompts.py +273 -62
  26. syntaxmatrix/settings/string_navbar.py +3 -3
  27. syntaxmatrix/static/docs.md +272 -0
  28. syntaxmatrix/static/icons/favicon.png +0 -0
  29. syntaxmatrix/static/icons/hero_bg.jpg +0 -0
  30. syntaxmatrix/templates/dashboard.html +608 -147
  31. syntaxmatrix/templates/docs.html +71 -0
  32. syntaxmatrix/templates/error.html +2 -3
  33. syntaxmatrix/templates/login.html +1 -0
  34. syntaxmatrix/templates/register.html +1 -0
  35. syntaxmatrix/ui_modes.py +14 -0
  36. syntaxmatrix/utils.py +2482 -159
  37. syntaxmatrix/vectorizer.py +16 -12
  38. {syntaxmatrix-1.4.6.dist-info → syntaxmatrix-2.5.5.4.dist-info}/METADATA +20 -17
  39. syntaxmatrix-2.5.5.4.dist-info/RECORD +68 -0
  40. syntaxmatrix/model_templates.py +0 -30
  41. syntaxmatrix/static/icons/favicon.ico +0 -0
  42. syntaxmatrix-1.4.6.dist-info/RECORD +0 -54
  43. {syntaxmatrix-1.4.6.dist-info → syntaxmatrix-2.5.5.4.dist-info}/WHEEL +0 -0
  44. {syntaxmatrix-1.4.6.dist-info → syntaxmatrix-2.5.5.4.dist-info}/licenses/LICENSE.txt +0 -0
  45. {syntaxmatrix-1.4.6.dist-info → syntaxmatrix-2.5.5.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,218 @@
1
+ # syntaxmatrix/dataset_preprocessing.py
2
+ # -----------------------------------------------------------------------------
3
+ # Dataset-agnostic cleaning for analysis with imputation and audit outputs.
4
+ # Writes:
5
+ # DATA_FOLDER / selected_dataset / cleaned_df.csv
6
+ # DATA_FOLDER / selected_dataset / missingness.csv
7
+ # Does NOT mutate the in-memory EDA df. Call ensure_cleaned_df(...) after df load.
8
+ # -----------------------------------------------------------------------------
9
+
10
+ from __future__ import annotations
11
+ import os
12
+ import re
13
+ import pandas as pd
14
+ import numpy as np
15
+ from typing import Tuple, Dict
16
+
17
+ __all__ = ["ensure_cleaned_df"]
18
+
19
+ # Common tokens that should be treated as missing
20
+ _MISSING_TOKENS = {
21
+ "", "na", "n/a", "n.a.", "nan", "none", "null", "-", "--", "?", "unknown"
22
+ }
23
+
24
+ _BOOL_TRUE = {"true", "t", "yes", "y", "1", "on"}
25
+ _BOOL_FALSE = {"false", "f", "no", "n", "0", "off"}
26
+
27
+ # Columns whose names hint at date/time content (case-insensitive)
28
+ _DATE_HINTS = re.compile(r"(date|time|timestamp|_dt)$", re.IGNORECASE)
29
+
30
+ # -----------------------------------------------------------------------------
31
+ # Helpers
32
+ # -----------------------------------------------------------------------------
33
+
34
+ def _strip_column_names_only(df: pd.DataFrame) -> pd.DataFrame:
35
+ """Trim surrounding whitespace in column names (preserve original names)."""
36
+ df = df.copy()
37
+ df.rename(columns=lambda c: str(c).strip(), inplace=True)
38
+ return df
39
+
40
+ def _standardise_missing_tokens(s: pd.Series) -> pd.Series:
41
+ """Map common missing tokens to NaN in object-like columns."""
42
+ if s.dtype != "object":
43
+ return s
44
+ mapped = s.astype(str).str.strip()
45
+ lowered = mapped.str.lower()
46
+ is_missing = lowered.isin(_MISSING_TOKENS)
47
+ mapped = mapped.mask(is_missing, np.nan)
48
+ return mapped
49
+
50
+ def _coerce_booleans(s: pd.Series) -> pd.Series:
51
+ if s.dtype != "object":
52
+ return s
53
+ cand = s.astype(str).str.strip().str.lower()
54
+ uniq = set(cand.dropna().unique().tolist())
55
+ if uniq and uniq.issubset(_BOOL_TRUE | _BOOL_FALSE):
56
+ return cand.map(lambda v: True if v in _BOOL_TRUE else False if v in _BOOL_FALSE else np.nan)
57
+ return s
58
+
59
+ _NUM_RE = re.compile(r"[,\s£$€]")
60
+
61
+ def _looks_numeric(x: str) -> bool:
62
+ v = _NUM_RE.sub("", x.strip()).replace("%", "")
63
+ return bool(re.match(r"^[+-]?(\d+(\.\d*)?|\.\d+)$", v))
64
+
65
+ def _coerce_numerics(s: pd.Series) -> pd.Series:
66
+ if s.dtype != "object":
67
+ return s
68
+ sample = s.dropna().astype(str).head(1000)
69
+ if len(sample) == 0:
70
+ return s
71
+ ratio = np.mean([_looks_numeric(x) for x in sample])
72
+ if ratio >= 0.8:
73
+ cleaned = _NUM_RE.sub("", s.astype(str).str.strip())
74
+ # If many values end with %, interpret as percent
75
+ if (cleaned.str.endswith("%")).mean() > 0.6:
76
+ # remove % and divide by 100
77
+ cleaned = cleaned.str.replace("%", "", regex=False)
78
+ out = pd.to_numeric(cleaned, errors="coerce") / 100.0
79
+ else:
80
+ out = pd.to_numeric(cleaned.str.replace("%", "", regex=False), errors="coerce")
81
+ return out
82
+ return s
83
+
84
+ def _parse_datetimes(df: pd.DataFrame, col: str) -> pd.Series:
85
+ """Parse datetimes robustly; produce tz-naive UTC for consistent .dt."""
86
+ s = df[col].astype(str)
87
+ dt = pd.to_datetime(s, errors="coerce", infer_datetime_format=True, utc=True)
88
+ if dt.isna().mean() > 0.9:
89
+ # strip trailing ' (PDT)' etc.
90
+ s2 = s.str.replace(r"\s*\([^)]*\)\s*$", "", regex=True)
91
+ dt = pd.to_datetime(s2, errors="coerce", infer_datetime_format=True, utc=True)
92
+ # Convert to tz-naive UTC if we parsed anything meaningful
93
+ if dt.notna().sum() >= max(3, int(0.1 * len(df))):
94
+ try:
95
+ return dt.dt.tz_convert("UTC").dt.tz_localize(None)
96
+ except Exception:
97
+ return dt # already tz-naive
98
+ return df[col] # leave original if parsing failed
99
+
100
+ def _summarise_missingness(df: pd.DataFrame) -> pd.DataFrame:
101
+ total = len(df)
102
+ miss = df.isna().sum()
103
+ pct = (miss / total * 100.0).round(2)
104
+ dtype = df.dtypes.astype(str)
105
+ return pd.DataFrame({"column": df.columns, "missing": miss.values, "missing_%": pct.values, "dtype": dtype.values})
106
+
107
+ # -----------------------------------------------------------------------------
108
+ # Main cleaner (type coercion + imputation for analysis)
109
+ # -----------------------------------------------------------------------------
110
+
111
+ def _clean_and_coerce(df: pd.DataFrame) -> pd.DataFrame:
112
+ df = df.copy()
113
+ # 0) tidy strings and standardise missing tokens
114
+ for c in df.columns:
115
+ s = df[c]
116
+ if s.dtype == "object":
117
+ s = s.astype(str).str.strip().str.replace(r"\s+", " ", regex=True)
118
+ s = _standardise_missing_tokens(s)
119
+ df[c] = s
120
+
121
+ # 1) booleans
122
+ for c in df.columns:
123
+ df[c] = _coerce_booleans(df[c])
124
+
125
+ # 2) numerics
126
+ for c in df.columns:
127
+ df[c] = _coerce_numerics(df[c])
128
+
129
+ # 3) datetimes (by name hint + explicit 'saledate')
130
+ for c in list(df.columns):
131
+ n = str(c).lower()
132
+ if _DATE_HINTS.search(n) or n == "saledate":
133
+ try:
134
+ df[c] = _parse_datetimes(df, c)
135
+ except Exception:
136
+ pass
137
+
138
+ # 4) drop exact duplicates
139
+ df = df.drop_duplicates()
140
+ return df
141
+
142
+ def _impute_for_analysis(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, str]]:
143
+ """
144
+ Impute missing values:
145
+ - numeric -> median
146
+ - categorical/object/bool -> most frequent (fallback 'Unknown')
147
+ Adds <col>__imputed boolean flags where any fills occurred.
148
+ Returns cleaned df and a dict of imputation strategies used.
149
+ """
150
+ df = df.copy()
151
+ strategy: Dict[str, str] = {}
152
+
153
+ # numeric
154
+ num_cols = df.select_dtypes(include=["number"]).columns.tolist()
155
+ for c in num_cols:
156
+ if df[c].isna().any():
157
+ med = df[c].median(skipna=True)
158
+ if pd.isna(med):
159
+ continue # cannot impute an all-NaN column
160
+ df[f"{c}__imputed"] = df[c].isna()
161
+ df[c] = df[c].fillna(med)
162
+ strategy[c] = "median"
163
+
164
+ # categoricals & booleans (object/category/bool)
165
+ cat_cols = [c for c in df.columns
166
+ if df[c].dtype == "object" or str(df[c].dtype).startswith("category") or df[c].dtype == "bool"]
167
+ for c in cat_cols:
168
+ if df[c].isna().any():
169
+ # mode; if multiple modes, pick the first stable value
170
+ try:
171
+ mode_val = df[c].mode(dropna=True)
172
+ fill = mode_val.iloc[0] if not mode_val.empty else "Unknown"
173
+ except Exception:
174
+ fill = "Unknown"
175
+ df[f"{c}__imputed"] = df[c].isna()
176
+ df[c] = df[c].fillna(fill)
177
+ strategy[c] = f"mode('{fill}')"
178
+
179
+ return df, strategy
180
+
181
+ def ensure_cleaned_df(DATA_FOLDER: str, cleaned_folder: str, df: pd.DataFrame) -> pd.DataFrame:
182
+ """
183
+ Build (or reuse) an analysis-ready cleaned dataset and persist to:
184
+ f"{DATA_FOLDER}/{selected_dataset}/cleaned_df.csv"
185
+ Also writes a missingness audit:
186
+ f"{DATA_FOLDER}/{selected_dataset}/missingness.csv"
187
+ Returns the cleaned frame. Does NOT mutate the provided df.
188
+ """
189
+ target_dir = os.path.join(DATA_FOLDER, cleaned_folder)
190
+ os.makedirs(target_dir, exist_ok=True)
191
+ target_csv = os.path.join(target_dir, "cleaned_df.csv")
192
+ # miss_csv = os.path.join(target_dir, "missingness.csv")
193
+
194
+
195
+ # If a cleaned file already exists, reuse it (your pipeline already calls this once per dataset)
196
+ if os.path.exists(target_csv):
197
+ try:
198
+ return pd.read_csv(target_csv, low_memory=False)
199
+ except Exception:
200
+ # fall through to rebuild if unreadable
201
+ pass
202
+
203
+ # Pipeline: normalise headers → coerce types → impute → audits → save
204
+ step0 = _strip_column_names_only(df)
205
+ step1 = _clean_and_coerce(step0)
206
+ # audit BEFORE imputation (raw missingness after coercion)
207
+ #_summarise_missingness(step1).to_csv(miss_csv, index=False)
208
+ step2, _strategy = _impute_for_analysis(step1)
209
+
210
+ # Drop id-like columns (high-uniqueness or name pattern)
211
+ name_hit = [c for c in step2.columns if re.search(r'\b(id|uuid|vin|serial|record|row_?id)\b', c, re.I)]
212
+ uniq_hit = [c for c in step2.columns if step2[c].nunique(dropna=True) >= 0.98 * len(step2)]
213
+ id_like = sorted(set(name_hit) | set(uniq_hit))
214
+ step2 = step2.drop(columns=id_like, errors='ignore')
215
+
216
+ # Persist cleaned for tasks
217
+ step2.to_csv(target_csv, index=False)
218
+ return step2
syntaxmatrix/db.py CHANGED
@@ -1,15 +1,17 @@
1
- # syntaxmatrix/db.py
2
- from datetime import datetime
1
+ from __future__ import annotations
3
2
  import sqlite3
4
- import time
5
3
  import os
6
4
  import json
5
+ from werkzeug.utils import secure_filename
7
6
  from syntaxmatrix.project_root import detect_project_root
8
7
 
8
+
9
9
  _CLIENT_DIR = detect_project_root()
10
10
  DB_PATH = os.path.join(_CLIENT_DIR, "data", "syntaxmatrix.db")
11
11
  os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
12
12
 
13
+ TEMPLATES_DIR = os.path.join(_CLIENT_DIR, "templates")
14
+ os.makedirs(TEMPLATES_DIR, exist_ok=True)
13
15
 
14
16
 
15
17
  # ***************************************
@@ -25,16 +27,6 @@ def init_db():
25
27
  )
26
28
  """)
27
29
 
28
- # # Create table for pdf_chunks for the admin files
29
- # conn.execute("""
30
- # CREATE TABLE IF NOT EXISTS pdf_chunks (
31
- # id INTEGER PRIMARY KEY AUTOINCREMENT,
32
- # file_name TEXT,
33
- # chunk_index INTEGER,
34
- # chunk_text TEXT
35
- # )
36
- # """)
37
-
38
30
  conn.execute("""
39
31
  CREATE TABLE IF NOT EXISTS askai_cells (
40
32
  id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -48,110 +40,115 @@ def init_db():
48
40
 
49
41
  conn.commit()
50
42
  conn.close()
51
-
43
+
52
44
 
53
45
  def get_pages():
46
+ """Return {page_name: html} resolving relative paths under syntaxmatrixdir/templates."""
47
+ import sqlite3
54
48
  conn = sqlite3.connect(DB_PATH)
55
- cursor = conn.cursor()
56
- cursor.execute("SELECT name, content FROM pages")
57
- rows = cursor.fetchall()
49
+ rows = conn.execute("SELECT name, content FROM pages").fetchall()
58
50
  conn.close()
59
- return {row[0]: row[1] for row in rows}
60
51
 
61
- def add_page(name, content):
62
- conn = sqlite3.connect(DB_PATH)
63
- cursor = conn.cursor()
64
- cursor.execute("INSERT INTO pages (name, content) VALUES (?, ?)", (name, content))
65
- conn.commit()
66
- conn.close()
52
+ pages = {}
53
+ for name, file_path in rows:
54
+ # If the DB holds a relative path (e.g. 'templates/about.html'), make it absolute.
55
+ if file_path and not os.path.isabs(file_path):
56
+ file_path = os.path.join(_CLIENT_DIR, file_path)
57
+ try:
58
+ with open(file_path, "r", encoding="utf-8") as f:
59
+ pages[name] = f.read()
60
+ except Exception:
61
+ pages[name] = f"<p>Missing file for page '{name}'.</p>"
62
+ return pages
67
63
 
68
- def update_page(old_name, new_name, content):
69
- conn = sqlite3.connect(DB_PATH)
70
- cursor = conn.cursor()
71
- cursor.execute("UPDATE pages SET name = ?, content = ? WHERE name = ?", (new_name, content, old_name))
72
- conn.commit()
73
- conn.close()
74
64
 
75
- def delete_page(name):
76
- conn = sqlite3.connect(DB_PATH)
77
- cursor = conn.cursor()
78
- cursor.execute("DELETE FROM pages WHERE name = ?", (name,))
79
- conn.commit()
80
- conn.close()
65
+ def add_page(name, html):
66
+ """Create templates/<slug>.html and store a relative path in the DB."""
67
+ filename = secure_filename(name.lower()) + ".html"
68
+ abs_path = os.path.join(TEMPLATES_DIR, filename)
81
69
 
70
+ with open(abs_path, "w", encoding="utf-8") as f:
71
+ f.write(html)
82
72
 
83
- def add_pdf_chunk(file_name: str, chunk_index: int, chunk_text: str):
73
+ rel_path = f"templates/{filename}"
84
74
  conn = sqlite3.connect(DB_PATH)
85
- conn.execute(
86
- "INSERT INTO pdf_chunks (file_name, chunk_index, chunk_text) VALUES (?, ?, ?)",
87
- (file_name, chunk_index, chunk_text)
88
- )
75
+ cur = conn.cursor()
76
+ cur.execute("INSERT INTO pages (name, content) VALUES (?, ?)", (name, rel_path))
77
+
89
78
  conn.commit()
90
79
  conn.close()
91
80
 
92
- def get_pdf_chunks(file_name: str = None):
93
- conn = sqlite3.connect(DB_PATH)
94
- cursor = conn.cursor()
95
- if file_name:
96
- cursor.execute(
97
- "SELECT chunk_index, chunk_text FROM pdf_chunks WHERE file_name = ? ORDER BY chunk_index",
98
- (file_name,)
99
- )
100
- else:
101
- cursor.execute(
102
- "SELECT file_name, chunk_index, chunk_text FROM pdf_chunks ORDER BY file_name, chunk_index"
103
- )
104
- rows = cursor.fetchall()
105
- conn.close()
106
- return rows
107
81
 
108
- def update_pdf_chunk(chunk_id: int, new_chunk_text: str):
82
+ def update_page(old_name, new_name, html):
109
83
  """
110
- Updates the chunk_text of a PDF chunk record identified by chunk_id.
84
+ Overwrite the page file; if the title changes, rename the file.
85
+ Always store a relative path 'templates/<slug>.html' in the DB.
111
86
  """
112
- conn = sqlite3.connect(DB_PATH)
113
- cursor = conn.cursor()
114
- cursor.execute("""
115
- UPDATE pdf_chunks
116
- SET chunk_text = ?
117
- WHERE id = ?
118
- """, (new_chunk_text, chunk_id))
119
- conn.commit()
120
- conn.close()
87
+ import sqlite3, os
88
+ from werkzeug.utils import secure_filename
121
89
 
122
- def delete_pdf_chunks(file_name):
123
- """
124
- Delete all chunks associated with the given PDF file name.
125
- """
126
90
  conn = sqlite3.connect(DB_PATH)
127
- conn.execute(
128
- "DELETE FROM pdf_chunks WHERE file_name = ?",
129
- (file_name,)
130
- )
131
- conn.commit()
132
- conn.close()
91
+ cur = conn.cursor()
133
92
 
134
- # ***************************************
135
- # AskAI
136
- # ***************************************
93
+ row = cur.execute("SELECT content FROM pages WHERE name = ?", (old_name,)).fetchone()
94
+ if not row:
95
+ conn.close()
96
+ return
137
97
 
138
- def add_askai_cell(session_id, question, output, code):
139
- conn = sqlite3.connect(DB_PATH)
140
- conn.execute(
141
- "INSERT INTO askai_cells (session_id, question, output, code) VALUES (?, ?, ?, ?)",
142
- (session_id, question, output, code)
98
+ # Resolve current path (absolute if DB stored absolute; otherwise under syntaxmatrixdir)
99
+ current = row[0] or ""
100
+ if current and not os.path.isabs(current):
101
+ current_abs = os.path.join(_CLIENT_DIR, current)
102
+ else:
103
+ current_abs = current
104
+
105
+ # Target filename/path for the new name
106
+ new_filename = secure_filename(new_name.lower()) + ".html"
107
+ target_abs = os.path.join(_CLIENT_DIR, "templates", new_filename)
108
+ os.makedirs(os.path.dirname(target_abs), exist_ok=True)
109
+
110
+ # If name changed and the old file exists, rename; otherwise we’ll just write fresh
111
+ if old_name != new_name and current_abs and os.path.exists(current_abs) and current_abs != target_abs:
112
+ try:
113
+ os.replace(current_abs, target_abs)
114
+ except Exception:
115
+ # If rename fails (e.g. old file missing), we’ll write the new file below
116
+ pass
117
+
118
+ # Write the HTML (create if missing, overwrite if present)
119
+ with open(target_abs, "w", encoding="utf-8") as f:
120
+ f.write(html)
121
+
122
+ # Store a relative, OS-agnostic path in the DB
123
+ rel_path = f"templates/{new_filename}"
124
+ cur.execute(
125
+ "UPDATE pages SET name = ?, content = ? WHERE name = ?",
126
+ (new_name, rel_path, old_name)
143
127
  )
144
128
  conn.commit()
145
129
  conn.close()
146
130
 
147
- def get_askai_cells(session_id, limit=15):
148
- conn = sqlite3.connect(DB_PATH)
149
- cursor = conn.cursor()
150
- cursor.execute(
151
- "SELECT question, output, code FROM askai_cells WHERE session_id = ? ORDER BY id DESC LIMIT ?",
152
- (session_id, limit)
153
- )
154
- cells = [{"question": q, "output": o, "code": c} for q, o, c in cursor.fetchall()]
155
- conn.close()
156
- return cells
131
+ def delete_page(name):
132
+ """
133
+ Delete the page file (if present) and remove the row from the DB.
134
+ Works whether 'content' is absolute or relative.
135
+ """
136
+ import sqlite3, os
157
137
 
138
+ conn = sqlite3.connect(DB_PATH)
139
+ cur = conn.cursor()
140
+
141
+ row = cur.execute("SELECT content FROM pages WHERE name = ?", (name,)).fetchone()
142
+ if row:
143
+ path = row[0] or ""
144
+ abs_path = path if os.path.isabs(path) else os.path.join(_CLIENT_DIR, path)
145
+ if os.path.exists(abs_path):
146
+ try:
147
+ os.remove(abs_path)
148
+ except Exception:
149
+ # Don’t block deletion if the file cannot be removed
150
+ pass
151
+
152
+ cur.execute("DELETE FROM pages WHERE name = ?", (name,))
153
+ conn.commit()
154
+ conn.close()