syntaxmatrix 2.3.5__py3-none-any.whl → 2.5.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,218 @@
1
+ # syntaxmatrix/dataset_preprocessing.py
2
+ # -----------------------------------------------------------------------------
3
+ # Dataset-agnostic cleaning for analysis with imputation and audit outputs.
4
+ # Writes:
5
+ # DATA_FOLDER / selected_dataset / cleaned_df.csv
6
+ # DATA_FOLDER / selected_dataset / missingness.csv
7
+ # Does NOT mutate the in-memory EDA df. Call ensure_cleaned_df(...) after df load.
8
+ # -----------------------------------------------------------------------------
9
+
10
+ from __future__ import annotations
11
+ import os
12
+ import re
13
+ import pandas as pd
14
+ import numpy as np
15
+ from typing import Tuple, Dict
16
+
17
+ __all__ = ["ensure_cleaned_df"]
18
+
19
+ # Common tokens that should be treated as missing
20
+ _MISSING_TOKENS = {
21
+ "", "na", "n/a", "n.a.", "nan", "none", "null", "-", "--", "?", "unknown"
22
+ }
23
+
24
+ _BOOL_TRUE = {"true", "t", "yes", "y", "1", "on"}
25
+ _BOOL_FALSE = {"false", "f", "no", "n", "0", "off"}
26
+
27
+ # Columns whose names hint at date/time content (case-insensitive)
28
+ _DATE_HINTS = re.compile(r"(date|time|timestamp|_dt)$", re.IGNORECASE)
29
+
30
+ # -----------------------------------------------------------------------------
31
+ # Helpers
32
+ # -----------------------------------------------------------------------------
33
+
34
+ def _strip_column_names_only(df: pd.DataFrame) -> pd.DataFrame:
35
+ """Trim surrounding whitespace in column names (preserve original names)."""
36
+ df = df.copy()
37
+ df.rename(columns=lambda c: str(c).strip(), inplace=True)
38
+ return df
39
+
40
+ def _standardise_missing_tokens(s: pd.Series) -> pd.Series:
41
+ """Map common missing tokens to NaN in object-like columns."""
42
+ if s.dtype != "object":
43
+ return s
44
+ mapped = s.astype(str).str.strip()
45
+ lowered = mapped.str.lower()
46
+ is_missing = lowered.isin(_MISSING_TOKENS)
47
+ mapped = mapped.mask(is_missing, np.nan)
48
+ return mapped
49
+
50
+ def _coerce_booleans(s: pd.Series) -> pd.Series:
51
+ if s.dtype != "object":
52
+ return s
53
+ cand = s.astype(str).str.strip().str.lower()
54
+ uniq = set(cand.dropna().unique().tolist())
55
+ if uniq and uniq.issubset(_BOOL_TRUE | _BOOL_FALSE):
56
+ return cand.map(lambda v: True if v in _BOOL_TRUE else False if v in _BOOL_FALSE else np.nan)
57
+ return s
58
+
59
+ _NUM_RE = re.compile(r"[,\s£$€]")
60
+
61
+ def _looks_numeric(x: str) -> bool:
62
+ v = _NUM_RE.sub("", x.strip()).replace("%", "")
63
+ return bool(re.match(r"^[+-]?(\d+(\.\d*)?|\.\d+)$", v))
64
+
65
+ def _coerce_numerics(s: pd.Series) -> pd.Series:
66
+ if s.dtype != "object":
67
+ return s
68
+ sample = s.dropna().astype(str).head(1000)
69
+ if len(sample) == 0:
70
+ return s
71
+ ratio = np.mean([_looks_numeric(x) for x in sample])
72
+ if ratio >= 0.8:
73
+ cleaned = _NUM_RE.sub("", s.astype(str).str.strip())
74
+ # If many values end with %, interpret as percent
75
+ if (cleaned.str.endswith("%")).mean() > 0.6:
76
+ # remove % and divide by 100
77
+ cleaned = cleaned.str.replace("%", "", regex=False)
78
+ out = pd.to_numeric(cleaned, errors="coerce") / 100.0
79
+ else:
80
+ out = pd.to_numeric(cleaned.str.replace("%", "", regex=False), errors="coerce")
81
+ return out
82
+ return s
83
+
84
+ def _parse_datetimes(df: pd.DataFrame, col: str) -> pd.Series:
85
+ """Parse datetimes robustly; produce tz-naive UTC for consistent .dt."""
86
+ s = df[col].astype(str)
87
+ dt = pd.to_datetime(s, errors="coerce", infer_datetime_format=True, utc=True)
88
+ if dt.isna().mean() > 0.9:
89
+ # strip trailing ' (PDT)' etc.
90
+ s2 = s.str.replace(r"\s*\([^)]*\)\s*$", "", regex=True)
91
+ dt = pd.to_datetime(s2, errors="coerce", infer_datetime_format=True, utc=True)
92
+ # Convert to tz-naive UTC if we parsed anything meaningful
93
+ if dt.notna().sum() >= max(3, int(0.1 * len(df))):
94
+ try:
95
+ return dt.dt.tz_convert("UTC").dt.tz_localize(None)
96
+ except Exception:
97
+ return dt # already tz-naive
98
+ return df[col] # leave original if parsing failed
99
+
100
+ def _summarise_missingness(df: pd.DataFrame) -> pd.DataFrame:
101
+ total = len(df)
102
+ miss = df.isna().sum()
103
+ pct = (miss / total * 100.0).round(2)
104
+ dtype = df.dtypes.astype(str)
105
+ return pd.DataFrame({"column": df.columns, "missing": miss.values, "missing_%": pct.values, "dtype": dtype.values})
106
+
107
+ # -----------------------------------------------------------------------------
108
+ # Main cleaner (type coercion + imputation for analysis)
109
+ # -----------------------------------------------------------------------------
110
+
111
+ def _clean_and_coerce(df: pd.DataFrame) -> pd.DataFrame:
112
+ df = df.copy()
113
+ # 0) tidy strings and standardise missing tokens
114
+ for c in df.columns:
115
+ s = df[c]
116
+ if s.dtype == "object":
117
+ s = s.astype(str).str.strip().str.replace(r"\s+", " ", regex=True)
118
+ s = _standardise_missing_tokens(s)
119
+ df[c] = s
120
+
121
+ # 1) booleans
122
+ for c in df.columns:
123
+ df[c] = _coerce_booleans(df[c])
124
+
125
+ # 2) numerics
126
+ for c in df.columns:
127
+ df[c] = _coerce_numerics(df[c])
128
+
129
+ # 3) datetimes (by name hint + explicit 'saledate')
130
+ for c in list(df.columns):
131
+ n = str(c).lower()
132
+ if _DATE_HINTS.search(n) or n == "saledate":
133
+ try:
134
+ df[c] = _parse_datetimes(df, c)
135
+ except Exception:
136
+ pass
137
+
138
+ # 4) drop exact duplicates
139
+ df = df.drop_duplicates()
140
+ return df
141
+
142
+ def _impute_for_analysis(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, str]]:
143
+ """
144
+ Impute missing values:
145
+ - numeric -> median
146
+ - categorical/object/bool -> most frequent (fallback 'Unknown')
147
+ Adds <col>__imputed boolean flags where any fills occurred.
148
+ Returns cleaned df and a dict of imputation strategies used.
149
+ """
150
+ df = df.copy()
151
+ strategy: Dict[str, str] = {}
152
+
153
+ # numeric
154
+ num_cols = df.select_dtypes(include=["number"]).columns.tolist()
155
+ for c in num_cols:
156
+ if df[c].isna().any():
157
+ med = df[c].median(skipna=True)
158
+ if pd.isna(med):
159
+ continue # cannot impute an all-NaN column
160
+ df[f"{c}__imputed"] = df[c].isna()
161
+ df[c] = df[c].fillna(med)
162
+ strategy[c] = "median"
163
+
164
+ # categoricals & booleans (object/category/bool)
165
+ cat_cols = [c for c in df.columns
166
+ if df[c].dtype == "object" or str(df[c].dtype).startswith("category") or df[c].dtype == "bool"]
167
+ for c in cat_cols:
168
+ if df[c].isna().any():
169
+ # mode; if multiple modes, pick the first stable value
170
+ try:
171
+ mode_val = df[c].mode(dropna=True)
172
+ fill = mode_val.iloc[0] if not mode_val.empty else "Unknown"
173
+ except Exception:
174
+ fill = "Unknown"
175
+ df[f"{c}__imputed"] = df[c].isna()
176
+ df[c] = df[c].fillna(fill)
177
+ strategy[c] = f"mode('{fill}')"
178
+
179
+ return df, strategy
180
+
181
+ def ensure_cleaned_df(DATA_FOLDER: str, cleaned_folder: str, df: pd.DataFrame) -> pd.DataFrame:
182
+ """
183
+ Build (or reuse) an analysis-ready cleaned dataset and persist to:
184
+ f"{DATA_FOLDER}/{selected_dataset}/cleaned_df.csv"
185
+ Also writes a missingness audit:
186
+ f"{DATA_FOLDER}/{selected_dataset}/missingness.csv"
187
+ Returns the cleaned frame. Does NOT mutate the provided df.
188
+ """
189
+ target_dir = os.path.join(DATA_FOLDER, cleaned_folder)
190
+ os.makedirs(target_dir, exist_ok=True)
191
+ target_csv = os.path.join(target_dir, "cleaned_df.csv")
192
+ # miss_csv = os.path.join(target_dir, "missingness.csv")
193
+
194
+
195
+ # If a cleaned file already exists, reuse it (your pipeline already calls this once per dataset)
196
+ if os.path.exists(target_csv):
197
+ try:
198
+ return pd.read_csv(target_csv, low_memory=False)
199
+ except Exception:
200
+ # fall through to rebuild if unreadable
201
+ pass
202
+
203
+ # Pipeline: normalise headers → coerce types → impute → audits → save
204
+ step0 = _strip_column_names_only(df)
205
+ step1 = _clean_and_coerce(step0)
206
+ # audit BEFORE imputation (raw missingness after coercion)
207
+ #_summarise_missingness(step1).to_csv(miss_csv, index=False)
208
+ step2, _strategy = _impute_for_analysis(step1)
209
+
210
+ # Drop id-like columns (high-uniqueness or name pattern)
211
+ name_hit = [c for c in step2.columns if re.search(r'\b(id|uuid|vin|serial|record|row_?id)\b', c, re.I)]
212
+ uniq_hit = [c for c in step2.columns if step2[c].nunique(dropna=True) >= 0.98 * len(step2)]
213
+ id_like = sorted(set(name_hit) | set(uniq_hit))
214
+ step2 = step2.drop(columns=id_like, errors='ignore')
215
+
216
+ # Persist cleaned for tasks
217
+ step2.to_csv(target_csv, index=False)
218
+ return step2
syntaxmatrix/display.py CHANGED
@@ -1,54 +1,106 @@
1
- # -----------------------------------------------------------------
2
- # Paste *inside* syntaxmatrix/display.py – only the show() body
3
- # -----------------------------------------------------------------
4
- def show(obj):
1
+ """
2
+ syntaxmatrix.display
3
+ --------------------
4
+ Single responsibility: render arbitrary Python objects in the SMX UI.
5
+
6
+ - Matplotlib figures: displayed directly.
7
+ - Pandas Styler (with .set_caption): rendered to HTML so captions always show.
8
+ - Pandas DataFrame/Series: rendered to HTML (no caption path).
9
+ - Dict of scalars: rendered as a small table.
10
+ - Tuple of two numbers (e.g., mse, r2): rendered as a labelled 2-row table.
11
+ - Everything else: shown as <pre> for safe inspection.
12
+ """
13
+
14
+ from typing import Any
15
+ import numbers
16
+
17
+ import pandas as pd
18
+ import matplotlib.figure as mpfig
19
+ from IPython.display import display, HTML
20
+
21
+ try:
22
+ # Optional: if pandas Styler exists, we can keep captions reliably
23
+ from pandas.io.formats.style import Styler as _Styler # type: ignore
24
+ except Exception: # pragma: no cover
25
+ _Styler = None # type: ignore
26
+
27
+
28
+ __all__ = ["show"]
29
+
30
+
31
+ # ---- internal helpers -------------------------------------------------------
32
+
33
+
34
+ def _wrap_html_table(html: str) -> str:
35
+ """Apply consistent UI styling and horizontal scrolling."""
36
+ return (
37
+ "<style>"
38
+ "caption{caption-side: top; font-weight:600; margin:0 0 6px 0;}"
39
+ "table{border-collapse:collapse;font-size:0.9em;white-space:nowrap;}"
40
+ "th{background:#f0f2f5;text-align:left;padding:6px 8px;border:1px solid gray;}"
41
+ "td{border:1px solid #ddd;padding:6px 8px;}"
42
+ "tbody tr:nth-child(even){background-color:#f9f9f9;}"
43
+ "</style>"
44
+ "<div style='overflow-x:auto;max-width:100%;margin-bottom:1rem;'>"
45
+ + html +
46
+ "</div>"
47
+ )
48
+
49
+
50
+ # ---- public API -------------------------------------------------------------
51
+
52
+
53
+ def show(obj: Any) -> None:
5
54
  """
6
55
  Render common objects so the Dashboard (or chat) always shows output.
7
- """
8
- import io, base64, numbers
9
- from IPython.display import display, HTML
10
- import pandas as pd
11
- import matplotlib.figure as mpfig
12
56
 
13
- # ── matplotlib Figure ─────────────────────────────────────────
57
+ Notes
58
+ -----
59
+ * Do not print here. All rendering goes through IPython's display layer.
60
+ * Captions are supplied upstream by the SMX PREFACE via DataFrame.style.set_caption(...).
61
+ """
62
+ # 1) Matplotlib figures
14
63
  if isinstance(obj, mpfig.Figure):
15
- display(obj)
64
+ display(obj)
16
65
  return None
17
-
18
- if isinstance(obj, (pd.Series, pd.DataFrame)):
19
66
 
67
+ # 2) Pandas Styler (keeps caption)
68
+ if _Styler is not None and isinstance(obj, _Styler): # type: ignore
69
+ try:
70
+ html = obj.to_html()
71
+ display(HTML(_wrap_html_table(html)))
72
+ except Exception:
73
+ # Fallback: if Styler HTML fails for any reason, display raw Styler
74
+ display(obj)
75
+ return None
76
+
77
+ # 3) Series / DataFrame (no caption path)
78
+ if isinstance(obj, (pd.Series, pd.DataFrame)):
20
79
  html = obj.to_html(classes="smx-table", border=0)
21
- wrapped_html = (
22
- "<style>"
23
- ".smx-table{border-collapse:collapse;font-size:0.9em;white-space:nowrap;}"
24
- ".smx-table th{background:#f0f2f5;text-align:left;padding:6px 8px;border:1px solid gray;}"
25
- ".smx-table td{border:1px solid #ddd;padding:6px 8px;}"
26
- ".smx-table tbody tr:nth-child(even){background-color:#f9f9f9;}"
27
- "</style>"
28
- "<div style='overflow-x:auto; max-width:100%; margin-bottom:1rem;'>"
29
- + html +
30
- "</div>"
31
- )
32
- display(HTML(wrapped_html))
80
+ display(HTML(_wrap_html_table(html)))
33
81
  return None
34
82
 
35
- # ── dict of scalars → pretty 2-col table ─────────────────────
83
+ # 4) Dict of scalar numbers → pretty 2-col table
36
84
  if isinstance(obj, dict) and all(isinstance(v, numbers.Number) for v in obj.values()):
37
- df_ = pd.DataFrame({"metric": list(obj.keys()),
38
- "value": list(obj.values())})
39
- display(df_)
85
+ df_ = pd.DataFrame({"metric": list(obj.keys()), "value": list(obj.values())})
86
+ html = df_.to_html(classes="smx-table", border=0, index=False)
87
+ display(HTML(_wrap_html_table(html)))
40
88
  return None
41
89
 
42
- # ── 2-tuple of numbers (mse, ) ─────────────────────────────
43
- if (isinstance(obj, tuple) and len(obj) == 2 and
44
- all(isinstance(v, numbers.Number) for v in obj)):
90
+ # 5) Two-number tuple labelled metric table (e.g., (mse, r2))
91
+ if (
92
+ isinstance(obj, tuple)
93
+ and len(obj) == 2
94
+ and all(isinstance(v, numbers.Number) for v in obj)
95
+ ):
45
96
  mse, r2 = obj
46
- df_ = pd.DataFrame({"metric": ["Mean-squared error", "R²"],
47
- "value": [mse, r2]})
48
- display(df_)
97
+ df_ = pd.DataFrame(
98
+ {"metric": ["Mean-squared error", "R²"], "value": [mse, r2]}
99
+ )
100
+ html = df_.to_html(classes="smx-table", border=0, index=False)
101
+ display(HTML(_wrap_html_table(html)))
49
102
  return None
50
103
 
51
- # ── fallback ─────────────────────────────────────────────────
104
+ # 6) Fallback: show as preformatted text (safe and predictable)
52
105
  display(HTML(f"<pre>{obj}</pre>"))
53
-
54
106
  return None
@@ -21,10 +21,11 @@ def extract_output_text(resp) -> str:
21
21
  def set_args(
22
22
  model,
23
23
  instructions,
24
- input, previous_id=None,
24
+ input,
25
+ previous_id=None,
25
26
  store=False,
26
- reasoning_effort="minimal",
27
- verbosity="low",
27
+ reasoning_effort="medium", # "minimal", "low", "medium", "high"
28
+ verbosity="medium", # "low", "medium", "high"
28
29
  truncation="auto",
29
30
  ):
30
31
  base_params = {
@@ -35,7 +36,7 @@ def set_args(
35
36
  "store": store,
36
37
  "truncation": truncation,
37
38
  }
38
- if model == "gpt-5-chat-latest":
39
+ if model == "gpt-5.1-chat-latest":
39
40
  args = base_params
40
41
  else:
41
42
  args = {**base_params,
syntaxmatrix/profiles.py CHANGED
@@ -25,24 +25,39 @@ def get_profile(purpose: str) -> dict:
25
25
  _refresh_profiles()
26
26
  return _profiles.get(purpose)
27
27
 
28
+ def get_profiles():
29
+ return list_profiles()
28
30
 
29
31
  def get_client(profile):
30
32
 
31
33
  provider = profile["provider"].lower()
32
34
  api_key = profile["api_key"]
33
35
 
34
- if provider == "google": #1
36
+ #1 - Google - gemini series
37
+ if provider == "google":
35
38
  return genai.Client(api_key=api_key)
36
- if provider == "openai": #2
39
+
40
+ #2 OpenAI gpt-5 series
41
+ if provider == "openai":
37
42
  return OpenAI(api_key=api_key)
38
- if provider == "xai": #3
43
+
44
+ #3 - xAI - grok series
45
+ if provider == "xai":
39
46
  return OpenAI(api_key=api_key, base_url="https://api.x.ai/v1")
40
- if provider == "deepseek": #4
47
+
48
+ #4 - DeepSeek chat model
49
+ if provider == "deepseek":
41
50
  return OpenAI(api_key=api_key, base_url="https://api.deepseek.com")
51
+
52
+ #5 - Moonshot chat model
42
53
  if provider == "moonshot": #5
43
54
  return OpenAI(api_key=api_key, base_url="https://api.moonshot.ai/v1")
55
+
56
+ #6 - Alibaba qwen series
44
57
  if provider == "alibaba": #6
45
58
  return OpenAI(api_key=api_key, base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",)
59
+
60
+ #7 - Anthropic claude series
46
61
  if provider == "anthropic": #7
47
62
  return anthropic.Anthropic(api_key=api_key)
48
63