syntaxmatrix 2.5.5.5__py3-none-any.whl → 2.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
syntaxmatrix/utils.py CHANGED
@@ -93,6 +93,55 @@ def classify_ml_job(prompt: str) -> str:
93
93
  return "eda"
94
94
 
95
95
 
96
+ def _indent(code: str, spaces: int = 4) -> str:
97
+ """
98
+ Indent a block of code by `spaces` spaces, line by line.
99
+ Blank lines are preserved unchanged.
100
+ """
101
+ pad = " " * spaces
102
+ lines = code.splitlines()
103
+ return "\n".join((pad + line) if line.strip() else line for line in lines)
104
+
105
+
106
+ def wrap_llm_code_safe(body: str) -> str:
107
+ """
108
+ Wrap arbitrary LLM code so that:
109
+ - Any exception is caught and shown.
110
+ - A minimal, useful EDA fallback still runs so the user sees *something*.
111
+ This happens once in the framework; you never touch the individual cells.
112
+ """
113
+ return textwrap.dedent(
114
+ "try:\n"
115
+ + _indent(body)
116
+ + "\n"
117
+ "except Exception as e:\n"
118
+ " from syntaxmatrix.display import show\n"
119
+ " msg = f\"⚠️ Skipped LLM block due to: {type(e).__name__}: {e}\"\n"
120
+ " show(msg)\n"
121
+ " # --- automatic EDA fallback ---\n"
122
+ " try:\n"
123
+ " df_local = globals().get('df')\n"
124
+ " if df_local is not None:\n"
125
+ " import pandas as pd\n"
126
+ " from syntaxmatrix.preface import SB_histplot, _SMX_export_png\n"
127
+ " num_cols = df_local.select_dtypes(include=['number', 'bool']).columns.tolist()\n"
128
+ " cat_cols = [c for c in df_local.columns if c not in num_cols]\n"
129
+ " info = {\n"
130
+ " 'rows': len(df_local),\n"
131
+ " 'cols': len(df_local.columns),\n"
132
+ " 'numeric_cols': len(num_cols),\n"
133
+ " 'categorical_cols': len(cat_cols),\n"
134
+ " }\n"
135
+ " show(df_local.head())\n"
136
+ " show(info)\n"
137
+ " if num_cols:\n"
138
+ " SB_histplot()\n"
139
+ " _SMX_export_png()\n"
140
+ " except Exception as _f:\n"
141
+ " show(f\"⚠️ Fallback EDA failed: {type(_f).__name__}: {_f}\")\n"
142
+ )
143
+
144
+
96
145
  def harden_ai_code(code: str) -> str:
97
146
  """
98
147
  Make any AI-generated cell resilient:
@@ -113,10 +162,6 @@ def harden_ai_code(code: str) -> str:
113
162
  code = re.sub(r"\breturn\s+\.\.\.", "return None", code)
114
163
  return code
115
164
 
116
- def _indent(code: str, spaces: int = 4) -> str:
117
- pad = " " * spaces
118
- return "\n".join(pad + line for line in code.splitlines())
119
-
120
165
  def _SMX_OHE(**k):
121
166
  # normalise arg name across sklearn versions
122
167
  if "sparse" in k and "sparse_output" not in k:
@@ -140,6 +185,49 @@ def harden_ai_code(code: str) -> str:
140
185
  code = re.sub(r'(?m)^\s*\\\d+\s*', '', code)
141
186
  code = re.sub(r'(?m)[;]\s*\\\d+\s*', '; ', code)
142
187
  return code
188
+
189
+ def _patch_feature_coef_dataframe(code: str) -> str:
190
+ """
191
+ Harden patterns like:
192
+ coeffs_df = pd.DataFrame({'feature': num_features, 'coefficient': coef})
193
+ which can crash with:
194
+ ValueError: All arrays must be of the same length
195
+ We wrap them in a try/except and, on failure, rebuild the
196
+ DataFrame by zipping feature names with coefficients up to
197
+ the min length.
198
+ """
199
+ # Match single-line assignments of the form:
200
+ # <var> = pd.DataFrame({'feature': <feat>, 'coefficient': <coef>})
201
+ import re
202
+
203
+ pattern = re.compile(
204
+ r"(?P<indent>^[ \t]*)"
205
+ r"(?P<var>\w+)\s*=\s*pd\.DataFrame\(\s*{\s*"
206
+ r"['\"]feature['\"]\s*:\s*(?P<feat_expr>.+?)\s*,\s*"
207
+ r"['\"]coefficient['\"]\s*:\s*(?P<coef_expr>.+?)\s*"
208
+ r"}\s*\)\s*$",
209
+ re.MULTILINE,
210
+ )
211
+
212
+ def repl(m: re.Match) -> str:
213
+ indent = m.group("indent")
214
+ var = m.group("var")
215
+ feat_expr = m.group("feat_expr").strip()
216
+ coef_expr = m.group("coef_expr").strip()
217
+
218
+ # Keep the original intent, but add a safe fallback.
219
+ return (
220
+ f"{indent}try:\n"
221
+ f"{indent} {var} = pd.DataFrame({{'feature': {feat_expr}, 'coefficient': {coef_expr}}})\n"
222
+ f"{indent}except Exception as _e:\n"
223
+ f"{indent} import numpy as _np\n"
224
+ f"{indent} _feat = list({feat_expr})\n"
225
+ f"{indent} _coef = _np.asarray({coef_expr}).ravel()\n"
226
+ f"{indent} _k = min(len(_feat), len(_coef))\n"
227
+ f"{indent} {var} = pd.DataFrame({{'feature': _feat[:_k], 'coefficient': _coef[:_k]}})\n"
228
+ )
229
+
230
+ return pattern.sub(repl, code)
143
231
 
144
232
  def _wrap_metric_calls(code: str) -> str:
145
233
  names = [
@@ -224,6 +312,191 @@ def harden_ai_code(code: str) -> str:
224
312
  except Exception:
225
313
  pass
226
314
 
315
+ def _ensure_metrics_imports(code: str) -> str:
316
+ needed = set()
317
+ if "r2_score" in code:
318
+ needed.add("r2_score")
319
+ if "mean_absolute_error" in code:
320
+ needed.add("mean_absolute_error")
321
+ # ... add others if you like ...
322
+
323
+ if not needed:
324
+ return code
325
+
326
+ if "from sklearn.metrics import" in code:
327
+ return code # assume user/LLM handled it
328
+
329
+ import_line = "from sklearn.metrics import " + ", ".join(sorted(needed)) + "\n"
330
+ return import_line + code
331
+
332
+ def _fix_unexpected_indent(src: str) -> str:
333
+ """
334
+ Some LLM snippets jump indentation (e.g. extra 8 spaces on an 'import'
335
+ line) without a preceding block opener. That causes
336
+ `IndentationError: unexpected indent` when we wrap in our own `try:`.
337
+ This normalises those lines back to the previous indent level, but only
338
+ when we're not in a multi-line bracket/paren context.
339
+ """
340
+ lines = src.splitlines()
341
+ out = []
342
+ prev_indent = 0
343
+ prev_ends_colon = False
344
+ paren_depth = 0 # (), [], {} depth across lines (very approximate)
345
+
346
+ for raw in lines:
347
+ stripped = raw.lstrip()
348
+ if not stripped: # blank / whitespace line
349
+ out.append(raw)
350
+ continue
351
+
352
+ indent = len(raw) - len(stripped)
353
+
354
+ # Only flatten if:
355
+ # - we're not inside a (...) / [...] / {...} block, and
356
+ # - previous logical line did NOT end with ':', and
357
+ # - this line is indented more than the previous indent.
358
+ if paren_depth == 0 and not prev_ends_colon and indent > prev_indent:
359
+ indent = prev_indent
360
+ new_line = " " * indent + stripped
361
+ out.append(new_line)
362
+
363
+ # Update simple state for next line
364
+ txt = stripped
365
+ paren_depth += txt.count("(") + txt.count("[") + txt.count("{")
366
+ paren_depth -= txt.count(")") + txt.count("]") + txt.count("}")
367
+ prev_ends_colon = txt.rstrip().endswith(":")
368
+ prev_indent = indent
369
+
370
+ return "\n".join(out)
371
+
372
+ def _fallback_snippet() -> str:
373
+ """
374
+ Final-resort snippet when the LLM code is syntactically broken.
375
+
376
+ It:
377
+ - attempts a simple automatic ML task (classification or regression)
378
+ - then falls back to generic but useful EDA.
379
+
380
+ It assumes `from syntaxmatrix.preface import *` has already been done,
381
+ so `_SMX_OHE`, `_SMX_call`, `SB_histplot`, `_SMX_export_png` and the
382
+ patched `show()` are available.
383
+ """
384
+ import textwrap
385
+
386
+ return textwrap.dedent(
387
+ """\
388
+ import pandas as pd
389
+ import numpy as np
390
+ import matplotlib.pyplot as plt
391
+ from sklearn.model_selection import train_test_split
392
+ from sklearn.compose import ColumnTransformer
393
+ from sklearn.preprocessing import StandardScaler
394
+ from sklearn.linear_model import LogisticRegression, LinearRegression
395
+ from sklearn.metrics import accuracy_score, r2_score, mean_absolute_error
396
+
397
+ df = df.copy()
398
+
399
+ # --- basic column introspection ---
400
+ num_cols = df.select_dtypes(include=['number', 'bool']).columns.tolist()
401
+ cat_cols = [c for c in df.columns if c not in num_cols]
402
+
403
+ # --- attempt an automatic ML task ---
404
+ target_col = None
405
+ task_type = None
406
+
407
+ # Prefer a low-cardinality target (classification)
408
+ for c in num_cols + cat_cols:
409
+ uniq = df[c].dropna().nunique()
410
+ if 2 <= uniq <= 10:
411
+ target_col = c
412
+ task_type = 'classification'
413
+ break
414
+
415
+ # If none found, try a numeric regression target
416
+ if target_col is None and num_cols:
417
+ target_col = num_cols[-1]
418
+ task_type = 'regression'
419
+
420
+ if target_col is not None:
421
+ try:
422
+ X = df.drop(columns=[target_col]).copy()
423
+ y = df[target_col].copy()
424
+
425
+ num_feats = X.select_dtypes(include=['number', 'bool']).columns.tolist()
426
+ cat_feats = [c for c in X.columns if c not in num_feats]
427
+
428
+ pre = ColumnTransformer(
429
+ transformers=[
430
+ ('num', StandardScaler(), num_feats),
431
+ ('cat', _SMX_OHE(handle_unknown='ignore'), cat_feats),
432
+ ],
433
+ remainder='drop',
434
+ )
435
+
436
+ from sklearn.pipeline import Pipeline
437
+ if task_type == 'classification':
438
+ model = LogisticRegression(max_iter=1000)
439
+ else:
440
+ model = LinearRegression()
441
+
442
+ pipe = Pipeline([('pre', pre), ('model', model)])
443
+
444
+ X_train, X_test, y_train, y_test = train_test_split(
445
+ X, y, test_size=0.25, random_state=42
446
+ )
447
+
448
+ pipe.fit(X_train, y_train)
449
+ y_pred = pipe.predict(X_test)
450
+
451
+ if task_type == 'classification':
452
+ # If predictions look like probabilities, convert to labels
453
+ if getattr(y_pred, 'ndim', 1) > 1 and y_pred.shape[1] > 1:
454
+ y_pred_labels = y_pred.argmax(axis=1)
455
+ else:
456
+ try:
457
+ y_pred_labels = (y_pred > 0.5).astype(y_test.dtype)
458
+ except Exception:
459
+ y_pred_labels = y_pred
460
+
461
+ acc = _SMX_call(accuracy_score, y_test, y_pred_labels)
462
+ show({
463
+ 'target': target_col,
464
+ 'task': 'classification',
465
+ 'accuracy': acc,
466
+ })
467
+ else:
468
+ r2 = _SMX_call(r2_score, y_test, y_pred)
469
+ mae = _SMX_call(mean_absolute_error, y_test, y_pred)
470
+ show({
471
+ 'target': target_col,
472
+ 'task': 'regression',
473
+ 'r2': r2,
474
+ 'mae': mae,
475
+ })
476
+
477
+ except Exception as _ml_e:
478
+ show(f"⚠ ML fallback failed: {type(_ml_e).__name__}: {_ml_e}")
479
+
480
+ # --- EDA fallback that still helps answer the question ---
481
+ try:
482
+ info = {
483
+ 'rows': len(df),
484
+ 'cols': len(df.columns),
485
+ 'numeric_cols': len(num_cols),
486
+ 'categorical_cols': len(cat_cols),
487
+ }
488
+ show(df.head(), title='Sample of data')
489
+ show(info, title='Dataset summary')
490
+
491
+ # Quick univariate look if we have numeric columns
492
+ if num_cols:
493
+ SB_histplot()
494
+ _SMX_export_png()
495
+ except Exception as _eda_e:
496
+ show(f"⚠ EDA fallback failed: {type(_eda_e).__name__}: {_eda_e}")
497
+ """
498
+ )
499
+
227
500
  # Register and run patches once per execution
228
501
  for _patch in (
229
502
  _smx_patch_mean_squared_error_squared_kw,
@@ -235,412 +508,6 @@ def harden_ai_code(code: str) -> str:
235
508
  except Exception:
236
509
  pass
237
510
 
238
- PREFACE = (
239
- "# === SMX Auto-Hardening Preface (do not edit) ===\n"
240
- "import warnings, numpy as np, pandas as pd, matplotlib.pyplot as plt\n"
241
- "warnings.filterwarnings('ignore')\n"
242
- "try:\n"
243
- " import seaborn as sns\n"
244
- "except Exception:\n"
245
- " class _Dummy:\n"
246
- " def __getattr__(self, name):\n"
247
- " def _f(*a, **k):\n"
248
- " from syntaxmatrix.display import show\n"
249
- " show('⚠ seaborn not available; plot skipped.')\n"
250
- " return _f\n"
251
- " sns = _Dummy()\n"
252
- "\n"
253
- "from syntaxmatrix.display import show as _SMX_base_show\n"
254
- "def _SMX_caption_from_ctx():\n"
255
- " g = globals()\n"
256
- " t = g.get('refined_question') or g.get('askai_question') or 'Table'\n"
257
- " return str(t).strip().splitlines()[0][:120]\n"
258
- "\n"
259
- "def show(obj, title=None):\n"
260
- " try:\n"
261
- " import pandas as pd\n"
262
- " if isinstance(obj, pd.DataFrame):\n"
263
- " cap = (title or _SMX_caption_from_ctx())\n"
264
- " try:\n"
265
- " return _SMX_base_show(obj.style.set_caption(cap))\n"
266
- " except Exception:\n"
267
- " pass\n"
268
- " except Exception:\n"
269
- " pass\n"
270
- " return _SMX_base_show(obj)\n"
271
- "\n"
272
- "def _SMX_axes_have_titles(fig=None):\n"
273
- " import matplotlib.pyplot as _plt\n"
274
- " fig = fig or _plt.gcf()\n"
275
- " try:\n"
276
- " for _ax in fig.get_axes():\n"
277
- " if (_ax.get_title() or '').strip():\n"
278
- " return True\n"
279
- " except Exception:\n"
280
- " pass\n"
281
- " return False\n"
282
- "\n"
283
- "def _SMX_export_png():\n"
284
- " import io, base64\n"
285
- " fig = plt.gcf()\n"
286
- " try:\n"
287
- " if not _SMX_axes_have_titles(fig):\n"
288
- " fig.suptitle(_SMX_caption_from_ctx(), fontsize=10)\n"
289
- " except Exception:\n"
290
- " pass\n"
291
- " buf = io.BytesIO()\n"
292
- " plt.savefig(buf, format='png', bbox_inches='tight')\n"
293
- " buf.seek(0)\n"
294
- " from IPython.display import display, HTML\n"
295
- " _img = base64.b64encode(buf.read()).decode('ascii')\n"
296
- " display(HTML(f\"<img src='data:image/png;base64,{_img}' style='max-width:100%;height:auto;border:1px solid #ccc;border-radius:4px;'/>\"))\n"
297
- " plt.close()\n"
298
- "\n"
299
- "def _pick_df():\n"
300
- " return globals().get('df', None)\n"
301
- "\n"
302
- "def _pick_ax_slot():\n"
303
- " ax = None\n"
304
- " try:\n"
305
- " _axes = globals().get('axes', None)\n"
306
- " import numpy as _np\n"
307
- " if _axes is not None:\n"
308
- " arr = _np.ravel(_axes)\n"
309
- " for _a in arr:\n"
310
- " try:\n"
311
- " if hasattr(_a,'has_data') and not _a.has_data():\n"
312
- " ax = _a; break\n"
313
- " except Exception:\n"
314
- " continue\n"
315
- " except Exception:\n"
316
- " ax = None\n"
317
- " return ax\n"
318
- "\n"
319
- "def _first_numeric(_d):\n"
320
- " import numpy as np, pandas as pd\n"
321
- " try:\n"
322
- " preferred = [\"median_house_value\", \"price\", \"value\", \"target\", \"label\", \"y\"]\n"
323
- " for c in preferred:\n"
324
- " if c in _d.columns and pd.api.types.is_numeric_dtype(_d[c]):\n"
325
- " return c\n"
326
- " cols = _d.select_dtypes(include=[np.number]).columns.tolist()\n"
327
- " return cols[0] if cols else None\n"
328
- " except Exception:\n"
329
- " return None\n"
330
- "\n"
331
- "def _first_categorical(_d):\n"
332
- " import pandas as pd, numpy as np\n"
333
- " try:\n"
334
- " num = set(_d.select_dtypes(include=[np.number]).columns.tolist())\n"
335
- " cand = [c for c in _d.columns if c not in num and _d[c].nunique(dropna=True) <= 50]\n"
336
- " return cand[0] if cand else None\n"
337
- " except Exception:\n"
338
- " return None\n"
339
- "\n"
340
- "boxplot = barplot = histplot = distplot = lineplot = countplot = heatmap = pairplot = None\n"
341
- "\n"
342
- "def _safe_plot(func, *args, **kwargs):\n"
343
- " try:\n"
344
- " ax = func(*args, **kwargs)\n"
345
- " if ax is None:\n"
346
- " ax = plt.gca()\n"
347
- " try:\n"
348
- " if hasattr(ax, 'has_data') and not ax.has_data():\n"
349
- " from syntaxmatrix.display import show as _show\n"
350
- " _show('⚠ Empty plot: no data drawn.')\n"
351
- " except Exception:\n"
352
- " pass\n"
353
- " try: plt.tight_layout()\n"
354
- " except Exception: pass\n"
355
- " return ax\n"
356
- " except Exception as e:\n"
357
- " from syntaxmatrix.display import show as _show\n"
358
- " _show(f'⚠ Plot skipped: {type(e).__name__}: {e}')\n"
359
- " return None\n"
360
- "\n"
361
- "def SB_histplot(*a, **k):\n"
362
- " _missing = (getattr(sns, '__class__', type(sns)).__name__ == '_Dummy')\n"
363
- " _sentinel = (len(a) >= 1 and a[0] is None)\n"
364
- " if (not a or _sentinel) and not k:\n"
365
- " d = _pick_df()\n"
366
- " if d is not None:\n"
367
- " x = _first_numeric(d)\n"
368
- " if x is not None:\n"
369
- " def _draw():\n"
370
- " plt.hist(d[x].dropna())\n"
371
- " ax = plt.gca()\n"
372
- " if not (ax.get_title() or '').strip():\n"
373
- " ax.set_title(f'Distribution of {x}')\n"
374
- " return ax\n"
375
- " return _safe_plot(lambda **kw: _draw())\n"
376
- " if _missing:\n"
377
- " return _safe_plot(lambda **kw: plt.hist([]))\n"
378
- " if _sentinel:\n"
379
- " a = a[1:]\n"
380
- " return _safe_plot(getattr(sns,'histplot', plt.hist), *a, **k)\n"
381
- "\n"
382
- "def SB_barplot(*a, **k):\n"
383
- " _missing = (getattr(sns, '__class__', type(sns)).__name__ == '_Dummy')\n"
384
- " _sentinel = (len(a) >= 1 and a[0] is None)\n"
385
- " _ax = k.get('ax') or _pick_ax_slot()\n"
386
- " if _ax is not None:\n"
387
- " try: plt.sca(_ax)\n"
388
- " except Exception: pass\n"
389
- " k.setdefault('ax', _ax)\n"
390
- " if (not a or _sentinel) and not k:\n"
391
- " d = _pick_df()\n"
392
- " if d is not None:\n"
393
- " x = _first_categorical(d)\n"
394
- " y = _first_numeric(d)\n"
395
- " if x and y:\n"
396
- " import pandas as _pd\n"
397
- " g = d.groupby(x)[y].mean().reset_index()\n"
398
- " def _draw():\n"
399
- " if _missing:\n"
400
- " plt.bar(g[x], g[y])\n"
401
- " else:\n"
402
- " sns.barplot(data=g, x=x, y=y, ax=k.get('ax'))\n"
403
- " ax = plt.gca()\n"
404
- " if not (ax.get_title() or '').strip():\n"
405
- " ax.set_title(f'Mean {y} by {x}')\n"
406
- " return ax\n"
407
- " return _safe_plot(lambda **kw: _draw())\n"
408
- " if _missing:\n"
409
- " return _safe_plot(lambda **kw: plt.bar([], []))\n"
410
- " if _sentinel:\n"
411
- " a = a[1:]\n"
412
- " return _safe_plot(sns.barplot, *a, **k)\n"
413
- "\n"
414
- "def SB_boxplot(*a, **k):\n"
415
- " _missing = (getattr(sns, '__class__', type(sns)).__name__ == '_Dummy')\n"
416
- " _sentinel = (len(a) >= 1 and a[0] is None)\n"
417
- " _ax = k.get('ax') or _pick_ax_slot()\n"
418
- " if _ax is not None:\n"
419
- " try: plt.sca(_ax)\n"
420
- " except Exception: pass\n"
421
- " k.setdefault('ax', _ax)\n"
422
- " if (not a or _sentinel) and not k:\n"
423
- " d = _pick_df()\n"
424
- " if d is not None:\n"
425
- " x = _first_categorical(d)\n"
426
- " y = _first_numeric(d)\n"
427
- " if x and y:\n"
428
- " def _draw():\n"
429
- " if _missing:\n"
430
- " plt.boxplot(d[y].dropna())\n"
431
- " else:\n"
432
- " sns.boxplot(data=d, x=x, y=y, ax=k.get('ax'))\n"
433
- " ax = plt.gca()\n"
434
- " if not (ax.get_title() or '').strip():\n"
435
- " ax.set_title(f'Distribution of {y} by {x}')\n"
436
- " return ax\n"
437
- " return _safe_plot(lambda **kw: _draw())\n"
438
- " if _missing:\n"
439
- " return _safe_plot(lambda **kw: plt.boxplot([]))\n"
440
- " if _sentinel:\n"
441
- " a = a[1:]\n"
442
- " return _safe_plot(sns.boxplot, *a, **k)\n"
443
- "\n"
444
- "def SB_scatterplot(*a, **k):\n"
445
- " _missing = (getattr(sns, '__class__', type(sns)).__name__ == '_Dummy')\n"
446
- " fn = getattr(sns,'scatterplot', None)\n"
447
- " # If seaborn is unavailable OR the caller passed (data=..., x='col', y='col'),\n"
448
- " # use a robust matplotlib path that looks up data and coerces to numeric.\n"
449
- " if _missing or fn is None:\n"
450
- " data = k.get('data'); x = k.get('x'); y = k.get('y')\n"
451
- " if data is not None and isinstance(x, str) and isinstance(y, str) and x in data.columns and y in data.columns:\n"
452
- " xs = pd.to_numeric(data[x], errors='coerce')\n"
453
- " ys = pd.to_numeric(data[y], errors='coerce')\n"
454
- " m = xs.notna() & ys.notna()\n"
455
- " def _draw():\n"
456
- " plt.scatter(xs[m], ys[m])\n"
457
- " ax = plt.gca()\n"
458
- " if not (ax.get_title() or '').strip():\n"
459
- " ax.set_title(f'{y} vs {x}')\n"
460
- " return ax\n"
461
- " return _safe_plot(lambda **kw: _draw())\n"
462
- " # else: fall back to auto-pick two numeric columns\n"
463
- " d = _pick_df()\n"
464
- " if d is not None:\n"
465
- " num = d.select_dtypes(include=[np.number]).columns.tolist()\n"
466
- " if len(num) >= 2:\n"
467
- " def _draw2():\n"
468
- " plt.scatter(d[num[0]], d[num[1]])\n"
469
- " ax = plt.gca()\n"
470
- " if not (ax.get_title() or '').strip():\n"
471
- " ax.set_title(f'{num[1]} vs {num[0]}')\n"
472
- " return ax\n"
473
- " return _safe_plot(lambda **kw: _draw2())\n"
474
- " return _safe_plot(lambda **kw: plt.scatter([], []))\n"
475
- " # seaborn path\n"
476
- " return _safe_plot(fn, *a, **k)\n"
477
- "\n"
478
- "def SB_heatmap(*a, **k):\n"
479
- " _missing = (getattr(sns, '__class__', type(sns)).__name__ == '_Dummy')\n"
480
- " data = None\n"
481
- " if a:\n"
482
- " data = a[0]\n"
483
- " elif 'data' in k:\n"
484
- " data = k['data']\n"
485
- " if data is None:\n"
486
- " d = _pick_df()\n"
487
- " try:\n"
488
- " if d is not None:\n"
489
- " import numpy as _np\n"
490
- " data = d.select_dtypes(include=[_np.number]).corr()\n"
491
- " except Exception:\n"
492
- " data = None\n"
493
- " if data is None:\n"
494
- " from syntaxmatrix.display import show as _show\n"
495
- " _show('⚠ Heatmap skipped: no data.')\n"
496
- " return None\n"
497
- " if not _missing and hasattr(sns, 'heatmap'):\n"
498
- " _k = {kk: vv for kk, vv in k.items() if kk != 'data'}\n"
499
- " def _draw():\n"
500
- " ax = sns.heatmap(data, **_k)\n"
501
- " try:\n"
502
- " ax = ax or plt.gca()\n"
503
- " if not (ax.get_title() or '').strip():\n"
504
- " ax.set_title('Correlation Heatmap')\n"
505
- " except Exception:\n"
506
- " pass\n"
507
- " return ax\n"
508
- " return _safe_plot(lambda **kw: _draw())\n"
509
- " def _mat_heat():\n"
510
- " im = plt.imshow(data, aspect='auto')\n"
511
- " try: plt.colorbar()\n"
512
- " except Exception: pass\n"
513
- " try:\n"
514
- " cols = list(getattr(data, 'columns', []))\n"
515
- " rows = list(getattr(data, 'index', []))\n"
516
- " if cols: plt.xticks(range(len(cols)), cols, rotation=90)\n"
517
- " if rows: plt.yticks(range(len(rows)), rows)\n"
518
- " except Exception:\n"
519
- " pass\n"
520
- " ax = plt.gca()\n"
521
- " try:\n"
522
- " if not (ax.get_title() or '').strip():\n"
523
- " ax.set_title('Correlation Heatmap')\n"
524
- " except Exception:\n"
525
- " pass\n"
526
- " return ax\n"
527
- " return _safe_plot(lambda **kw: _mat_heat())\n"
528
- "\n"
529
- "def _safe_concat(objs, **kwargs):\n"
530
- " import pandas as _pd\n"
531
- " if objs is None: return _pd.DataFrame()\n"
532
- " if isinstance(objs,(list,tuple)) and len(objs)==0: return _pd.DataFrame()\n"
533
- " try: return _pd.concat(objs, **kwargs)\n"
534
- " except Exception as e:\n"
535
- " show(f'⚠ concat skipped: {e}')\n"
536
- " return _pd.DataFrame()\n"
537
- "\n"
538
- "from sklearn.preprocessing import OneHotEncoder\n"
539
- "import inspect\n"
540
- "def _SMX_OHE(**k):\n"
541
- " # normalise arg name across sklearn versions\n"
542
- " if 'sparse' in k and 'sparse_output' not in k:\n"
543
- " k['sparse_output'] = k.pop('sparse')\n"
544
- " k.setdefault('handle_unknown','ignore')\n"
545
- " k.setdefault('sparse_output', False)\n"
546
- " try:\n"
547
- " sig = inspect.signature(OneHotEncoder)\n"
548
- " if 'sparse_output' not in sig.parameters and 'sparse_output' in k:\n"
549
- " k['sparse'] = k.pop('sparse_output')\n"
550
- " except Exception:\n"
551
- " if 'sparse_output' in k:\n"
552
- " k['sparse'] = k.pop('sparse_output')\n"
553
- " return OneHotEncoder(**k)\n"
554
- "\n"
555
- "import numpy as _np\n"
556
- "def _SMX_mm(a, b):\n"
557
- " try:\n"
558
- " return a @ b # normal path\n"
559
- " except Exception:\n"
560
- " try:\n"
561
- " A = _np.asarray(a); B = _np.asarray(b)\n"
562
- " # If same 2D shape (e.g. (n,k) & (n,k)), treat as row-wise dot\n"
563
- " if A.ndim==2 and B.ndim==2 and A.shape==B.shape:\n"
564
- " return (A * B).sum(axis=1)\n"
565
- " # Otherwise try element-wise product (broadcast if possible)\n"
566
- " return A * B\n"
567
- " except Exception as e:\n"
568
- " from syntaxmatrix.display import show\n"
569
- " show(f'⚠ Matmul relaxed: {type(e).__name__}: {e}'); return _np.nan\n"
570
- "\n"
571
- "def _SMX_call(fn, *a, **k):\n"
572
- " try:\n"
573
- " return fn(*a, **k)\n"
574
- " except TypeError as e:\n"
575
- " msg = str(e)\n"
576
- " if \"unexpected keyword argument 'squared'\" in msg:\n"
577
- " k.pop('squared', None)\n"
578
- " return fn(*a, **k)\n"
579
- " raise\n"
580
- "\n"
581
- "def _SMX_rmse(y_true, y_pred):\n"
582
- " try:\n"
583
- " from sklearn.metrics import mean_squared_error as _mse\n"
584
- " try:\n"
585
- " return _mse(y_true, y_pred, squared=False)\n"
586
- " except TypeError:\n"
587
- " return (_mse(y_true, y_pred)) ** 0.5\n"
588
- " except Exception:\n"
589
- " import numpy as _np\n"
590
- " yt = _np.asarray(y_true, dtype=float)\n"
591
- " yp = _np.asarray(y_pred, dtype=float)\n"
592
- " diff = yt - yp\n"
593
- " return float((_np.mean(diff * diff)) ** 0.5)\n"
594
- "\n"
595
- "import pandas as _pd\n"
596
- "import numpy as _np\n"
597
- "def _SMX_autocoerce_dates(_df):\n"
598
- " if _df is None or not hasattr(_df, 'columns'): return\n"
599
- " for c in list(_df.columns):\n"
600
- " s = _df[c]\n"
601
- " n = str(c).lower()\n"
602
- " if _pd.api.types.is_datetime64_any_dtype(s):\n"
603
- " continue\n"
604
- " if _pd.api.types.is_object_dtype(s) or ('date' in n or 'time' in n or 'timestamp' in n or n.endswith('_dt')):\n"
605
- " try:\n"
606
- " conv = _pd.to_datetime(s, errors='coerce', utc=True).dt.tz_localize(None)\n"
607
- " # accept only if at least 10% (min 3) parse as dates\n"
608
- " if getattr(conv, 'notna', lambda: _pd.Series([]))().sum() >= max(3, int(0.1*len(_df))):\n"
609
- " _df[c] = conv\n"
610
- " except Exception:\n"
611
- " pass\n"
612
- "\n"
613
- "def _SMX_autocoerce_numeric(_df, cols):\n"
614
- " if _df is None: return\n"
615
- " for c in cols:\n"
616
- " if c in getattr(_df, 'columns', []):\n"
617
- " try:\n"
618
- " _df[c] = _pd.to_numeric(_df[c], errors='coerce')\n"
619
- " except Exception:\n"
620
- " pass\n"
621
- "\n"
622
- "def show(obj, title=None):\n"
623
- " try:\n"
624
- " import pandas as pd, numbers\n"
625
- " cap = (title or _SMX_caption_from_ctx())\n"
626
- " # 1) DataFrame → Styler with caption\n"
627
- " if isinstance(obj, pd.DataFrame):\n"
628
- " try: return _SMX_base_show(obj.style.set_caption(cap))\n"
629
- " except Exception: pass\n"
630
- " # 2) dict of scalars → DataFrame with caption\n"
631
- " if isinstance(obj, dict) and all(isinstance(v, numbers.Number) for v in obj.values()):\n"
632
- " df_ = pd.DataFrame({'metric': list(obj.keys()), 'value': list(obj.values())})\n"
633
- " try: return _SMX_base_show(df_.style.set_caption(cap))\n"
634
- " except Exception: return _SMX_base_show(df_)\n"
635
- " except Exception:\n"
636
- " pass\n"
637
- " return _SMX_base_show(obj)\n"
638
- )
639
-
640
- PREFACE_IMPORT = "from syntaxmatrix.smx_preface import *\n"
641
- # if PREFACE not in code:
642
- # code = PREFACE_IMPORT + code
643
-
644
511
  fixed = code
645
512
 
646
513
  fixed = re.sub(
@@ -690,6 +557,24 @@ def harden_ai_code(code: str) -> str:
690
557
  fixed
691
558
  )
692
559
 
560
+ try:
561
+ ast.parse(fixed)
562
+ except (SyntaxError, IndentationError):
563
+ fixed = _fallback_snippet()
564
+
565
+ fixed = re.sub(
566
+ r"except\s+Exception\s+as\s+e:\s*\n\s*show\(\.\.\.\)",
567
+ "except Exception as e:\n show(f\"⚠ Block skipped due to: {type(e).__name__}: {e}\")",
568
+ fixed,
569
+ )
570
+
571
+ # Fix placeholder Ellipsis handlers from LLM
572
+ fixed = re.sub(
573
+ r"except\s+Exception\s+as\s+e:\s*\n\s*show\(\.\.\.\)",
574
+ "except Exception as e:\n show(f\"⚠ Block skipped due to: {type(e).__name__}: {e}\")",
575
+ fixed,
576
+ )
577
+
693
578
  try:
694
579
  class _SMXMatmulRewriter(ast.NodeTransformer):
695
580
  def visit_BinOp(self, node):
@@ -708,44 +593,15 @@ def harden_ai_code(code: str) -> str:
708
593
  # 6) Final safety wrapper
709
594
  fixed = fixed.replace("\t", " ")
710
595
  fixed = textwrap.dedent(fixed).strip("\n")
711
-
596
+ fixed = _ensure_metrics_imports(fixed)
712
597
  fixed = _strip_stray_backrefs(fixed)
713
598
  fixed = _wrap_metric_calls(fixed)
599
+ fixed = _fix_unexpected_indent(fixed)
600
+ fixed = _patch_feature_coef_dataframe(fixed)
714
601
 
715
- # If the transformed code is still not syntactically valid, fall back to a
716
- # very defensive generic snippet that depends only on `df`. This guarantees
717
- try:
718
- ast.parse(fixed)
719
- except (SyntaxError, IndentationError):
720
- fixed = (
721
- "import pandas as pd\n"
722
- "df = df.copy()\n"
723
- "_info = {\n"
724
- " 'rows': len(df),\n"
725
- " 'cols': len(df.columns),\n"
726
- " 'numeric_cols': len(df.select_dtypes(include=['number','bool']).columns),\n"
727
- " 'categorical_cols': len(df.select_dtypes(exclude=['number','bool']).columns),\n"
728
- "}\n"
729
- "show(df.head(), title='Sample of data')\n"
730
- "show(_info, title='Dataset summary')\n"
731
- "try:\n"
732
- " _num = df.select_dtypes(include=['number','bool']).columns.tolist()\n"
733
- " if _num:\n"
734
- " SB_histplot()\n"
735
- " _SMX_export_png()\n"
736
- "except Exception as e:\n"
737
- " show(f\"⚠ Fallback visualisation failed: {type(e).__name__}: {e}\")\n"
738
- )
739
-
740
- # Fix placeholder Ellipsis handlers from LLM
741
- fixed = re.sub(
742
- r"except\s+Exception\s+as\s+e:\s*\n\s*show\(\.\.\.\)",
743
- "except Exception as e:\n show(f\"⚠ Block skipped due to: {type(e).__name__}: {e}\")",
744
- fixed,
745
- )
746
-
747
- wrapped = PREFACE + "try:\n" + _indent(fixed) + "\nexcept Exception as e:\n show(...)\n"
748
- wrapped = wrapped.lstrip()
602
+ # Import shared preface helpers once and wrap the LLM body safely
603
+ header = "from syntaxmatrix.preface import *\n\n"
604
+ wrapped = header + wrap_llm_code_safe(fixed)
749
605
  return wrapped
750
606
 
751
607
 
@@ -754,17 +610,6 @@ def indent_code(code: str, spaces: int = 4) -> str:
754
610
  return "\n".join(pad + line for line in code.splitlines())
755
611
 
756
612
 
757
- def wrap_llm_code_safe(code: str) -> str:
758
- # Swallow any runtime error from the LLM block instead of crashing the run
759
- return (
760
- "# __SAFE_WRAPPED__\n"
761
- "try:\n" + indent_code(code) + "\n"
762
- "except Exception as e:\n"
763
- " from syntaxmatrix.display import show\n"
764
- " show(f\"⚠️ Skipped LLM block due to: {type(e).__name__}: {e}\")\n"
765
- )
766
-
767
-
768
613
  def fix_boxplot_placeholder(code: str) -> str:
769
614
  # Replace invalid 'sns.boxplot(boxplot)' with a safe call using df/group_label/m
770
615
  return re.sub(