syntaxmatrix 2.5.5.5__py3-none-any.whl → 2.5.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
syntaxmatrix/utils.py CHANGED
@@ -93,6 +93,55 @@ def classify_ml_job(prompt: str) -> str:
93
93
  return "eda"
94
94
 
95
95
 
96
+ def _indent(code: str, spaces: int = 4) -> str:
97
+ """
98
+ Indent a block of code by `spaces` spaces, line by line.
99
+ Blank lines are preserved unchanged.
100
+ """
101
+ pad = " " * spaces
102
+ lines = code.splitlines()
103
+ return "\n".join((pad + line) if line.strip() else line for line in lines)
104
+
105
+
106
+ def wrap_llm_code_safe(body: str) -> str:
107
+ """
108
+ Wrap arbitrary LLM code so that:
109
+ - Any exception is caught and shown.
110
+ - A minimal, useful EDA fallback still runs so the user sees *something*.
111
+ This happens once in the framework; you never touch the individual cells.
112
+ """
113
+ return textwrap.dedent(
114
+ "try:\n"
115
+ + _indent(body)
116
+ + "\n"
117
+ "except Exception as e:\n"
118
+ " from syntaxmatrix.display import show\n"
119
+ " msg = f\"⚠️ Skipped LLM block due to: {type(e).__name__}: {e}\"\n"
120
+ " show(msg)\n"
121
+ " # --- automatic EDA fallback ---\n"
122
+ " try:\n"
123
+ " df_local = globals().get('df')\n"
124
+ " if df_local is not None:\n"
125
+ " import pandas as pd\n"
126
+ " from syntaxmatrix.preface import SB_histplot, _SMX_export_png\n"
127
+ " num_cols = df_local.select_dtypes(include=['number', 'bool']).columns.tolist()\n"
128
+ " cat_cols = [c for c in df_local.columns if c not in num_cols]\n"
129
+ " info = {\n"
130
+ " 'rows': len(df_local),\n"
131
+ " 'cols': len(df_local.columns),\n"
132
+ " 'numeric_cols': len(num_cols),\n"
133
+ " 'categorical_cols': len(cat_cols),\n"
134
+ " }\n"
135
+ " show(df_local.head())\n"
136
+ " show(info)\n"
137
+ " if num_cols:\n"
138
+ " SB_histplot()\n"
139
+ " _SMX_export_png()\n"
140
+ " except Exception as _f:\n"
141
+ " show(f\"⚠️ Fallback EDA failed: {type(_f).__name__}: {_f}\")\n"
142
+ )
143
+
144
+
96
145
  def harden_ai_code(code: str) -> str:
97
146
  """
98
147
  Make any AI-generated cell resilient:
@@ -106,40 +155,54 @@ def harden_ai_code(code: str) -> str:
106
155
  # Remove any LLM-added try/except blocks (hardener adds its own)
107
156
  import re
108
157
 
109
- def strip_placeholders(code: str) -> str:
110
- code = re.sub(r"\bshow\(\s*\.\.\.\s*\)",
111
- "show('⚠ Block skipped due to an error.')",
112
- code)
113
- code = re.sub(r"\breturn\s+\.\.\.", "return None", code)
114
- return code
115
-
116
- def _indent(code: str, spaces: int = 4) -> str:
117
- pad = " " * spaces
118
- return "\n".join(pad + line for line in code.splitlines())
119
-
120
- def _SMX_OHE(**k):
121
- # normalise arg name across sklearn versions
122
- if "sparse" in k and "sparse_output" not in k:
123
- k["sparse_output"] = k.pop("sparse")
124
- # default behaviour we want
125
- k.setdefault("handle_unknown", "ignore")
126
- k.setdefault("sparse_output", False)
127
- try:
128
- # if running on old sklearn without sparse_output, translate back
129
- if "sparse_output" not in inspect.signature(OneHotEncoder).parameters:
130
- if "sparse_output" in k:
131
- k["sparse"] = k.pop("sparse_output")
132
- return OneHotEncoder(**k)
133
- except TypeError:
134
- # final fallback: try legacy name
135
- if "sparse_output" in k:
136
- k["sparse"] = k.pop("sparse_output")
137
- return OneHotEncoder(**k)
138
158
 
139
159
  def _strip_stray_backrefs(code: str) -> str:
140
160
  code = re.sub(r'(?m)^\s*\\\d+\s*', '', code)
141
161
  code = re.sub(r'(?m)[;]\s*\\\d+\s*', '; ', code)
142
162
  return code
163
+
164
+ def _patch_feature_coef_dataframe(code: str) -> str:
165
+ """
166
+ Harden patterns like:
167
+ coeffs_df = pd.DataFrame({'feature': num_features, 'coefficient': coef})
168
+ which can crash with:
169
+ ValueError: All arrays must be of the same length
170
+ We wrap them in a try/except and, on failure, rebuild the
171
+ DataFrame by zipping feature names with coefficients up to
172
+ the min length.
173
+ """
174
+ # Match single-line assignments of the form:
175
+ # <var> = pd.DataFrame({'feature': <feat>, 'coefficient': <coef>})
176
+ import re
177
+
178
+ pattern = re.compile(
179
+ r"(?P<indent>^[ \t]*)"
180
+ r"(?P<var>\w+)\s*=\s*pd\.DataFrame\(\s*{\s*"
181
+ r"['\"]feature['\"]\s*:\s*(?P<feat_expr>.+?)\s*,\s*"
182
+ r"['\"]coefficient['\"]\s*:\s*(?P<coef_expr>.+?)\s*"
183
+ r"}\s*\)\s*$",
184
+ re.MULTILINE,
185
+ )
186
+
187
+ def repl(m: re.Match) -> str:
188
+ indent = m.group("indent")
189
+ var = m.group("var")
190
+ feat_expr = m.group("feat_expr").strip()
191
+ coef_expr = m.group("coef_expr").strip()
192
+
193
+ # Keep the original intent, but add a safe fallback.
194
+ return (
195
+ f"{indent}try:\n"
196
+ f"{indent} {var} = pd.DataFrame({{'feature': {feat_expr}, 'coefficient': {coef_expr}}})\n"
197
+ f"{indent}except Exception as _e:\n"
198
+ f"{indent} import numpy as _np\n"
199
+ f"{indent} _feat = list({feat_expr})\n"
200
+ f"{indent} _coef = _np.asarray({coef_expr}).ravel()\n"
201
+ f"{indent} _k = min(len(_feat), len(_coef))\n"
202
+ f"{indent} {var} = pd.DataFrame({{'feature': _feat[:_k], 'coefficient': _coef[:_k]}})\n"
203
+ )
204
+
205
+ return pattern.sub(repl, code)
143
206
 
144
207
  def _wrap_metric_calls(code: str) -> str:
145
208
  names = [
@@ -224,6 +287,252 @@ def harden_ai_code(code: str) -> str:
224
287
  except Exception:
225
288
  pass
226
289
 
290
+ def _ensure_metrics_imports(code: str) -> str:
291
+ needed = set()
292
+ if "r2_score" in code:
293
+ needed.add("r2_score")
294
+ if "mean_absolute_error" in code:
295
+ needed.add("mean_absolute_error")
296
+ # ... add others if you like ...
297
+
298
+ if not needed:
299
+ return code
300
+
301
+ if "from sklearn.metrics import" in code:
302
+ return code # assume user/LLM handled it
303
+
304
+ import_line = "from sklearn.metrics import " + ", ".join(sorted(needed)) + "\n"
305
+ return import_line + code
306
+
307
+ def _fix_unexpected_indent(src: str) -> str:
308
+ """
309
+ Some LLM snippets jump indentation (e.g. extra 8 spaces on an 'import'
310
+ line) without a preceding block opener. That causes
311
+ `IndentationError: unexpected indent` when we wrap in our own `try:`.
312
+ This normalises those lines back to the previous indent level, but only
313
+ when we're not in a multi-line bracket/paren context.
314
+ """
315
+ lines = src.splitlines()
316
+ out = []
317
+ prev_indent = 0
318
+ prev_ends_colon = False
319
+ paren_depth = 0 # (), [], {} depth across lines (very approximate)
320
+
321
+ for raw in lines:
322
+ stripped = raw.lstrip()
323
+ if not stripped: # blank / whitespace line
324
+ out.append(raw)
325
+ continue
326
+
327
+ indent = len(raw) - len(stripped)
328
+
329
+ # Only flatten if:
330
+ # - we're not inside a (...) / [...] / {...} block, and
331
+ # - previous logical line did NOT end with ':', and
332
+ # - this line is indented more than the previous indent.
333
+ if paren_depth == 0 and not prev_ends_colon and indent > prev_indent:
334
+ indent = prev_indent
335
+ new_line = " " * indent + stripped
336
+ out.append(new_line)
337
+
338
+ # Update simple state for next line
339
+ txt = stripped
340
+ paren_depth += txt.count("(") + txt.count("[") + txt.count("{")
341
+ paren_depth -= txt.count(")") + txt.count("]") + txt.count("}")
342
+ prev_ends_colon = txt.rstrip().endswith(":")
343
+ prev_indent = indent
344
+
345
+ return "\n".join(out)
346
+
347
+ def _fallback_snippet() -> str:
348
+ """
349
+ Final-resort snippet when the LLM code is syntactically broken.
350
+
351
+ It:
352
+ - attempts a simple automatic ML task (classification or regression)
353
+ - then falls back to generic but useful EDA.
354
+
355
+ It assumes `from syntaxmatrix.preface import *` has already been done,
356
+ so `_SMX_OHE`, `_SMX_call`, `SB_histplot`, `_SMX_export_png` and the
357
+ patched `show()` are available.
358
+ """
359
+ import textwrap
360
+
361
+ return textwrap.dedent(
362
+ """\
363
+ import pandas as pd
364
+ import numpy as np
365
+ import matplotlib.pyplot as plt
366
+ from sklearn.model_selection import train_test_split
367
+ from sklearn.compose import ColumnTransformer
368
+ from sklearn.preprocessing import StandardScaler
369
+ from sklearn.linear_model import LogisticRegression, LinearRegression
370
+ from sklearn.metrics import accuracy_score, r2_score, mean_absolute_error
371
+
372
+ df = df.copy()
373
+
374
+ # --- basic column introspection ---
375
+ num_cols = df.select_dtypes(include=['number', 'bool']).columns.tolist()
376
+ cat_cols = [c for c in df.columns if c not in num_cols]
377
+
378
+ # --- attempt an automatic ML task ---
379
+ target_col = None
380
+ task_type = None
381
+
382
+ # Prefer a low-cardinality target (classification)
383
+ for c in num_cols + cat_cols:
384
+ uniq = df[c].dropna().nunique()
385
+ if 2 <= uniq <= 10:
386
+ target_col = c
387
+ task_type = 'classification'
388
+ break
389
+
390
+ # If none found, try a numeric regression target
391
+ if target_col is None and num_cols:
392
+ target_col = num_cols[-1]
393
+ task_type = 'regression'
394
+
395
+ if target_col is not None:
396
+ try:
397
+ X = df.drop(columns=[target_col]).copy()
398
+ y = df[target_col].copy()
399
+
400
+ num_feats = X.select_dtypes(include=['number', 'bool']).columns.tolist()
401
+ cat_feats = [c for c in X.columns if c not in num_feats]
402
+
403
+ pre = ColumnTransformer(
404
+ transformers=[
405
+ ('num', StandardScaler(), num_feats),
406
+ ('cat', _SMX_OHE(handle_unknown='ignore'), cat_feats),
407
+ ],
408
+ remainder='drop',
409
+ )
410
+
411
+ from sklearn.pipeline import Pipeline
412
+ if task_type == 'classification':
413
+ model = LogisticRegression(max_iter=1000)
414
+ else:
415
+ model = LinearRegression()
416
+
417
+ pipe = Pipeline([('pre', pre), ('model', model)])
418
+
419
+ X_train, X_test, y_train, y_test = train_test_split(
420
+ X, y, test_size=0.25, random_state=42
421
+ )
422
+
423
+ pipe.fit(X_train, y_train)
424
+ y_pred = pipe.predict(X_test)
425
+
426
+ if task_type == 'classification':
427
+ # If predictions look like probabilities, convert to labels
428
+ if getattr(y_pred, 'ndim', 1) > 1 and y_pred.shape[1] > 1:
429
+ y_pred_labels = y_pred.argmax(axis=1)
430
+ else:
431
+ try:
432
+ y_pred_labels = (y_pred > 0.5).astype(y_test.dtype)
433
+ except Exception:
434
+ y_pred_labels = y_pred
435
+
436
+ acc = _SMX_call(accuracy_score, y_test, y_pred_labels)
437
+ show({
438
+ 'target': target_col,
439
+ 'task': 'classification',
440
+ 'accuracy': acc,
441
+ })
442
+ else:
443
+ r2 = _SMX_call(r2_score, y_test, y_pred)
444
+ mae = _SMX_call(mean_absolute_error, y_test, y_pred)
445
+ show({
446
+ 'target': target_col,
447
+ 'task': 'regression',
448
+ 'r2': r2,
449
+ 'mae': mae,
450
+ })
451
+
452
+ except Exception as _ml_e:
453
+ show(f"⚠ ML fallback failed: {type(_ml_e).__name__}: {_ml_e}")
454
+
455
+ # --- EDA fallback that still helps answer the question ---
456
+ try:
457
+ info = {
458
+ 'rows': len(df),
459
+ 'cols': len(df.columns),
460
+ 'numeric_cols': len(num_cols),
461
+ 'categorical_cols': len(cat_cols),
462
+ }
463
+ show(df.head(), title='Sample of data')
464
+ show(info, title='Dataset summary')
465
+
466
+ # Quick univariate look if we have numeric columns
467
+ if num_cols:
468
+ SB_histplot()
469
+ _SMX_export_png()
470
+ except Exception as _eda_e:
471
+ show(f"⚠ EDA fallback failed: {type(_eda_e).__name__}: {_eda_e}")
472
+ """
473
+ )
474
+
475
+ def _strip_file_io_ops(code: str) -> str:
476
+ """
477
+ Remove obvious local file I/O operations in LLM code
478
+ so nothing writes to the container filesystem.
479
+ """
480
+ # 1) Methods like df.to_csv(...), df.to_excel(...), etc.
481
+ FILE_WRITE_METHODS = (
482
+ "to_csv", "to_excel", "to_pickle", "to_parquet",
483
+ "to_json", "to_hdf",
484
+ )
485
+
486
+ for mname in FILE_WRITE_METHODS:
487
+ pat = re.compile(
488
+ rf"(?m)^(\s*)([A-Za-z_][A-Za-z0-9_\.]*)\s*\.\s*{mname}\s*\([^)]*\)\s*$"
489
+ )
490
+
491
+ def _repl(match):
492
+ indent = match.group(1)
493
+ expr = match.group(2)
494
+ return f"{indent}# [SMX] stripped file write: {expr}.{mname}(...)"
495
+
496
+ code = pat.sub(_repl, code)
497
+
498
+ # 2) plt.savefig(...) calls
499
+ pat_savefig = re.compile(r"(?m)^(\s*)(plt\.savefig\s*\([^)]*\)\s*)$")
500
+ code = pat_savefig.sub(
501
+ lambda m: f"{m.group(1)}# [SMX] stripped savefig: {m.group(2).strip()}",
502
+ code,
503
+ )
504
+
505
+ # 3) with open(..., 'w'/'wb') as f:
506
+ pat_with_open = re.compile(
507
+ r"(?m)^(\s*)with\s+open\([^)]*['\"]w[b]?['\"][^)]*\)\s+as\s+([A-Za-z_][A-Za-z0-9_]*)\s*:\s*$"
508
+ )
509
+
510
+ def _with_open_repl(match):
511
+ indent = match.group(1)
512
+ var = match.group(2)
513
+ return f"{indent}if False: # [SMX] file write stripped (was: with open(... as {var}))"
514
+
515
+ code = pat_with_open.sub(_with_open_repl, code)
516
+
517
+ # 4) joblib.dump(...), pickle.dump(...)
518
+ for mod in ("joblib", "pickle"):
519
+ pat = re.compile(rf"(?m)^(\s*){mod}\.dump\s*\([^)]*\)\s*$")
520
+ code = pat.sub(
521
+ lambda m: f"{m.group(1)}# [SMX] stripped {mod}.dump(...)",
522
+ code,
523
+ )
524
+
525
+ # 5) bare open(..., 'w'/'wb') calls
526
+ pat_open = re.compile(
527
+ r"(?m)^(\s*)open\([^)]*['\"]w[b]?['\"][^)]*\)\s*$"
528
+ )
529
+ code = pat_open.sub(
530
+ lambda m: f"{m.group(1)}# [SMX] stripped open(..., 'w'/'wb')",
531
+ code,
532
+ )
533
+
534
+ return code
535
+
227
536
  # Register and run patches once per execution
228
537
  for _patch in (
229
538
  _smx_patch_mean_squared_error_squared_kw,
@@ -235,412 +544,6 @@ def harden_ai_code(code: str) -> str:
235
544
  except Exception:
236
545
  pass
237
546
 
238
- PREFACE = (
239
- "# === SMX Auto-Hardening Preface (do not edit) ===\n"
240
- "import warnings, numpy as np, pandas as pd, matplotlib.pyplot as plt\n"
241
- "warnings.filterwarnings('ignore')\n"
242
- "try:\n"
243
- " import seaborn as sns\n"
244
- "except Exception:\n"
245
- " class _Dummy:\n"
246
- " def __getattr__(self, name):\n"
247
- " def _f(*a, **k):\n"
248
- " from syntaxmatrix.display import show\n"
249
- " show('⚠ seaborn not available; plot skipped.')\n"
250
- " return _f\n"
251
- " sns = _Dummy()\n"
252
- "\n"
253
- "from syntaxmatrix.display import show as _SMX_base_show\n"
254
- "def _SMX_caption_from_ctx():\n"
255
- " g = globals()\n"
256
- " t = g.get('refined_question') or g.get('askai_question') or 'Table'\n"
257
- " return str(t).strip().splitlines()[0][:120]\n"
258
- "\n"
259
- "def show(obj, title=None):\n"
260
- " try:\n"
261
- " import pandas as pd\n"
262
- " if isinstance(obj, pd.DataFrame):\n"
263
- " cap = (title or _SMX_caption_from_ctx())\n"
264
- " try:\n"
265
- " return _SMX_base_show(obj.style.set_caption(cap))\n"
266
- " except Exception:\n"
267
- " pass\n"
268
- " except Exception:\n"
269
- " pass\n"
270
- " return _SMX_base_show(obj)\n"
271
- "\n"
272
- "def _SMX_axes_have_titles(fig=None):\n"
273
- " import matplotlib.pyplot as _plt\n"
274
- " fig = fig or _plt.gcf()\n"
275
- " try:\n"
276
- " for _ax in fig.get_axes():\n"
277
- " if (_ax.get_title() or '').strip():\n"
278
- " return True\n"
279
- " except Exception:\n"
280
- " pass\n"
281
- " return False\n"
282
- "\n"
283
- "def _SMX_export_png():\n"
284
- " import io, base64\n"
285
- " fig = plt.gcf()\n"
286
- " try:\n"
287
- " if not _SMX_axes_have_titles(fig):\n"
288
- " fig.suptitle(_SMX_caption_from_ctx(), fontsize=10)\n"
289
- " except Exception:\n"
290
- " pass\n"
291
- " buf = io.BytesIO()\n"
292
- " plt.savefig(buf, format='png', bbox_inches='tight')\n"
293
- " buf.seek(0)\n"
294
- " from IPython.display import display, HTML\n"
295
- " _img = base64.b64encode(buf.read()).decode('ascii')\n"
296
- " display(HTML(f\"<img src='data:image/png;base64,{_img}' style='max-width:100%;height:auto;border:1px solid #ccc;border-radius:4px;'/>\"))\n"
297
- " plt.close()\n"
298
- "\n"
299
- "def _pick_df():\n"
300
- " return globals().get('df', None)\n"
301
- "\n"
302
- "def _pick_ax_slot():\n"
303
- " ax = None\n"
304
- " try:\n"
305
- " _axes = globals().get('axes', None)\n"
306
- " import numpy as _np\n"
307
- " if _axes is not None:\n"
308
- " arr = _np.ravel(_axes)\n"
309
- " for _a in arr:\n"
310
- " try:\n"
311
- " if hasattr(_a,'has_data') and not _a.has_data():\n"
312
- " ax = _a; break\n"
313
- " except Exception:\n"
314
- " continue\n"
315
- " except Exception:\n"
316
- " ax = None\n"
317
- " return ax\n"
318
- "\n"
319
- "def _first_numeric(_d):\n"
320
- " import numpy as np, pandas as pd\n"
321
- " try:\n"
322
- " preferred = [\"median_house_value\", \"price\", \"value\", \"target\", \"label\", \"y\"]\n"
323
- " for c in preferred:\n"
324
- " if c in _d.columns and pd.api.types.is_numeric_dtype(_d[c]):\n"
325
- " return c\n"
326
- " cols = _d.select_dtypes(include=[np.number]).columns.tolist()\n"
327
- " return cols[0] if cols else None\n"
328
- " except Exception:\n"
329
- " return None\n"
330
- "\n"
331
- "def _first_categorical(_d):\n"
332
- " import pandas as pd, numpy as np\n"
333
- " try:\n"
334
- " num = set(_d.select_dtypes(include=[np.number]).columns.tolist())\n"
335
- " cand = [c for c in _d.columns if c not in num and _d[c].nunique(dropna=True) <= 50]\n"
336
- " return cand[0] if cand else None\n"
337
- " except Exception:\n"
338
- " return None\n"
339
- "\n"
340
- "boxplot = barplot = histplot = distplot = lineplot = countplot = heatmap = pairplot = None\n"
341
- "\n"
342
- "def _safe_plot(func, *args, **kwargs):\n"
343
- " try:\n"
344
- " ax = func(*args, **kwargs)\n"
345
- " if ax is None:\n"
346
- " ax = plt.gca()\n"
347
- " try:\n"
348
- " if hasattr(ax, 'has_data') and not ax.has_data():\n"
349
- " from syntaxmatrix.display import show as _show\n"
350
- " _show('⚠ Empty plot: no data drawn.')\n"
351
- " except Exception:\n"
352
- " pass\n"
353
- " try: plt.tight_layout()\n"
354
- " except Exception: pass\n"
355
- " return ax\n"
356
- " except Exception as e:\n"
357
- " from syntaxmatrix.display import show as _show\n"
358
- " _show(f'⚠ Plot skipped: {type(e).__name__}: {e}')\n"
359
- " return None\n"
360
- "\n"
361
- "def SB_histplot(*a, **k):\n"
362
- " _missing = (getattr(sns, '__class__', type(sns)).__name__ == '_Dummy')\n"
363
- " _sentinel = (len(a) >= 1 and a[0] is None)\n"
364
- " if (not a or _sentinel) and not k:\n"
365
- " d = _pick_df()\n"
366
- " if d is not None:\n"
367
- " x = _first_numeric(d)\n"
368
- " if x is not None:\n"
369
- " def _draw():\n"
370
- " plt.hist(d[x].dropna())\n"
371
- " ax = plt.gca()\n"
372
- " if not (ax.get_title() or '').strip():\n"
373
- " ax.set_title(f'Distribution of {x}')\n"
374
- " return ax\n"
375
- " return _safe_plot(lambda **kw: _draw())\n"
376
- " if _missing:\n"
377
- " return _safe_plot(lambda **kw: plt.hist([]))\n"
378
- " if _sentinel:\n"
379
- " a = a[1:]\n"
380
- " return _safe_plot(getattr(sns,'histplot', plt.hist), *a, **k)\n"
381
- "\n"
382
- "def SB_barplot(*a, **k):\n"
383
- " _missing = (getattr(sns, '__class__', type(sns)).__name__ == '_Dummy')\n"
384
- " _sentinel = (len(a) >= 1 and a[0] is None)\n"
385
- " _ax = k.get('ax') or _pick_ax_slot()\n"
386
- " if _ax is not None:\n"
387
- " try: plt.sca(_ax)\n"
388
- " except Exception: pass\n"
389
- " k.setdefault('ax', _ax)\n"
390
- " if (not a or _sentinel) and not k:\n"
391
- " d = _pick_df()\n"
392
- " if d is not None:\n"
393
- " x = _first_categorical(d)\n"
394
- " y = _first_numeric(d)\n"
395
- " if x and y:\n"
396
- " import pandas as _pd\n"
397
- " g = d.groupby(x)[y].mean().reset_index()\n"
398
- " def _draw():\n"
399
- " if _missing:\n"
400
- " plt.bar(g[x], g[y])\n"
401
- " else:\n"
402
- " sns.barplot(data=g, x=x, y=y, ax=k.get('ax'))\n"
403
- " ax = plt.gca()\n"
404
- " if not (ax.get_title() or '').strip():\n"
405
- " ax.set_title(f'Mean {y} by {x}')\n"
406
- " return ax\n"
407
- " return _safe_plot(lambda **kw: _draw())\n"
408
- " if _missing:\n"
409
- " return _safe_plot(lambda **kw: plt.bar([], []))\n"
410
- " if _sentinel:\n"
411
- " a = a[1:]\n"
412
- " return _safe_plot(sns.barplot, *a, **k)\n"
413
- "\n"
414
- "def SB_boxplot(*a, **k):\n"
415
- " _missing = (getattr(sns, '__class__', type(sns)).__name__ == '_Dummy')\n"
416
- " _sentinel = (len(a) >= 1 and a[0] is None)\n"
417
- " _ax = k.get('ax') or _pick_ax_slot()\n"
418
- " if _ax is not None:\n"
419
- " try: plt.sca(_ax)\n"
420
- " except Exception: pass\n"
421
- " k.setdefault('ax', _ax)\n"
422
- " if (not a or _sentinel) and not k:\n"
423
- " d = _pick_df()\n"
424
- " if d is not None:\n"
425
- " x = _first_categorical(d)\n"
426
- " y = _first_numeric(d)\n"
427
- " if x and y:\n"
428
- " def _draw():\n"
429
- " if _missing:\n"
430
- " plt.boxplot(d[y].dropna())\n"
431
- " else:\n"
432
- " sns.boxplot(data=d, x=x, y=y, ax=k.get('ax'))\n"
433
- " ax = plt.gca()\n"
434
- " if not (ax.get_title() or '').strip():\n"
435
- " ax.set_title(f'Distribution of {y} by {x}')\n"
436
- " return ax\n"
437
- " return _safe_plot(lambda **kw: _draw())\n"
438
- " if _missing:\n"
439
- " return _safe_plot(lambda **kw: plt.boxplot([]))\n"
440
- " if _sentinel:\n"
441
- " a = a[1:]\n"
442
- " return _safe_plot(sns.boxplot, *a, **k)\n"
443
- "\n"
444
- "def SB_scatterplot(*a, **k):\n"
445
- " _missing = (getattr(sns, '__class__', type(sns)).__name__ == '_Dummy')\n"
446
- " fn = getattr(sns,'scatterplot', None)\n"
447
- " # If seaborn is unavailable OR the caller passed (data=..., x='col', y='col'),\n"
448
- " # use a robust matplotlib path that looks up data and coerces to numeric.\n"
449
- " if _missing or fn is None:\n"
450
- " data = k.get('data'); x = k.get('x'); y = k.get('y')\n"
451
- " if data is not None and isinstance(x, str) and isinstance(y, str) and x in data.columns and y in data.columns:\n"
452
- " xs = pd.to_numeric(data[x], errors='coerce')\n"
453
- " ys = pd.to_numeric(data[y], errors='coerce')\n"
454
- " m = xs.notna() & ys.notna()\n"
455
- " def _draw():\n"
456
- " plt.scatter(xs[m], ys[m])\n"
457
- " ax = plt.gca()\n"
458
- " if not (ax.get_title() or '').strip():\n"
459
- " ax.set_title(f'{y} vs {x}')\n"
460
- " return ax\n"
461
- " return _safe_plot(lambda **kw: _draw())\n"
462
- " # else: fall back to auto-pick two numeric columns\n"
463
- " d = _pick_df()\n"
464
- " if d is not None:\n"
465
- " num = d.select_dtypes(include=[np.number]).columns.tolist()\n"
466
- " if len(num) >= 2:\n"
467
- " def _draw2():\n"
468
- " plt.scatter(d[num[0]], d[num[1]])\n"
469
- " ax = plt.gca()\n"
470
- " if not (ax.get_title() or '').strip():\n"
471
- " ax.set_title(f'{num[1]} vs {num[0]}')\n"
472
- " return ax\n"
473
- " return _safe_plot(lambda **kw: _draw2())\n"
474
- " return _safe_plot(lambda **kw: plt.scatter([], []))\n"
475
- " # seaborn path\n"
476
- " return _safe_plot(fn, *a, **k)\n"
477
- "\n"
478
- "def SB_heatmap(*a, **k):\n"
479
- " _missing = (getattr(sns, '__class__', type(sns)).__name__ == '_Dummy')\n"
480
- " data = None\n"
481
- " if a:\n"
482
- " data = a[0]\n"
483
- " elif 'data' in k:\n"
484
- " data = k['data']\n"
485
- " if data is None:\n"
486
- " d = _pick_df()\n"
487
- " try:\n"
488
- " if d is not None:\n"
489
- " import numpy as _np\n"
490
- " data = d.select_dtypes(include=[_np.number]).corr()\n"
491
- " except Exception:\n"
492
- " data = None\n"
493
- " if data is None:\n"
494
- " from syntaxmatrix.display import show as _show\n"
495
- " _show('⚠ Heatmap skipped: no data.')\n"
496
- " return None\n"
497
- " if not _missing and hasattr(sns, 'heatmap'):\n"
498
- " _k = {kk: vv for kk, vv in k.items() if kk != 'data'}\n"
499
- " def _draw():\n"
500
- " ax = sns.heatmap(data, **_k)\n"
501
- " try:\n"
502
- " ax = ax or plt.gca()\n"
503
- " if not (ax.get_title() or '').strip():\n"
504
- " ax.set_title('Correlation Heatmap')\n"
505
- " except Exception:\n"
506
- " pass\n"
507
- " return ax\n"
508
- " return _safe_plot(lambda **kw: _draw())\n"
509
- " def _mat_heat():\n"
510
- " im = plt.imshow(data, aspect='auto')\n"
511
- " try: plt.colorbar()\n"
512
- " except Exception: pass\n"
513
- " try:\n"
514
- " cols = list(getattr(data, 'columns', []))\n"
515
- " rows = list(getattr(data, 'index', []))\n"
516
- " if cols: plt.xticks(range(len(cols)), cols, rotation=90)\n"
517
- " if rows: plt.yticks(range(len(rows)), rows)\n"
518
- " except Exception:\n"
519
- " pass\n"
520
- " ax = plt.gca()\n"
521
- " try:\n"
522
- " if not (ax.get_title() or '').strip():\n"
523
- " ax.set_title('Correlation Heatmap')\n"
524
- " except Exception:\n"
525
- " pass\n"
526
- " return ax\n"
527
- " return _safe_plot(lambda **kw: _mat_heat())\n"
528
- "\n"
529
- "def _safe_concat(objs, **kwargs):\n"
530
- " import pandas as _pd\n"
531
- " if objs is None: return _pd.DataFrame()\n"
532
- " if isinstance(objs,(list,tuple)) and len(objs)==0: return _pd.DataFrame()\n"
533
- " try: return _pd.concat(objs, **kwargs)\n"
534
- " except Exception as e:\n"
535
- " show(f'⚠ concat skipped: {e}')\n"
536
- " return _pd.DataFrame()\n"
537
- "\n"
538
- "from sklearn.preprocessing import OneHotEncoder\n"
539
- "import inspect\n"
540
- "def _SMX_OHE(**k):\n"
541
- " # normalise arg name across sklearn versions\n"
542
- " if 'sparse' in k and 'sparse_output' not in k:\n"
543
- " k['sparse_output'] = k.pop('sparse')\n"
544
- " k.setdefault('handle_unknown','ignore')\n"
545
- " k.setdefault('sparse_output', False)\n"
546
- " try:\n"
547
- " sig = inspect.signature(OneHotEncoder)\n"
548
- " if 'sparse_output' not in sig.parameters and 'sparse_output' in k:\n"
549
- " k['sparse'] = k.pop('sparse_output')\n"
550
- " except Exception:\n"
551
- " if 'sparse_output' in k:\n"
552
- " k['sparse'] = k.pop('sparse_output')\n"
553
- " return OneHotEncoder(**k)\n"
554
- "\n"
555
- "import numpy as _np\n"
556
- "def _SMX_mm(a, b):\n"
557
- " try:\n"
558
- " return a @ b # normal path\n"
559
- " except Exception:\n"
560
- " try:\n"
561
- " A = _np.asarray(a); B = _np.asarray(b)\n"
562
- " # If same 2D shape (e.g. (n,k) & (n,k)), treat as row-wise dot\n"
563
- " if A.ndim==2 and B.ndim==2 and A.shape==B.shape:\n"
564
- " return (A * B).sum(axis=1)\n"
565
- " # Otherwise try element-wise product (broadcast if possible)\n"
566
- " return A * B\n"
567
- " except Exception as e:\n"
568
- " from syntaxmatrix.display import show\n"
569
- " show(f'⚠ Matmul relaxed: {type(e).__name__}: {e}'); return _np.nan\n"
570
- "\n"
571
- "def _SMX_call(fn, *a, **k):\n"
572
- " try:\n"
573
- " return fn(*a, **k)\n"
574
- " except TypeError as e:\n"
575
- " msg = str(e)\n"
576
- " if \"unexpected keyword argument 'squared'\" in msg:\n"
577
- " k.pop('squared', None)\n"
578
- " return fn(*a, **k)\n"
579
- " raise\n"
580
- "\n"
581
- "def _SMX_rmse(y_true, y_pred):\n"
582
- " try:\n"
583
- " from sklearn.metrics import mean_squared_error as _mse\n"
584
- " try:\n"
585
- " return _mse(y_true, y_pred, squared=False)\n"
586
- " except TypeError:\n"
587
- " return (_mse(y_true, y_pred)) ** 0.5\n"
588
- " except Exception:\n"
589
- " import numpy as _np\n"
590
- " yt = _np.asarray(y_true, dtype=float)\n"
591
- " yp = _np.asarray(y_pred, dtype=float)\n"
592
- " diff = yt - yp\n"
593
- " return float((_np.mean(diff * diff)) ** 0.5)\n"
594
- "\n"
595
- "import pandas as _pd\n"
596
- "import numpy as _np\n"
597
- "def _SMX_autocoerce_dates(_df):\n"
598
- " if _df is None or not hasattr(_df, 'columns'): return\n"
599
- " for c in list(_df.columns):\n"
600
- " s = _df[c]\n"
601
- " n = str(c).lower()\n"
602
- " if _pd.api.types.is_datetime64_any_dtype(s):\n"
603
- " continue\n"
604
- " if _pd.api.types.is_object_dtype(s) or ('date' in n or 'time' in n or 'timestamp' in n or n.endswith('_dt')):\n"
605
- " try:\n"
606
- " conv = _pd.to_datetime(s, errors='coerce', utc=True).dt.tz_localize(None)\n"
607
- " # accept only if at least 10% (min 3) parse as dates\n"
608
- " if getattr(conv, 'notna', lambda: _pd.Series([]))().sum() >= max(3, int(0.1*len(_df))):\n"
609
- " _df[c] = conv\n"
610
- " except Exception:\n"
611
- " pass\n"
612
- "\n"
613
- "def _SMX_autocoerce_numeric(_df, cols):\n"
614
- " if _df is None: return\n"
615
- " for c in cols:\n"
616
- " if c in getattr(_df, 'columns', []):\n"
617
- " try:\n"
618
- " _df[c] = _pd.to_numeric(_df[c], errors='coerce')\n"
619
- " except Exception:\n"
620
- " pass\n"
621
- "\n"
622
- "def show(obj, title=None):\n"
623
- " try:\n"
624
- " import pandas as pd, numbers\n"
625
- " cap = (title or _SMX_caption_from_ctx())\n"
626
- " # 1) DataFrame → Styler with caption\n"
627
- " if isinstance(obj, pd.DataFrame):\n"
628
- " try: return _SMX_base_show(obj.style.set_caption(cap))\n"
629
- " except Exception: pass\n"
630
- " # 2) dict of scalars → DataFrame with caption\n"
631
- " if isinstance(obj, dict) and all(isinstance(v, numbers.Number) for v in obj.values()):\n"
632
- " df_ = pd.DataFrame({'metric': list(obj.keys()), 'value': list(obj.values())})\n"
633
- " try: return _SMX_base_show(df_.style.set_caption(cap))\n"
634
- " except Exception: return _SMX_base_show(df_)\n"
635
- " except Exception:\n"
636
- " pass\n"
637
- " return _SMX_base_show(obj)\n"
638
- )
639
-
640
- PREFACE_IMPORT = "from syntaxmatrix.smx_preface import *\n"
641
- # if PREFACE not in code:
642
- # code = PREFACE_IMPORT + code
643
-
644
547
  fixed = code
645
548
 
646
549
  fixed = re.sub(
@@ -690,6 +593,24 @@ def harden_ai_code(code: str) -> str:
690
593
  fixed
691
594
  )
692
595
 
596
+ try:
597
+ ast.parse(fixed)
598
+ except (SyntaxError, IndentationError):
599
+ fixed = _fallback_snippet()
600
+
601
+ fixed = re.sub(
602
+ r"except\s+Exception\s+as\s+e:\s*\n\s*show\(\.\.\.\)",
603
+ "except Exception as e:\n show(f\"⚠ Block skipped due to: {type(e).__name__}: {e}\")",
604
+ fixed,
605
+ )
606
+
607
+ # Fix placeholder Ellipsis handlers from LLM
608
+ fixed = re.sub(
609
+ r"except\s+Exception\s+as\s+e:\s*\n\s*show\(\.\.\.\)",
610
+ "except Exception as e:\n show(f\"⚠ Block skipped due to: {type(e).__name__}: {e}\")",
611
+ fixed,
612
+ )
613
+
693
614
  try:
694
615
  class _SMXMatmulRewriter(ast.NodeTransformer):
695
616
  def visit_BinOp(self, node):
@@ -708,44 +629,16 @@ def harden_ai_code(code: str) -> str:
708
629
  # 6) Final safety wrapper
709
630
  fixed = fixed.replace("\t", " ")
710
631
  fixed = textwrap.dedent(fixed).strip("\n")
711
-
632
+ fixed = _ensure_metrics_imports(fixed)
712
633
  fixed = _strip_stray_backrefs(fixed)
713
634
  fixed = _wrap_metric_calls(fixed)
635
+ fixed = _fix_unexpected_indent(fixed)
636
+ fixed = _patch_feature_coef_dataframe(fixed)
637
+ fixed = _strip_file_io_ops(fixed)
714
638
 
715
- # If the transformed code is still not syntactically valid, fall back to a
716
- # very defensive generic snippet that depends only on `df`. This guarantees
717
- try:
718
- ast.parse(fixed)
719
- except (SyntaxError, IndentationError):
720
- fixed = (
721
- "import pandas as pd\n"
722
- "df = df.copy()\n"
723
- "_info = {\n"
724
- " 'rows': len(df),\n"
725
- " 'cols': len(df.columns),\n"
726
- " 'numeric_cols': len(df.select_dtypes(include=['number','bool']).columns),\n"
727
- " 'categorical_cols': len(df.select_dtypes(exclude=['number','bool']).columns),\n"
728
- "}\n"
729
- "show(df.head(), title='Sample of data')\n"
730
- "show(_info, title='Dataset summary')\n"
731
- "try:\n"
732
- " _num = df.select_dtypes(include=['number','bool']).columns.tolist()\n"
733
- " if _num:\n"
734
- " SB_histplot()\n"
735
- " _SMX_export_png()\n"
736
- "except Exception as e:\n"
737
- " show(f\"⚠ Fallback visualisation failed: {type(e).__name__}: {e}\")\n"
738
- )
739
-
740
- # Fix placeholder Ellipsis handlers from LLM
741
- fixed = re.sub(
742
- r"except\s+Exception\s+as\s+e:\s*\n\s*show\(\.\.\.\)",
743
- "except Exception as e:\n show(f\"⚠ Block skipped due to: {type(e).__name__}: {e}\")",
744
- fixed,
745
- )
746
-
747
- wrapped = PREFACE + "try:\n" + _indent(fixed) + "\nexcept Exception as e:\n show(...)\n"
748
- wrapped = wrapped.lstrip()
639
+ # Import shared preface helpers once and wrap the LLM body safely
640
+ header = "from syntaxmatrix.preface import *\n\n"
641
+ wrapped = header + wrap_llm_code_safe(fixed)
749
642
  return wrapped
750
643
 
751
644
 
@@ -754,17 +647,6 @@ def indent_code(code: str, spaces: int = 4) -> str:
754
647
  return "\n".join(pad + line for line in code.splitlines())
755
648
 
756
649
 
757
- def wrap_llm_code_safe(code: str) -> str:
758
- # Swallow any runtime error from the LLM block instead of crashing the run
759
- return (
760
- "# __SAFE_WRAPPED__\n"
761
- "try:\n" + indent_code(code) + "\n"
762
- "except Exception as e:\n"
763
- " from syntaxmatrix.display import show\n"
764
- " show(f\"⚠️ Skipped LLM block due to: {type(e).__name__}: {e}\")\n"
765
- )
766
-
767
-
768
650
  def fix_boxplot_placeholder(code: str) -> str:
769
651
  # Replace invalid 'sns.boxplot(boxplot)' with a safe call using df/group_label/m
770
652
  return re.sub(