syntaxmatrix 2.3.5__py3-none-any.whl → 2.5.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1790 @@
1
+ # syntaxmatrix/model_templates.py
2
+ from textwrap import dedent
3
+
4
+
5
+ def classification(df, target=None):
6
+ code = dedent("""
7
+ # ==== CLASSIFICATION BASELINE (titles + shared SMX_SAMPLE_CAP) ====
8
+ import numpy as np, pandas as pd
9
+ import matplotlib.pyplot as plt
10
+ from sklearn.model_selection import train_test_split
11
+ from sklearn.compose import ColumnTransformer
12
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
13
+ from sklearn.pipeline import Pipeline
14
+ from sklearn.linear_model import LogisticRegression
15
+ from sklearn.metrics import (
16
+ classification_report, confusion_matrix, roc_curve, auc,
17
+ precision_recall_curve, average_precision_score, accuracy_score,
18
+ f1_score, recall_score, precision_score
19
+ )
20
+
21
+ _work = df.copy()
22
+
23
+ # --- 0) Sample cap (from PREFACE) ---
24
+ try:
25
+ CAP = int(SMX_SAMPLE_CAP)
26
+ except Exception:
27
+ CAP = 5000
28
+ if len(_work) > CAP:
29
+ _work = _work.sample(n=CAP, random_state=42)
30
+
31
+ # --- 1) Choose target (use hint if valid, else heuristic) ---
32
+ _hint = __SMX_TARGET_HINT__
33
+ target = _hint if (_hint is not None and str(_hint) in _work.columns) else None
34
+ if target is None:
35
+ prefs = ['target','label','class','y','outcome','churn','default','is_fraud','clicked','purchased']
36
+ for c in prefs:
37
+ if c in _work.columns:
38
+ target = c; break
39
+ if target is None:
40
+ # choose low-cardinality column
41
+ cand = [(c, _work[c].nunique(dropna=True)) for c in _work.columns]
42
+ cand = [c for c, k in cand if k <= 20 and c.lower() not in ('id','uuid')]
43
+ target = cand[-1] if cand else None
44
+
45
+ if target is None:
46
+ show("No obvious classification target found.", title="Classification")
47
+ elif _work[target].nunique(dropna=True) < 2:
48
+ show(f"Target '{target}' has fewer than two classes.", title="Classification")
49
+ else:
50
+ X = _work.drop(columns=[target])
51
+ y = _work[target].astype(str)
52
+
53
+ num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
54
+ cat_cols = X.select_dtypes(include=["object","category","string","bool"]).columns.tolist()
55
+
56
+ # robust OneHot across sklearn versions (uses PREFACE helper if present)
57
+ try:
58
+ enc = _SMX_OHE()
59
+ except Exception:
60
+ enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
61
+
62
+ pre = ColumnTransformer(
63
+ transformers=[
64
+ ("num", Pipeline([("scaler", StandardScaler())]), num_cols) if num_cols else ("num","drop",[]),
65
+ ("cat", enc, cat_cols) if cat_cols else ("cat","drop",[]),
66
+ ],
67
+ remainder="drop"
68
+ )
69
+
70
+ X_train, X_test, y_train, y_test = train_test_split(
71
+ X, y, test_size=0.2, random_state=42, stratify=y if y.nunique() > 1 else None
72
+ )
73
+
74
+ clf = Pipeline([
75
+ ("pre", pre),
76
+ ("est", LogisticRegression(max_iter=1000, class_weight="balanced"))
77
+ ])
78
+ clf.fit(X_train, y_train)
79
+
80
+ y_pred = clf.predict(X_test)
81
+ try:
82
+ proba = clf.predict_proba(X_test)
83
+ y_score = proba.max(axis=1)
84
+ except Exception:
85
+ proba, y_score = None, None
86
+
87
+ # --- 2) Tables with explicit titles ---
88
+ cr = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
89
+ show(pd.DataFrame(cr).transpose(), title="Classification report")
90
+
91
+ # Confusion matrix (robust labels)
92
+ labels_list = sorted(list(pd.unique(y)))
93
+ cm = confusion_matrix(y_test, y_pred, labels=labels_list)
94
+ index = [f"true:{str(lbl)}" for lbl in labels_list]
95
+ columns = [f"pred:{str(lbl)}" for lbl in labels_list]
96
+ cm_df = pd.DataFrame(cm, index=index, columns=columns)
97
+ show(cm_df, title="Confusion matrix")
98
+
99
+ summary = {
100
+ "accuracy": float(accuracy_score(y_test, y_pred)),
101
+ "precision_macro": float(precision_score(y_test, y_pred, average="macro", zero_division=0)),
102
+ "recall_macro": float(recall_score(y_test, y_pred, average="macro", zero_division=0)),
103
+ "f1_macro": float(f1_score(y_test, y_pred, average="macro", zero_division=0)),
104
+ "classes": int(len(labels_list)),
105
+ "rows_used": int(len(_work))
106
+ }
107
+ show(summary, title="Metrics summary")
108
+
109
+ # --- 3) ROC / PR curves for binary (best-effort) ---
110
+ if proba is not None and len(labels_list) == 2:
111
+ pos = labels_list[1]
112
+ y_bin = (y_test == pos).astype(int)
113
+ y_prob = proba[:, 1]
114
+ fpr, tpr, _ = roc_curve(y_bin, y_prob)
115
+ roc_auc = auc(fpr, tpr)
116
+
117
+ fig, ax = plt.subplots(figsize=(6,5))
118
+ ax.plot(fpr, tpr)
119
+ ax.plot([0,1],[0,1], linestyle="--")
120
+ ax.set_title(f"ROC curve (AUC={roc_auc:.3f})")
121
+ ax.set_xlabel("FPR"); ax.set_ylabel("TPR")
122
+ plt.tight_layout(); plt.show()
123
+
124
+ prec, rec, _ = precision_recall_curve(y_bin, y_prob)
125
+ ap = average_precision_score(y_bin, y_prob)
126
+ fig2, ax2 = plt.subplots(figsize=(6,4))
127
+ ax2.plot(rec, prec)
128
+ ax2.set_title(f"Precision–Recall (AP={ap:.3f})")
129
+ ax2.set_xlabel("Recall"); ax2.set_ylabel("Precision")
130
+ plt.tight_layout(); plt.show()
131
+
132
+ # --- 4) Predictions sample (captioned) ---
133
+ out = X_test.copy()
134
+ out["_true"] = y_test.values
135
+ out["_pred"] = y_pred
136
+ if y_score is not None:
137
+ out["_score"] = y_score
138
+ show(out.head(20), title="Predictions (sample)")
139
+ """)
140
+ return code.replace("__SMX_TARGET_HINT__", repr(target))
141
+
142
+
143
+ def regression(df, target=None):
144
+ code = dedent("""
145
+ # ==== REGRESSION BASELINE (titles + shared SMX_SAMPLE_CAP) ====
146
+ import numpy as np, pandas as pd
147
+ import matplotlib.pyplot as plt
148
+ from sklearn.model_selection import train_test_split
149
+ from sklearn.compose import ColumnTransformer
150
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
151
+ from sklearn.pipeline import Pipeline
152
+ from sklearn.linear_model import Ridge
153
+ from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
154
+
155
+ _work = df.copy()
156
+ try:
157
+ CAP = int(SMX_SAMPLE_CAP)
158
+ except Exception:
159
+ CAP = 5000
160
+ if len(_work) > CAP:
161
+ _work = _work.sample(n=CAP, random_state=42)
162
+
163
+ # target pick (hint first)
164
+ _hint = __SMX_TARGET_HINT__
165
+ target = _hint if (_hint is not None and str(_hint) in _work.columns) else None
166
+
167
+ if target is None:
168
+ num_cols_all = _work.select_dtypes(include=[np.number]).columns.tolist()
169
+ for c in ['target','y','price','amount','value','score','sales','revenue']:
170
+ if c in num_cols_all:
171
+ target = c; break
172
+ if target is None and num_cols_all:
173
+ target = num_cols_all[-1]
174
+
175
+ if target is None:
176
+ show("No numeric target found for regression.", title="Regression")
177
+ else:
178
+ X = _work.drop(columns=[target]); y = _work[target].astype(float)
179
+
180
+ num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
181
+ cat_cols = X.select_dtypes(include=["object","category","string","bool"]).columns.tolist()
182
+
183
+ try:
184
+ enc = _SMX_OHE()
185
+ except Exception:
186
+ enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
187
+
188
+ pre = ColumnTransformer(
189
+ transformers=[
190
+ ("num", Pipeline([("scaler", StandardScaler())]), num_cols) if num_cols else ("num","drop",[]),
191
+ ("cat", enc, cat_cols) if cat_cols else ("cat","drop",[]),
192
+ ],
193
+ remainder="drop"
194
+ )
195
+
196
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
197
+
198
+ model = Pipeline([("pre", pre), ("est", Ridge(alpha=1.0, random_state=42))])
199
+ model.fit(X_train, y_train)
200
+
201
+ y_pred = model.predict(X_test)
202
+
203
+ mse = mean_squared_error(y_test, y_pred)
204
+ rmse = float(np.sqrt(mse))
205
+ mae = mean_absolute_error(y_test, y_pred)
206
+ r2 = r2_score(y_test, y_pred)
207
+
208
+ show({"MAE": float(mae), "MSE": float(mse), "RMSE": rmse, "R²": float(r2), "rows_used": int(len(_work))},
209
+ title="Regression metrics")
210
+
211
+ fig, ax = plt.subplots(figsize=(6,5))
212
+ ax.scatter(y_test, y_pred, s=18, alpha=0.7)
213
+ ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], linestyle="--")
214
+ ax.set_title("Parity plot (y vs ŷ)"); ax.set_xlabel("Actual"); ax.set_ylabel("Predicted")
215
+ plt.tight_layout(); plt.show()
216
+
217
+ resid = y_test - y_pred
218
+ fig2, ax2 = plt.subplots(figsize=(6,4))
219
+ ax2.scatter(y_pred, resid, s=16, alpha=0.7)
220
+ ax2.axhline(0.0, linestyle="--")
221
+ ax2.set_title("Residuals vs predicted"); ax2.set_xlabel("Predicted"); ax2.set_ylabel("Residual")
222
+ plt.tight_layout(); plt.show()
223
+
224
+ out = X_test.copy(); out["_actual"] = y_test.values; out["_pred"] = y_pred; out["_residual"] = resid
225
+ show(out.head(20), title="Predictions (sample)")
226
+ """)
227
+ return code.replace("__SMX_TARGET_HINT__", repr(target))
228
+
229
+
230
+ def multilabel_classification(df, label_cols):
231
+ """
232
+ Baseline multi-label pipeline:
233
+ - X: numeric features only (excludes label_cols)
234
+ - y: df[label_cols] (2D binary frame)
235
+ - Model: OneVsRest(LogisticRegression)
236
+ - Metrics: subset accuracy, hamming loss, micro/macro F1, per-label ROC AUC
237
+ - Confusion matrices: from_predictions per label (no estimator wrapper)
238
+ """
239
+ return dedent(f"""
240
+ import numpy as np
241
+ import pandas as pd
242
+ import matplotlib.pyplot as plt
243
+ from sklearn.preprocessing import StandardScaler
244
+ from sklearn.linear_model import LogisticRegression
245
+ from sklearn.multiclass import OneVsRestClassifier
246
+ from sklearn.pipeline import Pipeline
247
+ from sklearn.model_selection import train_test_split
248
+ from sklearn.metrics import (
249
+ accuracy_score, hamming_loss, f1_score, roc_auc_score,
250
+ classification_report, ConfusionMatrixDisplay
251
+ )
252
+
253
+ LABEL_COLS = {list(label_cols)}
254
+
255
+ # X = numeric features only, drop labels
256
+ X = df.drop(columns=LABEL_COLS).select_dtypes(include=['number','bool']).copy()
257
+ y = df[LABEL_COLS].astype(int).copy()
258
+
259
+ if X.empty:
260
+ raise ValueError("No numeric features available for multi-label classification.")
261
+ if y.shape[1] < 2:
262
+ raise ValueError("Need at least two label columns for multi-label classification.")
263
+
264
+ X_train, X_test, y_train, y_test = train_test_split(
265
+ X, y, test_size=0.2, random_state=42, stratify=y.sum(axis=1) if y.sum(axis=1).nunique()>1 else None
266
+ )
267
+
268
+ pipeline = Pipeline(steps=[
269
+ ("scaler", StandardScaler(with_mean=False)),
270
+ ("clf", OneVsRestClassifier(LogisticRegression(max_iter=200, n_jobs=None)))
271
+ ])
272
+
273
+ pipeline.fit(X_train, y_train)
274
+
275
+ y_pred = pipeline.predict(X_test)
276
+ # Probas for AUC (fallback to zeros if not available)
277
+ try:
278
+ y_proba = pipeline.predict_proba(X_test)
279
+ y_proba = np.column_stack([p[:,1] if p.ndim==2 else p for p in y_proba])
280
+ except Exception:
281
+ y_proba = np.zeros_like(y_pred, dtype=float)
282
+
283
+ # Aggregate metrics
284
+ metrics_row = {{
285
+ "accuracy": accuracy_score(y_test, y_pred),
286
+ "hamming_loss": hamming_loss(y_test, y_pred),
287
+ "f1_micro": f1_score(y_test, y_pred, average="micro", zero_division=0),
288
+ "f1_macro": f1_score(y_test, y_pred, average="macro", zero_division=0),
289
+ }}
290
+ # macro ROC AUC if we have probabilities
291
+ try:
292
+ metrics_row["roc_auc_macro"] = roc_auc_score(y_test, y_proba, average="macro")
293
+ except Exception:
294
+ metrics_row["roc_auc_macro"] = np.nan
295
+
296
+ show(pd.DataFrame([metrics_row]))
297
+
298
+ # Per-label report and ROC AUC
299
+ report_rows = []
300
+ for j, col in enumerate(LABEL_COLS):
301
+ try:
302
+ auc = roc_auc_score(y_test.iloc[:, j], y_proba[:, j]) if y_proba.size else np.nan
303
+ except Exception:
304
+ auc = np.nan
305
+ report = classification_report(
306
+ y_test.iloc[:, j], y_pred[:, j], output_dict=True, zero_division=0
307
+ )
308
+ report_rows.append({{"label": col, "roc_auc": auc}})
309
+ show(pd.DataFrame(report_rows))
310
+
311
+ # Confusion matrices per label — use from_predictions (no estimator wrapper needed)
312
+ n = len(LABEL_COLS)
313
+ ncols = 3 if n >= 3 else n
314
+ nrows = int(np.ceil(n / ncols)) if ncols else 1
315
+ fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(4*ncols, 3*nrows))
316
+ axes = axes.ravel() if n > 1 else [axes]
317
+ for i, col in enumerate(LABEL_COLS[:len(axes)]):
318
+ ConfusionMatrixDisplay.from_predictions(
319
+ y_test.iloc[:, i], y_pred[:, i], ax=axes[i], cmap=plt.cm.Blues
320
+ )
321
+ axes[i].set_title(col)
322
+ plt.tight_layout()
323
+ plt.show()
324
+ """)
325
+
326
+
327
+ def eda_overview(df):
328
+ return dedent("""
329
+ # ── Auto-generated EDA overview ───────────────
330
+ import pandas as pd
331
+ import matplotlib.pyplot as plt
332
+ import seaborn as sns
333
+
334
+ _df = df.copy()
335
+ num_cols = _df.select_dtypes(include=['number', 'bool']).columns.tolist()
336
+
337
+ if num_cols:
338
+ summary = _df[num_cols].describe().T.reset_index().rename(columns={'index': 'feature'})
339
+ show(summary)
340
+
341
+ sample = _df[num_cols]
342
+ if len(sample) > 500:
343
+ sample = sample.sample(500, random_state=42)
344
+
345
+ sns.pairplot(sample)
346
+ plt.tight_layout()
347
+ plt.show()
348
+ else:
349
+ show("No numeric columns available for EDA overview.")
350
+ """)
351
+
352
+
353
+ def eda_correlation(df):
354
+ return dedent("""
355
+ # ── Auto-generated correlation analysis ───────────────
356
+ import pandas as pd
357
+ import matplotlib.pyplot as plt
358
+ import seaborn as sns
359
+
360
+ _df = df.copy()
361
+ num_cols = _df.select_dtypes(include=['number', 'bool']).columns.tolist()
362
+ if not num_cols:
363
+ raise ValueError("No numeric columns available for correlation analysis.")
364
+
365
+ corr = _df[num_cols].corr()
366
+ show(corr)
367
+
368
+ plt.figure(figsize=(8, 6))
369
+ sns.heatmap(corr, annot=False, cmap="coolwarm", center=0)
370
+ plt.title("Correlation heatmap (numeric features)")
371
+ plt.tight_layout()
372
+ plt.show()
373
+ """)
374
+
375
+
376
+ def anomaly_detection(df):
377
+ return dedent("""
378
+ # ── Auto-generated IsolationForest anomaly detection ─────────────
379
+ import numpy as np
380
+ import pandas as pd
381
+ from sklearn.ensemble import IsolationForest
382
+ from sklearn.preprocessing import OneHotEncoder
383
+ from sklearn.compose import ColumnTransformer
384
+ from sklearn.pipeline import Pipeline
385
+ from IPython.display import display, HTML
386
+
387
+ # Split numeric vs categorical for simple preprocessing
388
+ num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
389
+ cat_cols = [c for c in df.columns if c not in num_cols]
390
+
391
+ if len(num_cols) + len(cat_cols) == 0:
392
+ raise ValueError("No usable columns for anomaly detection.")
393
+
394
+ preproc = ColumnTransformer(
395
+ transformers=[
396
+ ("num", "passthrough", num_cols),
397
+ ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
398
+ ],
399
+ remainder="drop",
400
+ verbose_feature_names_out=False,
401
+ )
402
+
403
+ model = IsolationForest(
404
+ n_estimators=300,
405
+ contamination="auto",
406
+ random_state=42
407
+ )
408
+
409
+ pipe = Pipeline([
410
+ ("prep", preproc),
411
+ ("iso", model),
412
+ ])
413
+
414
+ X = df[num_cols + cat_cols].copy()
415
+ pipe.fit(X)
416
+
417
+ # More negative = more anomalous in sklearn's score_samples
418
+ scores = pipe.named_steps["iso"].score_samples(pipe.named_steps["prep"].transform(X))
419
+ out = df.copy()
420
+ out["anomaly_score"] = -scores
421
+
422
+ # Flag top 5% as anomalies (simple heuristic)
423
+ threshold = np.percentile(out["anomaly_score"], 95)
424
+ out["is_anomaly"] = out["anomaly_score"] >= threshold
425
+
426
+ # Show the most anomalous rows
427
+ top = out.sort_values("anomaly_score", ascending=False).head(20)
428
+ display(HTML(top.to_html(index=False)))
429
+ """)
430
+
431
+
432
+ def ts_anomaly_detection(df):
433
+ return dedent("""
434
+ # ==== TIME-SERIES ANOMALY DETECTION ====
435
+ # Prefers STL (statsmodels). If not available, falls back to rolling-MAD.
436
+ import numpy as np, pandas as pd
437
+ import matplotlib.pyplot as plt
438
+
439
+ _df = df.copy()
440
+
441
+ # --- 1) Find a datetime column (or use datetime index) ---
442
+ time_col = None
443
+ if isinstance(_df.index, pd.DatetimeIndex):
444
+ _df = _df.reset_index().rename(columns={"index": "timestamp"})
445
+ time_col = "timestamp"
446
+ else:
447
+ # try common names first, then dtype-based
448
+ preferred = [c for c in _df.columns if ("date" in c.lower() or "time" in c.lower())]
449
+ dt_candidates = preferred + _df.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns.tolist()
450
+ for c in dt_candidates or _df.columns.tolist():
451
+ try:
452
+ _df[c] = pd.to_datetime(_df[c], errors="coerce")
453
+ if _df[c].notna().sum() >= 3:
454
+ time_col = c
455
+ break
456
+ except Exception:
457
+ pass
458
+
459
+ if time_col is None:
460
+ show("No timestamp/datetime column found. Provide a column like 'date' or 'timestamp'.")
461
+ else:
462
+ # --- 2) Pick a numeric value column ---
463
+ num_cols = _df.select_dtypes(include=[np.number]).columns.tolist()
464
+ preferred_vals = [c for c in num_cols if any(k in c.lower() for k in ["value","amount","count","y","target"])]
465
+ value_col = preferred_vals[0] if preferred_vals else (num_cols[0] if num_cols else None)
466
+
467
+ if value_col is None:
468
+ show("No numeric value column found for time-series analysis.")
469
+ else:
470
+ ts = _df[[time_col, value_col]].dropna().sort_values(time_col).set_index(time_col)
471
+
472
+ # --- 3) Infer resample rule (D/W/M) ---
473
+ def _choose_rule(idx):
474
+ if len(idx) < 3: return "D"
475
+ # median gap in seconds
476
+ arr = idx.view("i8")
477
+ diffs = np.diff(arr) / 1e9 if len(arr) > 1 else np.array([0.0])
478
+ med = np.median(diffs) if len(diffs) else 0.0
479
+ day = 86400.0
480
+ if med <= day: return "D"
481
+ if med <= 7 * day: return "W"
482
+ return "M"
483
+
484
+ rule = _choose_rule(ts.index.values)
485
+ period_map = {"D": 7, "W": 52, "M": 12}
486
+ period = period_map.get(rule, 7)
487
+
488
+ # --- 4) Resample & detect anomalies (STL or fallback) ---
489
+ ts_res = ts.resample(rule).mean().dropna()
490
+ used_statsmodels = False
491
+ try:
492
+ from statsmodels.tsa.seasonal import STL
493
+ used_statsmodels = True
494
+ stl = STL(ts_res[value_col], robust=True, period=period)
495
+ res = stl.fit()
496
+ trend = res.trend
497
+ resid = res.resid
498
+ seasonal = res.seasonal
499
+ # robust z-score
500
+ mad = np.median(np.abs(resid - np.median(resid))) or 1e-8
501
+ z = np.abs(resid) / (1.4826 * mad)
502
+ anomalies = z > 3.5
503
+ except Exception:
504
+ # --- Rolling-MAD fallback (no statsmodels required) ---
505
+ used_statsmodels = False
506
+ series = ts_res[value_col]
507
+ # choose an odd window scaled to series length
508
+ n = max(7, min(61, (len(series) // 10) * 2 + 1))
509
+ med = series.rolling(window=n, center=True, min_periods=max(3, n // 3)).median()
510
+ resid = series - med
511
+ mad = (np.abs(resid)).rolling(window=n, center=True, min_periods=max(3, n // 3)).median()
512
+ # robust scale; avoid zeros
513
+ scale = (1.4826 * mad).replace(0, np.nan)
514
+ scale = scale.fillna(scale.median() or 1e-8)
515
+ z = np.abs(resid) / scale
516
+ anomalies = z > 3.5
517
+ trend = med
518
+ seasonal = pd.Series(0.0, index=series.index)
519
+
520
+ out = ts_res.copy()
521
+ out["trend"] = trend.reindex(out.index)
522
+ out["resid"] = resid.reindex(out.index)
523
+ out["zscore"] = z.reindex(out.index)
524
+ out["anomaly"] = anomalies.reindex(out.index).astype(bool)
525
+
526
+ # --- 5) UI outputs (no prints) ---
527
+ mode_note = "STL (statsmodels)" if used_statsmodels else "Rolling-MAD fallback"
528
+ show({"method": mode_note, "frequency": rule, "period": period, "points": int(out.shape[0]), "anomalies": int(out["anomaly"].sum())})
529
+ show(out[out["anomaly"]].head(30))
530
+
531
+ # value + trend + anomalies
532
+ fig, ax = plt.subplots(figsize=(9, 5))
533
+ ax.plot(out.index, out[value_col], label="value")
534
+ ax.plot(out.index, out["trend"], label="trend")
535
+ ax.scatter(out.index[out["anomaly"]], out[value_col][out["anomaly"]], s=40, label="anomaly")
536
+ ax.set_title(f"Time-series anomalies ({mode_note})")
537
+ ax.set_xlabel("time"); ax.set_ylabel(value_col)
538
+ ax.legend(loc="best"); plt.tight_layout(); plt.show()
539
+
540
+ # robust z-scores
541
+ fig2, ax2 = plt.subplots(figsize=(9, 3))
542
+ ax2.plot(out.index, out["zscore"])
543
+ ax2.axhline(3.5, linestyle="--")
544
+ ax2.set_title("Robust z-score")
545
+ ax2.set_xlabel("time"); ax2.set_ylabel("z")
546
+ plt.tight_layout(); plt.show()
547
+
548
+ # sample of last periods for quick inspection
549
+ show(out.tail(12))
550
+ """)
551
+
552
+
553
+ def dimensionality_reduction(df):
554
+ return dedent("""
555
+ # ── Dimensionality Reduction (PCA + optional t-SNE) ───────────────
556
+ import numpy as np, pandas as pd
557
+ import matplotlib.pyplot as plt
558
+ from sklearn.preprocessing import StandardScaler
559
+ from sklearn.decomposition import PCA
560
+ try:
561
+ from sklearn.manifold import TSNE
562
+ _HAS_TSNE = True
563
+ except Exception:
564
+ _HAS_TSNE = False
565
+ from IPython.display import display, HTML
566
+
567
+ _df = df.copy()
568
+ num_cols = _df.select_dtypes(include=[np.number]).columns.tolist()
569
+ if len(num_cols) < 2:
570
+ raise ValueError("Need at least 2 numeric columns for PCA. Found: %d" % len(num_cols))
571
+
572
+ X = _df[num_cols].astype(float).copy()
573
+ scaler = StandardScaler()
574
+ Xs = scaler.fit_transform(X)
575
+
576
+ n_comp = int(min(10, Xs.shape[1]))
577
+ pca = PCA(n_components=n_comp)
578
+ Z = pca.fit_transform(Xs)
579
+
580
+ # Explained variance table
581
+ evr = pca.explained_variance_ratio_
582
+ cum = np.cumsum(evr)
583
+ stats = pd.DataFrame({
584
+ "component": [f"PC{i+1}" for i in range(n_comp)],
585
+ "explained_variance_ratio": evr,
586
+ "cumulative_variance": cum
587
+ })
588
+ display(HTML("<h4>PCA explained variance</h4>" + stats.to_html(index=False)))
589
+
590
+ # 2D scatter of PC1 vs PC2
591
+ fig, ax = plt.subplots(figsize=(8, 5))
592
+ ax.scatter(Z[:,0], Z[:,1], s=14, alpha=0.7)
593
+ ax.set_xlabel("PC1")
594
+ ax.set_ylabel("PC2")
595
+ ax.set_title("PCA: PC1 vs PC2")
596
+ plt.show()
597
+
598
+ # Top absolute loadings for PC1 & PC2
599
+ comps = pd.DataFrame(pca.components_[:2], columns=num_cols, index=["PC1","PC2"]).T
600
+ top1 = comps["PC1"].abs().sort_values(ascending=False).head(10)
601
+ top2 = comps["PC2"].abs().sort_values(ascending=False).head(10)
602
+ display(HTML("<h4>Top |loadings| for PC1</h4>" + top1.to_frame("abs_loading").to_html()))
603
+ display(HTML("<h4>Top |loadings| for PC2</h4>" + top2.to_frame("abs_loading").to_html()))
604
+
605
+ # Optional t-SNE (only if sample size reasonable)
606
+ if _HAS_TSNE and Xs.shape[0] >= 200:
607
+ tsne = TSNE(n_components=2, init="pca", learning_rate="auto", perplexity=min(30, max(5, Xs.shape[0]//50)), random_state=42)
608
+ Zt = tsne.fit_transform(Xs)
609
+ fig2, ax2 = plt.subplots(figsize=(8, 5))
610
+ ax2.scatter(Zt[:,0], Zt[:,1], s=8, alpha=0.7)
611
+ ax2.set_title("t-SNE (2D)")
612
+ plt.show()
613
+ """)
614
+
615
+
616
+ def feature_selection(df):
617
+ return dedent("""
618
+ # ── Feature Selection (mutual info + permutation importance) ──────
619
+ import numpy as np, pandas as pd
620
+ import matplotlib.pyplot as plt
621
+ from sklearn.model_selection import train_test_split
622
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
623
+ from sklearn.compose import ColumnTransformer
624
+ from sklearn.pipeline import Pipeline
625
+ from sklearn.linear_model import LogisticRegression, Ridge
626
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
627
+ from sklearn.inspection import permutation_importance
628
+ from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
629
+ from IPython.display import display, HTML
630
+ try:
631
+ from syntaxmatrix.display import show # UI-safe
632
+ except Exception:
633
+ pass
634
+
635
+ _df = df.copy()
636
+
637
+ # ---- 1) Pick target y (heuristics; fall back gracefully)
638
+ target_candidates = [
639
+ "target", "label", "y", "outcome", "class", "response", "target_var"
640
+ ]
641
+ ycol = None
642
+ for c in target_candidates:
643
+ if c in _df.columns:
644
+ ycol = c; break
645
+
646
+ _reason = None
647
+ if ycol is None:
648
+ # 1a) Prefer a low-cardinality non-ID column (classification)
649
+ low_card = []
650
+ for c in _df.columns:
651
+ try:
652
+ nun = _df[c].nunique(dropna=True)
653
+ if 2 <= nun <= 20 and str(c).lower() not in ("id","uuid","index"):
654
+ low_card.append(c)
655
+ except Exception:
656
+ pass
657
+ if low_card:
658
+ ycol = low_card[-1]
659
+ try:
660
+ show(f"Using provisional classification target: '{ycol}' (low-cardinality)", title="Feature Selection")
661
+ except Exception:
662
+ pass
663
+
664
+ if ycol is None:
665
+ # 1b) Else take a high-variance numeric (regression)
666
+ num = _df.select_dtypes(include=[np.number])
667
+ if not num.empty:
668
+ try:
669
+ ycol = num.var().sort_values(ascending=False).index[0]
670
+ _reason = "highest-variance numeric"
671
+ try:
672
+ show(f"Using provisional regression target: '{ycol}' ({_reason})", title="Feature Selection")
673
+ except Exception:
674
+ pass
675
+ except Exception:
676
+ ycol = None
677
+
678
+ _can_run = ycol is not None
679
+ if not _can_run:
680
+ # Friendly message and a proxy output so the block still yields value
681
+ try:
682
+ show("Feature selection needs a target. None detected and none could be inferred. Showing numeric variance as a proxy.", title="Feature Selection")
683
+ var_df = _df.select_dtypes(include=[np.number]).var().sort_values(ascending=False).to_frame('variance').reset_index().rename(columns={'index':'feature'})
684
+ show(var_df.head(15), title="Numeric variance (proxy)")
685
+ except Exception:
686
+ pass
687
+ else:
688
+ # ---- 2) Build X/y and simple preprocessing
689
+ X = _df.drop(columns=[ycol]).copy()
690
+ y = _df[ycol].copy()
691
+
692
+ num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
693
+ cat_cols = [c for c in X.columns if c not in num_cols]
694
+
695
+ # Robust encoder across sklearn versions / environments
696
+ try:
697
+ enc = _SMX_OHE()
698
+ except NameError:
699
+ enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
700
+
701
+ preproc = ColumnTransformer(
702
+ transformers=[
703
+ ("num", StandardScaler(with_mean=True, with_std=True), num_cols) if num_cols else ("num","drop",[]),
704
+ ("cat", enc, cat_cols) if cat_cols else ("cat","drop",[]),
705
+ ],
706
+ remainder="drop",
707
+ verbose_feature_names_out=False,
708
+ )
709
+
710
+ # classify vs regress
711
+ y_is_classification = (y.nunique() <= 20) and (y.dtype.kind in "biuO" or y.nunique() <= 10)
712
+
713
+ X_train, X_test, y_train, y_test = train_test_split(
714
+ X, y, test_size=0.25, random_state=42, stratify=y if y_is_classification else None
715
+ )
716
+
717
+ if y_is_classification:
718
+ base_est = LogisticRegression(max_iter=2000, n_jobs=None) if hasattr(LogisticRegression(), "n_jobs") else LogisticRegression(max_iter=2000)
719
+ alt_est = RandomForestClassifier(n_estimators=200, random_state=42)
720
+ mi_func = mutual_info_classif
721
+ score_kw = {"scoring": "roc_auc"} if y.nunique()==2 else {"scoring": "balanced_accuracy"}
722
+ else:
723
+ try:
724
+ base_est = Ridge(random_state=42)
725
+ except TypeError:
726
+ base_est = Ridge()
727
+ alt_est = RandomForestRegressor(n_estimators=200, random_state=42)
728
+ mi_func = mutual_info_regression
729
+ score_kw = {"scoring": "r2"}
730
+
731
+ pipe = Pipeline([("prep", preproc), ("est", base_est)])
732
+ pipe.fit(X_train, y_train)
733
+
734
+ # ---- 3) Mutual information (on one-hot expanded X)
735
+ X_enc = pipe.named_steps["prep"].transform(X_train)
736
+ # Get feature names after OHE
737
+ try:
738
+ ohe = pipe.named_steps["prep"].named_transformers_["cat"]
739
+ if hasattr(ohe, 'get_feature_names_out'):
740
+ cat_feature_names = list(ohe.get_feature_names_out(cat_cols))
741
+ else:
742
+ cat_feature_names = []
743
+ except Exception:
744
+ cat_feature_names = []
745
+ feature_names = num_cols + cat_feature_names
746
+ if len(feature_names) != (X_enc.shape[1] if hasattr(X_enc, 'shape') else len(feature_names)):
747
+ # fallback if names length mismatch
748
+ feature_names = [f"f{i}" for i in range(X_enc.shape[1])]
749
+
750
+ # Mutual information scores
751
+ try:
752
+ mi = mi_func(y_train, X_enc) if callable(mi_func) else np.zeros(len(feature_names))
753
+ except Exception:
754
+ mi = np.zeros(len(feature_names))
755
+ mi_df = pd.DataFrame({"feature": feature_names, "mi": mi}).sort_values("mi", ascending=False)
756
+
757
+ # ---- 4) Permutation importance on alt estimator
758
+ pipe_alt = Pipeline([("prep", preproc), ("est", alt_est)])
759
+ pipe_alt.fit(X_train, y_train)
760
+ try:
761
+ pi = permutation_importance(pipe_alt, pipe_alt.named_steps["prep"].transform(X_test), y_test, n_repeats=5, random_state=42, **score_kw)
762
+ pi_df = pd.DataFrame({"feature": feature_names, "perm_importance_mean": pi.importances_mean}).sort_values("perm_importance_mean", ascending=False)
763
+ except Exception:
764
+ pi_df = pd.DataFrame({"feature": feature_names, "perm_importance_mean": np.zeros(len(feature_names))})
765
+
766
+ # ---- 5) Show results
767
+ show(mi_df.head(20), title="Mutual information (top features)")
768
+ show(pi_df.head(20), title="Permutation importance (top features)")
769
+
770
+ # Horizontal bars for permutation importance
771
+ top = pi_df.head(15)[::-1]
772
+ fig, ax = plt.subplots(figsize=(8, 6))
773
+ ax.barh(top["feature"], top["perm_importance_mean"])
774
+ ax.set_title("Top permutation importances")
775
+ ax.set_xlabel("Importance (mean over repeats)")
776
+ plt.tight_layout(); plt.show()
777
+ """)
778
+
779
+
780
+ def time_series_forecasting(df):
781
+ return dedent("""
782
+ # ── Auto-generated baseline time-series forecast ─────────
783
+ import numpy as np
784
+ import pandas as pd
785
+ import matplotlib.pyplot as plt
786
+ from sklearn.linear_model import LinearRegression
787
+ from sklearn.metrics import mean_absolute_error
788
+
789
+ _df = df.copy()
790
+
791
+ # 1) pick a datetime column
792
+ dt_cols = [c for c in _df.columns if np.issubdtype(_df[c].dtype, np.datetime64)]
793
+ if not dt_cols:
794
+ name_hits = [c for c in _df.columns if any(k in str(c).lower()
795
+ for k in ["date","time","timestamp","datetime","ds","period"])]
796
+ for c in name_hits:
797
+ try:
798
+ _df[c] = pd.to_datetime(_df[c], errors="raise")
799
+ dt_cols = [c]
800
+ break
801
+ except Exception:
802
+ continue
803
+
804
+ if not dt_cols:
805
+ raise ValueError("No datetime-like column found for time-series forecasting.")
806
+
807
+ time_col = dt_cols[0]
808
+
809
+ # 2) pick a numeric target column
810
+ num_cols = [c for c in _df.select_dtypes(include=['number', 'bool']).columns if c != time_col]
811
+ if not num_cols:
812
+ raise ValueError("No numeric target available for time-series forecasting.")
813
+
814
+ target = num_cols[0]
815
+
816
+ ts = _df[[time_col, target]].dropna().sort_values(time_col)
817
+ ts["time_idx"] = (ts[time_col] - ts[time_col].min()).dt.total_seconds() / 86400.0
818
+
819
+ if len(ts) < 10:
820
+ raise ValueError("Not enough data points for time-series forecasting (need >= 10 rows).")
821
+
822
+ split_idx = int(len(ts) * 0.8)
823
+ train, test = ts.iloc[:split_idx], ts.iloc[split_idx:]
824
+
825
+ X_train = train[["time_idx"]].values
826
+ y_train = train[target].values
827
+ X_test = test[["time_idx"]].values
828
+ y_test = test[target].values
829
+
830
+ reg = LinearRegression()
831
+ reg.fit(X_train, y_train)
832
+
833
+ y_pred = reg.predict(X_test)
834
+ mae = mean_absolute_error(y_test, y_pred)
835
+ show({{"MAE_forecast": mae}})
836
+
837
+ fig, ax = plt.subplots(figsize=(10, 4))
838
+ ax.plot(train[time_col], train[target], label="train")
839
+ ax.plot(test[time_col], y_test, label="test")
840
+ ax.plot(test[time_col], y_pred, label="forecast")
841
+ ax.legend()
842
+ ax.set_title(f"Baseline time-series forecast for {{target}}")
843
+ plt.tight_layout()
844
+ plt.show()
845
+ """)
846
+
847
+
848
+ def time_series_classification(df, entity_col, time_col, target_col):
849
+ return dedent(f"""
850
+ # ── Auto-generated time-series classification baseline ─────
851
+ import numpy as np
852
+ import pandas as pd
853
+ from sklearn.model_selection import train_test_split
854
+ from sklearn.ensemble import RandomForestClassifier
855
+ from sklearn.metrics import accuracy_score, classification_report
856
+
857
+ _df = df.copy()
858
+
859
+ # Drop rows missing key columns
860
+ _df = _df.dropna(subset=['{entity_col}', '{time_col}', '{target_col}'])
861
+
862
+ # Ensure datetime for the time column
863
+ _df['{time_col}'] = pd.to_datetime(_df['{time_col}'], errors="coerce")
864
+ _df = _df.dropna(subset=['{time_col}'])
865
+
866
+ # Sort by entity then time
867
+ _df = _df.sort_values(['{entity_col}', '{time_col}'])
868
+
869
+ # Numeric features only (excluding target, entity, time)
870
+ num_cols = _df.select_dtypes(include=['number', 'bool']).columns.tolist()
871
+ for c in ['{target_col}', '{entity_col}', '{time_col}']:
872
+ if c in num_cols:
873
+ num_cols.remove(c)
874
+
875
+ if not num_cols:
876
+ raise ValueError("No numeric features available for time-series classification template.")
877
+
878
+ # Aggregate sequence into per-entity features
879
+ agg_spec = {{}}
880
+ for c in num_cols:
881
+ agg_spec[c] = ['mean', 'std', 'min', 'max', 'last']
882
+
883
+ grouped = _df.groupby('{entity_col}').agg(agg_spec)
884
+
885
+ # Flatten MultiIndex columns
886
+ grouped.columns = [f"{{col}}_{{stat}}" for col, stat in grouped.columns]
887
+
888
+ # Target per entity: last observed label
889
+ y = _df.groupby('{entity_col}')['{target_col}'].last()
890
+
891
+ # Align X and y on the same entities
892
+ X, y = grouped.align(y, join="inner", axis=0)
893
+
894
+ if X.empty:
895
+ raise ValueError("No aggregated rows available for time-series classification.")
896
+
897
+ # Train/test split by entities
898
+ X_train, X_test, y_train, y_test = train_test_split(
899
+ X, y, test_size=0.2, stratify=y, random_state=42
900
+ )
901
+
902
+ clf = RandomForestClassifier(n_estimators=300, random_state=42)
903
+ clf.fit(X_train, y_train)
904
+
905
+ y_pred = clf.predict(X_test)
906
+ acc = accuracy_score(y_test, y_pred)
907
+ show({{"Accuracy": acc}})
908
+
909
+ report_df = pd.DataFrame(
910
+ classification_report(y_test, y_pred, output_dict=True)
911
+ ).T
912
+ show(report_df)
913
+ """)
914
+
915
+
916
+ def unknown_group_proxy_pack(df, group_col, unknown_tokens, numeric_cols, cat_cols, outcome_col=None):
917
+ return dedent(f"""
918
+ # ── Unknown Group: Proxy Insight Pack ──
919
+ import numpy as np
920
+ import pandas as pd
921
+ import matplotlib.pyplot as plt
922
+
923
+ _df = df.copy()
924
+
925
+ if '{group_col}' not in _df.columns:
926
+ show("Grouping column '{group_col}' not found; showing overall summary only.")
927
+ show(_df.head())
928
+ else:
929
+ s = _df['{group_col}']
930
+ s_norm = s.astype(str).str.strip().str.lower()
931
+ _tokens = set({list({"unknown","not reported","not_reported","not known","n/a","na","none","nan","missing","unreported","unspecified","null","-",""})})
932
+ _tokens.update({list(set(unknown_tokens))})
933
+ is_unknown = s.isna() | s_norm.isin(_tokens)
934
+ _df["_UnknownGroup"] = np.where(is_unknown, "Unknown/Not Reported", "Known")
935
+
936
+ # 1) Size table (never errors)
937
+ size_tbl = _df["_UnknownGroup"].value_counts(dropna=False).rename_axis("Group").reset_index(name="Count")
938
+ total = len(_df) if len(_df) else 1
939
+ size_tbl["Pct"] = (size_tbl["Count"] / total * 100).round(1)
940
+ show(size_tbl)
941
+
942
+ # 2) Numeric comparisons (auto-select; safe when empty)
943
+ num_cols = [c for c in {list([])} or {list(set(numeric_cols))} if c in _df.columns and pd.api.types.is_numeric_dtype(_df[c])]
944
+ if not num_cols:
945
+ num_cols = _df.select_dtypes(include=['number','bool']).columns.tolist()[:6]
946
+
947
+ if "_UnknownGroup" in _df.columns and num_cols:
948
+ blocks = []
949
+ for g, sub in _df.groupby("_UnknownGroup", dropna=False):
950
+ if sub.empty:
951
+ continue
952
+ desc = sub[num_cols].describe().T
953
+ desc.insert(0, "Group", g)
954
+ desc = desc.reset_index().rename(columns={{"index":"Variable","std":"Std","25%":"Q1","50%":"Median","75%":"Q3"}})
955
+ blocks.append(desc[["Variable","Group","count","mean","Median","Std","min","Q1","Q3","max"]])
956
+ numeric_summary = pd.concat(blocks, ignore_index=True) if blocks else pd.DataFrame(
957
+ columns=["Variable","Group","count","mean","Median","Std","min","Q1","Q3","max"]
958
+ )
959
+ show(numeric_summary)
960
+
961
+ # 3) Composition of categorical columns for Unknown group
962
+ cat_cols = [c for c in {list(set(cat_cols))} if c in _df.columns]
963
+ if "_UnknownGroup" in _df.columns and cat_cols:
964
+ unk = _df[_df["_UnknownGroup"]=="Unknown/Not Reported"]
965
+ comp_blocks = []
966
+ if not unk.empty:
967
+ for c in cat_cols:
968
+ vc = unk[c].astype(str).str.strip().replace({{"nan":"(missing)","":"(blank)"}}).value_counts(normalize=True, dropna=False)
969
+ comp = vc.mul(100).round(1).rename_axis("level").reset_index(name="Pct")
970
+ comp.insert(0, "Variable", c)
971
+ comp_blocks.append(comp)
972
+ lifestyle_comp = pd.concat(comp_blocks, ignore_index=True) if comp_blocks else pd.DataFrame(columns=["Variable","level","Pct"])
973
+ show(lifestyle_comp)
974
+
975
+ # 4) Visuals — guarded; fall back silently if plotting fails
976
+ try:
977
+ if "_UnknownGroup" in _df.columns:
978
+ ax = (size_tbl.set_index("Group")["Pct"]).plot(kind="bar", figsize=(5,3))
979
+ ax.set_ylabel("% of records")
980
+ ax.set_title(f"Known vs Unknown/Not Reported — {{'{group_col}'}}")
981
+ plt.tight_layout(); plt.show()
982
+ except Exception:
983
+ pass
984
+
985
+ # 5) Optional outcome prevalence
986
+ if {repr(outcome_col)} and {repr(outcome_col)} in _df.columns and pd.api.types.is_numeric_dtype(_df[{repr(outcome_col)}]):
987
+ try:
988
+ prev = _df.groupby("_UnknownGroup")[{repr(outcome_col)}].mean() * 100.0
989
+ show(prev.rename("Prevalence_%").reset_index())
990
+ except Exception:
991
+ pass
992
+
993
+ # 6) Note on data capture
994
+ note = (
995
+ "Data capture: reduce 'Unknown/Not Reported' via intake prompts, pre-fill known values, "
996
+ "audit repeated unknowns, and monitor Unknown rate over time and by site/channel."
997
+ )
998
+ show(note)
999
+ """)
1000
+
1001
+
1002
+ def viz_line(df, time_col=None, max_series=3, freq=None):
1003
+ """
1004
+ Plot up to `max_series` numeric columns against a detected datetime axis.
1005
+ - Detects a datetime/time-like column if `time_col` is None.
1006
+ - Optionally resamples to `freq` (e.g. 'D','W','M') if provided and evenly spaced lines are wanted.
1007
+ - Skips gracefully if no time or numeric columns are suitable.
1008
+ """
1009
+ return dedent(f"""
1010
+ import numpy as np
1011
+ import pandas as pd
1012
+ import matplotlib.pyplot as plt
1013
+
1014
+ _df = df.copy()
1015
+
1016
+ # 1) choose time column
1017
+ time_col = {repr(time_col)} # may be None
1018
+ if time_col is None:
1019
+ dt_cols = [c for c in _df.columns if np.issubdtype(_df[c].dtype, np.datetime64)]
1020
+ if not dt_cols:
1021
+ # name hints as fallback
1022
+ keys = ["date","time","timestamp","datetime","ds","period"]
1023
+ for c in _df.columns:
1024
+ n = str(c).lower()
1025
+ if any(k in n for k in keys):
1026
+ try:
1027
+ _df[c] = pd.to_datetime(_df[c], errors="coerce")
1028
+ if _df[c].notna().any():
1029
+ dt_cols = [c]
1030
+ break
1031
+ except Exception:
1032
+ pass
1033
+ time_col = dt_cols[0] if dt_cols else None
1034
+
1035
+ if not time_col or time_col not in _df.columns:
1036
+ show("⚠ No datetime-like column detected for a line chart; skipping.")
1037
+ else:
1038
+ _df = _df.dropna(subset=[time_col]).sort_values(time_col)
1039
+ # 2) pick up to `max_series` numeric columns (by variance)
1040
+ num_cols = [c for c in _df.select_dtypes(include=['number','bool']).columns if c != time_col]
1041
+ scored = []
1042
+ for c in num_cols:
1043
+ v = _df[c].dropna()
1044
+ scored.append((float(v.var()) if len(v) else 0.0, c))
1045
+ scored.sort(reverse=True)
1046
+ keep = [c for _, c in scored[:{max_series}]]
1047
+
1048
+ if not keep:
1049
+ show("⚠ No numeric columns available for a line chart; skipping.")
1050
+ else:
1051
+ plot_df = _df[[time_col] + keep].copy()
1052
+ # optional resample
1053
+ if {repr(freq)} and plot_df[time_col].notna().any():
1054
+ plot_df = plot_df.set_index(time_col).resample({repr(freq)}).mean().reset_index()
1055
+
1056
+ fig, ax = plt.subplots(figsize=(8, 4))
1057
+ for c in keep:
1058
+ ax.plot(plot_df[time_col], plot_df[c], label=str(c))
1059
+ ax.set_xlabel(str(time_col))
1060
+ ax.set_ylabel("Value")
1061
+ ax.legend(loc="best", frameon=False)
1062
+ ax.set_title("Line chart")
1063
+ plt.tight_layout()
1064
+ plt.show()
1065
+ """)
1066
+
1067
+
1068
+ def clustering(df):
1069
+ return dedent("""
1070
+ # ==== CLUSTERING BASELINE (KMeans + DBSCAN fallback) ====
1071
+ import numpy as np, pandas as pd
1072
+ from sklearn.pipeline import Pipeline
1073
+ from sklearn.impute import SimpleImputer
1074
+ from sklearn.preprocessing import StandardScaler
1075
+ from sklearn.cluster import KMeans, DBSCAN
1076
+ from sklearn.metrics import silhouette_score
1077
+ from sklearn.decomposition import PCA
1078
+ import matplotlib.pyplot as plt
1079
+
1080
+ _work = df.copy()
1081
+ num_cols = _work.select_dtypes(include=[np.number]).columns.tolist()
1082
+
1083
+ if len(num_cols) < 2:
1084
+ show(f"Clustering needs at least two numeric columns. Found: {num_cols}")
1085
+ else:
1086
+ X = _work[num_cols]
1087
+ pipe = Pipeline([
1088
+ ("imputer", SimpleImputer(strategy="median")),
1089
+ ("scaler", StandardScaler())
1090
+ ])
1091
+ Xp = pipe.fit_transform(X)
1092
+
1093
+ n = Xp.shape[0]
1094
+ k_max = max(2, min(12, n - 1))
1095
+ best_k, best_sil = None, -1
1096
+ inertias, ks = [], []
1097
+
1098
+ for k in range(2, k_max + 1):
1099
+ km = KMeans(n_clusters=k, n_init="auto", random_state=42)
1100
+ labels_k = km.fit_predict(Xp)
1101
+ if len(set(labels_k)) < 2:
1102
+ continue
1103
+ sil = silhouette_score(Xp, labels_k)
1104
+ inertias.append(km.inertia_); ks.append(k)
1105
+ if sil > best_sil:
1106
+ best_sil, best_k = sil, k
1107
+
1108
+ model_label = "KMeans"
1109
+ if best_k is not None:
1110
+ model = KMeans(n_clusters=best_k, n_init="auto", random_state=42).fit(Xp)
1111
+ labels = model.labels_
1112
+ show({"model": model_label, "k": best_k, "silhouette": round(best_sil, 3)})
1113
+ else:
1114
+ model = DBSCAN(eps=0.8, min_samples=10).fit(Xp)
1115
+ labels = model.labels_
1116
+ model_label = "DBSCAN"
1117
+ show({"model": model_label})
1118
+
1119
+ _work["cluster"] = labels
1120
+ show(_work["cluster"].value_counts().sort_index().rename("count").to_frame())
1121
+
1122
+ prof = _work.groupby("cluster")[num_cols].agg(["mean","median","std","min","max","count"])
1123
+ show(prof)
1124
+
1125
+ pca = PCA(n_components=2, random_state=42)
1126
+ comps = pca.fit_transform(Xp)
1127
+ fig, ax = plt.subplots(figsize=(7,5))
1128
+ for cl in sorted(set(labels)):
1129
+ mask = labels == cl
1130
+ ax.scatter(comps[mask,0], comps[mask,1], s=20, alpha=0.7, label=f"cluster {cl}")
1131
+ ax.set_title("PCA scatter of clusters"); ax.set_xlabel("PC1"); ax.set_ylabel("PC2")
1132
+ ax.legend(loc="best"); plt.tight_layout(); plt.show()
1133
+
1134
+ if ks:
1135
+ fig2, ax2 = plt.subplots(figsize=(7,4))
1136
+ ax2.plot(ks, inertias, marker="o")
1137
+ ax2.set_title("KMeans inertia by k"); ax2.set_xlabel("k"); ax2.set_ylabel("Inertia (SSE)")
1138
+ plt.tight_layout(); plt.show()
1139
+
1140
+ df[:] = _work
1141
+ """)
1142
+
1143
+
1144
+ def recommendation(df):
1145
+ return dedent("""
1146
+ # ==== ITEM-ITEM RECOMMENDATION (Nearest Neighbours over mixed features) ====
1147
+ import numpy as np, pandas as pd
1148
+ from sklearn.compose import ColumnTransformer
1149
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
1150
+ from sklearn.pipeline import Pipeline
1151
+ from sklearn.neighbors import NearestNeighbors
1152
+
1153
+ _work = df.copy()
1154
+
1155
+ # --- 1) Identify features (numeric + categorical) ---
1156
+ num_cols = _work.select_dtypes(include=[np.number]).columns.tolist()
1157
+ cat_cols = _work.select_dtypes(include=["object", "category", "string"]).columns.tolist()
1158
+
1159
+ # Heuristic: drop obvious IDs from features
1160
+ id_like = [c for c in _work.columns if (c.lower() in ("id","uid","uuid","record_id","row_id") or c.lower().endswith("_id"))]
1161
+ num_cols = [c for c in num_cols if c not in id_like]
1162
+ cat_cols = [c for c in cat_cols if c not in id_like]
1163
+
1164
+ # Minimal guard
1165
+ if len(num_cols) + len(cat_cols) < 1:
1166
+ show("No usable feature columns for recommendation."); # caption comes from PREFACE
1167
+ else:
1168
+ # --- 2) Build preprocessing (robust across sklearn versions) ---
1169
+ try:
1170
+ enc = _SMX_OHE()
1171
+ except NameError:
1172
+ # Fallback if PREFACE wasn't injected for some reason
1173
+ enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
1174
+
1175
+ pre = ColumnTransformer(
1176
+ transformers=[
1177
+ ("num", Pipeline([("scaler", StandardScaler())]), num_cols) if num_cols else ("num", "drop", []),
1178
+ ("cat", enc, cat_cols) if cat_cols else ("cat", "drop", []),
1179
+ ],
1180
+ remainder="drop"
1181
+ )
1182
+
1183
+ # --- 3) Sample cap for safety on huge tables ---
1184
+ N = len(_work)
1185
+ cap = min(N, 5000)
1186
+ _sample = _work.sample(n=cap, random_state=42) if N > cap else _work
1187
+
1188
+ X = pre.fit_transform(_sample)
1189
+ if getattr(X, "shape", (0,0))[0] < 2:
1190
+ show("Not enough rows to compute neighbours.")
1191
+ else:
1192
+ # --- 4) Fit cosine NN and pick a few anchors ---
1193
+ k = min(6, X.shape[0]) # includes self; we'll drop it
1194
+ nn = NearestNeighbors(metric="cosine", n_neighbors=k)
1195
+ nn.fit(X)
1196
+
1197
+ # Anchor strategy: prefer rows with an id-like column; otherwise first few
1198
+ anchor_ids = None
1199
+ if id_like:
1200
+ anchor_ids = _sample[id_like[0]].head(min(5, len(_sample))).tolist()
1201
+ anchors = _sample.index[:len(anchor_ids)].tolist()
1202
+ else:
1203
+ anchors = _sample.index[:min(5, len(_sample))].tolist()
1204
+
1205
+ # For readability, pick up to 4 descriptive (non-numeric) columns
1206
+ desc_cols = [c for c in _sample.columns if c in cat_cols][:4]
1207
+ meta_cols = (id_like[:1] + desc_cols)[:5]
1208
+
1209
+ # --- 5) Build neighbour tables per anchor ---
1210
+ for pos, aidx in enumerate(anchors):
1211
+ # position of aidx inside _sample
1212
+ loc = list(_sample.index).index(aidx)
1213
+ dists, inds = nn.kneighbors(X[loc].reshape(1, -1), return_distance=True)
1214
+ dists, inds = dists[0].tolist(), inds[0].tolist()
1215
+
1216
+ rows = []
1217
+ for dist, i in zip(dists, inds):
1218
+ if i == loc:
1219
+ continue # drop self
1220
+ ridx = _sample.index[i]
1221
+ row = {"rank": len(rows)+1, "distance": float(dist), "_index": int(ridx)}
1222
+ for c in meta_cols:
1223
+ if c in _sample.columns:
1224
+ row[c] = _sample.loc[ridx, c]
1225
+ rows.append(row)
1226
+
1227
+ out = pd.DataFrame(rows)
1228
+ title = "Similar items" if not id_like else f"Similar to {id_like[0]}={_sample.loc[aidx, id_like[0]]}"
1229
+ show(out, title=title)
1230
+
1231
+ # Summary
1232
+ feats = len(num_cols) + len(cat_cols)
1233
+ show({"rows_used": X.shape[0], "features": feats}, title="Recommendation set-up summary")
1234
+ """)
1235
+
1236
+
1237
+ def topic_modelling(df):
1238
+
1239
+ return dedent("""
1240
+ # ==== TOPIC MODELLING (LDA with safe fallback) ====
1241
+ import numpy as np, pandas as pd, re
1242
+ import matplotlib.pyplot as plt
1243
+
1244
+ # --- 1) Pick a text column (or compose one) ---
1245
+ _df = df.copy()
1246
+ text_cols_named = [c for c in _df.columns if any(k in c.lower() for k in ["text","review","description","comment","notes","content","body","message","title"])]
1247
+ obj_cols = _df.select_dtypes(include=["object","string"]).columns.tolist()
1248
+ candidates = text_cols_named + [c for c in obj_cols if c not in text_cols_named]
1249
+
1250
+ def _choose_text_col(d):
1251
+ best, best_score = None, -1
1252
+ for c in candidates or []:
1253
+ s = d[c].astype(str).fillna("")
1254
+ # token score: average length and alphabetic ratio
1255
+ tokens = s.str.split()
1256
+ score = float(tokens.map(len).mean() or 0) + float((s.str.contains(r"[A-Za-z]", regex=True)).mean()) * 2.0
1257
+ if score > best_score:
1258
+ best, best_score = c, score
1259
+ return best
1260
+
1261
+ text_col = _choose_text_col(_df)
1262
+ if text_col is None:
1263
+ # build a composite text if nothing obvious
1264
+ parts = obj_cols[:4]
1265
+ if not parts:
1266
+ show("No suitable text columns found for topic modelling.")
1267
+ else:
1268
+ _df["_smx_text"] = _df[parts].astype(str).agg(" ".join, axis=1)
1269
+ text_col = "_smx_text"
1270
+
1271
+ if text_col is not None:
1272
+ docs = _df[text_col].astype(str).fillna("").tolist()
1273
+ n_docs = len(docs)
1274
+
1275
+ # --- 2) Choose topic count sensibly ---
1276
+ n_topics = int(np.clip(max(3, int(np.sqrt(max(1, n_docs/50)))) , 3, 12))
1277
+
1278
+ # --- 3) Try LDA; if it fails, fall back to n-gram frequencies ---
1279
+ used_lda = False
1280
+ try:
1281
+ from sklearn.feature_extraction.text import CountVectorizer
1282
+ from sklearn.decomposition import LatentDirichletAllocation
1283
+ vect = CountVectorizer(stop_words="english", max_features=5000, ngram_range=(1,2))
1284
+ X = vect.fit_transform(docs)
1285
+ if X.shape[0] < 5 or X.shape[1] < 10:
1286
+ raise RuntimeError("Too little text to fit LDA.")
1287
+ lda = LatentDirichletAllocation(n_components=n_topics, learning_method="batch", random_state=42)
1288
+ W = lda.fit_transform(X) # doc-topic
1289
+ H = lda.components_ # topic-term
1290
+ terms = np.array(vect.get_feature_names_out())
1291
+
1292
+ # --- topic → top words table ---
1293
+ rows = []
1294
+ for k in range(n_topics):
1295
+ inds = np.argsort(H[k])[::-1][:12]
1296
+ words = terms[inds]
1297
+ weights = H[k, inds]
1298
+ rows.append({"topic": k, "top_terms": ", ".join(words[:10])})
1299
+ top_words = pd.DataFrame(rows)
1300
+ show(top_words, title="Topics and top terms")
1301
+
1302
+ # --- doc dominant topic + prevalence ---
1303
+ dom = W.argmax(axis=1)
1304
+ strength = W.max(axis=1)
1305
+ _df["topic"] = dom
1306
+ _df["topic_score"] = strength
1307
+ # prevalence plot
1308
+ prev = pd.Series(dom).value_counts().sort_index()
1309
+ fig, ax = plt.subplots(figsize=(7,4))
1310
+ prev.plot(kind="bar", ax=ax)
1311
+ ax.set_title("Topic prevalence"); ax.set_xlabel("topic"); ax.set_ylabel("documents")
1312
+ plt.tight_layout(); plt.show()
1313
+
1314
+ show(_df[["topic","topic_score"]].head(20), title="Document-topic sample")
1315
+ used_lda = True
1316
+
1317
+ except Exception as e:
1318
+ # --- Fallback: simple n-gram frequency table ---
1319
+ try:
1320
+ from sklearn.feature_extraction.text import CountVectorizer
1321
+ vect = CountVectorizer(stop_words="english", max_features=3000, ngram_range=(1,2))
1322
+ X = vect.fit_transform(docs)
1323
+ counts = np.asarray(X.sum(axis=0)).ravel()
1324
+ terms = np.array(vect.get_feature_names_out())
1325
+ top = pd.DataFrame({"term": terms, "count": counts}).sort_values("count", ascending=False).head(30)
1326
+ show(top, title="Top terms (fallback)")
1327
+ except Exception:
1328
+ show("Text vectorisation unavailable; cannot compute topics.")
1329
+ used_lda = False
1330
+
1331
+ # Summary
1332
+ show({"docs": n_docs, "topics": (n_topics if used_lda else 0)}, title="Topic modelling summary")
1333
+ """)
1334
+
1335
+
1336
+ def viz_pie(df, category_col=None, top_k=8):
1337
+ """Generic pie chart of category shares."""
1338
+ return dedent("""
1339
+ import pandas as pd
1340
+ import matplotlib.pyplot as plt
1341
+ from syntaxmatrix.display import show
1342
+
1343
+ _df = df.copy()
1344
+
1345
+ # auto pick categorical column if not provided
1346
+ cat = __SMX_CAT_HINT__
1347
+ if cat is None or cat not in _df.columns:
1348
+ cat_cols = [c for c in _df.columns
1349
+ if (_df[c].dtype == 'object' or str(_df[c].dtype).startswith('category'))
1350
+ and _df[c].nunique(dropna=True) > 1]
1351
+ if not cat_cols:
1352
+ raise ValueError("No suitable categorical column for pie chart.")
1353
+ cat = cat_cols[0]
1354
+
1355
+ s = _df[cat].astype(str).fillna("Missing").value_counts()
1356
+ if len(s) > __SMX_TOPK__:
1357
+ s = pd.concat([s.iloc[:__SMX_TOPK__], pd.Series({"Other": s.iloc[__SMX_TOPK__:].sum()})])
1358
+
1359
+ pie_df = s.reset_index()
1360
+ pie_df.columns = [cat, "count"]
1361
+ pie_df["percent"] = (pie_df["count"] / pie_df["count"].sum() * 100).round(2)
1362
+ show(pie_df)
1363
+
1364
+ plt.figure(figsize=(5,5))
1365
+ plt.pie(pie_df["count"], labels=pie_df[cat], autopct='%1.1f%%', startangle=90)
1366
+ plt.title(f"Composition of {cat}")
1367
+ plt.tight_layout()
1368
+ plt.show()
1369
+ """.replace("__SMX_CAT_HINT__", repr(category_col))
1370
+ .replace("__SMX_TOPK__", str(top_k)))
1371
+
1372
+
1373
+ def viz_violin(df, x=None, y=None, hue=None, sample_n=2000):
1374
+ """Violin plot for numeric distribution across categories."""
1375
+ return dedent("""
1376
+ import numpy as np
1377
+ import pandas as pd
1378
+ import matplotlib.pyplot as plt
1379
+ from syntaxmatrix.display import show
1380
+
1381
+ _df = df.copy()
1382
+
1383
+ xcol = __SMX_X__
1384
+ ycol = __SMX_Y__
1385
+ hcol = __SMX_HUE__
1386
+
1387
+ if xcol is None or xcol not in _df.columns:
1388
+ cat_cols = [c for c in _df.columns
1389
+ if (_df[c].dtype == 'object' or str(_df[c].dtype).startswith('category'))
1390
+ and _df[c].nunique(dropna=True) > 1
1391
+ and _df[c].nunique(dropna=True) <= 20]
1392
+ xcol = cat_cols[0] if cat_cols else None
1393
+
1394
+ if ycol is None or ycol not in _df.columns:
1395
+ num_cols = _df.select_dtypes(include=['number','bool']).columns.tolist()
1396
+ ycol = num_cols[0] if num_cols else None
1397
+
1398
+ if xcol is None or ycol is None:
1399
+ raise ValueError("Need one categorical (x) and one numeric (y) column for violin plot.")
1400
+
1401
+ use_cols = [xcol, ycol]
1402
+ if hcol in _df.columns and hcol not in (xcol, ycol):
1403
+ use_cols.append(hcol)
1404
+
1405
+ _work = _df[use_cols].dropna()
1406
+ if len(_work) > __SMX_SAMPLE_N__:
1407
+ _work = _work.sample(__SMX_SAMPLE_N__, random_state=42)
1408
+
1409
+ # Use seaborn if available, else fallback to boxplot
1410
+ try:
1411
+ import seaborn as sns
1412
+ plt.figure(figsize=(7,4))
1413
+ sns.violinplot(
1414
+ data=_work,
1415
+ x=xcol, y=ycol,
1416
+ hue=hcol if hcol in _work.columns else None,
1417
+ cut=0
1418
+ )
1419
+ plt.title(f"{ycol} distribution by {xcol}")
1420
+ plt.tight_layout()
1421
+ plt.show()
1422
+ except Exception:
1423
+ plt.figure(figsize=(7,4))
1424
+ _work.boxplot(column=ycol, by=xcol, grid=False)
1425
+ plt.title(f"{ycol} by {xcol} (box fallback)")
1426
+ plt.suptitle("")
1427
+ plt.tight_layout()
1428
+ plt.show()
1429
+
1430
+ show(_work.groupby(xcol)[ycol].describe().round(2))
1431
+ """.replace("__SMX_X__", repr(x))
1432
+ .replace("__SMX_Y__", repr(y))
1433
+ .replace("__SMX_HUE__", repr(hue))
1434
+ .replace("__SMX_SAMPLE_N__", str(sample_n)))
1435
+
1436
+
1437
+ def viz_stacked_bar(df, x=None, hue=None, normalise=True, top_k=8):
1438
+ """Stacked (optionally % stacked) bar chart for two categoricals."""
1439
+ return dedent("""
1440
+ import pandas as pd
1441
+ import matplotlib.pyplot as plt
1442
+ from syntaxmatrix.display import show
1443
+
1444
+ _df = df.copy()
1445
+
1446
+ xcol = __SMX_X__
1447
+ hcol = __SMX_HUE__
1448
+
1449
+ cat_cols = [c for c in _df.columns
1450
+ if (_df[c].dtype == 'object' or str(_df[c].dtype).startswith('category'))
1451
+ and _df[c].nunique(dropna=True) > 1
1452
+ and _df[c].nunique(dropna=True) <= 30]
1453
+
1454
+ if xcol is None or xcol not in _df.columns:
1455
+ xcol = cat_cols[0] if cat_cols else None
1456
+ if hcol is None or hcol not in _df.columns:
1457
+ hcol = cat_cols[1] if len(cat_cols) > 1 else None
1458
+
1459
+ if xcol is None or hcol is None:
1460
+ raise ValueError("Need two categorical columns for stacked bar chart.")
1461
+
1462
+ _work = _df[[xcol, hcol]].dropna()
1463
+
1464
+ keep_h = _work[hcol].astype(str).value_counts().index[:__SMX_TOPK__]
1465
+ _work[hcol] = _work[hcol].astype(str).where(_work[hcol].astype(str).isin(keep_h), other="Other")
1466
+
1467
+ tab = pd.crosstab(_work[xcol].astype(str), _work[hcol].astype(str))
1468
+ show(tab)
1469
+
1470
+ plot_tab = tab.copy()
1471
+ if __SMX_NORM__:
1472
+ plot_tab = plot_tab.div(plot_tab.sum(axis=1), axis=0) * 100
1473
+
1474
+ ax = plot_tab.plot(kind="bar", stacked=True, figsize=(8,4))
1475
+ ax.set_title(
1476
+ f"{hcol} composition by {xcol}" + (" (%)" if __SMX_NORM__ else "")
1477
+ )
1478
+ ax.set_xlabel(xcol)
1479
+ ax.set_ylabel("Percent" if __SMX_NORM__ else "Count")
1480
+ plt.legend(title=hcol, bbox_to_anchor=(1.02, 1), loc="upper left")
1481
+ plt.tight_layout()
1482
+ plt.show()
1483
+ """.replace("__SMX_X__", repr(x))
1484
+ .replace("__SMX_HUE__", repr(hue))
1485
+ .replace("__SMX_NORM__", "True" if normalise else "False")
1486
+ .replace("__SMX_TOPK__", str(top_k)))
1487
+
1488
+
1489
+ def viz_distribution(df, col=None, by=None, bins=30, sample_n=5000):
1490
+ """Histogram distribution for a numeric column, optionally split by a category."""
1491
+ return dedent("""
1492
+ import numpy as np
1493
+ import pandas as pd
1494
+ import matplotlib.pyplot as plt
1495
+ from syntaxmatrix.display import show
1496
+
1497
+ _df = df.copy()
1498
+ ncol = __SMX_COL__
1499
+ bcol = __SMX_BY__
1500
+
1501
+ if ncol is None or ncol not in _df.columns:
1502
+ num_cols = _df.select_dtypes(include=['number','bool']).columns.tolist()
1503
+ ncol = num_cols[0] if num_cols else None
1504
+
1505
+ if ncol is None:
1506
+ raise ValueError("No numeric column available for distribution plot.")
1507
+
1508
+ if bcol is not None and bcol not in _df.columns:
1509
+ bcol = None
1510
+
1511
+ use_cols = [ncol] + ([bcol] if bcol else [])
1512
+ _work = _df[use_cols].dropna()
1513
+
1514
+ if len(_work) > __SMX_SAMPLE_N__:
1515
+ _work = _work.sample(__SMX_SAMPLE_N__, random_state=42)
1516
+
1517
+ plt.figure(figsize=(7,4))
1518
+ if bcol:
1519
+ try:
1520
+ import seaborn as sns
1521
+ sns.histplot(
1522
+ data=_work, x=ncol, hue=bcol,
1523
+ bins=__SMX_BINS__,
1524
+ stat="density",
1525
+ common_norm=False,
1526
+ element="step"
1527
+ )
1528
+ except Exception:
1529
+ for k, g in _work.groupby(bcol):
1530
+ plt.hist(g[ncol], bins=__SMX_BINS__, alpha=0.5, density=True, label=str(k))
1531
+ plt.legend(title=bcol)
1532
+ else:
1533
+ plt.hist(_work[ncol], bins=__SMX_BINS__, alpha=0.8)
1534
+
1535
+ plt.title(f"Distribution of {ncol}" + (f" by {bcol}" if bcol else ""))
1536
+ plt.xlabel(ncol)
1537
+ plt.ylabel("Density" if bcol else "Count")
1538
+ plt.tight_layout()
1539
+ plt.show()
1540
+
1541
+ show(_work[ncol].describe().round(2))
1542
+ """.replace("__SMX_COL__", repr(col))
1543
+ .replace("__SMX_BY__", repr(by))
1544
+ .replace("__SMX_BINS__", str(bins))
1545
+ .replace("__SMX_SAMPLE_N__", str(sample_n)))
1546
+
1547
+
1548
+ def viz_area(df, x=None, y=None, group=None, sample_n=3000):
1549
+ # richer time/area plot (useful for trends)
1550
+ return dedent(f"""
1551
+ import matplotlib.pyplot as plt
1552
+ from syntaxmatrix.display import show
1553
+
1554
+ _df = df.copy()
1555
+
1556
+ # auto-pick numeric columns
1557
+ num_cols = _df.select_dtypes(include=['number','bool']).columns.tolist()
1558
+ cat_cols = [c for c in _df.columns if c not in num_cols and _df[c].nunique(dropna=True) <= 12]
1559
+
1560
+ if x is None or x not in _df.columns:
1561
+ x = None # area plot can be index-based
1562
+ if y is None or y not in _df.columns:
1563
+ y = num_cols[0] if num_cols else None
1564
+ if group is None or group not in _df.columns:
1565
+ group = cat_cols[0] if cat_cols else None
1566
+
1567
+ if y is None:
1568
+ show("⚠ No numeric column for area plot.")
1569
+ else:
1570
+ dplot = _df[[c for c in [x,y,group] if c]].dropna()
1571
+ if len(dplot) > sample_n:
1572
+ dplot = dplot.sample(sample_n, random_state=42)
1573
+
1574
+ if x:
1575
+ dplot = dplot.sort_values(x)
1576
+
1577
+ plt.figure(figsize=(7,3.5))
1578
+ if group is None:
1579
+ plt.fill_between(range(len(dplot)), dplot[y].values, alpha=0.6)
1580
+ plt.title(f"Area plot of {y}")
1581
+ else:
1582
+ for k, g in dplot.groupby(group):
1583
+ plt.fill_between(range(len(g)), g[y].values, alpha=0.4, label=str(k))
1584
+ plt.legend()
1585
+ plt.title(f"{y} area plot by {group}")
1586
+ plt.tight_layout()
1587
+ plt.show()
1588
+ """)
1589
+
1590
+
1591
+ def viz_kde(df, col=None, by=None, sample_n=5000):
1592
+ return dedent(f"""
1593
+ import matplotlib.pyplot as plt
1594
+ from syntaxmatrix.display import show
1595
+
1596
+ _df = df.copy()
1597
+ num_cols = _df.select_dtypes(include=['number','bool']).columns.tolist()
1598
+ cat_cols = [c for c in _df.columns if c not in num_cols and _df[c].nunique(dropna=True) <= 12]
1599
+
1600
+ if col is None or col not in _df.columns:
1601
+ col = num_cols[0] if num_cols else None
1602
+ if by is None or by not in _df.columns:
1603
+ by = cat_cols[0] if cat_cols else None
1604
+
1605
+ if col is None:
1606
+ show("⚠ No numeric column for density plot.")
1607
+ else:
1608
+ dplot = _df[[c for c in [col,by] if c]].dropna()
1609
+ if len(dplot) > sample_n:
1610
+ dplot = dplot.sample(sample_n, random_state=42)
1611
+
1612
+ plt.figure(figsize=(6,3.5))
1613
+ try:
1614
+ if by is None:
1615
+ sns.kdeplot(data=dplot, x=col, fill=True)
1616
+ plt.title(f"Density of {col}")
1617
+ else:
1618
+ sns.kdeplot(data=dplot, x=col, hue=by, fill=True, common_norm=False)
1619
+ plt.title(f"Density of {col} by {by}")
1620
+ except Exception:
1621
+ # matplotlib fallback
1622
+ if by is None:
1623
+ dplot[col].plot(kind="kde")
1624
+ else:
1625
+ for k, g in dplot.groupby(by):
1626
+ g[col].plot(kind="kde", label=str(k))
1627
+ plt.legend()
1628
+ plt.tight_layout()
1629
+ plt.show()
1630
+ """)
1631
+
1632
+
1633
+ def viz_count_bar(df, category_col=None, top_k=12):
1634
+ return dedent("""
1635
+ import matplotlib.pyplot as plt
1636
+ from syntaxmatrix.display import show
1637
+
1638
+ _df = df.copy()
1639
+
1640
+ # Auto-pick a sensible categorical column if none provided
1641
+ if category_col is None or category_col not in _df.columns:
1642
+ num_cols = _df.select_dtypes(include=['number','bool']).columns.tolist()
1643
+ cat_cols = [
1644
+ c for c in _df.columns
1645
+ if c not in num_cols
1646
+ and (_df[c].dtype == 'object' or str(_df[c].dtype).startswith('category') or _df[c].nunique(dropna=True) <= 25)
1647
+ ]
1648
+ # Prefer low-cardinality cols
1649
+ cat_cols = [c for c in cat_cols if 2 <= _df[c].nunique(dropna=True) <= 25]
1650
+ category_col = cat_cols[0] if cat_cols else None
1651
+
1652
+ if category_col is None:
1653
+ show("⚠ No categorical column available for count bar chart.")
1654
+ else:
1655
+ s = _df[category_col].astype(str)
1656
+ vc = s.value_counts()
1657
+
1658
+ # Trim long tails so the bar stays readable
1659
+ if len(vc) > top_k:
1660
+ head = vc.head(top_k)
1661
+ tail_sum = vc.iloc[top_k:].sum()
1662
+ vc = head.copy()
1663
+ if tail_sum > 0:
1664
+ vc.loc["Other"] = tail_sum
1665
+
1666
+ plt.figure(figsize=(7, 3.8))
1667
+ plt.bar(vc.index.astype(str), vc.values)
1668
+ plt.xticks(rotation=0, ha="center")
1669
+ plt.title(f"Counts by {category_col}")
1670
+ plt.ylabel("Count")
1671
+ plt.tight_layout()
1672
+ plt.show()
1673
+
1674
+ show(vc.rename("count").reset_index().rename(columns={"index": category_col}))
1675
+ """)
1676
+
1677
+
1678
+ from textwrap import dedent
1679
+
1680
+ def viz_scatter(df, x=None, y=None, hue=None, sample_n=2000):
1681
+ return dedent("""
1682
+ import matplotlib.pyplot as plt
1683
+ from syntaxmatrix.display import show
1684
+
1685
+ _df = df.copy()
1686
+
1687
+ num_cols = _df.select_dtypes(include=['number','bool']).columns.tolist()
1688
+ cat_cols = [
1689
+ c for c in _df.columns
1690
+ if c not in num_cols
1691
+ and (_df[c].dtype == 'object' or str(_df[c].dtype).startswith('category') or _df[c].nunique(dropna=True) <= 20)
1692
+ ]
1693
+ cat_cols = [c for c in cat_cols if 2 <= _df[c].nunique(dropna=True) <= 20]
1694
+
1695
+ if x is None or x not in _df.columns:
1696
+ x = num_cols[0] if len(num_cols) > 0 else None
1697
+ if y is None or y not in _df.columns:
1698
+ y = num_cols[1] if len(num_cols) > 1 else None
1699
+ if hue is None or hue not in _df.columns:
1700
+ hue = cat_cols[0] if cat_cols else None
1701
+
1702
+ if x is None or y is None:
1703
+ show("⚠ Not enough numeric columns for scatter plot.")
1704
+ else:
1705
+ cols = [c for c in [x, y, hue] if c is not None]
1706
+ dplot = _df[cols].dropna()
1707
+
1708
+ if len(dplot) > sample_n:
1709
+ dplot = dplot.sample(sample_n, random_state=42)
1710
+
1711
+ plt.figure(figsize=(6, 4))
1712
+ if hue is None:
1713
+ plt.scatter(dplot[x], dplot[y], alpha=0.6)
1714
+ plt.title(f"{y} vs {x}")
1715
+ plt.xlabel(x); plt.ylabel(y)
1716
+ else:
1717
+ try:
1718
+ ax = sns.scatterplot(data=dplot, x=x, y=y, hue=hue)
1719
+ ax.set_title(f"{y} vs {x} by {hue}")
1720
+ except Exception:
1721
+ for k, g in dplot.groupby(hue):
1722
+ plt.scatter(g[x], g[y], label=str(k), alpha=0.6)
1723
+ plt.legend()
1724
+ plt.title(f"{y} vs {x} by {hue}")
1725
+ plt.xlabel(x); plt.ylabel(y)
1726
+
1727
+ plt.tight_layout()
1728
+ plt.show()
1729
+ """)
1730
+
1731
+
1732
+ def viz_box(df, x=None, y=None, sample_n=3000):
1733
+ return dedent("""
1734
+ import matplotlib.pyplot as plt
1735
+ from syntaxmatrix.display import show
1736
+
1737
+ _df = df.copy()
1738
+
1739
+ # Identify numeric and categorical candidates
1740
+ num_cols = _df.select_dtypes(include=['number','bool']).columns.tolist()
1741
+ cat_cols = [
1742
+ c for c in _df.columns
1743
+ if c not in num_cols
1744
+ and (_df[c].dtype == 'object' or str(_df[c].dtype).startswith('category') or _df[c].nunique(dropna=True) <= 25)
1745
+ ]
1746
+ cat_cols = [c for c in cat_cols if 2 <= _df[c].nunique(dropna=True) <= 25]
1747
+
1748
+ # Auto-pick y (numeric) and x (categorical) if not provided
1749
+ if y is None or y not in _df.columns:
1750
+ y = num_cols[0] if num_cols else None
1751
+ if x is None or x not in _df.columns:
1752
+ x = cat_cols[0] if cat_cols else None
1753
+
1754
+ if y is None:
1755
+ show("⚠ No numeric column available for box plot.")
1756
+ else:
1757
+ cols = [c for c in [x, y] if c is not None]
1758
+ dplot = _df[cols].dropna()
1759
+
1760
+ if len(dplot) > sample_n:
1761
+ dplot = dplot.sample(sample_n, random_state=42)
1762
+
1763
+ if x is None:
1764
+ plt.figure(figsize=(5.5, 3.8))
1765
+ plt.boxplot(dplot[y])
1766
+ plt.title(f"Distribution of {y}")
1767
+ plt.ylabel(y)
1768
+ else:
1769
+ # seaborn if available, else matplotlib grouped box
1770
+ try:
1771
+ ax = sns.boxplot(data=dplot, x=x, y=y)
1772
+ ax.set_title(f"{y} by {x}")
1773
+ ax.set_xlabel(x); ax.set_ylabel(y)
1774
+ except Exception:
1775
+ groups = [g[y].values for _, g in dplot.groupby(x)]
1776
+ labels = [str(k) for k in dplot.groupby(x).groups.keys()]
1777
+ plt.figure(figsize=(7.5, 3.8))
1778
+ plt.boxplot(groups, labels=labels)
1779
+ plt.title(f"{y} by {x}")
1780
+ plt.xlabel(x); plt.ylabel(y)
1781
+
1782
+ plt.tight_layout()
1783
+ plt.show()
1784
+
1785
+ # Show a quick summary table too
1786
+ if x is None:
1787
+ show(dplot[y].describe())
1788
+ else:
1789
+ show(dplot.groupby(x)[y].describe())
1790
+ """)