syntaxmatrix 2.3.5__py3-none-any.whl → 2.5.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- syntaxmatrix/agentic/__init__.py +0 -0
- syntaxmatrix/agentic/agent_tools.py +24 -0
- syntaxmatrix/agentic/agents.py +810 -0
- syntaxmatrix/agentic/code_tools_registry.py +37 -0
- syntaxmatrix/agentic/model_templates.py +1790 -0
- syntaxmatrix/commentary.py +134 -112
- syntaxmatrix/core.py +385 -245
- syntaxmatrix/dataset_preprocessing.py +218 -0
- syntaxmatrix/display.py +89 -37
- syntaxmatrix/gpt_models_latest.py +5 -4
- syntaxmatrix/profiles.py +19 -4
- syntaxmatrix/routes.py +947 -141
- syntaxmatrix/settings/model_map.py +38 -30
- syntaxmatrix/static/icons/hero_bg.jpg +0 -0
- syntaxmatrix/templates/dashboard.html +248 -54
- syntaxmatrix/utils.py +2254 -84
- {syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/METADATA +16 -17
- {syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/RECORD +21 -15
- syntaxmatrix/model_templates.py +0 -29
- {syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/WHEEL +0 -0
- {syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/licenses/LICENSE.txt +0 -0
- {syntaxmatrix-2.3.5.dist-info → syntaxmatrix-2.5.5.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1790 @@
|
|
|
1
|
+
# syntaxmatrix/model_templates.py
|
|
2
|
+
from textwrap import dedent
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def classification(df, target=None):
|
|
6
|
+
code = dedent("""
|
|
7
|
+
# ==== CLASSIFICATION BASELINE (titles + shared SMX_SAMPLE_CAP) ====
|
|
8
|
+
import numpy as np, pandas as pd
|
|
9
|
+
import matplotlib.pyplot as plt
|
|
10
|
+
from sklearn.model_selection import train_test_split
|
|
11
|
+
from sklearn.compose import ColumnTransformer
|
|
12
|
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
13
|
+
from sklearn.pipeline import Pipeline
|
|
14
|
+
from sklearn.linear_model import LogisticRegression
|
|
15
|
+
from sklearn.metrics import (
|
|
16
|
+
classification_report, confusion_matrix, roc_curve, auc,
|
|
17
|
+
precision_recall_curve, average_precision_score, accuracy_score,
|
|
18
|
+
f1_score, recall_score, precision_score
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
_work = df.copy()
|
|
22
|
+
|
|
23
|
+
# --- 0) Sample cap (from PREFACE) ---
|
|
24
|
+
try:
|
|
25
|
+
CAP = int(SMX_SAMPLE_CAP)
|
|
26
|
+
except Exception:
|
|
27
|
+
CAP = 5000
|
|
28
|
+
if len(_work) > CAP:
|
|
29
|
+
_work = _work.sample(n=CAP, random_state=42)
|
|
30
|
+
|
|
31
|
+
# --- 1) Choose target (use hint if valid, else heuristic) ---
|
|
32
|
+
_hint = __SMX_TARGET_HINT__
|
|
33
|
+
target = _hint if (_hint is not None and str(_hint) in _work.columns) else None
|
|
34
|
+
if target is None:
|
|
35
|
+
prefs = ['target','label','class','y','outcome','churn','default','is_fraud','clicked','purchased']
|
|
36
|
+
for c in prefs:
|
|
37
|
+
if c in _work.columns:
|
|
38
|
+
target = c; break
|
|
39
|
+
if target is None:
|
|
40
|
+
# choose low-cardinality column
|
|
41
|
+
cand = [(c, _work[c].nunique(dropna=True)) for c in _work.columns]
|
|
42
|
+
cand = [c for c, k in cand if k <= 20 and c.lower() not in ('id','uuid')]
|
|
43
|
+
target = cand[-1] if cand else None
|
|
44
|
+
|
|
45
|
+
if target is None:
|
|
46
|
+
show("No obvious classification target found.", title="Classification")
|
|
47
|
+
elif _work[target].nunique(dropna=True) < 2:
|
|
48
|
+
show(f"Target '{target}' has fewer than two classes.", title="Classification")
|
|
49
|
+
else:
|
|
50
|
+
X = _work.drop(columns=[target])
|
|
51
|
+
y = _work[target].astype(str)
|
|
52
|
+
|
|
53
|
+
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
|
|
54
|
+
cat_cols = X.select_dtypes(include=["object","category","string","bool"]).columns.tolist()
|
|
55
|
+
|
|
56
|
+
# robust OneHot across sklearn versions (uses PREFACE helper if present)
|
|
57
|
+
try:
|
|
58
|
+
enc = _SMX_OHE()
|
|
59
|
+
except Exception:
|
|
60
|
+
enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
|
|
61
|
+
|
|
62
|
+
pre = ColumnTransformer(
|
|
63
|
+
transformers=[
|
|
64
|
+
("num", Pipeline([("scaler", StandardScaler())]), num_cols) if num_cols else ("num","drop",[]),
|
|
65
|
+
("cat", enc, cat_cols) if cat_cols else ("cat","drop",[]),
|
|
66
|
+
],
|
|
67
|
+
remainder="drop"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
71
|
+
X, y, test_size=0.2, random_state=42, stratify=y if y.nunique() > 1 else None
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
clf = Pipeline([
|
|
75
|
+
("pre", pre),
|
|
76
|
+
("est", LogisticRegression(max_iter=1000, class_weight="balanced"))
|
|
77
|
+
])
|
|
78
|
+
clf.fit(X_train, y_train)
|
|
79
|
+
|
|
80
|
+
y_pred = clf.predict(X_test)
|
|
81
|
+
try:
|
|
82
|
+
proba = clf.predict_proba(X_test)
|
|
83
|
+
y_score = proba.max(axis=1)
|
|
84
|
+
except Exception:
|
|
85
|
+
proba, y_score = None, None
|
|
86
|
+
|
|
87
|
+
# --- 2) Tables with explicit titles ---
|
|
88
|
+
cr = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
|
|
89
|
+
show(pd.DataFrame(cr).transpose(), title="Classification report")
|
|
90
|
+
|
|
91
|
+
# Confusion matrix (robust labels)
|
|
92
|
+
labels_list = sorted(list(pd.unique(y)))
|
|
93
|
+
cm = confusion_matrix(y_test, y_pred, labels=labels_list)
|
|
94
|
+
index = [f"true:{str(lbl)}" for lbl in labels_list]
|
|
95
|
+
columns = [f"pred:{str(lbl)}" for lbl in labels_list]
|
|
96
|
+
cm_df = pd.DataFrame(cm, index=index, columns=columns)
|
|
97
|
+
show(cm_df, title="Confusion matrix")
|
|
98
|
+
|
|
99
|
+
summary = {
|
|
100
|
+
"accuracy": float(accuracy_score(y_test, y_pred)),
|
|
101
|
+
"precision_macro": float(precision_score(y_test, y_pred, average="macro", zero_division=0)),
|
|
102
|
+
"recall_macro": float(recall_score(y_test, y_pred, average="macro", zero_division=0)),
|
|
103
|
+
"f1_macro": float(f1_score(y_test, y_pred, average="macro", zero_division=0)),
|
|
104
|
+
"classes": int(len(labels_list)),
|
|
105
|
+
"rows_used": int(len(_work))
|
|
106
|
+
}
|
|
107
|
+
show(summary, title="Metrics summary")
|
|
108
|
+
|
|
109
|
+
# --- 3) ROC / PR curves for binary (best-effort) ---
|
|
110
|
+
if proba is not None and len(labels_list) == 2:
|
|
111
|
+
pos = labels_list[1]
|
|
112
|
+
y_bin = (y_test == pos).astype(int)
|
|
113
|
+
y_prob = proba[:, 1]
|
|
114
|
+
fpr, tpr, _ = roc_curve(y_bin, y_prob)
|
|
115
|
+
roc_auc = auc(fpr, tpr)
|
|
116
|
+
|
|
117
|
+
fig, ax = plt.subplots(figsize=(6,5))
|
|
118
|
+
ax.plot(fpr, tpr)
|
|
119
|
+
ax.plot([0,1],[0,1], linestyle="--")
|
|
120
|
+
ax.set_title(f"ROC curve (AUC={roc_auc:.3f})")
|
|
121
|
+
ax.set_xlabel("FPR"); ax.set_ylabel("TPR")
|
|
122
|
+
plt.tight_layout(); plt.show()
|
|
123
|
+
|
|
124
|
+
prec, rec, _ = precision_recall_curve(y_bin, y_prob)
|
|
125
|
+
ap = average_precision_score(y_bin, y_prob)
|
|
126
|
+
fig2, ax2 = plt.subplots(figsize=(6,4))
|
|
127
|
+
ax2.plot(rec, prec)
|
|
128
|
+
ax2.set_title(f"Precision–Recall (AP={ap:.3f})")
|
|
129
|
+
ax2.set_xlabel("Recall"); ax2.set_ylabel("Precision")
|
|
130
|
+
plt.tight_layout(); plt.show()
|
|
131
|
+
|
|
132
|
+
# --- 4) Predictions sample (captioned) ---
|
|
133
|
+
out = X_test.copy()
|
|
134
|
+
out["_true"] = y_test.values
|
|
135
|
+
out["_pred"] = y_pred
|
|
136
|
+
if y_score is not None:
|
|
137
|
+
out["_score"] = y_score
|
|
138
|
+
show(out.head(20), title="Predictions (sample)")
|
|
139
|
+
""")
|
|
140
|
+
return code.replace("__SMX_TARGET_HINT__", repr(target))
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def regression(df, target=None):
|
|
144
|
+
code = dedent("""
|
|
145
|
+
# ==== REGRESSION BASELINE (titles + shared SMX_SAMPLE_CAP) ====
|
|
146
|
+
import numpy as np, pandas as pd
|
|
147
|
+
import matplotlib.pyplot as plt
|
|
148
|
+
from sklearn.model_selection import train_test_split
|
|
149
|
+
from sklearn.compose import ColumnTransformer
|
|
150
|
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
151
|
+
from sklearn.pipeline import Pipeline
|
|
152
|
+
from sklearn.linear_model import Ridge
|
|
153
|
+
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
|
154
|
+
|
|
155
|
+
_work = df.copy()
|
|
156
|
+
try:
|
|
157
|
+
CAP = int(SMX_SAMPLE_CAP)
|
|
158
|
+
except Exception:
|
|
159
|
+
CAP = 5000
|
|
160
|
+
if len(_work) > CAP:
|
|
161
|
+
_work = _work.sample(n=CAP, random_state=42)
|
|
162
|
+
|
|
163
|
+
# target pick (hint first)
|
|
164
|
+
_hint = __SMX_TARGET_HINT__
|
|
165
|
+
target = _hint if (_hint is not None and str(_hint) in _work.columns) else None
|
|
166
|
+
|
|
167
|
+
if target is None:
|
|
168
|
+
num_cols_all = _work.select_dtypes(include=[np.number]).columns.tolist()
|
|
169
|
+
for c in ['target','y','price','amount','value','score','sales','revenue']:
|
|
170
|
+
if c in num_cols_all:
|
|
171
|
+
target = c; break
|
|
172
|
+
if target is None and num_cols_all:
|
|
173
|
+
target = num_cols_all[-1]
|
|
174
|
+
|
|
175
|
+
if target is None:
|
|
176
|
+
show("No numeric target found for regression.", title="Regression")
|
|
177
|
+
else:
|
|
178
|
+
X = _work.drop(columns=[target]); y = _work[target].astype(float)
|
|
179
|
+
|
|
180
|
+
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
|
|
181
|
+
cat_cols = X.select_dtypes(include=["object","category","string","bool"]).columns.tolist()
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
enc = _SMX_OHE()
|
|
185
|
+
except Exception:
|
|
186
|
+
enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
|
|
187
|
+
|
|
188
|
+
pre = ColumnTransformer(
|
|
189
|
+
transformers=[
|
|
190
|
+
("num", Pipeline([("scaler", StandardScaler())]), num_cols) if num_cols else ("num","drop",[]),
|
|
191
|
+
("cat", enc, cat_cols) if cat_cols else ("cat","drop",[]),
|
|
192
|
+
],
|
|
193
|
+
remainder="drop"
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
197
|
+
|
|
198
|
+
model = Pipeline([("pre", pre), ("est", Ridge(alpha=1.0, random_state=42))])
|
|
199
|
+
model.fit(X_train, y_train)
|
|
200
|
+
|
|
201
|
+
y_pred = model.predict(X_test)
|
|
202
|
+
|
|
203
|
+
mse = mean_squared_error(y_test, y_pred)
|
|
204
|
+
rmse = float(np.sqrt(mse))
|
|
205
|
+
mae = mean_absolute_error(y_test, y_pred)
|
|
206
|
+
r2 = r2_score(y_test, y_pred)
|
|
207
|
+
|
|
208
|
+
show({"MAE": float(mae), "MSE": float(mse), "RMSE": rmse, "R²": float(r2), "rows_used": int(len(_work))},
|
|
209
|
+
title="Regression metrics")
|
|
210
|
+
|
|
211
|
+
fig, ax = plt.subplots(figsize=(6,5))
|
|
212
|
+
ax.scatter(y_test, y_pred, s=18, alpha=0.7)
|
|
213
|
+
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], linestyle="--")
|
|
214
|
+
ax.set_title("Parity plot (y vs ŷ)"); ax.set_xlabel("Actual"); ax.set_ylabel("Predicted")
|
|
215
|
+
plt.tight_layout(); plt.show()
|
|
216
|
+
|
|
217
|
+
resid = y_test - y_pred
|
|
218
|
+
fig2, ax2 = plt.subplots(figsize=(6,4))
|
|
219
|
+
ax2.scatter(y_pred, resid, s=16, alpha=0.7)
|
|
220
|
+
ax2.axhline(0.0, linestyle="--")
|
|
221
|
+
ax2.set_title("Residuals vs predicted"); ax2.set_xlabel("Predicted"); ax2.set_ylabel("Residual")
|
|
222
|
+
plt.tight_layout(); plt.show()
|
|
223
|
+
|
|
224
|
+
out = X_test.copy(); out["_actual"] = y_test.values; out["_pred"] = y_pred; out["_residual"] = resid
|
|
225
|
+
show(out.head(20), title="Predictions (sample)")
|
|
226
|
+
""")
|
|
227
|
+
return code.replace("__SMX_TARGET_HINT__", repr(target))
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def multilabel_classification(df, label_cols):
|
|
231
|
+
"""
|
|
232
|
+
Baseline multi-label pipeline:
|
|
233
|
+
- X: numeric features only (excludes label_cols)
|
|
234
|
+
- y: df[label_cols] (2D binary frame)
|
|
235
|
+
- Model: OneVsRest(LogisticRegression)
|
|
236
|
+
- Metrics: subset accuracy, hamming loss, micro/macro F1, per-label ROC AUC
|
|
237
|
+
- Confusion matrices: from_predictions per label (no estimator wrapper)
|
|
238
|
+
"""
|
|
239
|
+
return dedent(f"""
|
|
240
|
+
import numpy as np
|
|
241
|
+
import pandas as pd
|
|
242
|
+
import matplotlib.pyplot as plt
|
|
243
|
+
from sklearn.preprocessing import StandardScaler
|
|
244
|
+
from sklearn.linear_model import LogisticRegression
|
|
245
|
+
from sklearn.multiclass import OneVsRestClassifier
|
|
246
|
+
from sklearn.pipeline import Pipeline
|
|
247
|
+
from sklearn.model_selection import train_test_split
|
|
248
|
+
from sklearn.metrics import (
|
|
249
|
+
accuracy_score, hamming_loss, f1_score, roc_auc_score,
|
|
250
|
+
classification_report, ConfusionMatrixDisplay
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
LABEL_COLS = {list(label_cols)}
|
|
254
|
+
|
|
255
|
+
# X = numeric features only, drop labels
|
|
256
|
+
X = df.drop(columns=LABEL_COLS).select_dtypes(include=['number','bool']).copy()
|
|
257
|
+
y = df[LABEL_COLS].astype(int).copy()
|
|
258
|
+
|
|
259
|
+
if X.empty:
|
|
260
|
+
raise ValueError("No numeric features available for multi-label classification.")
|
|
261
|
+
if y.shape[1] < 2:
|
|
262
|
+
raise ValueError("Need at least two label columns for multi-label classification.")
|
|
263
|
+
|
|
264
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
265
|
+
X, y, test_size=0.2, random_state=42, stratify=y.sum(axis=1) if y.sum(axis=1).nunique()>1 else None
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
pipeline = Pipeline(steps=[
|
|
269
|
+
("scaler", StandardScaler(with_mean=False)),
|
|
270
|
+
("clf", OneVsRestClassifier(LogisticRegression(max_iter=200, n_jobs=None)))
|
|
271
|
+
])
|
|
272
|
+
|
|
273
|
+
pipeline.fit(X_train, y_train)
|
|
274
|
+
|
|
275
|
+
y_pred = pipeline.predict(X_test)
|
|
276
|
+
# Probas for AUC (fallback to zeros if not available)
|
|
277
|
+
try:
|
|
278
|
+
y_proba = pipeline.predict_proba(X_test)
|
|
279
|
+
y_proba = np.column_stack([p[:,1] if p.ndim==2 else p for p in y_proba])
|
|
280
|
+
except Exception:
|
|
281
|
+
y_proba = np.zeros_like(y_pred, dtype=float)
|
|
282
|
+
|
|
283
|
+
# Aggregate metrics
|
|
284
|
+
metrics_row = {{
|
|
285
|
+
"accuracy": accuracy_score(y_test, y_pred),
|
|
286
|
+
"hamming_loss": hamming_loss(y_test, y_pred),
|
|
287
|
+
"f1_micro": f1_score(y_test, y_pred, average="micro", zero_division=0),
|
|
288
|
+
"f1_macro": f1_score(y_test, y_pred, average="macro", zero_division=0),
|
|
289
|
+
}}
|
|
290
|
+
# macro ROC AUC if we have probabilities
|
|
291
|
+
try:
|
|
292
|
+
metrics_row["roc_auc_macro"] = roc_auc_score(y_test, y_proba, average="macro")
|
|
293
|
+
except Exception:
|
|
294
|
+
metrics_row["roc_auc_macro"] = np.nan
|
|
295
|
+
|
|
296
|
+
show(pd.DataFrame([metrics_row]))
|
|
297
|
+
|
|
298
|
+
# Per-label report and ROC AUC
|
|
299
|
+
report_rows = []
|
|
300
|
+
for j, col in enumerate(LABEL_COLS):
|
|
301
|
+
try:
|
|
302
|
+
auc = roc_auc_score(y_test.iloc[:, j], y_proba[:, j]) if y_proba.size else np.nan
|
|
303
|
+
except Exception:
|
|
304
|
+
auc = np.nan
|
|
305
|
+
report = classification_report(
|
|
306
|
+
y_test.iloc[:, j], y_pred[:, j], output_dict=True, zero_division=0
|
|
307
|
+
)
|
|
308
|
+
report_rows.append({{"label": col, "roc_auc": auc}})
|
|
309
|
+
show(pd.DataFrame(report_rows))
|
|
310
|
+
|
|
311
|
+
# Confusion matrices per label — use from_predictions (no estimator wrapper needed)
|
|
312
|
+
n = len(LABEL_COLS)
|
|
313
|
+
ncols = 3 if n >= 3 else n
|
|
314
|
+
nrows = int(np.ceil(n / ncols)) if ncols else 1
|
|
315
|
+
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(4*ncols, 3*nrows))
|
|
316
|
+
axes = axes.ravel() if n > 1 else [axes]
|
|
317
|
+
for i, col in enumerate(LABEL_COLS[:len(axes)]):
|
|
318
|
+
ConfusionMatrixDisplay.from_predictions(
|
|
319
|
+
y_test.iloc[:, i], y_pred[:, i], ax=axes[i], cmap=plt.cm.Blues
|
|
320
|
+
)
|
|
321
|
+
axes[i].set_title(col)
|
|
322
|
+
plt.tight_layout()
|
|
323
|
+
plt.show()
|
|
324
|
+
""")
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def eda_overview(df):
|
|
328
|
+
return dedent("""
|
|
329
|
+
# ── Auto-generated EDA overview ───────────────
|
|
330
|
+
import pandas as pd
|
|
331
|
+
import matplotlib.pyplot as plt
|
|
332
|
+
import seaborn as sns
|
|
333
|
+
|
|
334
|
+
_df = df.copy()
|
|
335
|
+
num_cols = _df.select_dtypes(include=['number', 'bool']).columns.tolist()
|
|
336
|
+
|
|
337
|
+
if num_cols:
|
|
338
|
+
summary = _df[num_cols].describe().T.reset_index().rename(columns={'index': 'feature'})
|
|
339
|
+
show(summary)
|
|
340
|
+
|
|
341
|
+
sample = _df[num_cols]
|
|
342
|
+
if len(sample) > 500:
|
|
343
|
+
sample = sample.sample(500, random_state=42)
|
|
344
|
+
|
|
345
|
+
sns.pairplot(sample)
|
|
346
|
+
plt.tight_layout()
|
|
347
|
+
plt.show()
|
|
348
|
+
else:
|
|
349
|
+
show("No numeric columns available for EDA overview.")
|
|
350
|
+
""")
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def eda_correlation(df):
|
|
354
|
+
return dedent("""
|
|
355
|
+
# ── Auto-generated correlation analysis ───────────────
|
|
356
|
+
import pandas as pd
|
|
357
|
+
import matplotlib.pyplot as plt
|
|
358
|
+
import seaborn as sns
|
|
359
|
+
|
|
360
|
+
_df = df.copy()
|
|
361
|
+
num_cols = _df.select_dtypes(include=['number', 'bool']).columns.tolist()
|
|
362
|
+
if not num_cols:
|
|
363
|
+
raise ValueError("No numeric columns available for correlation analysis.")
|
|
364
|
+
|
|
365
|
+
corr = _df[num_cols].corr()
|
|
366
|
+
show(corr)
|
|
367
|
+
|
|
368
|
+
plt.figure(figsize=(8, 6))
|
|
369
|
+
sns.heatmap(corr, annot=False, cmap="coolwarm", center=0)
|
|
370
|
+
plt.title("Correlation heatmap (numeric features)")
|
|
371
|
+
plt.tight_layout()
|
|
372
|
+
plt.show()
|
|
373
|
+
""")
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def anomaly_detection(df):
|
|
377
|
+
return dedent("""
|
|
378
|
+
# ── Auto-generated IsolationForest anomaly detection ─────────────
|
|
379
|
+
import numpy as np
|
|
380
|
+
import pandas as pd
|
|
381
|
+
from sklearn.ensemble import IsolationForest
|
|
382
|
+
from sklearn.preprocessing import OneHotEncoder
|
|
383
|
+
from sklearn.compose import ColumnTransformer
|
|
384
|
+
from sklearn.pipeline import Pipeline
|
|
385
|
+
from IPython.display import display, HTML
|
|
386
|
+
|
|
387
|
+
# Split numeric vs categorical for simple preprocessing
|
|
388
|
+
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
389
|
+
cat_cols = [c for c in df.columns if c not in num_cols]
|
|
390
|
+
|
|
391
|
+
if len(num_cols) + len(cat_cols) == 0:
|
|
392
|
+
raise ValueError("No usable columns for anomaly detection.")
|
|
393
|
+
|
|
394
|
+
preproc = ColumnTransformer(
|
|
395
|
+
transformers=[
|
|
396
|
+
("num", "passthrough", num_cols),
|
|
397
|
+
("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
|
|
398
|
+
],
|
|
399
|
+
remainder="drop",
|
|
400
|
+
verbose_feature_names_out=False,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
model = IsolationForest(
|
|
404
|
+
n_estimators=300,
|
|
405
|
+
contamination="auto",
|
|
406
|
+
random_state=42
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
pipe = Pipeline([
|
|
410
|
+
("prep", preproc),
|
|
411
|
+
("iso", model),
|
|
412
|
+
])
|
|
413
|
+
|
|
414
|
+
X = df[num_cols + cat_cols].copy()
|
|
415
|
+
pipe.fit(X)
|
|
416
|
+
|
|
417
|
+
# More negative = more anomalous in sklearn's score_samples
|
|
418
|
+
scores = pipe.named_steps["iso"].score_samples(pipe.named_steps["prep"].transform(X))
|
|
419
|
+
out = df.copy()
|
|
420
|
+
out["anomaly_score"] = -scores
|
|
421
|
+
|
|
422
|
+
# Flag top 5% as anomalies (simple heuristic)
|
|
423
|
+
threshold = np.percentile(out["anomaly_score"], 95)
|
|
424
|
+
out["is_anomaly"] = out["anomaly_score"] >= threshold
|
|
425
|
+
|
|
426
|
+
# Show the most anomalous rows
|
|
427
|
+
top = out.sort_values("anomaly_score", ascending=False).head(20)
|
|
428
|
+
display(HTML(top.to_html(index=False)))
|
|
429
|
+
""")
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def ts_anomaly_detection(df):
|
|
433
|
+
return dedent("""
|
|
434
|
+
# ==== TIME-SERIES ANOMALY DETECTION ====
|
|
435
|
+
# Prefers STL (statsmodels). If not available, falls back to rolling-MAD.
|
|
436
|
+
import numpy as np, pandas as pd
|
|
437
|
+
import matplotlib.pyplot as plt
|
|
438
|
+
|
|
439
|
+
_df = df.copy()
|
|
440
|
+
|
|
441
|
+
# --- 1) Find a datetime column (or use datetime index) ---
|
|
442
|
+
time_col = None
|
|
443
|
+
if isinstance(_df.index, pd.DatetimeIndex):
|
|
444
|
+
_df = _df.reset_index().rename(columns={"index": "timestamp"})
|
|
445
|
+
time_col = "timestamp"
|
|
446
|
+
else:
|
|
447
|
+
# try common names first, then dtype-based
|
|
448
|
+
preferred = [c for c in _df.columns if ("date" in c.lower() or "time" in c.lower())]
|
|
449
|
+
dt_candidates = preferred + _df.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns.tolist()
|
|
450
|
+
for c in dt_candidates or _df.columns.tolist():
|
|
451
|
+
try:
|
|
452
|
+
_df[c] = pd.to_datetime(_df[c], errors="coerce")
|
|
453
|
+
if _df[c].notna().sum() >= 3:
|
|
454
|
+
time_col = c
|
|
455
|
+
break
|
|
456
|
+
except Exception:
|
|
457
|
+
pass
|
|
458
|
+
|
|
459
|
+
if time_col is None:
|
|
460
|
+
show("No timestamp/datetime column found. Provide a column like 'date' or 'timestamp'.")
|
|
461
|
+
else:
|
|
462
|
+
# --- 2) Pick a numeric value column ---
|
|
463
|
+
num_cols = _df.select_dtypes(include=[np.number]).columns.tolist()
|
|
464
|
+
preferred_vals = [c for c in num_cols if any(k in c.lower() for k in ["value","amount","count","y","target"])]
|
|
465
|
+
value_col = preferred_vals[0] if preferred_vals else (num_cols[0] if num_cols else None)
|
|
466
|
+
|
|
467
|
+
if value_col is None:
|
|
468
|
+
show("No numeric value column found for time-series analysis.")
|
|
469
|
+
else:
|
|
470
|
+
ts = _df[[time_col, value_col]].dropna().sort_values(time_col).set_index(time_col)
|
|
471
|
+
|
|
472
|
+
# --- 3) Infer resample rule (D/W/M) ---
|
|
473
|
+
def _choose_rule(idx):
|
|
474
|
+
if len(idx) < 3: return "D"
|
|
475
|
+
# median gap in seconds
|
|
476
|
+
arr = idx.view("i8")
|
|
477
|
+
diffs = np.diff(arr) / 1e9 if len(arr) > 1 else np.array([0.0])
|
|
478
|
+
med = np.median(diffs) if len(diffs) else 0.0
|
|
479
|
+
day = 86400.0
|
|
480
|
+
if med <= day: return "D"
|
|
481
|
+
if med <= 7 * day: return "W"
|
|
482
|
+
return "M"
|
|
483
|
+
|
|
484
|
+
rule = _choose_rule(ts.index.values)
|
|
485
|
+
period_map = {"D": 7, "W": 52, "M": 12}
|
|
486
|
+
period = period_map.get(rule, 7)
|
|
487
|
+
|
|
488
|
+
# --- 4) Resample & detect anomalies (STL or fallback) ---
|
|
489
|
+
ts_res = ts.resample(rule).mean().dropna()
|
|
490
|
+
used_statsmodels = False
|
|
491
|
+
try:
|
|
492
|
+
from statsmodels.tsa.seasonal import STL
|
|
493
|
+
used_statsmodels = True
|
|
494
|
+
stl = STL(ts_res[value_col], robust=True, period=period)
|
|
495
|
+
res = stl.fit()
|
|
496
|
+
trend = res.trend
|
|
497
|
+
resid = res.resid
|
|
498
|
+
seasonal = res.seasonal
|
|
499
|
+
# robust z-score
|
|
500
|
+
mad = np.median(np.abs(resid - np.median(resid))) or 1e-8
|
|
501
|
+
z = np.abs(resid) / (1.4826 * mad)
|
|
502
|
+
anomalies = z > 3.5
|
|
503
|
+
except Exception:
|
|
504
|
+
# --- Rolling-MAD fallback (no statsmodels required) ---
|
|
505
|
+
used_statsmodels = False
|
|
506
|
+
series = ts_res[value_col]
|
|
507
|
+
# choose an odd window scaled to series length
|
|
508
|
+
n = max(7, min(61, (len(series) // 10) * 2 + 1))
|
|
509
|
+
med = series.rolling(window=n, center=True, min_periods=max(3, n // 3)).median()
|
|
510
|
+
resid = series - med
|
|
511
|
+
mad = (np.abs(resid)).rolling(window=n, center=True, min_periods=max(3, n // 3)).median()
|
|
512
|
+
# robust scale; avoid zeros
|
|
513
|
+
scale = (1.4826 * mad).replace(0, np.nan)
|
|
514
|
+
scale = scale.fillna(scale.median() or 1e-8)
|
|
515
|
+
z = np.abs(resid) / scale
|
|
516
|
+
anomalies = z > 3.5
|
|
517
|
+
trend = med
|
|
518
|
+
seasonal = pd.Series(0.0, index=series.index)
|
|
519
|
+
|
|
520
|
+
out = ts_res.copy()
|
|
521
|
+
out["trend"] = trend.reindex(out.index)
|
|
522
|
+
out["resid"] = resid.reindex(out.index)
|
|
523
|
+
out["zscore"] = z.reindex(out.index)
|
|
524
|
+
out["anomaly"] = anomalies.reindex(out.index).astype(bool)
|
|
525
|
+
|
|
526
|
+
# --- 5) UI outputs (no prints) ---
|
|
527
|
+
mode_note = "STL (statsmodels)" if used_statsmodels else "Rolling-MAD fallback"
|
|
528
|
+
show({"method": mode_note, "frequency": rule, "period": period, "points": int(out.shape[0]), "anomalies": int(out["anomaly"].sum())})
|
|
529
|
+
show(out[out["anomaly"]].head(30))
|
|
530
|
+
|
|
531
|
+
# value + trend + anomalies
|
|
532
|
+
fig, ax = plt.subplots(figsize=(9, 5))
|
|
533
|
+
ax.plot(out.index, out[value_col], label="value")
|
|
534
|
+
ax.plot(out.index, out["trend"], label="trend")
|
|
535
|
+
ax.scatter(out.index[out["anomaly"]], out[value_col][out["anomaly"]], s=40, label="anomaly")
|
|
536
|
+
ax.set_title(f"Time-series anomalies ({mode_note})")
|
|
537
|
+
ax.set_xlabel("time"); ax.set_ylabel(value_col)
|
|
538
|
+
ax.legend(loc="best"); plt.tight_layout(); plt.show()
|
|
539
|
+
|
|
540
|
+
# robust z-scores
|
|
541
|
+
fig2, ax2 = plt.subplots(figsize=(9, 3))
|
|
542
|
+
ax2.plot(out.index, out["zscore"])
|
|
543
|
+
ax2.axhline(3.5, linestyle="--")
|
|
544
|
+
ax2.set_title("Robust z-score")
|
|
545
|
+
ax2.set_xlabel("time"); ax2.set_ylabel("z")
|
|
546
|
+
plt.tight_layout(); plt.show()
|
|
547
|
+
|
|
548
|
+
# sample of last periods for quick inspection
|
|
549
|
+
show(out.tail(12))
|
|
550
|
+
""")
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def dimensionality_reduction(df):
|
|
554
|
+
return dedent("""
|
|
555
|
+
# ── Dimensionality Reduction (PCA + optional t-SNE) ───────────────
|
|
556
|
+
import numpy as np, pandas as pd
|
|
557
|
+
import matplotlib.pyplot as plt
|
|
558
|
+
from sklearn.preprocessing import StandardScaler
|
|
559
|
+
from sklearn.decomposition import PCA
|
|
560
|
+
try:
|
|
561
|
+
from sklearn.manifold import TSNE
|
|
562
|
+
_HAS_TSNE = True
|
|
563
|
+
except Exception:
|
|
564
|
+
_HAS_TSNE = False
|
|
565
|
+
from IPython.display import display, HTML
|
|
566
|
+
|
|
567
|
+
_df = df.copy()
|
|
568
|
+
num_cols = _df.select_dtypes(include=[np.number]).columns.tolist()
|
|
569
|
+
if len(num_cols) < 2:
|
|
570
|
+
raise ValueError("Need at least 2 numeric columns for PCA. Found: %d" % len(num_cols))
|
|
571
|
+
|
|
572
|
+
X = _df[num_cols].astype(float).copy()
|
|
573
|
+
scaler = StandardScaler()
|
|
574
|
+
Xs = scaler.fit_transform(X)
|
|
575
|
+
|
|
576
|
+
n_comp = int(min(10, Xs.shape[1]))
|
|
577
|
+
pca = PCA(n_components=n_comp)
|
|
578
|
+
Z = pca.fit_transform(Xs)
|
|
579
|
+
|
|
580
|
+
# Explained variance table
|
|
581
|
+
evr = pca.explained_variance_ratio_
|
|
582
|
+
cum = np.cumsum(evr)
|
|
583
|
+
stats = pd.DataFrame({
|
|
584
|
+
"component": [f"PC{i+1}" for i in range(n_comp)],
|
|
585
|
+
"explained_variance_ratio": evr,
|
|
586
|
+
"cumulative_variance": cum
|
|
587
|
+
})
|
|
588
|
+
display(HTML("<h4>PCA explained variance</h4>" + stats.to_html(index=False)))
|
|
589
|
+
|
|
590
|
+
# 2D scatter of PC1 vs PC2
|
|
591
|
+
fig, ax = plt.subplots(figsize=(8, 5))
|
|
592
|
+
ax.scatter(Z[:,0], Z[:,1], s=14, alpha=0.7)
|
|
593
|
+
ax.set_xlabel("PC1")
|
|
594
|
+
ax.set_ylabel("PC2")
|
|
595
|
+
ax.set_title("PCA: PC1 vs PC2")
|
|
596
|
+
plt.show()
|
|
597
|
+
|
|
598
|
+
# Top absolute loadings for PC1 & PC2
|
|
599
|
+
comps = pd.DataFrame(pca.components_[:2], columns=num_cols, index=["PC1","PC2"]).T
|
|
600
|
+
top1 = comps["PC1"].abs().sort_values(ascending=False).head(10)
|
|
601
|
+
top2 = comps["PC2"].abs().sort_values(ascending=False).head(10)
|
|
602
|
+
display(HTML("<h4>Top |loadings| for PC1</h4>" + top1.to_frame("abs_loading").to_html()))
|
|
603
|
+
display(HTML("<h4>Top |loadings| for PC2</h4>" + top2.to_frame("abs_loading").to_html()))
|
|
604
|
+
|
|
605
|
+
# Optional t-SNE (only if sample size reasonable)
|
|
606
|
+
if _HAS_TSNE and Xs.shape[0] >= 200:
|
|
607
|
+
tsne = TSNE(n_components=2, init="pca", learning_rate="auto", perplexity=min(30, max(5, Xs.shape[0]//50)), random_state=42)
|
|
608
|
+
Zt = tsne.fit_transform(Xs)
|
|
609
|
+
fig2, ax2 = plt.subplots(figsize=(8, 5))
|
|
610
|
+
ax2.scatter(Zt[:,0], Zt[:,1], s=8, alpha=0.7)
|
|
611
|
+
ax2.set_title("t-SNE (2D)")
|
|
612
|
+
plt.show()
|
|
613
|
+
""")
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
def feature_selection(df):
|
|
617
|
+
return dedent("""
|
|
618
|
+
# ── Feature Selection (mutual info + permutation importance) ──────
|
|
619
|
+
import numpy as np, pandas as pd
|
|
620
|
+
import matplotlib.pyplot as plt
|
|
621
|
+
from sklearn.model_selection import train_test_split
|
|
622
|
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
623
|
+
from sklearn.compose import ColumnTransformer
|
|
624
|
+
from sklearn.pipeline import Pipeline
|
|
625
|
+
from sklearn.linear_model import LogisticRegression, Ridge
|
|
626
|
+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
|
627
|
+
from sklearn.inspection import permutation_importance
|
|
628
|
+
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
|
|
629
|
+
from IPython.display import display, HTML
|
|
630
|
+
try:
|
|
631
|
+
from syntaxmatrix.display import show # UI-safe
|
|
632
|
+
except Exception:
|
|
633
|
+
pass
|
|
634
|
+
|
|
635
|
+
_df = df.copy()
|
|
636
|
+
|
|
637
|
+
# ---- 1) Pick target y (heuristics; fall back gracefully)
|
|
638
|
+
target_candidates = [
|
|
639
|
+
"target", "label", "y", "outcome", "class", "response", "target_var"
|
|
640
|
+
]
|
|
641
|
+
ycol = None
|
|
642
|
+
for c in target_candidates:
|
|
643
|
+
if c in _df.columns:
|
|
644
|
+
ycol = c; break
|
|
645
|
+
|
|
646
|
+
_reason = None
|
|
647
|
+
if ycol is None:
|
|
648
|
+
# 1a) Prefer a low-cardinality non-ID column (classification)
|
|
649
|
+
low_card = []
|
|
650
|
+
for c in _df.columns:
|
|
651
|
+
try:
|
|
652
|
+
nun = _df[c].nunique(dropna=True)
|
|
653
|
+
if 2 <= nun <= 20 and str(c).lower() not in ("id","uuid","index"):
|
|
654
|
+
low_card.append(c)
|
|
655
|
+
except Exception:
|
|
656
|
+
pass
|
|
657
|
+
if low_card:
|
|
658
|
+
ycol = low_card[-1]
|
|
659
|
+
try:
|
|
660
|
+
show(f"Using provisional classification target: '{ycol}' (low-cardinality)", title="Feature Selection")
|
|
661
|
+
except Exception:
|
|
662
|
+
pass
|
|
663
|
+
|
|
664
|
+
if ycol is None:
|
|
665
|
+
# 1b) Else take a high-variance numeric (regression)
|
|
666
|
+
num = _df.select_dtypes(include=[np.number])
|
|
667
|
+
if not num.empty:
|
|
668
|
+
try:
|
|
669
|
+
ycol = num.var().sort_values(ascending=False).index[0]
|
|
670
|
+
_reason = "highest-variance numeric"
|
|
671
|
+
try:
|
|
672
|
+
show(f"Using provisional regression target: '{ycol}' ({_reason})", title="Feature Selection")
|
|
673
|
+
except Exception:
|
|
674
|
+
pass
|
|
675
|
+
except Exception:
|
|
676
|
+
ycol = None
|
|
677
|
+
|
|
678
|
+
_can_run = ycol is not None
|
|
679
|
+
if not _can_run:
|
|
680
|
+
# Friendly message and a proxy output so the block still yields value
|
|
681
|
+
try:
|
|
682
|
+
show("Feature selection needs a target. None detected and none could be inferred. Showing numeric variance as a proxy.", title="Feature Selection")
|
|
683
|
+
var_df = _df.select_dtypes(include=[np.number]).var().sort_values(ascending=False).to_frame('variance').reset_index().rename(columns={'index':'feature'})
|
|
684
|
+
show(var_df.head(15), title="Numeric variance (proxy)")
|
|
685
|
+
except Exception:
|
|
686
|
+
pass
|
|
687
|
+
else:
|
|
688
|
+
# ---- 2) Build X/y and simple preprocessing
|
|
689
|
+
X = _df.drop(columns=[ycol]).copy()
|
|
690
|
+
y = _df[ycol].copy()
|
|
691
|
+
|
|
692
|
+
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
|
|
693
|
+
cat_cols = [c for c in X.columns if c not in num_cols]
|
|
694
|
+
|
|
695
|
+
# Robust encoder across sklearn versions / environments
|
|
696
|
+
try:
|
|
697
|
+
enc = _SMX_OHE()
|
|
698
|
+
except NameError:
|
|
699
|
+
enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
|
|
700
|
+
|
|
701
|
+
preproc = ColumnTransformer(
|
|
702
|
+
transformers=[
|
|
703
|
+
("num", StandardScaler(with_mean=True, with_std=True), num_cols) if num_cols else ("num","drop",[]),
|
|
704
|
+
("cat", enc, cat_cols) if cat_cols else ("cat","drop",[]),
|
|
705
|
+
],
|
|
706
|
+
remainder="drop",
|
|
707
|
+
verbose_feature_names_out=False,
|
|
708
|
+
)
|
|
709
|
+
|
|
710
|
+
# classify vs regress
|
|
711
|
+
y_is_classification = (y.nunique() <= 20) and (y.dtype.kind in "biuO" or y.nunique() <= 10)
|
|
712
|
+
|
|
713
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
714
|
+
X, y, test_size=0.25, random_state=42, stratify=y if y_is_classification else None
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
if y_is_classification:
|
|
718
|
+
base_est = LogisticRegression(max_iter=2000, n_jobs=None) if hasattr(LogisticRegression(), "n_jobs") else LogisticRegression(max_iter=2000)
|
|
719
|
+
alt_est = RandomForestClassifier(n_estimators=200, random_state=42)
|
|
720
|
+
mi_func = mutual_info_classif
|
|
721
|
+
score_kw = {"scoring": "roc_auc"} if y.nunique()==2 else {"scoring": "balanced_accuracy"}
|
|
722
|
+
else:
|
|
723
|
+
try:
|
|
724
|
+
base_est = Ridge(random_state=42)
|
|
725
|
+
except TypeError:
|
|
726
|
+
base_est = Ridge()
|
|
727
|
+
alt_est = RandomForestRegressor(n_estimators=200, random_state=42)
|
|
728
|
+
mi_func = mutual_info_regression
|
|
729
|
+
score_kw = {"scoring": "r2"}
|
|
730
|
+
|
|
731
|
+
pipe = Pipeline([("prep", preproc), ("est", base_est)])
|
|
732
|
+
pipe.fit(X_train, y_train)
|
|
733
|
+
|
|
734
|
+
# ---- 3) Mutual information (on one-hot expanded X)
|
|
735
|
+
X_enc = pipe.named_steps["prep"].transform(X_train)
|
|
736
|
+
# Get feature names after OHE
|
|
737
|
+
try:
|
|
738
|
+
ohe = pipe.named_steps["prep"].named_transformers_["cat"]
|
|
739
|
+
if hasattr(ohe, 'get_feature_names_out'):
|
|
740
|
+
cat_feature_names = list(ohe.get_feature_names_out(cat_cols))
|
|
741
|
+
else:
|
|
742
|
+
cat_feature_names = []
|
|
743
|
+
except Exception:
|
|
744
|
+
cat_feature_names = []
|
|
745
|
+
feature_names = num_cols + cat_feature_names
|
|
746
|
+
if len(feature_names) != (X_enc.shape[1] if hasattr(X_enc, 'shape') else len(feature_names)):
|
|
747
|
+
# fallback if names length mismatch
|
|
748
|
+
feature_names = [f"f{i}" for i in range(X_enc.shape[1])]
|
|
749
|
+
|
|
750
|
+
# Mutual information scores
|
|
751
|
+
try:
|
|
752
|
+
mi = mi_func(y_train, X_enc) if callable(mi_func) else np.zeros(len(feature_names))
|
|
753
|
+
except Exception:
|
|
754
|
+
mi = np.zeros(len(feature_names))
|
|
755
|
+
mi_df = pd.DataFrame({"feature": feature_names, "mi": mi}).sort_values("mi", ascending=False)
|
|
756
|
+
|
|
757
|
+
# ---- 4) Permutation importance on alt estimator
|
|
758
|
+
pipe_alt = Pipeline([("prep", preproc), ("est", alt_est)])
|
|
759
|
+
pipe_alt.fit(X_train, y_train)
|
|
760
|
+
try:
|
|
761
|
+
pi = permutation_importance(pipe_alt, pipe_alt.named_steps["prep"].transform(X_test), y_test, n_repeats=5, random_state=42, **score_kw)
|
|
762
|
+
pi_df = pd.DataFrame({"feature": feature_names, "perm_importance_mean": pi.importances_mean}).sort_values("perm_importance_mean", ascending=False)
|
|
763
|
+
except Exception:
|
|
764
|
+
pi_df = pd.DataFrame({"feature": feature_names, "perm_importance_mean": np.zeros(len(feature_names))})
|
|
765
|
+
|
|
766
|
+
# ---- 5) Show results
|
|
767
|
+
show(mi_df.head(20), title="Mutual information (top features)")
|
|
768
|
+
show(pi_df.head(20), title="Permutation importance (top features)")
|
|
769
|
+
|
|
770
|
+
# Horizontal bars for permutation importance
|
|
771
|
+
top = pi_df.head(15)[::-1]
|
|
772
|
+
fig, ax = plt.subplots(figsize=(8, 6))
|
|
773
|
+
ax.barh(top["feature"], top["perm_importance_mean"])
|
|
774
|
+
ax.set_title("Top permutation importances")
|
|
775
|
+
ax.set_xlabel("Importance (mean over repeats)")
|
|
776
|
+
plt.tight_layout(); plt.show()
|
|
777
|
+
""")
|
|
778
|
+
|
|
779
|
+
|
|
780
|
+
def time_series_forecasting(df):
|
|
781
|
+
return dedent("""
|
|
782
|
+
# ── Auto-generated baseline time-series forecast ─────────
|
|
783
|
+
import numpy as np
|
|
784
|
+
import pandas as pd
|
|
785
|
+
import matplotlib.pyplot as plt
|
|
786
|
+
from sklearn.linear_model import LinearRegression
|
|
787
|
+
from sklearn.metrics import mean_absolute_error
|
|
788
|
+
|
|
789
|
+
_df = df.copy()
|
|
790
|
+
|
|
791
|
+
# 1) pick a datetime column
|
|
792
|
+
dt_cols = [c for c in _df.columns if np.issubdtype(_df[c].dtype, np.datetime64)]
|
|
793
|
+
if not dt_cols:
|
|
794
|
+
name_hits = [c for c in _df.columns if any(k in str(c).lower()
|
|
795
|
+
for k in ["date","time","timestamp","datetime","ds","period"])]
|
|
796
|
+
for c in name_hits:
|
|
797
|
+
try:
|
|
798
|
+
_df[c] = pd.to_datetime(_df[c], errors="raise")
|
|
799
|
+
dt_cols = [c]
|
|
800
|
+
break
|
|
801
|
+
except Exception:
|
|
802
|
+
continue
|
|
803
|
+
|
|
804
|
+
if not dt_cols:
|
|
805
|
+
raise ValueError("No datetime-like column found for time-series forecasting.")
|
|
806
|
+
|
|
807
|
+
time_col = dt_cols[0]
|
|
808
|
+
|
|
809
|
+
# 2) pick a numeric target column
|
|
810
|
+
num_cols = [c for c in _df.select_dtypes(include=['number', 'bool']).columns if c != time_col]
|
|
811
|
+
if not num_cols:
|
|
812
|
+
raise ValueError("No numeric target available for time-series forecasting.")
|
|
813
|
+
|
|
814
|
+
target = num_cols[0]
|
|
815
|
+
|
|
816
|
+
ts = _df[[time_col, target]].dropna().sort_values(time_col)
|
|
817
|
+
ts["time_idx"] = (ts[time_col] - ts[time_col].min()).dt.total_seconds() / 86400.0
|
|
818
|
+
|
|
819
|
+
if len(ts) < 10:
|
|
820
|
+
raise ValueError("Not enough data points for time-series forecasting (need >= 10 rows).")
|
|
821
|
+
|
|
822
|
+
split_idx = int(len(ts) * 0.8)
|
|
823
|
+
train, test = ts.iloc[:split_idx], ts.iloc[split_idx:]
|
|
824
|
+
|
|
825
|
+
X_train = train[["time_idx"]].values
|
|
826
|
+
y_train = train[target].values
|
|
827
|
+
X_test = test[["time_idx"]].values
|
|
828
|
+
y_test = test[target].values
|
|
829
|
+
|
|
830
|
+
reg = LinearRegression()
|
|
831
|
+
reg.fit(X_train, y_train)
|
|
832
|
+
|
|
833
|
+
y_pred = reg.predict(X_test)
|
|
834
|
+
mae = mean_absolute_error(y_test, y_pred)
|
|
835
|
+
show({{"MAE_forecast": mae}})
|
|
836
|
+
|
|
837
|
+
fig, ax = plt.subplots(figsize=(10, 4))
|
|
838
|
+
ax.plot(train[time_col], train[target], label="train")
|
|
839
|
+
ax.plot(test[time_col], y_test, label="test")
|
|
840
|
+
ax.plot(test[time_col], y_pred, label="forecast")
|
|
841
|
+
ax.legend()
|
|
842
|
+
ax.set_title(f"Baseline time-series forecast for {{target}}")
|
|
843
|
+
plt.tight_layout()
|
|
844
|
+
plt.show()
|
|
845
|
+
""")
|
|
846
|
+
|
|
847
|
+
|
|
848
|
+
def time_series_classification(df, entity_col, time_col, target_col):
|
|
849
|
+
return dedent(f"""
|
|
850
|
+
# ── Auto-generated time-series classification baseline ─────
|
|
851
|
+
import numpy as np
|
|
852
|
+
import pandas as pd
|
|
853
|
+
from sklearn.model_selection import train_test_split
|
|
854
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
855
|
+
from sklearn.metrics import accuracy_score, classification_report
|
|
856
|
+
|
|
857
|
+
_df = df.copy()
|
|
858
|
+
|
|
859
|
+
# Drop rows missing key columns
|
|
860
|
+
_df = _df.dropna(subset=['{entity_col}', '{time_col}', '{target_col}'])
|
|
861
|
+
|
|
862
|
+
# Ensure datetime for the time column
|
|
863
|
+
_df['{time_col}'] = pd.to_datetime(_df['{time_col}'], errors="coerce")
|
|
864
|
+
_df = _df.dropna(subset=['{time_col}'])
|
|
865
|
+
|
|
866
|
+
# Sort by entity then time
|
|
867
|
+
_df = _df.sort_values(['{entity_col}', '{time_col}'])
|
|
868
|
+
|
|
869
|
+
# Numeric features only (excluding target, entity, time)
|
|
870
|
+
num_cols = _df.select_dtypes(include=['number', 'bool']).columns.tolist()
|
|
871
|
+
for c in ['{target_col}', '{entity_col}', '{time_col}']:
|
|
872
|
+
if c in num_cols:
|
|
873
|
+
num_cols.remove(c)
|
|
874
|
+
|
|
875
|
+
if not num_cols:
|
|
876
|
+
raise ValueError("No numeric features available for time-series classification template.")
|
|
877
|
+
|
|
878
|
+
# Aggregate sequence into per-entity features
|
|
879
|
+
agg_spec = {{}}
|
|
880
|
+
for c in num_cols:
|
|
881
|
+
agg_spec[c] = ['mean', 'std', 'min', 'max', 'last']
|
|
882
|
+
|
|
883
|
+
grouped = _df.groupby('{entity_col}').agg(agg_spec)
|
|
884
|
+
|
|
885
|
+
# Flatten MultiIndex columns
|
|
886
|
+
grouped.columns = [f"{{col}}_{{stat}}" for col, stat in grouped.columns]
|
|
887
|
+
|
|
888
|
+
# Target per entity: last observed label
|
|
889
|
+
y = _df.groupby('{entity_col}')['{target_col}'].last()
|
|
890
|
+
|
|
891
|
+
# Align X and y on the same entities
|
|
892
|
+
X, y = grouped.align(y, join="inner", axis=0)
|
|
893
|
+
|
|
894
|
+
if X.empty:
|
|
895
|
+
raise ValueError("No aggregated rows available for time-series classification.")
|
|
896
|
+
|
|
897
|
+
# Train/test split by entities
|
|
898
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
899
|
+
X, y, test_size=0.2, stratify=y, random_state=42
|
|
900
|
+
)
|
|
901
|
+
|
|
902
|
+
clf = RandomForestClassifier(n_estimators=300, random_state=42)
|
|
903
|
+
clf.fit(X_train, y_train)
|
|
904
|
+
|
|
905
|
+
y_pred = clf.predict(X_test)
|
|
906
|
+
acc = accuracy_score(y_test, y_pred)
|
|
907
|
+
show({{"Accuracy": acc}})
|
|
908
|
+
|
|
909
|
+
report_df = pd.DataFrame(
|
|
910
|
+
classification_report(y_test, y_pred, output_dict=True)
|
|
911
|
+
).T
|
|
912
|
+
show(report_df)
|
|
913
|
+
""")
|
|
914
|
+
|
|
915
|
+
|
|
916
|
+
def unknown_group_proxy_pack(df, group_col, unknown_tokens, numeric_cols, cat_cols, outcome_col=None):
|
|
917
|
+
return dedent(f"""
|
|
918
|
+
# ── Unknown Group: Proxy Insight Pack ──
|
|
919
|
+
import numpy as np
|
|
920
|
+
import pandas as pd
|
|
921
|
+
import matplotlib.pyplot as plt
|
|
922
|
+
|
|
923
|
+
_df = df.copy()
|
|
924
|
+
|
|
925
|
+
if '{group_col}' not in _df.columns:
|
|
926
|
+
show("Grouping column '{group_col}' not found; showing overall summary only.")
|
|
927
|
+
show(_df.head())
|
|
928
|
+
else:
|
|
929
|
+
s = _df['{group_col}']
|
|
930
|
+
s_norm = s.astype(str).str.strip().str.lower()
|
|
931
|
+
_tokens = set({list({"unknown","not reported","not_reported","not known","n/a","na","none","nan","missing","unreported","unspecified","null","-",""})})
|
|
932
|
+
_tokens.update({list(set(unknown_tokens))})
|
|
933
|
+
is_unknown = s.isna() | s_norm.isin(_tokens)
|
|
934
|
+
_df["_UnknownGroup"] = np.where(is_unknown, "Unknown/Not Reported", "Known")
|
|
935
|
+
|
|
936
|
+
# 1) Size table (never errors)
|
|
937
|
+
size_tbl = _df["_UnknownGroup"].value_counts(dropna=False).rename_axis("Group").reset_index(name="Count")
|
|
938
|
+
total = len(_df) if len(_df) else 1
|
|
939
|
+
size_tbl["Pct"] = (size_tbl["Count"] / total * 100).round(1)
|
|
940
|
+
show(size_tbl)
|
|
941
|
+
|
|
942
|
+
# 2) Numeric comparisons (auto-select; safe when empty)
|
|
943
|
+
num_cols = [c for c in {list([])} or {list(set(numeric_cols))} if c in _df.columns and pd.api.types.is_numeric_dtype(_df[c])]
|
|
944
|
+
if not num_cols:
|
|
945
|
+
num_cols = _df.select_dtypes(include=['number','bool']).columns.tolist()[:6]
|
|
946
|
+
|
|
947
|
+
if "_UnknownGroup" in _df.columns and num_cols:
|
|
948
|
+
blocks = []
|
|
949
|
+
for g, sub in _df.groupby("_UnknownGroup", dropna=False):
|
|
950
|
+
if sub.empty:
|
|
951
|
+
continue
|
|
952
|
+
desc = sub[num_cols].describe().T
|
|
953
|
+
desc.insert(0, "Group", g)
|
|
954
|
+
desc = desc.reset_index().rename(columns={{"index":"Variable","std":"Std","25%":"Q1","50%":"Median","75%":"Q3"}})
|
|
955
|
+
blocks.append(desc[["Variable","Group","count","mean","Median","Std","min","Q1","Q3","max"]])
|
|
956
|
+
numeric_summary = pd.concat(blocks, ignore_index=True) if blocks else pd.DataFrame(
|
|
957
|
+
columns=["Variable","Group","count","mean","Median","Std","min","Q1","Q3","max"]
|
|
958
|
+
)
|
|
959
|
+
show(numeric_summary)
|
|
960
|
+
|
|
961
|
+
# 3) Composition of categorical columns for Unknown group
|
|
962
|
+
cat_cols = [c for c in {list(set(cat_cols))} if c in _df.columns]
|
|
963
|
+
if "_UnknownGroup" in _df.columns and cat_cols:
|
|
964
|
+
unk = _df[_df["_UnknownGroup"]=="Unknown/Not Reported"]
|
|
965
|
+
comp_blocks = []
|
|
966
|
+
if not unk.empty:
|
|
967
|
+
for c in cat_cols:
|
|
968
|
+
vc = unk[c].astype(str).str.strip().replace({{"nan":"(missing)","":"(blank)"}}).value_counts(normalize=True, dropna=False)
|
|
969
|
+
comp = vc.mul(100).round(1).rename_axis("level").reset_index(name="Pct")
|
|
970
|
+
comp.insert(0, "Variable", c)
|
|
971
|
+
comp_blocks.append(comp)
|
|
972
|
+
lifestyle_comp = pd.concat(comp_blocks, ignore_index=True) if comp_blocks else pd.DataFrame(columns=["Variable","level","Pct"])
|
|
973
|
+
show(lifestyle_comp)
|
|
974
|
+
|
|
975
|
+
# 4) Visuals — guarded; fall back silently if plotting fails
|
|
976
|
+
try:
|
|
977
|
+
if "_UnknownGroup" in _df.columns:
|
|
978
|
+
ax = (size_tbl.set_index("Group")["Pct"]).plot(kind="bar", figsize=(5,3))
|
|
979
|
+
ax.set_ylabel("% of records")
|
|
980
|
+
ax.set_title(f"Known vs Unknown/Not Reported — {{'{group_col}'}}")
|
|
981
|
+
plt.tight_layout(); plt.show()
|
|
982
|
+
except Exception:
|
|
983
|
+
pass
|
|
984
|
+
|
|
985
|
+
# 5) Optional outcome prevalence
|
|
986
|
+
if {repr(outcome_col)} and {repr(outcome_col)} in _df.columns and pd.api.types.is_numeric_dtype(_df[{repr(outcome_col)}]):
|
|
987
|
+
try:
|
|
988
|
+
prev = _df.groupby("_UnknownGroup")[{repr(outcome_col)}].mean() * 100.0
|
|
989
|
+
show(prev.rename("Prevalence_%").reset_index())
|
|
990
|
+
except Exception:
|
|
991
|
+
pass
|
|
992
|
+
|
|
993
|
+
# 6) Note on data capture
|
|
994
|
+
note = (
|
|
995
|
+
"Data capture: reduce 'Unknown/Not Reported' via intake prompts, pre-fill known values, "
|
|
996
|
+
"audit repeated unknowns, and monitor Unknown rate over time and by site/channel."
|
|
997
|
+
)
|
|
998
|
+
show(note)
|
|
999
|
+
""")
|
|
1000
|
+
|
|
1001
|
+
|
|
1002
|
+
def viz_line(df, time_col=None, max_series=3, freq=None):
|
|
1003
|
+
"""
|
|
1004
|
+
Plot up to `max_series` numeric columns against a detected datetime axis.
|
|
1005
|
+
- Detects a datetime/time-like column if `time_col` is None.
|
|
1006
|
+
- Optionally resamples to `freq` (e.g. 'D','W','M') if provided and evenly spaced lines are wanted.
|
|
1007
|
+
- Skips gracefully if no time or numeric columns are suitable.
|
|
1008
|
+
"""
|
|
1009
|
+
return dedent(f"""
|
|
1010
|
+
import numpy as np
|
|
1011
|
+
import pandas as pd
|
|
1012
|
+
import matplotlib.pyplot as plt
|
|
1013
|
+
|
|
1014
|
+
_df = df.copy()
|
|
1015
|
+
|
|
1016
|
+
# 1) choose time column
|
|
1017
|
+
time_col = {repr(time_col)} # may be None
|
|
1018
|
+
if time_col is None:
|
|
1019
|
+
dt_cols = [c for c in _df.columns if np.issubdtype(_df[c].dtype, np.datetime64)]
|
|
1020
|
+
if not dt_cols:
|
|
1021
|
+
# name hints as fallback
|
|
1022
|
+
keys = ["date","time","timestamp","datetime","ds","period"]
|
|
1023
|
+
for c in _df.columns:
|
|
1024
|
+
n = str(c).lower()
|
|
1025
|
+
if any(k in n for k in keys):
|
|
1026
|
+
try:
|
|
1027
|
+
_df[c] = pd.to_datetime(_df[c], errors="coerce")
|
|
1028
|
+
if _df[c].notna().any():
|
|
1029
|
+
dt_cols = [c]
|
|
1030
|
+
break
|
|
1031
|
+
except Exception:
|
|
1032
|
+
pass
|
|
1033
|
+
time_col = dt_cols[0] if dt_cols else None
|
|
1034
|
+
|
|
1035
|
+
if not time_col or time_col not in _df.columns:
|
|
1036
|
+
show("⚠ No datetime-like column detected for a line chart; skipping.")
|
|
1037
|
+
else:
|
|
1038
|
+
_df = _df.dropna(subset=[time_col]).sort_values(time_col)
|
|
1039
|
+
# 2) pick up to `max_series` numeric columns (by variance)
|
|
1040
|
+
num_cols = [c for c in _df.select_dtypes(include=['number','bool']).columns if c != time_col]
|
|
1041
|
+
scored = []
|
|
1042
|
+
for c in num_cols:
|
|
1043
|
+
v = _df[c].dropna()
|
|
1044
|
+
scored.append((float(v.var()) if len(v) else 0.0, c))
|
|
1045
|
+
scored.sort(reverse=True)
|
|
1046
|
+
keep = [c for _, c in scored[:{max_series}]]
|
|
1047
|
+
|
|
1048
|
+
if not keep:
|
|
1049
|
+
show("⚠ No numeric columns available for a line chart; skipping.")
|
|
1050
|
+
else:
|
|
1051
|
+
plot_df = _df[[time_col] + keep].copy()
|
|
1052
|
+
# optional resample
|
|
1053
|
+
if {repr(freq)} and plot_df[time_col].notna().any():
|
|
1054
|
+
plot_df = plot_df.set_index(time_col).resample({repr(freq)}).mean().reset_index()
|
|
1055
|
+
|
|
1056
|
+
fig, ax = plt.subplots(figsize=(8, 4))
|
|
1057
|
+
for c in keep:
|
|
1058
|
+
ax.plot(plot_df[time_col], plot_df[c], label=str(c))
|
|
1059
|
+
ax.set_xlabel(str(time_col))
|
|
1060
|
+
ax.set_ylabel("Value")
|
|
1061
|
+
ax.legend(loc="best", frameon=False)
|
|
1062
|
+
ax.set_title("Line chart")
|
|
1063
|
+
plt.tight_layout()
|
|
1064
|
+
plt.show()
|
|
1065
|
+
""")
|
|
1066
|
+
|
|
1067
|
+
|
|
1068
|
+
def clustering(df):
|
|
1069
|
+
return dedent("""
|
|
1070
|
+
# ==== CLUSTERING BASELINE (KMeans + DBSCAN fallback) ====
|
|
1071
|
+
import numpy as np, pandas as pd
|
|
1072
|
+
from sklearn.pipeline import Pipeline
|
|
1073
|
+
from sklearn.impute import SimpleImputer
|
|
1074
|
+
from sklearn.preprocessing import StandardScaler
|
|
1075
|
+
from sklearn.cluster import KMeans, DBSCAN
|
|
1076
|
+
from sklearn.metrics import silhouette_score
|
|
1077
|
+
from sklearn.decomposition import PCA
|
|
1078
|
+
import matplotlib.pyplot as plt
|
|
1079
|
+
|
|
1080
|
+
_work = df.copy()
|
|
1081
|
+
num_cols = _work.select_dtypes(include=[np.number]).columns.tolist()
|
|
1082
|
+
|
|
1083
|
+
if len(num_cols) < 2:
|
|
1084
|
+
show(f"Clustering needs at least two numeric columns. Found: {num_cols}")
|
|
1085
|
+
else:
|
|
1086
|
+
X = _work[num_cols]
|
|
1087
|
+
pipe = Pipeline([
|
|
1088
|
+
("imputer", SimpleImputer(strategy="median")),
|
|
1089
|
+
("scaler", StandardScaler())
|
|
1090
|
+
])
|
|
1091
|
+
Xp = pipe.fit_transform(X)
|
|
1092
|
+
|
|
1093
|
+
n = Xp.shape[0]
|
|
1094
|
+
k_max = max(2, min(12, n - 1))
|
|
1095
|
+
best_k, best_sil = None, -1
|
|
1096
|
+
inertias, ks = [], []
|
|
1097
|
+
|
|
1098
|
+
for k in range(2, k_max + 1):
|
|
1099
|
+
km = KMeans(n_clusters=k, n_init="auto", random_state=42)
|
|
1100
|
+
labels_k = km.fit_predict(Xp)
|
|
1101
|
+
if len(set(labels_k)) < 2:
|
|
1102
|
+
continue
|
|
1103
|
+
sil = silhouette_score(Xp, labels_k)
|
|
1104
|
+
inertias.append(km.inertia_); ks.append(k)
|
|
1105
|
+
if sil > best_sil:
|
|
1106
|
+
best_sil, best_k = sil, k
|
|
1107
|
+
|
|
1108
|
+
model_label = "KMeans"
|
|
1109
|
+
if best_k is not None:
|
|
1110
|
+
model = KMeans(n_clusters=best_k, n_init="auto", random_state=42).fit(Xp)
|
|
1111
|
+
labels = model.labels_
|
|
1112
|
+
show({"model": model_label, "k": best_k, "silhouette": round(best_sil, 3)})
|
|
1113
|
+
else:
|
|
1114
|
+
model = DBSCAN(eps=0.8, min_samples=10).fit(Xp)
|
|
1115
|
+
labels = model.labels_
|
|
1116
|
+
model_label = "DBSCAN"
|
|
1117
|
+
show({"model": model_label})
|
|
1118
|
+
|
|
1119
|
+
_work["cluster"] = labels
|
|
1120
|
+
show(_work["cluster"].value_counts().sort_index().rename("count").to_frame())
|
|
1121
|
+
|
|
1122
|
+
prof = _work.groupby("cluster")[num_cols].agg(["mean","median","std","min","max","count"])
|
|
1123
|
+
show(prof)
|
|
1124
|
+
|
|
1125
|
+
pca = PCA(n_components=2, random_state=42)
|
|
1126
|
+
comps = pca.fit_transform(Xp)
|
|
1127
|
+
fig, ax = plt.subplots(figsize=(7,5))
|
|
1128
|
+
for cl in sorted(set(labels)):
|
|
1129
|
+
mask = labels == cl
|
|
1130
|
+
ax.scatter(comps[mask,0], comps[mask,1], s=20, alpha=0.7, label=f"cluster {cl}")
|
|
1131
|
+
ax.set_title("PCA scatter of clusters"); ax.set_xlabel("PC1"); ax.set_ylabel("PC2")
|
|
1132
|
+
ax.legend(loc="best"); plt.tight_layout(); plt.show()
|
|
1133
|
+
|
|
1134
|
+
if ks:
|
|
1135
|
+
fig2, ax2 = plt.subplots(figsize=(7,4))
|
|
1136
|
+
ax2.plot(ks, inertias, marker="o")
|
|
1137
|
+
ax2.set_title("KMeans inertia by k"); ax2.set_xlabel("k"); ax2.set_ylabel("Inertia (SSE)")
|
|
1138
|
+
plt.tight_layout(); plt.show()
|
|
1139
|
+
|
|
1140
|
+
df[:] = _work
|
|
1141
|
+
""")
|
|
1142
|
+
|
|
1143
|
+
|
|
1144
|
+
def recommendation(df):
|
|
1145
|
+
return dedent("""
|
|
1146
|
+
# ==== ITEM-ITEM RECOMMENDATION (Nearest Neighbours over mixed features) ====
|
|
1147
|
+
import numpy as np, pandas as pd
|
|
1148
|
+
from sklearn.compose import ColumnTransformer
|
|
1149
|
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
1150
|
+
from sklearn.pipeline import Pipeline
|
|
1151
|
+
from sklearn.neighbors import NearestNeighbors
|
|
1152
|
+
|
|
1153
|
+
_work = df.copy()
|
|
1154
|
+
|
|
1155
|
+
# --- 1) Identify features (numeric + categorical) ---
|
|
1156
|
+
num_cols = _work.select_dtypes(include=[np.number]).columns.tolist()
|
|
1157
|
+
cat_cols = _work.select_dtypes(include=["object", "category", "string"]).columns.tolist()
|
|
1158
|
+
|
|
1159
|
+
# Heuristic: drop obvious IDs from features
|
|
1160
|
+
id_like = [c for c in _work.columns if (c.lower() in ("id","uid","uuid","record_id","row_id") or c.lower().endswith("_id"))]
|
|
1161
|
+
num_cols = [c for c in num_cols if c not in id_like]
|
|
1162
|
+
cat_cols = [c for c in cat_cols if c not in id_like]
|
|
1163
|
+
|
|
1164
|
+
# Minimal guard
|
|
1165
|
+
if len(num_cols) + len(cat_cols) < 1:
|
|
1166
|
+
show("No usable feature columns for recommendation."); # caption comes from PREFACE
|
|
1167
|
+
else:
|
|
1168
|
+
# --- 2) Build preprocessing (robust across sklearn versions) ---
|
|
1169
|
+
try:
|
|
1170
|
+
enc = _SMX_OHE()
|
|
1171
|
+
except NameError:
|
|
1172
|
+
# Fallback if PREFACE wasn't injected for some reason
|
|
1173
|
+
enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
|
|
1174
|
+
|
|
1175
|
+
pre = ColumnTransformer(
|
|
1176
|
+
transformers=[
|
|
1177
|
+
("num", Pipeline([("scaler", StandardScaler())]), num_cols) if num_cols else ("num", "drop", []),
|
|
1178
|
+
("cat", enc, cat_cols) if cat_cols else ("cat", "drop", []),
|
|
1179
|
+
],
|
|
1180
|
+
remainder="drop"
|
|
1181
|
+
)
|
|
1182
|
+
|
|
1183
|
+
# --- 3) Sample cap for safety on huge tables ---
|
|
1184
|
+
N = len(_work)
|
|
1185
|
+
cap = min(N, 5000)
|
|
1186
|
+
_sample = _work.sample(n=cap, random_state=42) if N > cap else _work
|
|
1187
|
+
|
|
1188
|
+
X = pre.fit_transform(_sample)
|
|
1189
|
+
if getattr(X, "shape", (0,0))[0] < 2:
|
|
1190
|
+
show("Not enough rows to compute neighbours.")
|
|
1191
|
+
else:
|
|
1192
|
+
# --- 4) Fit cosine NN and pick a few anchors ---
|
|
1193
|
+
k = min(6, X.shape[0]) # includes self; we'll drop it
|
|
1194
|
+
nn = NearestNeighbors(metric="cosine", n_neighbors=k)
|
|
1195
|
+
nn.fit(X)
|
|
1196
|
+
|
|
1197
|
+
# Anchor strategy: prefer rows with an id-like column; otherwise first few
|
|
1198
|
+
anchor_ids = None
|
|
1199
|
+
if id_like:
|
|
1200
|
+
anchor_ids = _sample[id_like[0]].head(min(5, len(_sample))).tolist()
|
|
1201
|
+
anchors = _sample.index[:len(anchor_ids)].tolist()
|
|
1202
|
+
else:
|
|
1203
|
+
anchors = _sample.index[:min(5, len(_sample))].tolist()
|
|
1204
|
+
|
|
1205
|
+
# For readability, pick up to 4 descriptive (non-numeric) columns
|
|
1206
|
+
desc_cols = [c for c in _sample.columns if c in cat_cols][:4]
|
|
1207
|
+
meta_cols = (id_like[:1] + desc_cols)[:5]
|
|
1208
|
+
|
|
1209
|
+
# --- 5) Build neighbour tables per anchor ---
|
|
1210
|
+
for pos, aidx in enumerate(anchors):
|
|
1211
|
+
# position of aidx inside _sample
|
|
1212
|
+
loc = list(_sample.index).index(aidx)
|
|
1213
|
+
dists, inds = nn.kneighbors(X[loc].reshape(1, -1), return_distance=True)
|
|
1214
|
+
dists, inds = dists[0].tolist(), inds[0].tolist()
|
|
1215
|
+
|
|
1216
|
+
rows = []
|
|
1217
|
+
for dist, i in zip(dists, inds):
|
|
1218
|
+
if i == loc:
|
|
1219
|
+
continue # drop self
|
|
1220
|
+
ridx = _sample.index[i]
|
|
1221
|
+
row = {"rank": len(rows)+1, "distance": float(dist), "_index": int(ridx)}
|
|
1222
|
+
for c in meta_cols:
|
|
1223
|
+
if c in _sample.columns:
|
|
1224
|
+
row[c] = _sample.loc[ridx, c]
|
|
1225
|
+
rows.append(row)
|
|
1226
|
+
|
|
1227
|
+
out = pd.DataFrame(rows)
|
|
1228
|
+
title = "Similar items" if not id_like else f"Similar to {id_like[0]}={_sample.loc[aidx, id_like[0]]}"
|
|
1229
|
+
show(out, title=title)
|
|
1230
|
+
|
|
1231
|
+
# Summary
|
|
1232
|
+
feats = len(num_cols) + len(cat_cols)
|
|
1233
|
+
show({"rows_used": X.shape[0], "features": feats}, title="Recommendation set-up summary")
|
|
1234
|
+
""")
|
|
1235
|
+
|
|
1236
|
+
|
|
1237
|
+
def topic_modelling(df):
|
|
1238
|
+
|
|
1239
|
+
return dedent("""
|
|
1240
|
+
# ==== TOPIC MODELLING (LDA with safe fallback) ====
|
|
1241
|
+
import numpy as np, pandas as pd, re
|
|
1242
|
+
import matplotlib.pyplot as plt
|
|
1243
|
+
|
|
1244
|
+
# --- 1) Pick a text column (or compose one) ---
|
|
1245
|
+
_df = df.copy()
|
|
1246
|
+
text_cols_named = [c for c in _df.columns if any(k in c.lower() for k in ["text","review","description","comment","notes","content","body","message","title"])]
|
|
1247
|
+
obj_cols = _df.select_dtypes(include=["object","string"]).columns.tolist()
|
|
1248
|
+
candidates = text_cols_named + [c for c in obj_cols if c not in text_cols_named]
|
|
1249
|
+
|
|
1250
|
+
def _choose_text_col(d):
|
|
1251
|
+
best, best_score = None, -1
|
|
1252
|
+
for c in candidates or []:
|
|
1253
|
+
s = d[c].astype(str).fillna("")
|
|
1254
|
+
# token score: average length and alphabetic ratio
|
|
1255
|
+
tokens = s.str.split()
|
|
1256
|
+
score = float(tokens.map(len).mean() or 0) + float((s.str.contains(r"[A-Za-z]", regex=True)).mean()) * 2.0
|
|
1257
|
+
if score > best_score:
|
|
1258
|
+
best, best_score = c, score
|
|
1259
|
+
return best
|
|
1260
|
+
|
|
1261
|
+
text_col = _choose_text_col(_df)
|
|
1262
|
+
if text_col is None:
|
|
1263
|
+
# build a composite text if nothing obvious
|
|
1264
|
+
parts = obj_cols[:4]
|
|
1265
|
+
if not parts:
|
|
1266
|
+
show("No suitable text columns found for topic modelling.")
|
|
1267
|
+
else:
|
|
1268
|
+
_df["_smx_text"] = _df[parts].astype(str).agg(" ".join, axis=1)
|
|
1269
|
+
text_col = "_smx_text"
|
|
1270
|
+
|
|
1271
|
+
if text_col is not None:
|
|
1272
|
+
docs = _df[text_col].astype(str).fillna("").tolist()
|
|
1273
|
+
n_docs = len(docs)
|
|
1274
|
+
|
|
1275
|
+
# --- 2) Choose topic count sensibly ---
|
|
1276
|
+
n_topics = int(np.clip(max(3, int(np.sqrt(max(1, n_docs/50)))) , 3, 12))
|
|
1277
|
+
|
|
1278
|
+
# --- 3) Try LDA; if it fails, fall back to n-gram frequencies ---
|
|
1279
|
+
used_lda = False
|
|
1280
|
+
try:
|
|
1281
|
+
from sklearn.feature_extraction.text import CountVectorizer
|
|
1282
|
+
from sklearn.decomposition import LatentDirichletAllocation
|
|
1283
|
+
vect = CountVectorizer(stop_words="english", max_features=5000, ngram_range=(1,2))
|
|
1284
|
+
X = vect.fit_transform(docs)
|
|
1285
|
+
if X.shape[0] < 5 or X.shape[1] < 10:
|
|
1286
|
+
raise RuntimeError("Too little text to fit LDA.")
|
|
1287
|
+
lda = LatentDirichletAllocation(n_components=n_topics, learning_method="batch", random_state=42)
|
|
1288
|
+
W = lda.fit_transform(X) # doc-topic
|
|
1289
|
+
H = lda.components_ # topic-term
|
|
1290
|
+
terms = np.array(vect.get_feature_names_out())
|
|
1291
|
+
|
|
1292
|
+
# --- topic → top words table ---
|
|
1293
|
+
rows = []
|
|
1294
|
+
for k in range(n_topics):
|
|
1295
|
+
inds = np.argsort(H[k])[::-1][:12]
|
|
1296
|
+
words = terms[inds]
|
|
1297
|
+
weights = H[k, inds]
|
|
1298
|
+
rows.append({"topic": k, "top_terms": ", ".join(words[:10])})
|
|
1299
|
+
top_words = pd.DataFrame(rows)
|
|
1300
|
+
show(top_words, title="Topics and top terms")
|
|
1301
|
+
|
|
1302
|
+
# --- doc dominant topic + prevalence ---
|
|
1303
|
+
dom = W.argmax(axis=1)
|
|
1304
|
+
strength = W.max(axis=1)
|
|
1305
|
+
_df["topic"] = dom
|
|
1306
|
+
_df["topic_score"] = strength
|
|
1307
|
+
# prevalence plot
|
|
1308
|
+
prev = pd.Series(dom).value_counts().sort_index()
|
|
1309
|
+
fig, ax = plt.subplots(figsize=(7,4))
|
|
1310
|
+
prev.plot(kind="bar", ax=ax)
|
|
1311
|
+
ax.set_title("Topic prevalence"); ax.set_xlabel("topic"); ax.set_ylabel("documents")
|
|
1312
|
+
plt.tight_layout(); plt.show()
|
|
1313
|
+
|
|
1314
|
+
show(_df[["topic","topic_score"]].head(20), title="Document-topic sample")
|
|
1315
|
+
used_lda = True
|
|
1316
|
+
|
|
1317
|
+
except Exception as e:
|
|
1318
|
+
# --- Fallback: simple n-gram frequency table ---
|
|
1319
|
+
try:
|
|
1320
|
+
from sklearn.feature_extraction.text import CountVectorizer
|
|
1321
|
+
vect = CountVectorizer(stop_words="english", max_features=3000, ngram_range=(1,2))
|
|
1322
|
+
X = vect.fit_transform(docs)
|
|
1323
|
+
counts = np.asarray(X.sum(axis=0)).ravel()
|
|
1324
|
+
terms = np.array(vect.get_feature_names_out())
|
|
1325
|
+
top = pd.DataFrame({"term": terms, "count": counts}).sort_values("count", ascending=False).head(30)
|
|
1326
|
+
show(top, title="Top terms (fallback)")
|
|
1327
|
+
except Exception:
|
|
1328
|
+
show("Text vectorisation unavailable; cannot compute topics.")
|
|
1329
|
+
used_lda = False
|
|
1330
|
+
|
|
1331
|
+
# Summary
|
|
1332
|
+
show({"docs": n_docs, "topics": (n_topics if used_lda else 0)}, title="Topic modelling summary")
|
|
1333
|
+
""")
|
|
1334
|
+
|
|
1335
|
+
|
|
1336
|
+
def viz_pie(df, category_col=None, top_k=8):
|
|
1337
|
+
"""Generic pie chart of category shares."""
|
|
1338
|
+
return dedent("""
|
|
1339
|
+
import pandas as pd
|
|
1340
|
+
import matplotlib.pyplot as plt
|
|
1341
|
+
from syntaxmatrix.display import show
|
|
1342
|
+
|
|
1343
|
+
_df = df.copy()
|
|
1344
|
+
|
|
1345
|
+
# auto pick categorical column if not provided
|
|
1346
|
+
cat = __SMX_CAT_HINT__
|
|
1347
|
+
if cat is None or cat not in _df.columns:
|
|
1348
|
+
cat_cols = [c for c in _df.columns
|
|
1349
|
+
if (_df[c].dtype == 'object' or str(_df[c].dtype).startswith('category'))
|
|
1350
|
+
and _df[c].nunique(dropna=True) > 1]
|
|
1351
|
+
if not cat_cols:
|
|
1352
|
+
raise ValueError("No suitable categorical column for pie chart.")
|
|
1353
|
+
cat = cat_cols[0]
|
|
1354
|
+
|
|
1355
|
+
s = _df[cat].astype(str).fillna("Missing").value_counts()
|
|
1356
|
+
if len(s) > __SMX_TOPK__:
|
|
1357
|
+
s = pd.concat([s.iloc[:__SMX_TOPK__], pd.Series({"Other": s.iloc[__SMX_TOPK__:].sum()})])
|
|
1358
|
+
|
|
1359
|
+
pie_df = s.reset_index()
|
|
1360
|
+
pie_df.columns = [cat, "count"]
|
|
1361
|
+
pie_df["percent"] = (pie_df["count"] / pie_df["count"].sum() * 100).round(2)
|
|
1362
|
+
show(pie_df)
|
|
1363
|
+
|
|
1364
|
+
plt.figure(figsize=(5,5))
|
|
1365
|
+
plt.pie(pie_df["count"], labels=pie_df[cat], autopct='%1.1f%%', startangle=90)
|
|
1366
|
+
plt.title(f"Composition of {cat}")
|
|
1367
|
+
plt.tight_layout()
|
|
1368
|
+
plt.show()
|
|
1369
|
+
""".replace("__SMX_CAT_HINT__", repr(category_col))
|
|
1370
|
+
.replace("__SMX_TOPK__", str(top_k)))
|
|
1371
|
+
|
|
1372
|
+
|
|
1373
|
+
def viz_violin(df, x=None, y=None, hue=None, sample_n=2000):
|
|
1374
|
+
"""Violin plot for numeric distribution across categories."""
|
|
1375
|
+
return dedent("""
|
|
1376
|
+
import numpy as np
|
|
1377
|
+
import pandas as pd
|
|
1378
|
+
import matplotlib.pyplot as plt
|
|
1379
|
+
from syntaxmatrix.display import show
|
|
1380
|
+
|
|
1381
|
+
_df = df.copy()
|
|
1382
|
+
|
|
1383
|
+
xcol = __SMX_X__
|
|
1384
|
+
ycol = __SMX_Y__
|
|
1385
|
+
hcol = __SMX_HUE__
|
|
1386
|
+
|
|
1387
|
+
if xcol is None or xcol not in _df.columns:
|
|
1388
|
+
cat_cols = [c for c in _df.columns
|
|
1389
|
+
if (_df[c].dtype == 'object' or str(_df[c].dtype).startswith('category'))
|
|
1390
|
+
and _df[c].nunique(dropna=True) > 1
|
|
1391
|
+
and _df[c].nunique(dropna=True) <= 20]
|
|
1392
|
+
xcol = cat_cols[0] if cat_cols else None
|
|
1393
|
+
|
|
1394
|
+
if ycol is None or ycol not in _df.columns:
|
|
1395
|
+
num_cols = _df.select_dtypes(include=['number','bool']).columns.tolist()
|
|
1396
|
+
ycol = num_cols[0] if num_cols else None
|
|
1397
|
+
|
|
1398
|
+
if xcol is None or ycol is None:
|
|
1399
|
+
raise ValueError("Need one categorical (x) and one numeric (y) column for violin plot.")
|
|
1400
|
+
|
|
1401
|
+
use_cols = [xcol, ycol]
|
|
1402
|
+
if hcol in _df.columns and hcol not in (xcol, ycol):
|
|
1403
|
+
use_cols.append(hcol)
|
|
1404
|
+
|
|
1405
|
+
_work = _df[use_cols].dropna()
|
|
1406
|
+
if len(_work) > __SMX_SAMPLE_N__:
|
|
1407
|
+
_work = _work.sample(__SMX_SAMPLE_N__, random_state=42)
|
|
1408
|
+
|
|
1409
|
+
# Use seaborn if available, else fallback to boxplot
|
|
1410
|
+
try:
|
|
1411
|
+
import seaborn as sns
|
|
1412
|
+
plt.figure(figsize=(7,4))
|
|
1413
|
+
sns.violinplot(
|
|
1414
|
+
data=_work,
|
|
1415
|
+
x=xcol, y=ycol,
|
|
1416
|
+
hue=hcol if hcol in _work.columns else None,
|
|
1417
|
+
cut=0
|
|
1418
|
+
)
|
|
1419
|
+
plt.title(f"{ycol} distribution by {xcol}")
|
|
1420
|
+
plt.tight_layout()
|
|
1421
|
+
plt.show()
|
|
1422
|
+
except Exception:
|
|
1423
|
+
plt.figure(figsize=(7,4))
|
|
1424
|
+
_work.boxplot(column=ycol, by=xcol, grid=False)
|
|
1425
|
+
plt.title(f"{ycol} by {xcol} (box fallback)")
|
|
1426
|
+
plt.suptitle("")
|
|
1427
|
+
plt.tight_layout()
|
|
1428
|
+
plt.show()
|
|
1429
|
+
|
|
1430
|
+
show(_work.groupby(xcol)[ycol].describe().round(2))
|
|
1431
|
+
""".replace("__SMX_X__", repr(x))
|
|
1432
|
+
.replace("__SMX_Y__", repr(y))
|
|
1433
|
+
.replace("__SMX_HUE__", repr(hue))
|
|
1434
|
+
.replace("__SMX_SAMPLE_N__", str(sample_n)))
|
|
1435
|
+
|
|
1436
|
+
|
|
1437
|
+
def viz_stacked_bar(df, x=None, hue=None, normalise=True, top_k=8):
|
|
1438
|
+
"""Stacked (optionally % stacked) bar chart for two categoricals."""
|
|
1439
|
+
return dedent("""
|
|
1440
|
+
import pandas as pd
|
|
1441
|
+
import matplotlib.pyplot as plt
|
|
1442
|
+
from syntaxmatrix.display import show
|
|
1443
|
+
|
|
1444
|
+
_df = df.copy()
|
|
1445
|
+
|
|
1446
|
+
xcol = __SMX_X__
|
|
1447
|
+
hcol = __SMX_HUE__
|
|
1448
|
+
|
|
1449
|
+
cat_cols = [c for c in _df.columns
|
|
1450
|
+
if (_df[c].dtype == 'object' or str(_df[c].dtype).startswith('category'))
|
|
1451
|
+
and _df[c].nunique(dropna=True) > 1
|
|
1452
|
+
and _df[c].nunique(dropna=True) <= 30]
|
|
1453
|
+
|
|
1454
|
+
if xcol is None or xcol not in _df.columns:
|
|
1455
|
+
xcol = cat_cols[0] if cat_cols else None
|
|
1456
|
+
if hcol is None or hcol not in _df.columns:
|
|
1457
|
+
hcol = cat_cols[1] if len(cat_cols) > 1 else None
|
|
1458
|
+
|
|
1459
|
+
if xcol is None or hcol is None:
|
|
1460
|
+
raise ValueError("Need two categorical columns for stacked bar chart.")
|
|
1461
|
+
|
|
1462
|
+
_work = _df[[xcol, hcol]].dropna()
|
|
1463
|
+
|
|
1464
|
+
keep_h = _work[hcol].astype(str).value_counts().index[:__SMX_TOPK__]
|
|
1465
|
+
_work[hcol] = _work[hcol].astype(str).where(_work[hcol].astype(str).isin(keep_h), other="Other")
|
|
1466
|
+
|
|
1467
|
+
tab = pd.crosstab(_work[xcol].astype(str), _work[hcol].astype(str))
|
|
1468
|
+
show(tab)
|
|
1469
|
+
|
|
1470
|
+
plot_tab = tab.copy()
|
|
1471
|
+
if __SMX_NORM__:
|
|
1472
|
+
plot_tab = plot_tab.div(plot_tab.sum(axis=1), axis=0) * 100
|
|
1473
|
+
|
|
1474
|
+
ax = plot_tab.plot(kind="bar", stacked=True, figsize=(8,4))
|
|
1475
|
+
ax.set_title(
|
|
1476
|
+
f"{hcol} composition by {xcol}" + (" (%)" if __SMX_NORM__ else "")
|
|
1477
|
+
)
|
|
1478
|
+
ax.set_xlabel(xcol)
|
|
1479
|
+
ax.set_ylabel("Percent" if __SMX_NORM__ else "Count")
|
|
1480
|
+
plt.legend(title=hcol, bbox_to_anchor=(1.02, 1), loc="upper left")
|
|
1481
|
+
plt.tight_layout()
|
|
1482
|
+
plt.show()
|
|
1483
|
+
""".replace("__SMX_X__", repr(x))
|
|
1484
|
+
.replace("__SMX_HUE__", repr(hue))
|
|
1485
|
+
.replace("__SMX_NORM__", "True" if normalise else "False")
|
|
1486
|
+
.replace("__SMX_TOPK__", str(top_k)))
|
|
1487
|
+
|
|
1488
|
+
|
|
1489
|
+
def viz_distribution(df, col=None, by=None, bins=30, sample_n=5000):
|
|
1490
|
+
"""Histogram distribution for a numeric column, optionally split by a category."""
|
|
1491
|
+
return dedent("""
|
|
1492
|
+
import numpy as np
|
|
1493
|
+
import pandas as pd
|
|
1494
|
+
import matplotlib.pyplot as plt
|
|
1495
|
+
from syntaxmatrix.display import show
|
|
1496
|
+
|
|
1497
|
+
_df = df.copy()
|
|
1498
|
+
ncol = __SMX_COL__
|
|
1499
|
+
bcol = __SMX_BY__
|
|
1500
|
+
|
|
1501
|
+
if ncol is None or ncol not in _df.columns:
|
|
1502
|
+
num_cols = _df.select_dtypes(include=['number','bool']).columns.tolist()
|
|
1503
|
+
ncol = num_cols[0] if num_cols else None
|
|
1504
|
+
|
|
1505
|
+
if ncol is None:
|
|
1506
|
+
raise ValueError("No numeric column available for distribution plot.")
|
|
1507
|
+
|
|
1508
|
+
if bcol is not None and bcol not in _df.columns:
|
|
1509
|
+
bcol = None
|
|
1510
|
+
|
|
1511
|
+
use_cols = [ncol] + ([bcol] if bcol else [])
|
|
1512
|
+
_work = _df[use_cols].dropna()
|
|
1513
|
+
|
|
1514
|
+
if len(_work) > __SMX_SAMPLE_N__:
|
|
1515
|
+
_work = _work.sample(__SMX_SAMPLE_N__, random_state=42)
|
|
1516
|
+
|
|
1517
|
+
plt.figure(figsize=(7,4))
|
|
1518
|
+
if bcol:
|
|
1519
|
+
try:
|
|
1520
|
+
import seaborn as sns
|
|
1521
|
+
sns.histplot(
|
|
1522
|
+
data=_work, x=ncol, hue=bcol,
|
|
1523
|
+
bins=__SMX_BINS__,
|
|
1524
|
+
stat="density",
|
|
1525
|
+
common_norm=False,
|
|
1526
|
+
element="step"
|
|
1527
|
+
)
|
|
1528
|
+
except Exception:
|
|
1529
|
+
for k, g in _work.groupby(bcol):
|
|
1530
|
+
plt.hist(g[ncol], bins=__SMX_BINS__, alpha=0.5, density=True, label=str(k))
|
|
1531
|
+
plt.legend(title=bcol)
|
|
1532
|
+
else:
|
|
1533
|
+
plt.hist(_work[ncol], bins=__SMX_BINS__, alpha=0.8)
|
|
1534
|
+
|
|
1535
|
+
plt.title(f"Distribution of {ncol}" + (f" by {bcol}" if bcol else ""))
|
|
1536
|
+
plt.xlabel(ncol)
|
|
1537
|
+
plt.ylabel("Density" if bcol else "Count")
|
|
1538
|
+
plt.tight_layout()
|
|
1539
|
+
plt.show()
|
|
1540
|
+
|
|
1541
|
+
show(_work[ncol].describe().round(2))
|
|
1542
|
+
""".replace("__SMX_COL__", repr(col))
|
|
1543
|
+
.replace("__SMX_BY__", repr(by))
|
|
1544
|
+
.replace("__SMX_BINS__", str(bins))
|
|
1545
|
+
.replace("__SMX_SAMPLE_N__", str(sample_n)))
|
|
1546
|
+
|
|
1547
|
+
|
|
1548
|
+
def viz_area(df, x=None, y=None, group=None, sample_n=3000):
|
|
1549
|
+
# richer time/area plot (useful for trends)
|
|
1550
|
+
return dedent(f"""
|
|
1551
|
+
import matplotlib.pyplot as plt
|
|
1552
|
+
from syntaxmatrix.display import show
|
|
1553
|
+
|
|
1554
|
+
_df = df.copy()
|
|
1555
|
+
|
|
1556
|
+
# auto-pick numeric columns
|
|
1557
|
+
num_cols = _df.select_dtypes(include=['number','bool']).columns.tolist()
|
|
1558
|
+
cat_cols = [c for c in _df.columns if c not in num_cols and _df[c].nunique(dropna=True) <= 12]
|
|
1559
|
+
|
|
1560
|
+
if x is None or x not in _df.columns:
|
|
1561
|
+
x = None # area plot can be index-based
|
|
1562
|
+
if y is None or y not in _df.columns:
|
|
1563
|
+
y = num_cols[0] if num_cols else None
|
|
1564
|
+
if group is None or group not in _df.columns:
|
|
1565
|
+
group = cat_cols[0] if cat_cols else None
|
|
1566
|
+
|
|
1567
|
+
if y is None:
|
|
1568
|
+
show("⚠ No numeric column for area plot.")
|
|
1569
|
+
else:
|
|
1570
|
+
dplot = _df[[c for c in [x,y,group] if c]].dropna()
|
|
1571
|
+
if len(dplot) > sample_n:
|
|
1572
|
+
dplot = dplot.sample(sample_n, random_state=42)
|
|
1573
|
+
|
|
1574
|
+
if x:
|
|
1575
|
+
dplot = dplot.sort_values(x)
|
|
1576
|
+
|
|
1577
|
+
plt.figure(figsize=(7,3.5))
|
|
1578
|
+
if group is None:
|
|
1579
|
+
plt.fill_between(range(len(dplot)), dplot[y].values, alpha=0.6)
|
|
1580
|
+
plt.title(f"Area plot of {y}")
|
|
1581
|
+
else:
|
|
1582
|
+
for k, g in dplot.groupby(group):
|
|
1583
|
+
plt.fill_between(range(len(g)), g[y].values, alpha=0.4, label=str(k))
|
|
1584
|
+
plt.legend()
|
|
1585
|
+
plt.title(f"{y} area plot by {group}")
|
|
1586
|
+
plt.tight_layout()
|
|
1587
|
+
plt.show()
|
|
1588
|
+
""")
|
|
1589
|
+
|
|
1590
|
+
|
|
1591
|
+
def viz_kde(df, col=None, by=None, sample_n=5000):
|
|
1592
|
+
return dedent(f"""
|
|
1593
|
+
import matplotlib.pyplot as plt
|
|
1594
|
+
from syntaxmatrix.display import show
|
|
1595
|
+
|
|
1596
|
+
_df = df.copy()
|
|
1597
|
+
num_cols = _df.select_dtypes(include=['number','bool']).columns.tolist()
|
|
1598
|
+
cat_cols = [c for c in _df.columns if c not in num_cols and _df[c].nunique(dropna=True) <= 12]
|
|
1599
|
+
|
|
1600
|
+
if col is None or col not in _df.columns:
|
|
1601
|
+
col = num_cols[0] if num_cols else None
|
|
1602
|
+
if by is None or by not in _df.columns:
|
|
1603
|
+
by = cat_cols[0] if cat_cols else None
|
|
1604
|
+
|
|
1605
|
+
if col is None:
|
|
1606
|
+
show("⚠ No numeric column for density plot.")
|
|
1607
|
+
else:
|
|
1608
|
+
dplot = _df[[c for c in [col,by] if c]].dropna()
|
|
1609
|
+
if len(dplot) > sample_n:
|
|
1610
|
+
dplot = dplot.sample(sample_n, random_state=42)
|
|
1611
|
+
|
|
1612
|
+
plt.figure(figsize=(6,3.5))
|
|
1613
|
+
try:
|
|
1614
|
+
if by is None:
|
|
1615
|
+
sns.kdeplot(data=dplot, x=col, fill=True)
|
|
1616
|
+
plt.title(f"Density of {col}")
|
|
1617
|
+
else:
|
|
1618
|
+
sns.kdeplot(data=dplot, x=col, hue=by, fill=True, common_norm=False)
|
|
1619
|
+
plt.title(f"Density of {col} by {by}")
|
|
1620
|
+
except Exception:
|
|
1621
|
+
# matplotlib fallback
|
|
1622
|
+
if by is None:
|
|
1623
|
+
dplot[col].plot(kind="kde")
|
|
1624
|
+
else:
|
|
1625
|
+
for k, g in dplot.groupby(by):
|
|
1626
|
+
g[col].plot(kind="kde", label=str(k))
|
|
1627
|
+
plt.legend()
|
|
1628
|
+
plt.tight_layout()
|
|
1629
|
+
plt.show()
|
|
1630
|
+
""")
|
|
1631
|
+
|
|
1632
|
+
|
|
1633
|
+
def viz_count_bar(df, category_col=None, top_k=12):
|
|
1634
|
+
return dedent("""
|
|
1635
|
+
import matplotlib.pyplot as plt
|
|
1636
|
+
from syntaxmatrix.display import show
|
|
1637
|
+
|
|
1638
|
+
_df = df.copy()
|
|
1639
|
+
|
|
1640
|
+
# Auto-pick a sensible categorical column if none provided
|
|
1641
|
+
if category_col is None or category_col not in _df.columns:
|
|
1642
|
+
num_cols = _df.select_dtypes(include=['number','bool']).columns.tolist()
|
|
1643
|
+
cat_cols = [
|
|
1644
|
+
c for c in _df.columns
|
|
1645
|
+
if c not in num_cols
|
|
1646
|
+
and (_df[c].dtype == 'object' or str(_df[c].dtype).startswith('category') or _df[c].nunique(dropna=True) <= 25)
|
|
1647
|
+
]
|
|
1648
|
+
# Prefer low-cardinality cols
|
|
1649
|
+
cat_cols = [c for c in cat_cols if 2 <= _df[c].nunique(dropna=True) <= 25]
|
|
1650
|
+
category_col = cat_cols[0] if cat_cols else None
|
|
1651
|
+
|
|
1652
|
+
if category_col is None:
|
|
1653
|
+
show("⚠ No categorical column available for count bar chart.")
|
|
1654
|
+
else:
|
|
1655
|
+
s = _df[category_col].astype(str)
|
|
1656
|
+
vc = s.value_counts()
|
|
1657
|
+
|
|
1658
|
+
# Trim long tails so the bar stays readable
|
|
1659
|
+
if len(vc) > top_k:
|
|
1660
|
+
head = vc.head(top_k)
|
|
1661
|
+
tail_sum = vc.iloc[top_k:].sum()
|
|
1662
|
+
vc = head.copy()
|
|
1663
|
+
if tail_sum > 0:
|
|
1664
|
+
vc.loc["Other"] = tail_sum
|
|
1665
|
+
|
|
1666
|
+
plt.figure(figsize=(7, 3.8))
|
|
1667
|
+
plt.bar(vc.index.astype(str), vc.values)
|
|
1668
|
+
plt.xticks(rotation=0, ha="center")
|
|
1669
|
+
plt.title(f"Counts by {category_col}")
|
|
1670
|
+
plt.ylabel("Count")
|
|
1671
|
+
plt.tight_layout()
|
|
1672
|
+
plt.show()
|
|
1673
|
+
|
|
1674
|
+
show(vc.rename("count").reset_index().rename(columns={"index": category_col}))
|
|
1675
|
+
""")
|
|
1676
|
+
|
|
1677
|
+
|
|
1678
|
+
from textwrap import dedent
|
|
1679
|
+
|
|
1680
|
+
def viz_scatter(df, x=None, y=None, hue=None, sample_n=2000):
|
|
1681
|
+
return dedent("""
|
|
1682
|
+
import matplotlib.pyplot as plt
|
|
1683
|
+
from syntaxmatrix.display import show
|
|
1684
|
+
|
|
1685
|
+
_df = df.copy()
|
|
1686
|
+
|
|
1687
|
+
num_cols = _df.select_dtypes(include=['number','bool']).columns.tolist()
|
|
1688
|
+
cat_cols = [
|
|
1689
|
+
c for c in _df.columns
|
|
1690
|
+
if c not in num_cols
|
|
1691
|
+
and (_df[c].dtype == 'object' or str(_df[c].dtype).startswith('category') or _df[c].nunique(dropna=True) <= 20)
|
|
1692
|
+
]
|
|
1693
|
+
cat_cols = [c for c in cat_cols if 2 <= _df[c].nunique(dropna=True) <= 20]
|
|
1694
|
+
|
|
1695
|
+
if x is None or x not in _df.columns:
|
|
1696
|
+
x = num_cols[0] if len(num_cols) > 0 else None
|
|
1697
|
+
if y is None or y not in _df.columns:
|
|
1698
|
+
y = num_cols[1] if len(num_cols) > 1 else None
|
|
1699
|
+
if hue is None or hue not in _df.columns:
|
|
1700
|
+
hue = cat_cols[0] if cat_cols else None
|
|
1701
|
+
|
|
1702
|
+
if x is None or y is None:
|
|
1703
|
+
show("⚠ Not enough numeric columns for scatter plot.")
|
|
1704
|
+
else:
|
|
1705
|
+
cols = [c for c in [x, y, hue] if c is not None]
|
|
1706
|
+
dplot = _df[cols].dropna()
|
|
1707
|
+
|
|
1708
|
+
if len(dplot) > sample_n:
|
|
1709
|
+
dplot = dplot.sample(sample_n, random_state=42)
|
|
1710
|
+
|
|
1711
|
+
plt.figure(figsize=(6, 4))
|
|
1712
|
+
if hue is None:
|
|
1713
|
+
plt.scatter(dplot[x], dplot[y], alpha=0.6)
|
|
1714
|
+
plt.title(f"{y} vs {x}")
|
|
1715
|
+
plt.xlabel(x); plt.ylabel(y)
|
|
1716
|
+
else:
|
|
1717
|
+
try:
|
|
1718
|
+
ax = sns.scatterplot(data=dplot, x=x, y=y, hue=hue)
|
|
1719
|
+
ax.set_title(f"{y} vs {x} by {hue}")
|
|
1720
|
+
except Exception:
|
|
1721
|
+
for k, g in dplot.groupby(hue):
|
|
1722
|
+
plt.scatter(g[x], g[y], label=str(k), alpha=0.6)
|
|
1723
|
+
plt.legend()
|
|
1724
|
+
plt.title(f"{y} vs {x} by {hue}")
|
|
1725
|
+
plt.xlabel(x); plt.ylabel(y)
|
|
1726
|
+
|
|
1727
|
+
plt.tight_layout()
|
|
1728
|
+
plt.show()
|
|
1729
|
+
""")
|
|
1730
|
+
|
|
1731
|
+
|
|
1732
|
+
def viz_box(df, x=None, y=None, sample_n=3000):
|
|
1733
|
+
return dedent("""
|
|
1734
|
+
import matplotlib.pyplot as plt
|
|
1735
|
+
from syntaxmatrix.display import show
|
|
1736
|
+
|
|
1737
|
+
_df = df.copy()
|
|
1738
|
+
|
|
1739
|
+
# Identify numeric and categorical candidates
|
|
1740
|
+
num_cols = _df.select_dtypes(include=['number','bool']).columns.tolist()
|
|
1741
|
+
cat_cols = [
|
|
1742
|
+
c for c in _df.columns
|
|
1743
|
+
if c not in num_cols
|
|
1744
|
+
and (_df[c].dtype == 'object' or str(_df[c].dtype).startswith('category') or _df[c].nunique(dropna=True) <= 25)
|
|
1745
|
+
]
|
|
1746
|
+
cat_cols = [c for c in cat_cols if 2 <= _df[c].nunique(dropna=True) <= 25]
|
|
1747
|
+
|
|
1748
|
+
# Auto-pick y (numeric) and x (categorical) if not provided
|
|
1749
|
+
if y is None or y not in _df.columns:
|
|
1750
|
+
y = num_cols[0] if num_cols else None
|
|
1751
|
+
if x is None or x not in _df.columns:
|
|
1752
|
+
x = cat_cols[0] if cat_cols else None
|
|
1753
|
+
|
|
1754
|
+
if y is None:
|
|
1755
|
+
show("⚠ No numeric column available for box plot.")
|
|
1756
|
+
else:
|
|
1757
|
+
cols = [c for c in [x, y] if c is not None]
|
|
1758
|
+
dplot = _df[cols].dropna()
|
|
1759
|
+
|
|
1760
|
+
if len(dplot) > sample_n:
|
|
1761
|
+
dplot = dplot.sample(sample_n, random_state=42)
|
|
1762
|
+
|
|
1763
|
+
if x is None:
|
|
1764
|
+
plt.figure(figsize=(5.5, 3.8))
|
|
1765
|
+
plt.boxplot(dplot[y])
|
|
1766
|
+
plt.title(f"Distribution of {y}")
|
|
1767
|
+
plt.ylabel(y)
|
|
1768
|
+
else:
|
|
1769
|
+
# seaborn if available, else matplotlib grouped box
|
|
1770
|
+
try:
|
|
1771
|
+
ax = sns.boxplot(data=dplot, x=x, y=y)
|
|
1772
|
+
ax.set_title(f"{y} by {x}")
|
|
1773
|
+
ax.set_xlabel(x); ax.set_ylabel(y)
|
|
1774
|
+
except Exception:
|
|
1775
|
+
groups = [g[y].values for _, g in dplot.groupby(x)]
|
|
1776
|
+
labels = [str(k) for k in dplot.groupby(x).groups.keys()]
|
|
1777
|
+
plt.figure(figsize=(7.5, 3.8))
|
|
1778
|
+
plt.boxplot(groups, labels=labels)
|
|
1779
|
+
plt.title(f"{y} by {x}")
|
|
1780
|
+
plt.xlabel(x); plt.ylabel(y)
|
|
1781
|
+
|
|
1782
|
+
plt.tight_layout()
|
|
1783
|
+
plt.show()
|
|
1784
|
+
|
|
1785
|
+
# Show a quick summary table too
|
|
1786
|
+
if x is None:
|
|
1787
|
+
show(dplot[y].describe())
|
|
1788
|
+
else:
|
|
1789
|
+
show(dplot.groupby(x)[y].describe())
|
|
1790
|
+
""")
|