m8flow 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,745 +1,2907 @@
1
- """Pre-built node templates. Each template is just Python source defining `def run(...)`."""
2
-
3
- # ── Data ──────────────────────────────────────────────────────────────────────
4
-
5
- CSV_LOADER = '''import pandas as pd
6
- from typing import Annotated
7
-
8
- def run(file_path: Annotated[str, "file"] = "data.csv") -> dict:
9
- df = pd.read_csv(file_path)
10
- return {"data": df}
11
- '''
12
-
13
- CSV_EXPORTER = '''import pandas as pd
14
- import numpy as np
15
- import os
16
- from typing import Annotated
17
-
18
- def run(
19
- data=None,
20
- y_pred=None,
21
- y_test=None,
22
- file_path: Annotated[str, "file"] = "output.csv",
23
- ) -> dict:
24
- """Export any data to CSV. Accepts a DataFrame, or y_pred/y_test arrays from a model node."""
25
- if data is None:
26
- if y_pred is not None and y_test is not None:
27
- data = pd.DataFrame({"y_test": np.asarray(y_test).ravel(), "y_pred": np.asarray(y_pred).ravel()})
28
- elif y_pred is not None:
29
- data = pd.DataFrame({"predictions": np.asarray(y_pred).ravel()})
30
- else:
31
- raise ValueError("Provide 'data' (DataFrame) or connect a model node (y_pred/y_test)")
32
- if isinstance(data, (np.ndarray, list)):
33
- data = pd.DataFrame({"value": np.asarray(data).ravel()})
34
- if not isinstance(data, pd.DataFrame):
35
- data = pd.DataFrame(data)
36
- data.to_csv(file_path, index=False)
37
- return {"status": "saved", "path": os.path.abspath(file_path), "rows": len(data)}
38
- '''
39
-
40
- # ── EDA ───────────────────────────────────────────────────────────────────────
41
-
42
- EDA = '''import pandas as pd
43
- import numpy as np
44
-
45
- def run(data) -> dict:
46
- """Exploratory Data Analysis. Always passes 'data' through so it can be chained."""
47
- df = data.copy() if hasattr(data, "copy") else pd.DataFrame(data)
48
-
49
- shape = list(df.shape)
50
- dtypes = {col: str(dt) for col, dt in df.dtypes.items()}
51
-
52
- # Missing values
53
- miss = df.isnull().sum()
54
- missing = {
55
- col: {"count": int(miss[col]), "pct": round(float(miss[col] / len(df) * 100), 2)}
56
- for col in df.columns if miss[col] > 0
57
- }
58
-
59
- # Numeric summary
60
- num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
61
- numeric_summary = {}
62
- for col in num_cols:
63
- s = df[col]
64
- numeric_summary[col] = {
65
- "mean": round(float(s.mean()), 4),
66
- "median": round(float(s.median()), 4),
67
- "std": round(float(s.std()), 4),
68
- "min": round(float(s.min()), 4),
69
- "max": round(float(s.max()), 4),
70
- "skew": round(float(s.skew()), 4),
71
- }
72
-
73
- # Categorical summary
74
- cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
75
- categorical_summary = {}
76
- for col in cat_cols[:8]:
77
- vc = df[col].value_counts()
78
- categorical_summary[col] = {
79
- "unique": int(df[col].nunique()),
80
- "top": str(vc.index[0]) if len(vc) else "",
81
- "top_count": int(vc.iloc[0]) if len(vc) else 0,
82
- }
83
-
84
- # Correlations (top 10 pairs by absolute value)
85
- correlations = {}
86
- if len(num_cols) >= 2:
87
- corr = df[num_cols].corr()
88
- pairs = []
89
- for i in range(len(num_cols)):
90
- for j in range(i + 1, len(num_cols)):
91
- a, b = num_cols[i], num_cols[j]
92
- val = float(corr.loc[a, b])
93
- if not (val != val): # skip NaN
94
- pairs.append((abs(val), a, b, val))
95
- pairs.sort(reverse=True)
96
- correlations = {f"{a}—{b}": round(v, 4) for _, a, b, v in pairs[:10]}
97
-
98
- return {
99
- "data": df, # pass-through so chained nodes get the DataFrame
100
- "shape": shape,
101
- "dtypes": dtypes,
102
- "missing": missing,
103
- "numeric_summary": numeric_summary,
104
- "categorical_summary": categorical_summary,
105
- "correlations": correlations,
106
- }
107
- '''
108
-
109
- # ── Preprocessing ─────────────────────────────────────────────────────────────
110
-
111
- DATA_CLEANING = '''import pandas as pd
112
-
113
- def run(data, strategy: str = "drop", fill_value: float = 0.0) -> dict:
114
- df = data.copy()
115
- if strategy == "fill":
116
- df = df.fillna(fill_value)
117
- else:
118
- df = df.dropna()
119
- return {"data": df}
120
- '''
121
-
122
- LABEL_ENCODER = '''from sklearn.preprocessing import LabelEncoder
123
- import pandas as pd
124
-
125
- def run(data, columns: str = "") -> dict:
126
- """Encode categorical columns. Leave columns blank to encode all object columns."""
127
- df = data.copy()
128
- encoders = {}
129
- cols = [c.strip() for c in columns.split(",") if c.strip()] if columns.strip() else \
130
- df.select_dtypes(include=["object", "category"]).columns.tolist()
131
- for col in cols:
132
- if col in df.columns:
133
- le = LabelEncoder()
134
- df[col] = le.fit_transform(df[col].astype(str))
135
- encoders[col] = list(le.classes_)
136
- return {"data": df, "encoders": encoders}
137
- '''
138
-
139
- ONE_HOT_ENCODER = '''import pandas as pd
140
-
141
- def run(data, columns: str = "") -> dict:
142
- """One-hot encode categorical columns. Leave columns blank to encode all object/category columns."""
143
- df = data.copy()
144
- cols = [c.strip() for c in columns.split(",") if c.strip()] if columns.strip() else \
145
- df.select_dtypes(include=["object", "category"]).columns.tolist()
146
- if cols:
147
- df = pd.get_dummies(df, columns=cols, drop_first=False)
148
- # Convert boolean columns produced by get_dummies to int (0/1) for sklearn
149
- bool_cols = df.select_dtypes(include=["bool"]).columns.tolist()
150
- if bool_cols:
151
- df[bool_cols] = df[bool_cols].astype(int)
152
- return {"data": df}
153
- '''
154
-
155
- TRAIN_TEST_SPLIT = '''from sklearn.model_selection import train_test_split as _split
156
- from typing import Annotated
157
-
158
- def run(
159
- data,
160
- target_column: Annotated[str, "column"] = "target",
161
- test_size: float = 0.2,
162
- random_state: int = 42,
163
- ) -> dict:
164
- X = data.drop(columns=[target_column])
165
- y = data[target_column]
166
- X_train, X_test, y_train, y_test = _split(
167
- X, y, test_size=test_size, random_state=random_state
168
- )
169
- return {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}
170
- '''
171
-
172
- STANDARD_SCALER = '''from sklearn.preprocessing import StandardScaler
173
-
174
- def run(X_train, X_test, y_train, y_test) -> dict:
175
- scaler = StandardScaler()
176
- X_train_s = scaler.fit_transform(X_train)
177
- X_test_s = scaler.transform(X_test)
178
- return {"X_train": X_train_s, "X_test": X_test_s, "y_train": y_train, "y_test": y_test}
179
- '''
180
-
181
- MIN_MAX_SCALER = '''from sklearn.preprocessing import MinMaxScaler
182
-
183
- def run(X_train, X_test, y_train, y_test) -> dict:
184
- scaler = MinMaxScaler()
185
- X_train_s = scaler.fit_transform(X_train)
186
- X_test_s = scaler.transform(X_test)
187
- return {"X_train": X_train_s, "X_test": X_test_s, "y_train": y_train, "y_test": y_test}
188
- '''
189
-
190
- PCA = '''from sklearn.decomposition import PCA
191
-
192
- def run(X_train, X_test, y_train, y_test, n_components: int = 2) -> dict:
193
- pca = PCA(n_components=n_components)
194
- X_train_p = pca.fit_transform(X_train)
195
- X_test_p = pca.transform(X_test)
196
- explained = float(sum(pca.explained_variance_ratio_))
197
- return {"X_train": X_train_p, "X_test": X_test_p, "y_train": y_train, "y_test": y_test,
198
- "explained_variance": round(explained, 4)}
199
- '''
200
-
201
- # ── Shared CV helper (inlined into each template to stay self-contained) ────────
202
- #
203
- # def _run_cv(model, X_train, y_train, cv_folds, scoring):
204
- # scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring=scoring)
205
- # return {"cv_mean": round(float(scores.mean()), 4),
206
- # "cv_std": round(float(scores.std()), 4),
207
- # "cv_scores": [round(float(s), 4) for s in scores]}
208
- #
209
- # ── Classifier models ─────────────────────────────────────────────────────────
210
-
211
- LOGISTIC_REGRESSION = '''from sklearn.linear_model import LogisticRegression
212
- from sklearn.model_selection import cross_val_score
213
-
214
- def run(
215
- X_train, X_test, y_train,
216
- C: float = 1.0, max_iter: int = 200, random_state: int = 42,
217
- cross_validation: bool = False, cv_folds: int = 5,
218
- ) -> dict:
219
- model = LogisticRegression(C=C, max_iter=max_iter, random_state=random_state)
220
- if cross_validation:
221
- scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")
222
- model.fit(X_train, y_train)
223
- y_pred = model.predict(X_test)
224
- return {"model": model, "y_pred": y_pred,
225
- "cv_mean": round(float(scores.mean()), 4),
226
- "cv_std": round(float(scores.std()), 4),
227
- "cv_scores": [round(float(s), 4) for s in scores]}
228
- model.fit(X_train, y_train)
229
- y_pred = model.predict(X_test)
230
- return {"model": model, "y_pred": y_pred}
231
- '''
232
-
233
- RANDOM_FOREST_CLASSIFIER = '''from sklearn.ensemble import RandomForestClassifier
234
- from sklearn.model_selection import cross_val_score
235
-
236
- def run(
237
- X_train, X_test, y_train,
238
- n_estimators: int = 100, max_depth: int = 0, random_state: int = 42,
239
- cross_validation: bool = False, cv_folds: int = 5,
240
- ) -> dict:
241
- md = max_depth if max_depth > 0 else None
242
- model = RandomForestClassifier(n_estimators=n_estimators, max_depth=md, random_state=random_state)
243
- if cross_validation:
244
- scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")
245
- model.fit(X_train, y_train)
246
- y_pred = model.predict(X_test)
247
- return {"model": model, "y_pred": y_pred,
248
- "cv_mean": round(float(scores.mean()), 4),
249
- "cv_std": round(float(scores.std()), 4),
250
- "cv_scores": [round(float(s), 4) for s in scores]}
251
- model.fit(X_train, y_train)
252
- y_pred = model.predict(X_test)
253
- return {"model": model, "y_pred": y_pred}
254
- '''
255
-
256
- GRADIENT_BOOSTING_CLASSIFIER = '''from sklearn.ensemble import GradientBoostingClassifier
257
- from sklearn.model_selection import cross_val_score
258
-
259
- def run(
260
- X_train, X_test, y_train,
261
- n_estimators: int = 100, learning_rate: float = 0.1, random_state: int = 42,
262
- cross_validation: bool = False, cv_folds: int = 5,
263
- ) -> dict:
264
- model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=random_state)
265
- if cross_validation:
266
- scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")
267
- model.fit(X_train, y_train)
268
- y_pred = model.predict(X_test)
269
- return {"model": model, "y_pred": y_pred,
270
- "cv_mean": round(float(scores.mean()), 4),
271
- "cv_std": round(float(scores.std()), 4),
272
- "cv_scores": [round(float(s), 4) for s in scores]}
273
- model.fit(X_train, y_train)
274
- y_pred = model.predict(X_test)
275
- return {"model": model, "y_pred": y_pred}
276
- '''
277
-
278
- DECISION_TREE_CLASSIFIER = '''from sklearn.tree import DecisionTreeClassifier
279
- from sklearn.model_selection import cross_val_score
280
-
281
- def run(
282
- X_train, X_test, y_train,
283
- max_depth: int = 0, random_state: int = 42,
284
- cross_validation: bool = False, cv_folds: int = 5,
285
- ) -> dict:
286
- md = max_depth if max_depth > 0 else None
287
- model = DecisionTreeClassifier(max_depth=md, random_state=random_state)
288
- if cross_validation:
289
- scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")
290
- model.fit(X_train, y_train)
291
- y_pred = model.predict(X_test)
292
- return {"model": model, "y_pred": y_pred,
293
- "cv_mean": round(float(scores.mean()), 4),
294
- "cv_std": round(float(scores.std()), 4),
295
- "cv_scores": [round(float(s), 4) for s in scores]}
296
- model.fit(X_train, y_train)
297
- y_pred = model.predict(X_test)
298
- return {"model": model, "y_pred": y_pred}
299
- '''
300
-
301
- SVM_CLASSIFIER = '''from sklearn.svm import SVC
302
- from sklearn.model_selection import cross_val_score
303
-
304
- def run(
305
- X_train, X_test, y_train,
306
- C: float = 1.0, kernel: str = "rbf",
307
- cross_validation: bool = False, cv_folds: int = 5,
308
- ) -> dict:
309
- model = SVC(C=C, kernel=kernel)
310
- if cross_validation:
311
- scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")
312
- model.fit(X_train, y_train)
313
- y_pred = model.predict(X_test)
314
- return {"model": model, "y_pred": y_pred,
315
- "cv_mean": round(float(scores.mean()), 4),
316
- "cv_std": round(float(scores.std()), 4),
317
- "cv_scores": [round(float(s), 4) for s in scores]}
318
- model.fit(X_train, y_train)
319
- y_pred = model.predict(X_test)
320
- return {"model": model, "y_pred": y_pred}
321
- '''
322
-
323
- KNN_CLASSIFIER = '''from sklearn.neighbors import KNeighborsClassifier
324
- from sklearn.model_selection import cross_val_score
325
-
326
- def run(
327
- X_train, X_test, y_train,
328
- n_neighbors: int = 5,
329
- cross_validation: bool = False, cv_folds: int = 5,
330
- ) -> dict:
331
- model = KNeighborsClassifier(n_neighbors=n_neighbors)
332
- if cross_validation:
333
- scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")
334
- model.fit(X_train, y_train)
335
- y_pred = model.predict(X_test)
336
- return {"model": model, "y_pred": y_pred,
337
- "cv_mean": round(float(scores.mean()), 4),
338
- "cv_std": round(float(scores.std()), 4),
339
- "cv_scores": [round(float(s), 4) for s in scores]}
340
- model.fit(X_train, y_train)
341
- y_pred = model.predict(X_test)
342
- return {"model": model, "y_pred": y_pred}
343
- '''
344
-
345
- # ── Regressor models ──────────────────────────────────────────────────────────
346
-
347
- LINEAR_REGRESSION = '''from sklearn.linear_model import LinearRegression
348
- from sklearn.model_selection import cross_val_score
349
-
350
- def run(
351
- X_train, X_test, y_train,
352
- cross_validation: bool = False, cv_folds: int = 5,
353
- ) -> dict:
354
- model = LinearRegression()
355
- if cross_validation:
356
- scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
357
- model.fit(X_train, y_train)
358
- y_pred = model.predict(X_test)
359
- return {"model": model, "y_pred": y_pred,
360
- "cv_mean": round(float(scores.mean()), 4),
361
- "cv_std": round(float(scores.std()), 4),
362
- "cv_scores": [round(float(s), 4) for s in scores]}
363
- model.fit(X_train, y_train)
364
- y_pred = model.predict(X_test)
365
- return {"model": model, "y_pred": y_pred}
366
- '''
367
-
368
- RANDOM_FOREST_REGRESSOR = '''from sklearn.ensemble import RandomForestRegressor
369
- from sklearn.model_selection import cross_val_score
370
-
371
- def run(
372
- X_train, X_test, y_train,
373
- n_estimators: int = 100, max_depth: int = 0, random_state: int = 42,
374
- cross_validation: bool = False, cv_folds: int = 5,
375
- ) -> dict:
376
- md = max_depth if max_depth > 0 else None
377
- model = RandomForestRegressor(n_estimators=n_estimators, max_depth=md, random_state=random_state)
378
- if cross_validation:
379
- scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
380
- model.fit(X_train, y_train)
381
- y_pred = model.predict(X_test)
382
- return {"model": model, "y_pred": y_pred,
383
- "cv_mean": round(float(scores.mean()), 4),
384
- "cv_std": round(float(scores.std()), 4),
385
- "cv_scores": [round(float(s), 4) for s in scores]}
386
- model.fit(X_train, y_train)
387
- y_pred = model.predict(X_test)
388
- return {"model": model, "y_pred": y_pred}
389
- '''
390
-
391
- GRADIENT_BOOSTING_REGRESSOR = '''from sklearn.ensemble import GradientBoostingRegressor
392
- from sklearn.model_selection import cross_val_score
393
-
394
- def run(
395
- X_train, X_test, y_train,
396
- n_estimators: int = 100, learning_rate: float = 0.1, random_state: int = 42,
397
- cross_validation: bool = False, cv_folds: int = 5,
398
- ) -> dict:
399
- model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, random_state=random_state)
400
- if cross_validation:
401
- scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
402
- model.fit(X_train, y_train)
403
- y_pred = model.predict(X_test)
404
- return {"model": model, "y_pred": y_pred,
405
- "cv_mean": round(float(scores.mean()), 4),
406
- "cv_std": round(float(scores.std()), 4),
407
- "cv_scores": [round(float(s), 4) for s in scores]}
408
- model.fit(X_train, y_train)
409
- y_pred = model.predict(X_test)
410
- return {"model": model, "y_pred": y_pred}
411
- '''
412
-
413
- DECISION_TREE_REGRESSOR = '''from sklearn.tree import DecisionTreeRegressor
414
- from sklearn.model_selection import cross_val_score
415
-
416
- def run(
417
- X_train, X_test, y_train,
418
- max_depth: int = 0, random_state: int = 42,
419
- cross_validation: bool = False, cv_folds: int = 5,
420
- ) -> dict:
421
- md = max_depth if max_depth > 0 else None
422
- model = DecisionTreeRegressor(max_depth=md, random_state=random_state)
423
- if cross_validation:
424
- scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
425
- model.fit(X_train, y_train)
426
- y_pred = model.predict(X_test)
427
- return {"model": model, "y_pred": y_pred,
428
- "cv_mean": round(float(scores.mean()), 4),
429
- "cv_std": round(float(scores.std()), 4),
430
- "cv_scores": [round(float(s), 4) for s in scores]}
431
- model.fit(X_train, y_train)
432
- y_pred = model.predict(X_test)
433
- return {"model": model, "y_pred": y_pred}
434
- '''
435
-
436
- SVM_REGRESSOR = '''from sklearn.svm import SVR
437
- from sklearn.model_selection import cross_val_score
438
-
439
- def run(
440
- X_train, X_test, y_train,
441
- C: float = 1.0, kernel: str = "rbf",
442
- cross_validation: bool = False, cv_folds: int = 5,
443
- ) -> dict:
444
- model = SVR(C=C, kernel=kernel)
445
- if cross_validation:
446
- scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
447
- model.fit(X_train, y_train)
448
- y_pred = model.predict(X_test)
449
- return {"model": model, "y_pred": y_pred,
450
- "cv_mean": round(float(scores.mean()), 4),
451
- "cv_std": round(float(scores.std()), 4),
452
- "cv_scores": [round(float(s), 4) for s in scores]}
453
- model.fit(X_train, y_train)
454
- y_pred = model.predict(X_test)
455
- return {"model": model, "y_pred": y_pred}
456
- '''
457
-
458
- KNN_REGRESSOR = '''from sklearn.neighbors import KNeighborsRegressor
459
- from sklearn.model_selection import cross_val_score
460
-
461
- def run(
462
- X_train, X_test, y_train,
463
- n_neighbors: int = 5,
464
- cross_validation: bool = False, cv_folds: int = 5,
465
- ) -> dict:
466
- model = KNeighborsRegressor(n_neighbors=n_neighbors)
467
- if cross_validation:
468
- scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
469
- model.fit(X_train, y_train)
470
- y_pred = model.predict(X_test)
471
- return {"model": model, "y_pred": y_pred,
472
- "cv_mean": round(float(scores.mean()), 4),
473
- "cv_std": round(float(scores.std()), 4),
474
- "cv_scores": [round(float(s), 4) for s in scores]}
475
- model.fit(X_train, y_train)
476
- y_pred = model.predict(X_test)
477
- return {"model": model, "y_pred": y_pred}
478
- '''
479
-
480
- # ── Evaluation ────────────────────────────────────────────────────────────────
481
-
482
- ACCURACY = '''from sklearn.metrics import accuracy_score
483
-
484
- def run(y_test, y_pred) -> dict:
485
- return {"accuracy": float(accuracy_score(y_test, y_pred))}
486
- '''
487
-
488
- CLASSIFICATION_REPORT = '''from sklearn.metrics import (
489
- accuracy_score, f1_score, precision_score, recall_score,
490
- confusion_matrix, classification_report,
491
- )
492
-
493
- def run(y_test, y_pred) -> dict:
494
- return {
495
- "accuracy": round(float(accuracy_score(y_test, y_pred)), 4),
496
- "f1_score": round(float(f1_score(y_test, y_pred, average="weighted", zero_division=0)), 4),
497
- "precision": round(float(precision_score(y_test, y_pred, average="weighted", zero_division=0)), 4),
498
- "recall": round(float(recall_score(y_test, y_pred, average="weighted", zero_division=0)), 4),
499
- "confusion_matrix": confusion_matrix(y_test, y_pred).tolist(),
500
- "class_report": classification_report(y_test, y_pred, output_dict=True, zero_division=0),
501
- }
502
- '''
503
-
504
- VALIDATION_REPORT = '''from sklearn.model_selection import cross_validate
505
- import numpy as np
506
-
507
- def run(model, X_train, y_train, cv_folds: int = 5) -> dict:
508
- """Run K-Fold cross validation and generate a robust validation report."""
509
- # Using macro averages for multiclass compatibility
510
- scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
511
-
512
- # Catching cases where model might not be a classifier (though typically it will be here)
513
- try:
514
- scores = cross_validate(model, X_train, y_train, cv=cv_folds, scoring=scoring)
515
- return {
516
- "cv_folds": cv_folds,
517
- "mean_accuracy": round(float(np.mean(scores['test_accuracy'])), 4),
518
- "std_accuracy": round(float(np.std(scores['test_accuracy'])), 4),
519
- "mean_precision": round(float(np.mean(scores['test_precision_macro'])), 4),
520
- "mean_recall": round(float(np.mean(scores['test_recall_macro'])), 4),
521
- "mean_f1": round(float(np.mean(scores['test_f1_macro'])), 4),
522
- }
523
- except Exception as e:
524
- # Fallback for regressors
525
- scores = cross_validate(model, X_train, y_train, cv=cv_folds, scoring=['r2', 'neg_mean_squared_error'])
526
- return {
527
- "cv_folds": cv_folds,
528
- "mean_r2": round(float(np.mean(scores['test_r2'])), 4),
529
- "mean_mse": round(float(np.mean(-scores['test_neg_mean_squared_error'])), 4),
530
- }
531
- '''
532
-
533
- REGRESSION_METRICS = '''from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
534
- import numpy as np
535
-
536
- def run(y_test, y_pred) -> dict:
537
- mse = float(mean_squared_error(y_test, y_pred))
538
- return {
539
- "mse": round(mse, 4),
540
- "rmse": round(float(np.sqrt(mse)), 4),
541
- "mae": round(float(mean_absolute_error(y_test, y_pred)), 4),
542
- "r2": round(float(r2_score(y_test, y_pred)), 4),
543
- }
544
- '''
545
-
546
- FEATURE_IMPORTANCE = '''import numpy as np
547
-
548
- def run(model, X_train) -> dict:
549
- """Extract feature importances from any tree-based model."""
550
- if not hasattr(model, "feature_importances_"):
551
- raise ValueError(f"{type(model).__name__} has no feature_importances_. Use RF, GBM, or DT.")
552
- imps = model.feature_importances_
553
- names = list(X_train.columns) if hasattr(X_train, "columns") else [f"f{i}" for i in range(len(imps))]
554
- pairs = sorted(zip(imps, names), reverse=True)
555
- top = min(20, len(pairs))
556
- return {
557
- "feature_importances": {name: round(float(imp), 6) for imp, name in pairs[:top]},
558
- "top_feature": pairs[0][1],
559
- "top_importance": round(float(pairs[0][0]), 6),
560
- }
561
- '''
562
-
563
- AUTO_ML = '''from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
564
- from sklearn.linear_model import LogisticRegression
565
- from sklearn.metrics import accuracy_score
566
-
567
- def run(X_train, X_test, y_train, y_test) -> dict:
568
- candidates = {
569
- "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
570
- "Gradient Boosting": GradientBoostingClassifier(random_state=42),
571
- "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
572
- }
573
- scores = {}
574
- best_name, best_score, best_model, best_pred = "", -1.0, None, None
575
- for name, mdl in candidates.items():
576
- mdl.fit(X_train, y_train)
577
- preds = mdl.predict(X_test)
578
- score = float(accuracy_score(y_test, preds))
579
- scores[name] = round(score, 4)
580
- if score > best_score:
581
- best_score, best_name, best_model, best_pred = score, name, mdl, preds
582
- return {
583
- "model": best_model,
584
- "y_pred": best_pred,
585
- "best_algorithm": best_name,
586
- "accuracy": round(best_score, 4),
587
- "all_scores": scores,
588
- }
589
- '''
590
-
591
-
592
- EDA_HISTOGRAM = '''import pandas as pd
593
- import numpy as np
594
- from typing import Annotated
595
-
596
- def run(data, column: Annotated[str, "column"] = "price", bins: int = 20) -> dict:
597
- """Distribution histogram for a single numeric column."""
598
- series = data[column].dropna()
599
- counts, edges = np.histogram(series, bins=bins)
600
- return {
601
- "histogram": {
602
- "column": column,
603
- "counts": counts.tolist(),
604
- "bin_edges": [round(float(e), 2) for e in edges.tolist()],
605
- "mean": round(float(series.mean()), 2),
606
- "median": round(float(series.median()), 2),
607
- "std": round(float(series.std()), 2),
608
- "min": round(float(series.min()), 2),
609
- "max": round(float(series.max()), 2),
610
- }
611
- }
612
- '''
613
-
614
- EDA_CORRELATION = '''import pandas as pd
615
- import numpy as np
616
-
617
- def run(data) -> dict:
618
- """Full correlation matrix for all numeric columns."""
619
- num_df = data.select_dtypes(include=[np.number])
620
- corr = num_df.corr().round(3)
621
- matrix = []
622
- for row in corr.values.tolist():
623
- matrix.append([None if (v != v) else round(v, 3) for v in row])
624
- return {
625
- "correlation_matrix": {
626
- "columns": list(corr.columns),
627
- "matrix": matrix,
628
- }
629
- }
630
- '''
631
-
632
- EDA_VALUE_COUNTS = '''import pandas as pd
633
- from typing import Annotated
634
-
635
- def run(data, column: Annotated[str, "column"] = "mainroad", top_n: int = 10) -> dict:
636
- """Value counts for a categorical column."""
637
- vc = data[column].value_counts().head(top_n)
638
- return {
639
- "value_counts": {
640
- "column": column,
641
- "labels": [str(l) for l in vc.index.tolist()],
642
- "counts": vc.values.tolist(),
643
- "total": int(len(data)),
644
- }
645
- }
646
- '''
647
-
648
- EDA_BOX_PLOT = '''import pandas as pd
649
- import numpy as np
650
- from typing import Annotated
651
-
652
- def run(data, column: Annotated[str, "column"] = "price") -> dict:
653
- """Box plot statistics (min, Q1, median, Q3, max, outliers) for a column."""
654
- s = data[column].dropna()
655
- q1, q3 = float(s.quantile(0.25)), float(s.quantile(0.75))
656
- iqr = q3 - q1
657
- outliers = s[(s < q1 - 1.5*iqr) | (s > q3 + 1.5*iqr)]
658
- return {
659
- "box_plot": {
660
- "column": column,
661
- "min": round(float(s.min()), 2),
662
- "q1": round(q1, 2),
663
- "median": round(float(s.median()), 2),
664
- "q3": round(q3, 2),
665
- "max": round(float(s.max()), 2),
666
- "mean": round(float(s.mean()), 2),
667
- "iqr": round(iqr, 2),
668
- "outlier_count": int(len(outliers)),
669
- "whisker_low": round(float(max(s.min(), q1 - 1.5*iqr)), 2),
670
- "whisker_high": round(float(min(s.max(), q3 + 1.5*iqr)), 2),
671
- }
672
- }
673
- '''
674
-
675
- PREDICT = '''import pandas as pd
676
- import json
677
-
678
- def run(model, feature_json: str = "{}") -> dict:
679
- """Predict from a trained model. Enter feature values as a JSON string.
680
- Example: {"area": 7000, "bedrooms": 3, "bathrooms": 2, "stories": 2,
681
- "mainroad": 1, "guestroom": 0, "basement": 0,
682
- "hotwaterheating": 0, "airconditioning": 1, "parking": 2,
683
- "prefarea": 0, "furnishingstatus": 1}
684
- """
685
- features = json.loads(feature_json) if feature_json.strip() else {}
686
- if not features:
687
- raise ValueError("Enter feature values as JSON in the feature_json field")
688
- df = pd.DataFrame([features])
689
- prediction = model.predict(df)
690
- result = prediction[0]
691
- scalar = float(result) if hasattr(result, "__float__") else str(result)
692
- return {
693
- "prediction": scalar,
694
- "features_used": features,
695
- }
696
- '''
697
-
698
-
699
- TEMPLATES: list[dict] = [
700
- # Data
701
- {"id": "csv_loader", "label": "CSV Loader", "category": "Data", "code": CSV_LOADER},
702
- {"id": "csv_exporter", "label": "CSV Exporter", "category": "Data", "code": CSV_EXPORTER},
703
- {"id": "eda", "label": "EDA", "category": "Data", "code": EDA},
704
- {"id": "eda_histogram", "label": "Histogram", "category": "Visualize", "code": EDA_HISTOGRAM},
705
- {"id": "eda_correlation", "label": "Correlation Matrix", "category": "Visualize", "code": EDA_CORRELATION},
706
- {"id": "eda_value_counts", "label": "Value Counts", "category": "Visualize", "code": EDA_VALUE_COUNTS},
707
- {"id": "eda_box_plot", "label": "Box Plot", "category": "Visualize", "code": EDA_BOX_PLOT},
708
- {"id": "predict", "label": "Predict", "category": "Visualize", "code": PREDICT},
709
- # Preprocessing
710
- {"id": "data_cleaning", "label": "Data Cleaning", "category": "Preprocessing", "code": DATA_CLEANING},
711
- {"id": "label_encoder", "label": "Label Encoder", "category": "Preprocessing", "code": LABEL_ENCODER},
712
- {"id": "one_hot_encoder", "label": "One-Hot Encoder", "category": "Preprocessing", "code": ONE_HOT_ENCODER},
713
- {"id": "train_test_split", "label": "Train/Test Split", "category": "Preprocessing", "code": TRAIN_TEST_SPLIT},
714
- {"id": "standard_scaler", "label": "Standard Scaler", "category": "Preprocessing", "code": STANDARD_SCALER},
715
- {"id": "min_max_scaler", "label": "Min-Max Scaler", "category": "Preprocessing", "code": MIN_MAX_SCALER},
716
- {"id": "pca", "label": "PCA", "category": "Preprocessing", "code": PCA},
717
- # Classifiers
718
- {"id": "logistic_regression", "label": "Logistic Regression", "category": "Classifiers", "code": LOGISTIC_REGRESSION},
719
- {"id": "random_forest_classifier", "label": "RF Classifier", "category": "Classifiers", "code": RANDOM_FOREST_CLASSIFIER},
720
- {"id": "gradient_boosting_classifier","label": "GB Classifier", "category": "Classifiers", "code": GRADIENT_BOOSTING_CLASSIFIER},
721
- {"id": "decision_tree_classifier", "label": "DT Classifier", "category": "Classifiers", "code": DECISION_TREE_CLASSIFIER},
722
- {"id": "svm_classifier", "label": "SVM Classifier", "category": "Classifiers", "code": SVM_CLASSIFIER},
723
- {"id": "knn_classifier", "label": "KNN Classifier", "category": "Classifiers", "code": KNN_CLASSIFIER},
724
- # Regressors
725
- {"id": "linear_regression", "label": "Linear Regression", "category": "Regressors", "code": LINEAR_REGRESSION},
726
- {"id": "random_forest_regressor", "label": "RF Regressor", "category": "Regressors", "code": RANDOM_FOREST_REGRESSOR},
727
- {"id": "gradient_boosting_regressor", "label": "GB Regressor", "category": "Regressors", "code": GRADIENT_BOOSTING_REGRESSOR},
728
- {"id": "decision_tree_regressor", "label": "DT Regressor", "category": "Regressors", "code": DECISION_TREE_REGRESSOR},
729
- {"id": "svm_regressor", "label": "SVM Regressor", "category": "Regressors", "code": SVM_REGRESSOR},
730
- {"id": "knn_regressor", "label": "KNN Regressor", "category": "Regressors", "code": KNN_REGRESSOR},
731
- # Evaluation
732
- {"id": "accuracy", "label": "Accuracy", "category": "Evaluation", "code": ACCURACY},
733
- {"id": "classification_report", "label": "Classification Report", "category": "Evaluation", "code": CLASSIFICATION_REPORT},
734
- {"id": "validation_report", "label": "Validation Report", "category": "Evaluation", "code": VALIDATION_REPORT},
735
- {"id": "regression_metrics", "label": "Regression Metrics", "category": "Evaluation", "code": REGRESSION_METRICS},
736
- {"id": "feature_importance", "label": "Feature Importance", "category": "Evaluation", "code": FEATURE_IMPORTANCE},
737
- {"id": "auto_ml", "label": "AutoML", "category": "Evaluation", "code": AUTO_ML},
738
- ]
739
-
740
-
741
- def get_template(template_id: str) -> dict | None:
742
- for t in TEMPLATES:
743
- if t["id"] == template_id:
744
- return t
745
- return None
1
+ """Pre-built node templates. Each template is just Python source defining `def run(...)`."""
2
+
3
+ # ── Data ──────────────────────────────────────────────────────────────────────
4
+
5
+ CSV_LOADER = '''import pandas as pd
6
+ import os
7
+ from typing import Annotated
8
+
9
+ def run(file_path: Annotated[str, "file"] = "data.csv") -> dict:
10
+ """Loads a CSV file. Looks in the storage directory if a relative path is provided."""
11
+ # Resolve path
12
+ if not os.path.isabs(file_path):
13
+ storage_dir = os.environ.get("M8FLOW_UPLOAD_DIR") or os.path.abspath("uploads")
14
+ potential_path = os.path.join(storage_dir, file_path)
15
+ if os.path.exists(potential_path):
16
+ file_path = potential_path
17
+
18
+ df = pd.read_csv(file_path)
19
+ return {"data": df}
20
+ '''
21
+
22
+ CSV_EXPORTER = '''import pandas as pd
23
+ import numpy as np
24
+ import os
25
+ from typing import Annotated
26
+
27
+ def run(
28
+ data: Annotated[pd.DataFrame, "dataframe"] = None,
29
+ y_pred: Annotated[np.ndarray, "array"] = None,
30
+ y_test: Annotated[np.ndarray, "array"] = None,
31
+ file_path: Annotated[str, "file"] = "output.csv"
32
+ ) -> dict:
33
+ """Exports data to a CSV file. Supports DataFrames or Model predictions."""
34
+ # 1. Resolve data
35
+ if data is None:
36
+ if y_pred is not None and y_test is not None:
37
+ data = pd.DataFrame({"y_test": np.asarray(y_test).ravel(), "y_pred": np.asarray(y_pred).ravel()})
38
+ elif y_pred is not None:
39
+ data = pd.DataFrame({"predictions": np.asarray(y_pred).ravel()})
40
+ else:
41
+ return {"status": "error", "error": "No data or predictions provided to export"}
42
+
43
+ if not isinstance(data, pd.DataFrame):
44
+ try:
45
+ data = pd.DataFrame(data)
46
+ except Exception as e:
47
+ return {"status": "error", "error": f"Failed to convert input to DataFrame: {e}"}
48
+
49
+ # 2. Resolve output path
50
+ # If it's just a filename, try to save it in the storage directory if possible
51
+ # We look for M8FLOW_UPLOAD_DIR or default 'uploads'
52
+ storage_dir = os.environ.get("M8FLOW_UPLOAD_DIR") or os.path.abspath("uploads")
53
+ if not os.path.isabs(file_path):
54
+ os.makedirs(storage_dir, exist_ok=True)
55
+ full_path = os.path.join(storage_dir, file_path)
56
+ else:
57
+ full_path = file_path
58
+
59
+ data.to_csv(full_path, index=False)
60
+ return {"status": "saved", "path": full_path, "rows": len(data)}
61
+ '''
62
+
63
+ # ── EDA ───────────────────────────────────────────────────────────────────────
64
+
65
+ EDA = '''import pandas as pd
66
+ import numpy as np
67
+
68
+ def run(data) -> dict:
69
+ """Exploratory Data Analysis. Always passes 'data' through so it can be chained."""
70
+ df = data.copy() if hasattr(data, "copy") else pd.DataFrame(data)
71
+
72
+ shape = list(df.shape)
73
+ dtypes = {col: str(dt) for col, dt in df.dtypes.items()}
74
+
75
+ # Missing values
76
+ miss = df.isnull().sum()
77
+ missing = {
78
+ col: {"count": int(miss[col]), "pct": round(float(miss[col] / len(df) * 100), 2)}
79
+ for col in df.columns if miss[col] > 0
80
+ }
81
+
82
+ # Numeric summary
83
+ num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
84
+ numeric_summary = {}
85
+ for col in num_cols:
86
+ s = df[col]
87
+ numeric_summary[col] = {
88
+ "mean": round(float(s.mean()), 4),
89
+ "median": round(float(s.median()), 4),
90
+ "std": round(float(s.std()), 4),
91
+ "min": round(float(s.min()), 4),
92
+ "max": round(float(s.max()), 4),
93
+ "skew": round(float(s.skew()), 4),
94
+ }
95
+
96
+ # Categorical summary
97
+ cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
98
+ categorical_summary = {}
99
+ for col in cat_cols[:8]:
100
+ vc = df[col].value_counts()
101
+ categorical_summary[col] = {
102
+ "unique": int(df[col].nunique()),
103
+ "top": str(vc.index[0]) if len(vc) else "",
104
+ "top_count": int(vc.iloc[0]) if len(vc) else 0,
105
+ }
106
+
107
+ # Correlations (top 10 pairs by absolute value)
108
+ correlations = {}
109
+ if len(num_cols) >= 2:
110
+ corr = df[num_cols].corr()
111
+ pairs = []
112
+ for i in range(len(num_cols)):
113
+ for j in range(i + 1, len(num_cols)):
114
+ a, b = num_cols[i], num_cols[j]
115
+ val = float(corr.loc[a, b])
116
+ if not (val != val): # skip NaN
117
+ pairs.append((abs(val), a, b, val))
118
+ pairs.sort(reverse=True)
119
+ correlations = {f"{a}—{b}": round(v, 4) for _, a, b, v in pairs[:10]}
120
+
121
+ return {
122
+ "data": df, # pass-through so chained nodes get the DataFrame
123
+ "shape": shape,
124
+ "dtypes": dtypes,
125
+ "missing": missing,
126
+ "numeric_summary": numeric_summary,
127
+ "categorical_summary": categorical_summary,
128
+ "correlations": correlations,
129
+ }
130
+ '''
131
+
132
+ # ── Preprocessing ─────────────────────────────────────────────────────────────
133
+
134
+ SMART_OUTLIER_REMOVER = '''import pandas as pd
135
+ import numpy as np
136
+
137
+ def run(
138
+ data,
139
+ multiplier: float = 1.5,
140
+ method: str = "drop", # choices: drop | clip
141
+ ) -> dict:
142
+ """Remove or clip outliers using the IQR method across all numeric columns."""
143
+ df = data.copy()
144
+ num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
145
+ removed_per_col = {}
146
+ for col in num_cols:
147
+ q1 = df[col].quantile(0.25)
148
+ q3 = df[col].quantile(0.75)
149
+ iqr = q3 - q1
150
+ lo = q1 - multiplier * iqr
151
+ hi = q3 + multiplier * iqr
152
+ outliers = ((df[col] < lo) | (df[col] > hi)).sum()
153
+ removed_per_col[col] = int(outliers)
154
+ if method == "clip":
155
+ df[col] = df[col].clip(lower=lo, upper=hi)
156
+ else: # drop
157
+ df = df[(df[col] >= lo) & (df[col] <= hi)]
158
+ return {
159
+ "data": df,
160
+ "rows_before": len(data),
161
+ "rows_after": len(df),
162
+ "outliers_removed_per_col": removed_per_col,
163
+ }
164
+ '''
165
+
166
+ ADVANCED_IMPUTER = '''import pandas as pd
167
+ import numpy as np
168
+
169
+ def run(
170
+ data,
171
+ strategy: str = "knn", # choices: knn | mice
172
+ n_neighbors: int = 5,
173
+ ) -> dict:
174
+ """Smart missing-value imputation: KNNImputer or IterativeImputer (MICE)."""
175
+ from sklearn.impute import KNNImputer
176
+ df = data.copy()
177
+ num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
178
+ cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
179
+ missing_before = int(df[num_cols].isnull().sum().sum())
180
+ if strategy == "mice":
181
+ from sklearn.experimental import enable_iterative_imputer # noqa: F401
182
+ from sklearn.impute import IterativeImputer
183
+ imputer = IterativeImputer(max_iter=10, random_state=42)
184
+ else:
185
+ imputer = KNNImputer(n_neighbors=n_neighbors)
186
+ df[num_cols] = imputer.fit_transform(df[num_cols])
187
+ # Fill remaining categorical columns with mode
188
+ for col in cat_cols:
189
+ if df[col].isnull().any():
190
+ df[col] = df[col].fillna(df[col].mode()[0])
191
+ return {
192
+ "data": df,
193
+ "strategy": strategy,
194
+ "missing_before": missing_before,
195
+ "missing_after": int(df[num_cols].isnull().sum().sum()),
196
+ }
197
+ '''
198
+
199
+ SKEWNESS_FIXER = '''import pandas as pd
200
+ import numpy as np
201
+
202
+ def run(
203
+ data,
204
+ threshold: float = 0.75,
205
+ ) -> dict:
206
+ """Detect skewed numeric columns and apply log1p or sqrt transformation."""
207
+ df = data.copy()
208
+ num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
209
+ transformed = {}
210
+ for col in num_cols:
211
+ skew = float(df[col].skew())
212
+ if abs(skew) > threshold:
213
+ if df[col].min() >= 0:
214
+ if skew > 2: # heavy skew → log1p
215
+ df[col] = np.log1p(df[col])
216
+ transformed[col] = {"skew_before": round(skew, 4), "transform": "log1p"}
217
+ else: # moderate skew sqrt
218
+ df[col] = np.sqrt(df[col])
219
+ transformed[col] = {"skew_before": round(skew, 4), "transform": "sqrt"}
220
+ else: # column has negatives → shift then log1p
221
+ shift = abs(df[col].min()) + 1
222
+ df[col] = np.log1p(df[col] + shift)
223
+ transformed[col] = {"skew_before": round(skew, 4), "transform": f"log1p(x+{round(shift,2)})"}
224
+ return {
225
+ "data": df,
226
+ "columns_transformed": transformed,
227
+ "n_transformed": len(transformed),
228
+ }
229
+ '''
230
+
231
+ HIGH_CARDINALITY_ENCODER = '''import pandas as pd
232
+ import numpy as np
233
+ from typing import Annotated
234
+
235
+ def run(
236
+ data,
237
+ column: Annotated[str, "column"] = "city",
238
+ target: Annotated[str, "column"] = "target",
239
+ smoothing: float = 1.0,
240
+ ) -> dict:
241
+ """Target Encoding for high-cardinality columns (e.g. City, ZipCode).
242
+ Uses smoothed means to prevent overfitting on rare categories.
243
+ Supports comma-separated lists of columns."""
244
+ df = data.copy()
245
+ if target not in df.columns:
246
+ raise ValueError(f"Target column \'{target}\' not found in DataFrame.")
247
+
248
+ global_mean = df[target].mean()
249
+ cols_to_encode = [c.strip() for c in column.split(",") if c.strip()]
250
+ stats_info = {}
251
+
252
+ for col in cols_to_encode:
253
+ if col not in df.columns:
254
+ continue
255
+ stats = df.groupby(col)[target].agg(["mean", "count"])
256
+ # Smoothed target encoding formula
257
+ stats["smoothed"] = (
258
+ (stats["count"] * stats["mean"] + smoothing * global_mean)
259
+ / (stats["count"] + smoothing)
260
+ )
261
+ df[col] = df[col].map(stats["smoothed"]).fillna(global_mean)
262
+ stats_info[col] = int(len(stats))
263
+
264
+ return {
265
+ "data": df,
266
+ "encoded_columns": cols_to_encode,
267
+ "target_column": target,
268
+ "categories_encoded_per_col": stats_info,
269
+ "global_mean": round(float(global_mean), 6),
270
+ }
271
+ '''
272
+
273
+ FEATURE_SCALER_ROBUST = '''import pandas as pd
274
+ import numpy as np
275
+
276
+ def run(data) -> dict:
277
+ """Scale numeric features with RobustScaler (median + IQR).
278
+ Unlike StandardScaler, it is not distorted by extreme outliers."""
279
+ from sklearn.preprocessing import RobustScaler
280
+ df = data.copy()
281
+ num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
282
+ scaler = RobustScaler()
283
+ df[num_cols] = scaler.fit_transform(df[num_cols])
284
+ return {
285
+ "data": df,
286
+ "scaled_columns": num_cols,
287
+ "n_scaled": len(num_cols),
288
+ }
289
+ '''
290
+
291
+ MULTICOLLINEARITY_FILTER = '''import pandas as pd
292
+ import numpy as np
293
+
294
+ def run(
295
+ data,
296
+ threshold: float = 0.90,
297
+ ) -> dict:
298
+ """Drop features whose absolute Pearson correlation with any other feature exceeds threshold."""
299
+ df = data.copy()
300
+ num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
301
+ corr_matrix = df[num_cols].corr().abs()
302
+ upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
303
+ to_drop = [col for col in upper.columns if any(upper[col] > threshold)]
304
+ highly_correlated_pairs = []
305
+ for col in to_drop:
306
+ partners = upper.index[upper[col] > threshold].tolist()
307
+ for partner in partners:
308
+ highly_correlated_pairs.append({
309
+ "dropped": col,
310
+ "correlated_with": partner,
311
+ "correlation": round(float(upper.loc[partner, col]), 4),
312
+ })
313
+ df = df.drop(columns=to_drop)
314
+ return {
315
+ "data": df,
316
+ "dropped_columns": to_drop,
317
+ "n_dropped": len(to_drop),
318
+ "highly_correlated_pairs": highly_correlated_pairs,
319
+ "remaining_features": df.shape[1],
320
+ }
321
+ '''
322
+
323
+ TEXT_CLEANER_BASIC = '''import pandas as pd
324
+ import re
325
+ from typing import Annotated
326
+
327
+ def run(
328
+ data,
329
+ column: Annotated[str, "column"] = "text",
330
+ remove_stopwords: bool = True,
331
+ ) -> dict:
332
+ """Basic NLP text cleaning: lowercase, strip HTML, remove punctuation, remove stop words."""
333
+ df = data.copy()
334
+ if column not in df.columns:
335
+ raise ValueError(f"Column \'{column}\' not found in DataFrame.")
336
+ # Build stop-word set (use nltk if available, otherwise fall back to a minimal list)
337
+ stop_words = set()
338
+ if remove_stopwords:
339
+ try:
340
+ import nltk
341
+ try:
342
+ from nltk.corpus import stopwords
343
+ stop_words = set(stopwords.words("english"))
344
+ except LookupError:
345
+ nltk.download("stopwords", quiet=True)
346
+ from nltk.corpus import stopwords
347
+ stop_words = set(stopwords.words("english"))
348
+ except ImportError:
349
+ # Minimal fallback stop-word list if nltk is not installed
350
+ stop_words = {
351
+ "i","me","my","we","our","you","your","he","she","it","its",
352
+ "they","their","this","that","a","an","the","and","but","or",
353
+ "in","on","at","to","for","of","is","was","are","were","be",
354
+ "been","have","has","do","does","did","with","as","by","from",
355
+ }
356
+ def _clean(text):
357
+ if not isinstance(text, str):
358
+ return ""
359
+ text = text.lower()
360
+ text = re.sub(r"<[^>]+>", " ", text) # remove HTML tags
361
+ text = re.sub(r"[^a-z0-9\s]", " ", text) # remove punctuation
362
+ text = re.sub(r"\s+", " ", text).strip() # collapse whitespace
363
+ if stop_words:
364
+ text = " ".join(w for w in text.split() if w not in stop_words)
365
+ return text
366
+ original_sample = str(df[column].iloc[0]) if len(df) else ""
367
+ df[column] = df[column].apply(_clean)
368
+ return {
369
+ "data": df,
370
+ "cleaned_column": column,
371
+ "sample_before": original_sample[:120],
372
+ "sample_after": str(df[column].iloc[0])[:120] if len(df) else "",
373
+ }
374
+ '''
375
+
376
+ SMOTE_SAMPLER = '''import pandas as pd
377
+ import numpy as np
378
+ from typing import Annotated
379
+
380
+ def run(
381
+ data,
382
+ target_column: Annotated[str, "column"] = "target",
383
+ random_state: int = 42,
384
+ k_neighbors: int = 5,
385
+ ) -> dict:
386
+ """Oversample the minority class to fix class imbalance using SMOTE.
387
+ Falls back to RandomOverSampler if imbalanced-learn is unavailable."""
388
+ df = data.copy()
389
+ if target_column not in df.columns:
390
+ raise ValueError(f"Target column \'{target_column}\' not found in DataFrame.")
391
+ X = df.drop(columns=[target_column])
392
+ y = df[target_column]
393
+ # Keep only numeric features for sampling
394
+ X_num = X.select_dtypes(include=[np.number])
395
+ class_counts_before = y.value_counts().to_dict()
396
+ sampler_used = None
397
+ try:
398
+ from imblearn.over_sampling import SMOTE
399
+ sampler = SMOTE(k_neighbors=k_neighbors, random_state=random_state)
400
+ sampler_used = "SMOTE"
401
+ except ImportError:
402
+ try:
403
+ from imblearn.over_sampling import RandomOverSampler
404
+ sampler = RandomOverSampler(random_state=random_state)
405
+ sampler_used = "RandomOverSampler (SMOTE unavailable)"
406
+ except ImportError:
407
+ # Pure-numpy fallback: duplicate minority rows
408
+ classes, counts = np.unique(y, return_counts=True)
409
+ majority_count = counts.max()
410
+ X_resampled, y_resampled = X_num.copy(), y.copy()
411
+ for cls, cnt in zip(classes, counts):
412
+ if cnt < majority_count:
413
+ shortage = majority_count - cnt
414
+ minority_rows = X_num[y == cls]
415
+ minority_y = y[y == cls]
416
+ idx = np.random.RandomState(random_state).choice(len(minority_rows), shortage, replace=True)
417
+ X_resampled = pd.concat([X_resampled, minority_rows.iloc[idx]], ignore_index=True)
418
+ y_resampled = pd.concat([y_resampled, minority_y.iloc[idx]], ignore_index=True)
419
+ df_out = X_resampled.copy()
420
+ df_out[target_column] = y_resampled.values
421
+ return {
422
+ "data": df_out,
423
+ "sampler_used": "numpy_fallback",
424
+ "rows_before": len(data),
425
+ "rows_after": len(df_out),
426
+ "class_distribution_before": {str(k): int(v) for k, v in class_counts_before.items()},
427
+ "class_distribution_after": {str(k): int(v) for k, v in y_resampled.value_counts().items()},
428
+ }
429
+ X_res, y_res = sampler.fit_resample(X_num, y)
430
+ df_out = pd.DataFrame(X_res, columns=X_num.columns)
431
+ df_out[target_column] = y_res
432
+ return {
433
+ "data": df_out,
434
+ "sampler_used": sampler_used,
435
+ "rows_before": len(data),
436
+ "rows_after": len(df_out),
437
+ "class_distribution_before": {str(k): int(v) for k, v in class_counts_before.items()},
438
+ "class_distribution_after": {str(k): int(v) for k, v in pd.Series(y_res).value_counts().items()},
439
+ }
440
+ '''
441
+
442
+ DATA_CLEANING = '''import pandas as pd
443
+
444
+ def run(data, strategy: str = "drop", fill_value: float = 0.0) -> dict:
445
+ df = data.copy()
446
+ if strategy == "fill":
447
+ df = df.fillna(fill_value)
448
+ else:
449
+ df = df.dropna()
450
+ return {"data": df}
451
+ '''
452
+
453
+ LABEL_ENCODER = '''from sklearn.preprocessing import LabelEncoder
454
+ import pandas as pd
455
+
456
+ def run(data, columns: str = "") -> dict:
457
+ """Encode categorical columns. Leave columns blank to encode all object columns."""
458
+ df = data.copy()
459
+ encoders = {}
460
+ cols = [c.strip() for c in columns.split(",") if c.strip()] if columns.strip() else \
461
+ df.select_dtypes(include=["object", "category"]).columns.tolist()
462
+ for col in cols:
463
+ if col in df.columns:
464
+ le = LabelEncoder()
465
+ df[col] = le.fit_transform(df[col].astype(str))
466
+ encoders[col] = list(le.classes_)
467
+ return {"data": df, "encoders": encoders}
468
+ '''
469
+
470
+ ONE_HOT_ENCODER = '''import pandas as pd
471
+
472
+ def run(data, columns: str = "") -> dict:
473
+ """One-hot encode categorical columns. Leave columns blank to encode all object/category columns."""
474
+ df = data.copy()
475
+ cols = [c.strip() for c in columns.split(",") if c.strip()] if columns.strip() else \
476
+ df.select_dtypes(include=["object", "category"]).columns.tolist()
477
+ if cols:
478
+ df = pd.get_dummies(df, columns=cols, drop_first=False)
479
+ # Convert boolean columns produced by get_dummies to int (0/1) for sklearn
480
+ bool_cols = df.select_dtypes(include=["bool"]).columns.tolist()
481
+ if bool_cols:
482
+ df[bool_cols] = df[bool_cols].astype(int)
483
+ return {"data": df}
484
+ '''
485
+
486
+ TRAIN_TEST_SPLIT = '''from sklearn.model_selection import train_test_split as _split
487
+ from typing import Annotated
488
+
489
+ def run(
490
+ data,
491
+ target_column: Annotated[str, "column"] = "target",
492
+ test_size: float = 0.2,
493
+ random_state: int = 42,
494
+ ) -> dict:
495
+ X = data.drop(columns=[target_column])
496
+ y = data[target_column]
497
+ X_train, X_test, y_train, y_test = _split(
498
+ X, y, test_size=test_size, random_state=random_state
499
+ )
500
+ return {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}
501
+ '''
502
+
503
+ STANDARD_SCALER = '''from sklearn.preprocessing import StandardScaler
504
+
505
+ def run(X_train, X_test, y_train, y_test) -> dict:
506
+ scaler = StandardScaler()
507
+ X_train_s = scaler.fit_transform(X_train)
508
+ X_test_s = scaler.transform(X_test)
509
+ return {"X_train": X_train_s, "X_test": X_test_s, "y_train": y_train, "y_test": y_test}
510
+ '''
511
+
512
+ MIN_MAX_SCALER = '''from sklearn.preprocessing import MinMaxScaler
513
+
514
+ def run(X_train, X_test, y_train, y_test) -> dict:
515
+ scaler = MinMaxScaler()
516
+ X_train_s = scaler.fit_transform(X_train)
517
+ X_test_s = scaler.transform(X_test)
518
+ return {"X_train": X_train_s, "X_test": X_test_s, "y_train": y_train, "y_test": y_test}
519
+ '''
520
+
521
+ PCA = '''from sklearn.decomposition import PCA
522
+
523
+ def run(X_train, X_test, y_train, y_test, n_components: int = 2) -> dict:
524
+ pca = PCA(n_components=n_components)
525
+ X_train_p = pca.fit_transform(X_train)
526
+ X_test_p = pca.transform(X_test)
527
+ explained = float(sum(pca.explained_variance_ratio_))
528
+ return {"X_train": X_train_p, "X_test": X_test_p, "y_train": y_train, "y_test": y_test,
529
+ "explained_variance": round(explained, 4)}
530
+ '''
531
+
532
+ # ── Shared CV helper (inlined into each template to stay self-contained) ────────
533
+ #
534
+ # def _run_cv(model, X_train, y_train, cv_folds, scoring):
535
+ # scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring=scoring)
536
+ # return {"cv_mean": round(float(scores.mean()), 4),
537
+ # "cv_std": round(float(scores.std()), 4),
538
+ # "cv_scores": [round(float(s), 4) for s in scores]}
539
+ #
540
+ # ── Classifier models ─────────────────────────────────────────────────────────
541
+
542
+ LOGISTIC_REGRESSION = '''from sklearn.linear_model import LogisticRegression
543
+ from sklearn.model_selection import cross_val_score
544
+
545
+ def run(
546
+ X_train, X_test, y_train,
547
+ C: float = 1.0, max_iter: int = 200, random_state: int = 42,
548
+ cross_validation: bool = False, cv_folds: int = 5,
549
+ ) -> dict:
550
+ model = LogisticRegression(C=C, max_iter=max_iter, random_state=random_state)
551
+ if cross_validation:
552
+ scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")
553
+ model.fit(X_train, y_train)
554
+ y_pred = model.predict(X_test)
555
+ return {"model": model, "y_pred": y_pred,
556
+ "cv_mean": round(float(scores.mean()), 4),
557
+ "cv_std": round(float(scores.std()), 4),
558
+ "cv_scores": [round(float(s), 4) for s in scores]}
559
+ model.fit(X_train, y_train)
560
+ y_pred = model.predict(X_test)
561
+ return {"model": model, "y_pred": y_pred}
562
+ '''
563
+
564
+ RANDOM_FOREST_CLASSIFIER = '''from sklearn.ensemble import RandomForestClassifier
565
+ from sklearn.model_selection import cross_val_score
566
+
567
+ def run(
568
+ X_train, X_test, y_train,
569
+ n_estimators: int = 100, max_depth: int = 0, random_state: int = 42,
570
+ cross_validation: bool = False, cv_folds: int = 5,
571
+ ) -> dict:
572
+ md = max_depth if max_depth > 0 else None
573
+ model = RandomForestClassifier(n_estimators=n_estimators, max_depth=md, random_state=random_state)
574
+ if cross_validation:
575
+ scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")
576
+ model.fit(X_train, y_train)
577
+ y_pred = model.predict(X_test)
578
+ return {"model": model, "y_pred": y_pred,
579
+ "cv_mean": round(float(scores.mean()), 4),
580
+ "cv_std": round(float(scores.std()), 4),
581
+ "cv_scores": [round(float(s), 4) for s in scores]}
582
+ model.fit(X_train, y_train)
583
+ y_pred = model.predict(X_test)
584
+ return {"model": model, "y_pred": y_pred}
585
+ '''
586
+
587
+ GRADIENT_BOOSTING_CLASSIFIER = '''from sklearn.ensemble import GradientBoostingClassifier
588
+ from sklearn.model_selection import cross_val_score
589
+
590
+ def run(
591
+ X_train, X_test, y_train,
592
+ n_estimators: int = 100, learning_rate: float = 0.1, random_state: int = 42,
593
+ cross_validation: bool = False, cv_folds: int = 5,
594
+ ) -> dict:
595
+ model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=random_state)
596
+ if cross_validation:
597
+ scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")
598
+ model.fit(X_train, y_train)
599
+ y_pred = model.predict(X_test)
600
+ return {"model": model, "y_pred": y_pred,
601
+ "cv_mean": round(float(scores.mean()), 4),
602
+ "cv_std": round(float(scores.std()), 4),
603
+ "cv_scores": [round(float(s), 4) for s in scores]}
604
+ model.fit(X_train, y_train)
605
+ y_pred = model.predict(X_test)
606
+ return {"model": model, "y_pred": y_pred}
607
+ '''
608
+
609
+ DECISION_TREE_CLASSIFIER = '''from sklearn.tree import DecisionTreeClassifier
610
+ from sklearn.model_selection import cross_val_score
611
+
612
+ def run(
613
+ X_train, X_test, y_train,
614
+ max_depth: int = 0, random_state: int = 42,
615
+ cross_validation: bool = False, cv_folds: int = 5,
616
+ ) -> dict:
617
+ md = max_depth if max_depth > 0 else None
618
+ model = DecisionTreeClassifier(max_depth=md, random_state=random_state)
619
+ if cross_validation:
620
+ scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")
621
+ model.fit(X_train, y_train)
622
+ y_pred = model.predict(X_test)
623
+ return {"model": model, "y_pred": y_pred,
624
+ "cv_mean": round(float(scores.mean()), 4),
625
+ "cv_std": round(float(scores.std()), 4),
626
+ "cv_scores": [round(float(s), 4) for s in scores]}
627
+ model.fit(X_train, y_train)
628
+ y_pred = model.predict(X_test)
629
+ return {"model": model, "y_pred": y_pred}
630
+ '''
631
+
632
+ SVM_CLASSIFIER = '''from sklearn.svm import SVC
633
+ from sklearn.model_selection import cross_val_score
634
+
635
+ def run(
636
+ X_train, X_test, y_train,
637
+ C: float = 1.0, kernel: str = "rbf",
638
+ cross_validation: bool = False, cv_folds: int = 5,
639
+ ) -> dict:
640
+ model = SVC(C=C, kernel=kernel)
641
+ if cross_validation:
642
+ scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")
643
+ model.fit(X_train, y_train)
644
+ y_pred = model.predict(X_test)
645
+ return {"model": model, "y_pred": y_pred,
646
+ "cv_mean": round(float(scores.mean()), 4),
647
+ "cv_std": round(float(scores.std()), 4),
648
+ "cv_scores": [round(float(s), 4) for s in scores]}
649
+ model.fit(X_train, y_train)
650
+ y_pred = model.predict(X_test)
651
+ return {"model": model, "y_pred": y_pred}
652
+ '''
653
+
654
+ KNN_CLASSIFIER = '''from sklearn.neighbors import KNeighborsClassifier
655
+ from sklearn.model_selection import cross_val_score
656
+
657
+ def run(
658
+ X_train, X_test, y_train,
659
+ n_neighbors: int = 5,
660
+ cross_validation: bool = False, cv_folds: int = 5,
661
+ ) -> dict:
662
+ model = KNeighborsClassifier(n_neighbors=n_neighbors)
663
+ if cross_validation:
664
+ scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")
665
+ model.fit(X_train, y_train)
666
+ y_pred = model.predict(X_test)
667
+ return {"model": model, "y_pred": y_pred,
668
+ "cv_mean": round(float(scores.mean()), 4),
669
+ "cv_std": round(float(scores.std()), 4),
670
+ "cv_scores": [round(float(s), 4) for s in scores]}
671
+ model.fit(X_train, y_train)
672
+ y_pred = model.predict(X_test)
673
+ return {"model": model, "y_pred": y_pred}
674
+ '''
675
+
676
+ # ── Regressor models ──────────────────────────────────────────────────────────
677
+
678
+ LINEAR_REGRESSION = '''from sklearn.linear_model import LinearRegression
679
+ from sklearn.model_selection import cross_val_score
680
+
681
+ def run(
682
+ X_train, X_test, y_train,
683
+ cross_validation: bool = False, cv_folds: int = 5,
684
+ ) -> dict:
685
+ model = LinearRegression()
686
+ if cross_validation:
687
+ scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
688
+ model.fit(X_train, y_train)
689
+ y_pred = model.predict(X_test)
690
+ return {"model": model, "y_pred": y_pred,
691
+ "cv_mean": round(float(scores.mean()), 4),
692
+ "cv_std": round(float(scores.std()), 4),
693
+ "cv_scores": [round(float(s), 4) for s in scores]}
694
+ model.fit(X_train, y_train)
695
+ y_pred = model.predict(X_test)
696
+ return {"model": model, "y_pred": y_pred}
697
+ '''
698
+
699
+ RANDOM_FOREST_REGRESSOR = '''from sklearn.ensemble import RandomForestRegressor
700
+ from sklearn.model_selection import cross_val_score
701
+
702
+ def run(
703
+ X_train, X_test, y_train,
704
+ n_estimators: int = 100, max_depth: int = 0, random_state: int = 42,
705
+ cross_validation: bool = False, cv_folds: int = 5,
706
+ ) -> dict:
707
+ md = max_depth if max_depth > 0 else None
708
+ model = RandomForestRegressor(n_estimators=n_estimators, max_depth=md, random_state=random_state)
709
+ if cross_validation:
710
+ scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
711
+ model.fit(X_train, y_train)
712
+ y_pred = model.predict(X_test)
713
+ return {"model": model, "y_pred": y_pred,
714
+ "cv_mean": round(float(scores.mean()), 4),
715
+ "cv_std": round(float(scores.std()), 4),
716
+ "cv_scores": [round(float(s), 4) for s in scores]}
717
+ model.fit(X_train, y_train)
718
+ y_pred = model.predict(X_test)
719
+ return {"model": model, "y_pred": y_pred}
720
+ '''
721
+
722
+ GRADIENT_BOOSTING_REGRESSOR = '''from sklearn.ensemble import GradientBoostingRegressor
723
+ from sklearn.model_selection import cross_val_score
724
+
725
+ def run(
726
+ X_train, X_test, y_train,
727
+ n_estimators: int = 100, learning_rate: float = 0.1, random_state: int = 42,
728
+ cross_validation: bool = False, cv_folds: int = 5,
729
+ ) -> dict:
730
+ model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, random_state=random_state)
731
+ if cross_validation:
732
+ scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
733
+ model.fit(X_train, y_train)
734
+ y_pred = model.predict(X_test)
735
+ return {"model": model, "y_pred": y_pred,
736
+ "cv_mean": round(float(scores.mean()), 4),
737
+ "cv_std": round(float(scores.std()), 4),
738
+ "cv_scores": [round(float(s), 4) for s in scores]}
739
+ model.fit(X_train, y_train)
740
+ y_pred = model.predict(X_test)
741
+ return {"model": model, "y_pred": y_pred}
742
+ '''
743
+
744
+ DECISION_TREE_REGRESSOR = '''from sklearn.tree import DecisionTreeRegressor
745
+ from sklearn.model_selection import cross_val_score
746
+
747
+ def run(
748
+ X_train, X_test, y_train,
749
+ max_depth: int = 0, random_state: int = 42,
750
+ cross_validation: bool = False, cv_folds: int = 5,
751
+ ) -> dict:
752
+ md = max_depth if max_depth > 0 else None
753
+ model = DecisionTreeRegressor(max_depth=md, random_state=random_state)
754
+ if cross_validation:
755
+ scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
756
+ model.fit(X_train, y_train)
757
+ y_pred = model.predict(X_test)
758
+ return {"model": model, "y_pred": y_pred,
759
+ "cv_mean": round(float(scores.mean()), 4),
760
+ "cv_std": round(float(scores.std()), 4),
761
+ "cv_scores": [round(float(s), 4) for s in scores]}
762
+ model.fit(X_train, y_train)
763
+ y_pred = model.predict(X_test)
764
+ return {"model": model, "y_pred": y_pred}
765
+ '''
766
+
767
+ SVM_REGRESSOR = '''from sklearn.svm import SVR
768
+ from sklearn.model_selection import cross_val_score
769
+
770
+ def run(
771
+ X_train, X_test, y_train,
772
+ C: float = 1.0, kernel: str = "rbf",
773
+ cross_validation: bool = False, cv_folds: int = 5,
774
+ ) -> dict:
775
+ model = SVR(C=C, kernel=kernel)
776
+ if cross_validation:
777
+ scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
778
+ model.fit(X_train, y_train)
779
+ y_pred = model.predict(X_test)
780
+ return {"model": model, "y_pred": y_pred,
781
+ "cv_mean": round(float(scores.mean()), 4),
782
+ "cv_std": round(float(scores.std()), 4),
783
+ "cv_scores": [round(float(s), 4) for s in scores]}
784
+ model.fit(X_train, y_train)
785
+ y_pred = model.predict(X_test)
786
+ return {"model": model, "y_pred": y_pred}
787
+ '''
788
+
789
+ KNN_REGRESSOR = '''from sklearn.neighbors import KNeighborsRegressor
790
+ from sklearn.model_selection import cross_val_score
791
+
792
+ def run(
793
+ X_train, X_test, y_train,
794
+ n_neighbors: int = 5,
795
+ cross_validation: bool = False, cv_folds: int = 5,
796
+ ) -> dict:
797
+ model = KNeighborsRegressor(n_neighbors=n_neighbors)
798
+ if cross_validation:
799
+ scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
800
+ model.fit(X_train, y_train)
801
+ y_pred = model.predict(X_test)
802
+ return {"model": model, "y_pred": y_pred,
803
+ "cv_mean": round(float(scores.mean()), 4),
804
+ "cv_std": round(float(scores.std()), 4),
805
+ "cv_scores": [round(float(s), 4) for s in scores]}
806
+ model.fit(X_train, y_train)
807
+ y_pred = model.predict(X_test)
808
+ return {"model": model, "y_pred": y_pred}
809
+ '''
810
+
811
+ # ── Evaluation ────────────────────────────────────────────────────────────────
812
+
813
+ ACCURACY = '''from sklearn.metrics import accuracy_score
814
+
815
+ def run(y_test, y_pred) -> dict:
816
+ return {"accuracy": float(accuracy_score(y_test, y_pred))}
817
+ '''
818
+
819
+ CLASSIFICATION_REPORT = '''from sklearn.metrics import (
820
+ accuracy_score, f1_score, precision_score, recall_score,
821
+ confusion_matrix, classification_report,
822
+ )
823
+
824
+ def run(y_test, y_pred) -> dict:
825
+ return {
826
+ "accuracy": round(float(accuracy_score(y_test, y_pred)), 4),
827
+ "f1_score": round(float(f1_score(y_test, y_pred, average="weighted", zero_division=0)), 4),
828
+ "precision": round(float(precision_score(y_test, y_pred, average="weighted", zero_division=0)), 4),
829
+ "recall": round(float(recall_score(y_test, y_pred, average="weighted", zero_division=0)), 4),
830
+ "confusion_matrix": confusion_matrix(y_test, y_pred).tolist(),
831
+ "class_report": classification_report(y_test, y_pred, output_dict=True, zero_division=0),
832
+ }
833
+ '''
834
+
835
+ VALIDATION_REPORT = '''from sklearn.model_selection import cross_validate
836
+ import numpy as np
837
+
838
+ def run(model, X_train, y_train, cv_folds: int = 5) -> dict:
839
+ """Run K-Fold cross validation and generate a robust validation report."""
840
+ # Using macro averages for multiclass compatibility
841
+ scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
842
+
843
+ # Catching cases where model might not be a classifier (though typically it will be here)
844
+ try:
845
+ scores = cross_validate(model, X_train, y_train, cv=cv_folds, scoring=scoring)
846
+ return {
847
+ "cv_folds": cv_folds,
848
+ "mean_accuracy": round(float(np.mean(scores['test_accuracy'])), 4),
849
+ "std_accuracy": round(float(np.std(scores['test_accuracy'])), 4),
850
+ "mean_precision": round(float(np.mean(scores['test_precision_macro'])), 4),
851
+ "mean_recall": round(float(np.mean(scores['test_recall_macro'])), 4),
852
+ "mean_f1": round(float(np.mean(scores['test_f1_macro'])), 4),
853
+ }
854
+ except Exception as e:
855
+ # Fallback for regressors
856
+ scores = cross_validate(model, X_train, y_train, cv=cv_folds, scoring=['r2', 'neg_mean_squared_error'])
857
+ return {
858
+ "cv_folds": cv_folds,
859
+ "mean_r2": round(float(np.mean(scores['test_r2'])), 4),
860
+ "mean_mse": round(float(np.mean(-scores['test_neg_mean_squared_error'])), 4),
861
+ }
862
+ '''
863
+
864
+ REGRESSION_METRICS = '''from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
865
+ import numpy as np
866
+
867
+ def run(y_test, y_pred) -> dict:
868
+ mse = float(mean_squared_error(y_test, y_pred))
869
+ return {
870
+ "mse": round(mse, 4),
871
+ "rmse": round(float(np.sqrt(mse)), 4),
872
+ "mae": round(float(mean_absolute_error(y_test, y_pred)), 4),
873
+ "r2": round(float(r2_score(y_test, y_pred)), 4),
874
+ }
875
+ '''
876
+
877
+ FEATURE_IMPORTANCE = '''import numpy as np
878
+
879
+ def run(model, X_train) -> dict:
880
+ """Extract feature importances from any tree-based model."""
881
+ if not hasattr(model, "feature_importances_"):
882
+ raise ValueError(f"{type(model).__name__} has no feature_importances_. Use RF, GBM, or DT.")
883
+ imps = model.feature_importances_
884
+ names = list(X_train.columns) if hasattr(X_train, "columns") else [f"f{i}" for i in range(len(imps))]
885
+ pairs = sorted(zip(imps, names), reverse=True)
886
+ top = min(20, len(pairs))
887
+ return {
888
+ "feature_importances": {name: round(float(imp), 6) for imp, name in pairs[:top]},
889
+ "top_feature": pairs[0][1],
890
+ "top_importance": round(float(pairs[0][0]), 6),
891
+ }
892
+ '''
893
+
894
+ AUTO_ML = '''from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
895
+ from sklearn.linear_model import LogisticRegression
896
+ from sklearn.metrics import accuracy_score
897
+
898
+ def run(X_train, X_test, y_train, y_test) -> dict:
899
+ candidates = {
900
+ "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
901
+ "Gradient Boosting": GradientBoostingClassifier(random_state=42),
902
+ "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
903
+ }
904
+ scores = {}
905
+ best_name, best_score, best_model, best_pred = "", -1.0, None, None
906
+ for name, mdl in candidates.items():
907
+ mdl.fit(X_train, y_train)
908
+ preds = mdl.predict(X_test)
909
+ score = float(accuracy_score(y_test, preds))
910
+ scores[name] = round(score, 4)
911
+ if score > best_score:
912
+ best_score, best_name, best_model, best_pred = score, name, mdl, preds
913
+ return {
914
+ "model": best_model,
915
+ "y_pred": best_pred,
916
+ "best_algorithm": best_name,
917
+ "accuracy": round(best_score, 4),
918
+ "all_scores": scores,
919
+ }
920
+ '''
921
+
922
+
923
+ EDA_HISTOGRAM = '''import pandas as pd
924
+ import numpy as np
925
+ from typing import Annotated
926
+
927
+ def run(data, column: Annotated[str, "column"] = "price", bins: int = 20) -> dict:
928
+ """Distribution histogram for a single numeric column."""
929
+ series = data[column].dropna()
930
+ counts, edges = np.histogram(series, bins=bins)
931
+ return {
932
+ "histogram": {
933
+ "column": column,
934
+ "counts": counts.tolist(),
935
+ "bin_edges": [round(float(e), 2) for e in edges.tolist()],
936
+ "mean": round(float(series.mean()), 2),
937
+ "median": round(float(series.median()), 2),
938
+ "std": round(float(series.std()), 2),
939
+ "min": round(float(series.min()), 2),
940
+ "max": round(float(series.max()), 2),
941
+ }
942
+ }
943
+ '''
944
+
945
+ EDA_CORRELATION = '''import pandas as pd
946
+ import numpy as np
947
+
948
+ def run(data) -> dict:
949
+ """Full correlation matrix for all numeric columns."""
950
+ num_df = data.select_dtypes(include=[np.number])
951
+ corr = num_df.corr().round(3)
952
+ matrix = []
953
+ for row in corr.values.tolist():
954
+ matrix.append([None if (v != v) else round(v, 3) for v in row])
955
+ return {
956
+ "correlation_matrix": {
957
+ "columns": list(corr.columns),
958
+ "matrix": matrix,
959
+ }
960
+ }
961
+ '''
962
+
963
+ EDA_VALUE_COUNTS = '''import pandas as pd
964
+ from typing import Annotated
965
+
966
+ def run(data, column: Annotated[str, "column"] = "mainroad", top_n: int = 10) -> dict:
967
+ """Value counts for a categorical column."""
968
+ vc = data[column].value_counts().head(top_n)
969
+ return {
970
+ "value_counts": {
971
+ "column": column,
972
+ "labels": [str(l) for l in vc.index.tolist()],
973
+ "counts": vc.values.tolist(),
974
+ "total": int(len(data)),
975
+ }
976
+ }
977
+ '''
978
+
979
+ EDA_BOX_PLOT = '''import pandas as pd
980
+ import numpy as np
981
+ from typing import Annotated
982
+
983
+ def run(data, column: Annotated[str, "column"] = "price") -> dict:
984
+ """Box plot statistics (min, Q1, median, Q3, max, outliers) for a column."""
985
+ s = data[column].dropna()
986
+ q1, q3 = float(s.quantile(0.25)), float(s.quantile(0.75))
987
+ iqr = q3 - q1
988
+ outliers = s[(s < q1 - 1.5*iqr) | (s > q3 + 1.5*iqr)]
989
+ return {
990
+ "box_plot": {
991
+ "column": column,
992
+ "min": round(float(s.min()), 2),
993
+ "q1": round(q1, 2),
994
+ "median": round(float(s.median()), 2),
995
+ "q3": round(q3, 2),
996
+ "max": round(float(s.max()), 2),
997
+ "mean": round(float(s.mean()), 2),
998
+ "iqr": round(iqr, 2),
999
+ "outlier_count": int(len(outliers)),
1000
+ "whisker_low": round(float(max(s.min(), q1 - 1.5*iqr)), 2),
1001
+ "whisker_high": round(float(min(s.max(), q3 + 1.5*iqr)), 2),
1002
+ }
1003
+ }
1004
+ '''
1005
+
1006
+ PREDICT = '''import pandas as pd
1007
+ import json
1008
+
1009
+ def run(model, feature_json: str = "{}") -> dict:
1010
+ """Predict from a trained model. Enter feature values as a JSON string.
1011
+ Example: {"area": 7000, "bedrooms": 3, "bathrooms": 2, "stories": 2,
1012
+ "mainroad": 1, "guestroom": 0, "basement": 0,
1013
+ "hotwaterheating": 0, "airconditioning": 1, "parking": 2,
1014
+ "prefarea": 0, "furnishingstatus": 1}
1015
+ """
1016
+ features = json.loads(feature_json) if feature_json.strip() else {}
1017
+ if not features:
1018
+ raise ValueError("Enter feature values as JSON in the feature_json field")
1019
+ df = pd.DataFrame([features])
1020
+ prediction = model.predict(df)
1021
+ result = prediction[0]
1022
+ scalar = float(result) if hasattr(result, "__float__") else str(result)
1023
+ return {
1024
+ "prediction": scalar,
1025
+ "features_used": features,
1026
+ }
1027
+ '''
1028
+
1029
+ CONFUSION_MATRIX_PLOTTER = '''import numpy as np
1030
+ from sklearn.metrics import confusion_matrix
1031
+ import json
1032
+
1033
+ def run(y_test, y_pred, normalize: bool = False) -> dict:
1034
+ """Computes the confusion matrix and outputs a JSON-serializable structure."""
1035
+ labels = sorted(list(set(y_test) | set(y_pred)))
1036
+ cm = confusion_matrix(y_test, y_pred, labels=labels)
1037
+ if normalize:
1038
+ cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
1039
+ cm = np.nan_to_num(cm)
1040
+
1041
+ matrix_data = cm.tolist()
1042
+ labels_data = [str(l) for l in labels]
1043
+
1044
+ return {
1045
+ "correlation_heatmap": {
1046
+ "x": labels_data,
1047
+ "y": labels_data,
1048
+ "z": matrix_data
1049
+ },
1050
+ "summary": "Confusion matrix for error analysis"
1051
+ }
1052
+ '''
1053
+
1054
+ ROC_PR_CURVE_DATA = '''import numpy as np
1055
+ from sklearn.metrics import roc_curve, precision_recall_curve, auc
1056
+
1057
+ def run(model, X_test, y_test) -> dict:
1058
+ """Calculates ROC and PR curves data (TPR, FPR, Precision, Recall) across thresholds."""
1059
+ if not hasattr(model, "predict_proba"):
1060
+ raise ValueError("Model does not support predict_proba() required for ROC/PR curves.")
1061
+
1062
+ classes = model.classes_
1063
+ if len(classes) != 2:
1064
+ raise ValueError("ROC/PR curve data generation currently supports binary classification only.")
1065
+
1066
+ y_scores = model.predict_proba(X_test)[:, 1]
1067
+ pos_label = classes[1]
1068
+
1069
+ fpr, tpr, roc_thresh = roc_curve(y_test, y_scores, pos_label=pos_label)
1070
+ roc_auc = auc(fpr, tpr)
1071
+
1072
+ precision, recall, pr_thresh = precision_recall_curve(y_test, y_scores, pos_label=pos_label)
1073
+ pr_auc = auc(recall, precision)
1074
+
1075
+ return {
1076
+ "roc_curves": {
1077
+ "ROC (FPR vs TPR)": {
1078
+ "fpr": fpr.tolist(),
1079
+ "tpr": tpr.tolist(),
1080
+ "auc": float(roc_auc)
1081
+ },
1082
+ "PR (Recall vs Prec)": {
1083
+ "fpr": recall.tolist(), # X-axis
1084
+ "tpr": precision.tolist(), # Y-axis
1085
+ "auc": float(pr_auc)
1086
+ }
1087
+ }
1088
+ }
1089
+ '''
1090
+
1091
+ RESIDUAL_PLOTTER = '''import numpy as np
1092
+
1093
+ def run(y_test, y_pred) -> dict:
1094
+ """Generates (Predicted, Residual) pairs for regression error analysis."""
1095
+ y_test_arr = np.array(y_test).ravel()
1096
+ y_pred_arr = np.array(y_pred).ravel()
1097
+
1098
+ residuals = y_test_arr - y_pred_arr
1099
+
1100
+ if len(y_pred_arr) > 1000:
1101
+ idx = np.random.choice(len(y_pred_arr), 1000, replace=False)
1102
+ y_pred_arr = y_pred_arr[idx]
1103
+ residuals = residuals[idx]
1104
+
1105
+ return {
1106
+ "feature_target_scatter": {
1107
+ "x": y_pred_arr.tolist(),
1108
+ "y": residuals.tolist(),
1109
+ "feature_name": "Predicted",
1110
+ "target_name": "Residuals"
1111
+ },
1112
+ "summary": "Residual plot data points"
1113
+ }
1114
+ '''
1115
+
1116
+ FEATURE_IMPORTANCE_VISUALIZER = '''import numpy as np
1117
+
1118
+ def run(model, X_train=None, top_n: int = 10) -> dict:
1119
+ """Extracts and sorts feature importances or coefficients from a model."""
1120
+ importances = None
1121
+ names = None
1122
+
1123
+ if hasattr(model, 'feature_importances_'):
1124
+ importances = model.feature_importances_
1125
+ elif hasattr(model, 'coef_'):
1126
+ importances = np.abs(model.coef_[0]) if model.coef_.ndim > 1 else np.abs(model.coef_)
1127
+ else:
1128
+ raise ValueError("Model has neither 'feature_importances_' nor 'coef_'.")
1129
+
1130
+ if hasattr(model, 'feature_names_in_'):
1131
+ names = model.feature_names_in_
1132
+ elif X_train is not None and hasattr(X_train, 'columns'):
1133
+ names = X_train.columns
1134
+ else:
1135
+ names = [f"Feature {i}" for i in range(len(importances))]
1136
+
1137
+ indices = np.argsort(importances)[::-1][:top_n]
1138
+
1139
+ top_importances = importances[indices].tolist()
1140
+ top_names = [str(names[i]) for i in indices]
1141
+
1142
+ return {
1143
+ "value_counts": {
1144
+ "labels": top_names,
1145
+ "counts": top_importances,
1146
+ "column": "Feature Importance",
1147
+ "total": sum(top_importances) if sum(top_importances) > 0 else 1
1148
+ },
1149
+ "summary": f"Top {top_n} features"
1150
+ }
1151
+ '''
1152
+
1153
+ DECISION_BOUNDARY_2D = '''import numpy as np
1154
+ from sklearn.decomposition import PCA
1155
+
1156
+ def run(model, X_train, y_train, grid_resolution: int = 50) -> dict:
1157
+ """Reduces data to 2D (if needed) and predicts over a meshgrid to visualize decision boundaries."""
1158
+ X = np.array(X_train)
1159
+ y = np.array(y_train)
1160
+
1161
+ pca_used = False
1162
+ if X.shape[1] > 2:
1163
+ pca = PCA(n_components=2)
1164
+ X_2d = pca.fit_transform(X)
1165
+ pca_used = True
1166
+ elif X.shape[1] == 2:
1167
+ X_2d = X
1168
+ else:
1169
+ raise ValueError("Cannot draw 2D boundary for 1D feature space.")
1170
+
1171
+ x_min, x_max = X_2d[:, 0].min() - 1, X_2d[:, 0].max() + 1
1172
+ y_min, y_max = X_2d[:, 1].min() - 1, X_2d[:, 1].max() + 1
1173
+
1174
+ xx, yy = np.meshgrid(
1175
+ np.linspace(x_min, x_max, grid_resolution),
1176
+ np.linspace(y_min, y_max, grid_resolution)
1177
+ )
1178
+
1179
+ grid_points_2d = np.c_[xx.ravel(), yy.ravel()]
1180
+
1181
+ if pca_used:
1182
+ grid_points_orig = pca.inverse_transform(grid_points_2d)
1183
+ Z = model.predict(grid_points_orig)
1184
+ else:
1185
+ Z = model.predict(grid_points_2d)
1186
+
1187
+ sample_size = min(300, len(X_2d))
1188
+ idx = np.random.choice(len(X_2d), sample_size, replace=False)
1189
+
1190
+ if hasattr(Z[0], "item"):
1191
+ Z = np.array([z.item() if hasattr(z, "item") else z for z in Z])
1192
+
1193
+ return {
1194
+ "feature_target_scatter": {
1195
+ "x": X_2d[idx, 0].tolist(),
1196
+ "y": X_2d[idx, 1].tolist(),
1197
+ "feature_name": "Dim 1",
1198
+ "target_name": "Dim 2"
1199
+ },
1200
+ "summary": f"Decision Boundary Scatter. PCA applied: {pca_used}"
1201
+ }
1202
+ '''
1203
+
1204
+ PREDICTION_VS_ACTUAL_SCATTER = '''import numpy as np
1205
+
1206
+ def run(y_test, y_pred) -> dict:
1207
+ """Returns actual vs predicted pairs for plotting a regression scatter plot."""
1208
+ y_test_arr = np.array(y_test).ravel()
1209
+ y_pred_arr = np.array(y_pred).ravel()
1210
+
1211
+ if len(y_test_arr) > 1000:
1212
+ idx = np.random.choice(len(y_test_arr), 1000, replace=False)
1213
+ y_test_arr = y_test_arr[idx]
1214
+ y_pred_arr = y_pred_arr[idx]
1215
+
1216
+ return {
1217
+ "feature_target_scatter": {
1218
+ "x": y_test_arr.tolist(),
1219
+ "y": y_pred_arr.tolist(),
1220
+ "feature_name": "Actual",
1221
+ "target_name": "Predicted"
1222
+ },
1223
+ "summary": "Actual vs Predicted points"
1224
+ }
1225
+ '''
1226
+
1227
+ POLYNOMIAL_FEATURES = '''import pandas as pd
1228
+ from sklearn.preprocessing import PolynomialFeatures
1229
+
1230
+ def run(data, degree: int = 2, interaction_only: bool = False) -> dict:
1231
+ df = data.copy()
1232
+ num_cols = df.select_dtypes(include=['number']).columns.tolist()
1233
+ if not num_cols:
1234
+ return {"data": df, "summary": "No numeric columns found"}
1235
+
1236
+ poly = PolynomialFeatures(degree=degree, interaction_only=interaction_only, include_bias=False)
1237
+ poly_features = poly.fit_transform(df[num_cols])
1238
+
1239
+ feature_names = poly.get_feature_names_out(num_cols)
1240
+ poly_df = pd.DataFrame(poly_features, columns=feature_names, index=df.index)
1241
+
1242
+ df = df.drop(columns=num_cols)
1243
+ df = pd.concat([df, poly_df], axis=1)
1244
+
1245
+ return {"data": df, "summary": f"Added {len(feature_names)} polynomial features"}
1246
+ '''
1247
+
1248
+ DATETIME_EXTRACTOR = '''import pandas as pd
1249
+ from typing import Annotated
1250
+
1251
+ def run(data, column: Annotated[str, "column"] = "date") -> dict:
1252
+ df = data.copy()
1253
+ if column not in df.columns:
1254
+ raise ValueError(f"Column '{column}' not found.")
1255
+
1256
+ dt_series = pd.to_datetime(df[column], errors='coerce')
1257
+
1258
+ df[f"{column}_year"] = dt_series.dt.year
1259
+ df[f"{column}_month"] = dt_series.dt.month
1260
+ df[f"{column}_day"] = dt_series.dt.day
1261
+ df[f"{column}_dayofweek"] = dt_series.dt.dayofweek
1262
+
1263
+ df = df.drop(columns=[column])
1264
+
1265
+ return {"data": df, "summary": f"Extracted datetime components from {column}"}
1266
+ '''
1267
+
1268
+ RFE_FEATURE_SELECTOR = '''import pandas as pd
1269
+ import numpy as np
1270
+ from typing import Annotated
1271
+ from sklearn.feature_selection import RFE
1272
+ from sklearn.ensemble import RandomForestRegressor
1273
+
1274
+ def run(data, target_column: Annotated[str, "column"] = "target", n_to_select: int = 10) -> dict:
1275
+ df = data.copy()
1276
+ if target_column not in df.columns:
1277
+ raise ValueError(f"Target column '{target_column}' not found.")
1278
+
1279
+ y = df[target_column]
1280
+ X = df.drop(columns=[target_column])
1281
+
1282
+ X_num = X.select_dtypes(include=[np.number])
1283
+ X_non_num = X.select_dtypes(exclude=[np.number])
1284
+
1285
+ if X_num.shape[1] <= n_to_select:
1286
+ return {"data": df, "summary": "Feature count already <= n_to_select"}
1287
+
1288
+ estimator = RandomForestRegressor(n_estimators=50, random_state=42)
1289
+ selector = RFE(estimator, n_features_to_select=n_to_select, step=1)
1290
+
1291
+ selector.fit(X_num, y)
1292
+
1293
+ selected_features = X_num.columns[selector.support_].tolist()
1294
+
1295
+ df_out = pd.concat([X_non_num, X_num[selected_features]], axis=1)
1296
+ df_out[target_column] = y
1297
+
1298
+ return {"data": df_out, "selected_features": selected_features}
1299
+ '''
1300
+
1301
+ INVERSE_TARGET_TRANSFORMER = '''import numpy as np
1302
+
1303
+ def run(y_pred, y_test, method: str = "expm1") -> dict:
1304
+ y_p = np.array(y_pred)
1305
+ y_t = np.array(y_test)
1306
+
1307
+ if method == "expm1":
1308
+ y_p_inv = np.expm1(y_p)
1309
+ y_t_inv = np.expm1(y_t)
1310
+ elif method == "square":
1311
+ y_p_inv = np.square(y_p)
1312
+ y_t_inv = np.square(y_t)
1313
+ else:
1314
+ raise ValueError(f"Unsupported method: {method}")
1315
+
1316
+ return {
1317
+ "y_pred": y_p_inv.tolist() if hasattr(y_p_inv, "tolist") else y_p_inv,
1318
+ "y_test": y_t_inv.tolist() if hasattr(y_t_inv, "tolist") else y_t_inv,
1319
+ "method": method
1320
+ }
1321
+ '''
1322
+
1323
+ THRESHOLD_OPTIMIZER = '''import numpy as np
1324
+ from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
1325
+
1326
+ def run(model, X_test, y_test, target_metric: str = "f1") -> dict:
1327
+ if not hasattr(model, "predict_proba"):
1328
+ raise ValueError("Model does not support predict_proba required for threshold optimization.")
1329
+
1330
+ classes = model.classes_
1331
+ if len(classes) != 2:
1332
+ raise ValueError("Threshold optimization requires binary classification.")
1333
+
1334
+ y_scores = model.predict_proba(X_test)[:, 1]
1335
+ pos_label = classes[1]
1336
+
1337
+ y_true_bin = (np.array(y_test) == pos_label).astype(int)
1338
+
1339
+ thresholds = np.linspace(0.0, 1.0, 101)
1340
+ best_thresh = 0.5
1341
+ best_score = -1.0
1342
+
1343
+ metrics = []
1344
+
1345
+ for t in thresholds:
1346
+ y_pred_t = (y_scores >= t).astype(int)
1347
+
1348
+ if target_metric == "f1":
1349
+ score = f1_score(y_true_bin, y_pred_t, zero_division=0)
1350
+ elif target_metric == "precision":
1351
+ score = precision_score(y_true_bin, y_pred_t, zero_division=0)
1352
+ elif target_metric == "recall":
1353
+ score = recall_score(y_true_bin, y_pred_t, zero_division=0)
1354
+ elif target_metric == "accuracy":
1355
+ score = accuracy_score(y_true_bin, y_pred_t)
1356
+ else:
1357
+ raise ValueError(f"Unsupported target_metric: {target_metric}")
1358
+
1359
+ metrics.append({"threshold": float(t), "score": float(score)})
1360
+
1361
+ if score > best_score:
1362
+ best_score = score
1363
+ best_thresh = t
1364
+
1365
+ return {
1366
+ "best_threshold": float(best_thresh),
1367
+ "best_score": float(best_score),
1368
+ "target_metric": target_metric,
1369
+ "curve_data": metrics
1370
+ }
1371
+ '''
1372
+
1373
+
1374
+ BINARY_ENCODER = '''import pandas as pd
1375
+ try:
1376
+ import category_encoders as ce
1377
+ except ImportError:
1378
+ pass
1379
+
1380
+ def run(data, columns: str = "") -> dict:
1381
+ df = data.copy()
1382
+ cols = [c.strip() for c in columns.split(",") if c.strip()] if columns.strip() else df.select_dtypes(include=["object", "category"]).columns.tolist()
1383
+
1384
+ if not cols:
1385
+ return {"data": df, "summary": "No columns to encode"}
1386
+
1387
+ try:
1388
+ encoder = ce.BinaryEncoder(cols=cols)
1389
+ df = encoder.fit_transform(df)
1390
+ return {"data": df, "summary": f"Binary encoded {len(cols)} columns"}
1391
+ except NameError:
1392
+ return {"data": df, "summary": "category_encoders not installed. Skipping."}
1393
+ '''
1394
+
1395
+ FREQUENCY_ENCODER = '''import pandas as pd
1396
+
1397
+ def run(data, columns: str = "") -> dict:
1398
+ df = data.copy()
1399
+ cols = [c.strip() for c in columns.split(",") if c.strip()] if columns.strip() else df.select_dtypes(include=["object", "category"]).columns.tolist()
1400
+
1401
+ if not cols:
1402
+ return {"data": df, "summary": "No columns to encode"}
1403
+
1404
+ for col in cols:
1405
+ if col in df.columns:
1406
+ freq = df[col].value_counts(normalize=True)
1407
+ df[col] = df[col].map(freq)
1408
+
1409
+ return {"data": df, "summary": f"Frequency encoded {len(cols)} columns"}
1410
+ '''
1411
+
1412
+ ORDINAL_ENCODER = '''import pandas as pd
1413
+ from sklearn.preprocessing import OrdinalEncoder
1414
+
1415
+ def run(data, columns: str = "") -> dict:
1416
+ df = data.copy()
1417
+ cols = [c.strip() for c in columns.split(",") if c.strip()] if columns.strip() else df.select_dtypes(include=["object", "category"]).columns.tolist()
1418
+
1419
+ if not cols:
1420
+ return {"data": df, "summary": "No columns to encode"}
1421
+
1422
+ encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
1423
+ df[cols] = encoder.fit_transform(df[cols].astype(str))
1424
+
1425
+ return {"data": df, "summary": f"Ordinal encoded {len(cols)} columns"}
1426
+ '''
1427
+
1428
+ VIF_FEATURE_SELECTION = '''import pandas as pd
1429
+ import numpy as np
1430
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
1431
+
1432
+ def run(data, threshold: float = 5.0) -> dict:
1433
+ df = data.copy()
1434
+ num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
1435
+
1436
+ if len(num_cols) < 2:
1437
+ return {"data": df, "summary": "Not enough numeric columns for VIF"}
1438
+
1439
+ X = df[num_cols].dropna()
1440
+ dropped = []
1441
+
1442
+ while True:
1443
+ vif_data = pd.DataFrame()
1444
+ vif_data["feature"] = X.columns
1445
+ vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
1446
+
1447
+ max_vif = vif_data["VIF"].max()
1448
+ if max_vif > threshold:
1449
+ max_feat = vif_data.sort_values("VIF", ascending=False).iloc[0]["feature"]
1450
+ X = X.drop(columns=[max_feat])
1451
+ dropped.append(max_feat)
1452
+ else:
1453
+ break
1454
+
1455
+ df = df.drop(columns=dropped)
1456
+ return {"data": df, "summary": f"Dropped {len(dropped)} features due to high VIF: {dropped}"}
1457
+ '''
1458
+
1459
+ PCA_WHITENING = '''import pandas as pd
1460
+ import numpy as np
1461
+ from sklearn.decomposition import PCA
1462
+
1463
+ def run(data, n_components: int = 0, whiten: bool = True) -> dict:
1464
+ df = data.copy()
1465
+ num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
1466
+
1467
+ if not num_cols:
1468
+ return {"data": df, "summary": "No numeric columns for PCA"}
1469
+
1470
+ n = n_components if n_components > 0 else None
1471
+ pca = PCA(n_components=n, whiten=whiten)
1472
+
1473
+ pca_features = pca.fit_transform(df[num_cols].fillna(0))
1474
+
1475
+ feature_names = [f"pca_{i}" for i in range(pca_features.shape[1])]
1476
+ pca_df = pd.DataFrame(pca_features, columns=feature_names, index=df.index)
1477
+
1478
+ df = df.drop(columns=num_cols)
1479
+ df = pd.concat([df, pca_df], axis=1)
1480
+
1481
+ return {"data": df, "summary": f"Applied PCA whitening, created {len(feature_names)} components"}
1482
+ '''
1483
+
1484
+ K_MEANS_CLUSTERING_FEATURES = '''import pandas as pd
1485
+ import numpy as np
1486
+ from sklearn.cluster import KMeans
1487
+
1488
+ def run(data, n_clusters: int = 3, random_state: int = 42) -> dict:
1489
+ df = data.copy()
1490
+ num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
1491
+
1492
+ if not num_cols:
1493
+ return {"data": df, "summary": "No numeric columns for KMeans"}
1494
+
1495
+ kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init='auto')
1496
+ df['ClusterID'] = kmeans.fit_predict(df[num_cols].fillna(0))
1497
+
1498
+ return {"data": df, "summary": f"Added ClusterID with {n_clusters} clusters"}
1499
+ '''
1500
+
1501
+ XGBOOST_NODE = '''import pandas as pd
1502
+ import numpy as np
1503
+
1504
+ def run(X_train, X_test, y_train, task_type: str = "classifier", n_estimators: int = 100, learning_rate: float = 0.1, max_depth: int = 3) -> dict:
1505
+ if task_type.lower() == "classifier":
1506
+ from xgboost import XGBClassifier
1507
+ model = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, use_label_encoder=False, eval_metric='logloss')
1508
+ else:
1509
+ from xgboost import XGBRegressor
1510
+ model = XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
1511
+
1512
+ model.fit(X_train, y_train)
1513
+ y_pred = model.predict(X_test)
1514
+
1515
+ return {"model": model, "y_pred": y_pred}
1516
+ '''
1517
+
1518
+ LIGHTGBM_NODE = '''import pandas as pd
1519
+
1520
+ def run(X_train, X_test, y_train, task_type: str = "classifier", n_estimators: int = 100, learning_rate: float = 0.1, max_depth: int = -1) -> dict:
1521
+ if task_type.lower() == "classifier":
1522
+ from lightgbm import LGBMClassifier
1523
+ model = LGBMClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
1524
+ else:
1525
+ from lightgbm import LGBMRegressor
1526
+ model = LGBMRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
1527
+
1528
+ model.fit(X_train, y_train)
1529
+ y_pred = model.predict(X_test)
1530
+
1531
+ return {"model": model, "y_pred": y_pred}
1532
+ '''
1533
+
1534
+ ADABOOST_NODE = '''import pandas as pd
1535
+
1536
+ def run(X_train, X_test, y_train, task_type: str = "classifier", n_estimators: int = 50, learning_rate: float = 1.0) -> dict:
1537
+ if task_type.lower() == "classifier":
1538
+ from sklearn.ensemble import AdaBoostClassifier
1539
+ model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
1540
+ else:
1541
+ from sklearn.ensemble import AdaBoostRegressor
1542
+ model = AdaBoostRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
1543
+
1544
+ model.fit(X_train, y_train)
1545
+ y_pred = model.predict(X_test)
1546
+
1547
+ return {"model": model, "y_pred": y_pred}
1548
+ '''
1549
+
1550
+ VOTING_ENSEMBLE = '''import pandas as pd
1551
+ import numpy as np
1552
+
1553
+ def run(model1, model2, model3, X_train, X_test, y_train, task_type: str = "classifier", voting: str = "hard") -> dict:
1554
+ estimators = [('m1', model1), ('m2', model2), ('m3', model3)]
1555
+
1556
+ if task_type.lower() == "classifier":
1557
+ from sklearn.ensemble import VotingClassifier
1558
+ model = VotingClassifier(estimators=estimators, voting=voting)
1559
+ else:
1560
+ from sklearn.ensemble import VotingRegressor
1561
+ model = VotingRegressor(estimators=estimators)
1562
+
1563
+ model.fit(X_train, y_train)
1564
+ y_pred = model.predict(X_test)
1565
+
1566
+ return {"model": model, "y_pred": y_pred}
1567
+ '''
1568
+
1569
+ LAG_FEATURE_GENERATOR = '''import pandas as pd
1570
+ from typing import Annotated
1571
+
1572
+ def run(data, column: Annotated[str, "column"], lags: int = 3) -> dict:
1573
+ df = data.copy()
1574
+ if column not in df.columns:
1575
+ raise ValueError(f"Column '{column}' not found.")
1576
+
1577
+ for i in range(1, lags + 1):
1578
+ df[f"{column}_lag_{i}"] = df[column].shift(i)
1579
+
1580
+ return {"data": df, "summary": f"Created {lags} lag features for {column}"}
1581
+ '''
1582
+
1583
+ ROLLING_WINDOW_STATS = '''import pandas as pd
1584
+ from typing import Annotated
1585
+
1586
+ def run(data, column: Annotated[str, "column"], window: int = 7) -> dict:
1587
+ df = data.copy()
1588
+ if column not in df.columns:
1589
+ raise ValueError(f"Column '{column}' not found.")
1590
+
1591
+ df[f"{column}_roll_mean_{window}"] = df[column].rolling(window=window).mean()
1592
+ df[f"{column}_roll_std_{window}"] = df[column].rolling(window=window).std()
1593
+
1594
+ return {"data": df, "summary": f"Created rolling window ({window}) stats for {column}"}
1595
+ '''
1596
+
1597
+ PERMUTATION_IMPORTANCE = '''import numpy as np
1598
+ from sklearn.inspection import permutation_importance
1599
+
1600
+ def run(model, X_test, y_test, scoring: str = "accuracy", n_repeats: int = 5, random_state: int = 42) -> dict:
1601
+ result = permutation_importance(model, X_test, y_test, scoring=scoring, n_repeats=n_repeats, random_state=random_state)
1602
+
1603
+ importances = result.importances_mean
1604
+
1605
+ if hasattr(X_test, 'columns'):
1606
+ names = X_test.columns
1607
+ else:
1608
+ names = [f"Feature {i}" for i in range(len(importances))]
1609
+
1610
+ indices = np.argsort(importances)[::-1]
1611
+
1612
+ top_importances = importances[indices].tolist()
1613
+ top_names = [str(names[i]) for i in indices]
1614
+
1615
+ return {
1616
+ "features": top_names,
1617
+ "importances": top_importances,
1618
+ "summary": "Permutation Importance"
1619
+ }
1620
+ '''
1621
+
1622
+ LEARNING_CURVE_DATA = '''import numpy as np
1623
+ from sklearn.model_selection import learning_curve
1624
+
1625
+ def run(model, X_train, y_train, cv_folds: int = 5, scoring: str = "accuracy") -> dict:
1626
+ train_sizes, train_scores, test_scores = learning_curve(
1627
+ model, X_train, y_train, cv=cv_folds, scoring=scoring,
1628
+ train_sizes=np.linspace(0.1, 1.0, 10), random_state=42
1629
+ )
1630
+
1631
+ train_mean = np.mean(train_scores, axis=1).tolist()
1632
+ test_mean = np.mean(test_scores, axis=1).tolist()
1633
+ sizes = train_sizes.tolist()
1634
+
1635
+ return {
1636
+ "train_sizes": sizes,
1637
+ "train_scores": train_mean,
1638
+ "val_scores": test_mean,
1639
+ "scoring": scoring,
1640
+ "summary": "Learning Curve Data"
1641
+ }
1642
+ '''
1643
+
1644
+ LIFT_GAIN_CHARTS = '''import numpy as np
1645
+
1646
+ def run(model, X_test, y_test) -> dict:
1647
+ if not hasattr(model, "predict_proba"):
1648
+ raise ValueError("Model does not support predict_proba required for lift/gain charts.")
1649
+
1650
+ classes = model.classes_
1651
+ if len(classes) != 2:
1652
+ raise ValueError("Lift/Gain charts require binary classification.")
1653
+
1654
+ y_scores = model.predict_proba(X_test)[:, 1]
1655
+ pos_label = classes[1]
1656
+
1657
+ y_true_bin = (np.array(y_test) == pos_label).astype(int)
1658
+
1659
+ indices = np.argsort(y_scores)[::-1]
1660
+ y_true_sorted = y_true_bin[indices]
1661
+
1662
+ total_positives = y_true_bin.sum()
1663
+ total_samples = len(y_true_bin)
1664
+
1665
+ cum_positives = np.cumsum(y_true_sorted)
1666
+
1667
+ gain = cum_positives / max(total_positives, 1)
1668
+
1669
+ count = np.arange(1, total_samples + 1)
1670
+ lift = (cum_positives / count) / (total_positives / total_samples)
1671
+
1672
+ deciles = np.linspace(0, 1, 11)[1:]
1673
+ gain_deciles = [gain[min(int(d * total_samples) - 1, total_samples - 1)] for d in deciles]
1674
+ lift_deciles = [lift[min(int(d * total_samples) - 1, total_samples - 1)] for d in deciles]
1675
+
1676
+ return {
1677
+ "deciles": deciles.tolist(),
1678
+ "gain": gain_deciles,
1679
+ "lift": lift_deciles,
1680
+ "summary": "Lift and Gain Data"
1681
+ }
1682
+ '''
1683
+
1684
+ TOMEK_LINKS = '''import pandas as pd
1685
+ import numpy as np
1686
+ from typing import Annotated
1687
+
1688
+ def run(data, target_column: Annotated[str, "column"] = "target") -> dict:
1689
+ df = data.copy()
1690
+ if target_column not in df.columns:
1691
+ raise ValueError(f"Target column '{target_column}' not found.")
1692
+
1693
+ X = df.drop(columns=[target_column])
1694
+ y = df[target_column]
1695
+
1696
+ try:
1697
+ from imblearn.under_sampling import TomekLinks
1698
+ tl = TomekLinks()
1699
+ X_res, y_res = tl.fit_resample(X, y)
1700
+ df_out = X_res.copy()
1701
+ df_out[target_column] = y_res
1702
+
1703
+ return {"data": df_out, "summary": f"Tomek Links removed {len(df) - len(df_out)} noisy samples."}
1704
+ except ImportError:
1705
+ return {"data": df, "summary": "imbalanced-learn not installed. Skipping Tomek Links."}
1706
+ '''
1707
+
1708
+ RANDOM_UNDER_SAMPLER = '''import pandas as pd
1709
+ from typing import Annotated
1710
+
1711
+ def run(data, target_column: Annotated[str, "column"] = "target", random_state: int = 42) -> dict:
1712
+ df = data.copy()
1713
+ if target_column not in df.columns:
1714
+ raise ValueError(f"Target column '{target_column}' not found.")
1715
+
1716
+ try:
1717
+ from imblearn.under_sampling import RandomUnderSampler
1718
+ X = df.drop(columns=[target_column])
1719
+ y = df[target_column]
1720
+ rus = RandomUnderSampler(random_state=random_state)
1721
+ X_res, y_res = rus.fit_resample(X, y)
1722
+ df_out = X_res.copy()
1723
+ df_out[target_column] = y_res
1724
+
1725
+ return {"data": df_out, "summary": f"Undersampled from {len(df)} to {len(df_out)} rows."}
1726
+ except ImportError:
1727
+ min_class_size = df[target_column].value_counts().min()
1728
+ df_out = df.groupby(target_column).sample(n=min_class_size, random_state=random_state)
1729
+ return {"data": df_out, "summary": f"Manual undersampled from {len(df)} to {len(df_out)} rows."}
1730
+ '''
1731
+
1732
+ CORRELATION_HEATMAP = '''import pandas as pd
1733
+ import numpy as np
1734
+
1735
+ def run(data: pd.DataFrame) -> dict:
1736
+ numeric_df = data.select_dtypes(include=[np.number])
1737
+ if numeric_df.empty:
1738
+ return {"error": "No numeric columns found for correlation."}
1739
+
1740
+ corr_matrix = numeric_df.corr().fillna(0)
1741
+ cols = corr_matrix.columns.tolist()
1742
+
1743
+ z_values = corr_matrix.values.tolist()
1744
+
1745
+ return {
1746
+ "correlation_heatmap": {
1747
+ "x": cols,
1748
+ "y": cols,
1749
+ "z": z_values
1750
+ }
1751
+ }
1752
+ '''
1753
+
1754
+ MISSING_VALUE_MAP = '''import pandas as pd
1755
+
1756
+ def run(data: pd.DataFrame) -> dict:
1757
+ if len(data) > 100:
1758
+ sampled = data.sample(100, random_state=42)
1759
+ else:
1760
+ sampled = data
1761
+
1762
+ null_matrix = sampled.isnull().astype(int)
1763
+ cols = null_matrix.columns.tolist()
1764
+
1765
+ z_values = null_matrix.values.tolist()
1766
+
1767
+ return {
1768
+ "missing_value_map": {
1769
+ "x": cols,
1770
+ "y": list(range(len(sampled))),
1771
+ "z": z_values
1772
+ }
1773
+ }
1774
+ '''
1775
+
1776
+ CLASS_BALANCE_VISUALIZER = '''import pandas as pd
1777
+ from typing import Annotated
1778
+
1779
+ def run(data: pd.DataFrame, target: Annotated[str, "column"]) -> dict:
1780
+ if target not in data.columns:
1781
+ return {"error": f"Target column '{target}' not found."}
1782
+
1783
+ counts = data[target].value_counts(dropna=False)
1784
+ percentages = data[target].value_counts(normalize=True, dropna=False) * 100
1785
+
1786
+ classes = [str(c) for c in counts.index]
1787
+
1788
+ return {
1789
+ "class_balance": {
1790
+ "classes": classes,
1791
+ "counts": counts.tolist(),
1792
+ "percentages": percentages.tolist()
1793
+ }
1794
+ }
1795
+ '''
1796
+
1797
+ FEATURE_TARGET_SCATTER = '''import pandas as pd
1798
+ import numpy as np
1799
+ from typing import Annotated
1800
+
1801
+ def run(data: pd.DataFrame, target: Annotated[str, "column"]) -> dict:
1802
+ if target not in data.columns:
1803
+ return {"error": f"Target column '{target}' not found."}
1804
+
1805
+ df = data.dropna(subset=[target])
1806
+
1807
+ # Identify numeric features (excluding target)
1808
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
1809
+ if target in numeric_cols:
1810
+ numeric_cols.remove(target)
1811
+
1812
+ if not numeric_cols:
1813
+ return {"error": "No numeric features found."}
1814
+
1815
+ target_is_numeric = pd.api.types.is_numeric_dtype(df[target])
1816
+
1817
+ corrs = []
1818
+ for col in numeric_cols:
1819
+ if target_is_numeric:
1820
+ corr = df[col].corr(df[target])
1821
+ if pd.isna(corr): corr = 0
1822
+ corrs.append((col, corr))
1823
+ else:
1824
+ corrs.append((col, 0))
1825
+
1826
+ # Sort by absolute correlation and take top 8
1827
+ corrs.sort(key=lambda x: abs(x[1]), reverse=True)
1828
+ top_features = [x[0] for x in corrs[:8]]
1829
+ top_corrs = {x[0]: x[1] for x in corrs[:8]}
1830
+
1831
+ # Cap samples for rendering speed
1832
+ if len(df) > 600:
1833
+ df = df.sample(600, random_state=42)
1834
+
1835
+ panels = []
1836
+ for f in top_features:
1837
+ panels.append({
1838
+ "feature": f,
1839
+ "x": df[f].tolist(),
1840
+ "y": df[target].tolist(),
1841
+ "corr": top_corrs[f]
1842
+ })
1843
+
1844
+ return {
1845
+ "scatter_grid": {
1846
+ "panels": panels,
1847
+ "title": "Continuous Feature vs Target",
1848
+ "target_name": target
1849
+ }
1850
+ }
1851
+ '''
1852
+
1853
+ MODEL_ERROR_HISTOGRAM = '''import numpy as np
1854
+
1855
+ def run(y_test, y_pred) -> dict:
1856
+ y_t = np.array(y_test)
1857
+ y_p = np.array(y_pred)
1858
+
1859
+ residuals = y_t - y_p
1860
+ counts, bin_edges = np.histogram(residuals, bins=30)
1861
+ bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
1862
+
1863
+ return {
1864
+ "model_error_histogram": {
1865
+ "counts": counts.tolist(),
1866
+ "bin_centers": bin_centers.tolist(),
1867
+ "bin_edges": bin_edges.tolist()
1868
+ }
1869
+ }
1870
+ '''
1871
+
1872
+ PARTIAL_DEPENDENCE_DATA = '''import pandas as pd
1873
+ import numpy as np
1874
+ from sklearn.inspection import partial_dependence
1875
+ from typing import Annotated
1876
+
1877
+ def run(model, X_train: pd.DataFrame, feature: Annotated[str, "column"]) -> dict:
1878
+ if feature not in X_train.columns:
1879
+ return {"error": f"Feature '{feature}' not found in X_train."}
1880
+
1881
+ if len(X_train) > 1000:
1882
+ X_sample = X_train.sample(1000, random_state=42)
1883
+ else:
1884
+ X_sample = X_train
1885
+
1886
+ try:
1887
+ pd_result = partial_dependence(model, X_sample, features=[feature], kind='average')
1888
+ avg_response = pd_result['average'].tolist()
1889
+ grid_values = pd_result['grid_values'][0].tolist()
1890
+
1891
+ return {
1892
+ "partial_dependence": {
1893
+ "feature": feature,
1894
+ "grid": grid_values,
1895
+ "average_response": avg_response
1896
+ }
1897
+ }
1898
+ except Exception as e:
1899
+ return {"error": f"Partial dependence failed: {str(e)}"}
1900
+ '''
1901
+
1902
+ MULTICLASS_ROC_DATA = '''import numpy as np
1903
+ from sklearn.metrics import roc_curve, auc
1904
+ from sklearn.preprocessing import label_binarize
1905
+
1906
+ def run(model, X_test, y_test) -> dict:
1907
+ if not hasattr(model, "predict_proba"):
1908
+ return {"error": "Model does not support predict_proba, which is required for ROC."}
1909
+
1910
+ y_t = np.array(y_test)
1911
+ classes = np.unique(y_t)
1912
+
1913
+ if len(classes) < 2:
1914
+ return {"error": "Need at least 2 classes for ROC."}
1915
+
1916
+ try:
1917
+ y_prob = model.predict_proba(X_test)
1918
+ except Exception as e:
1919
+ return {"error": f"predict_proba failed: {str(e)}"}
1920
+
1921
+ roc_data = {}
1922
+
1923
+ if len(classes) == 2:
1924
+ fpr, tpr, _ = roc_curve(y_t, y_prob[:, 1], pos_label=classes[1])
1925
+ roc_data[str(classes[1])] = {
1926
+ "fpr": fpr.tolist(),
1927
+ "tpr": tpr.tolist(),
1928
+ "auc": auc(fpr, tpr)
1929
+ }
1930
+ else:
1931
+ y_test_bin = label_binarize(y_t, classes=classes)
1932
+ for i, cls in enumerate(classes):
1933
+ fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
1934
+ roc_data[str(cls)] = {
1935
+ "fpr": fpr.tolist(),
1936
+ "tpr": tpr.tolist(),
1937
+ "auc": auc(fpr, tpr)
1938
+ }
1939
+
1940
+ return {"roc_curves": roc_data}
1941
+ '''
1942
+
1943
+ DBSCAN_CLUSTERING = '''import pandas as pd
1944
+ import numpy as np
1945
+ from sklearn.cluster import DBSCAN
1946
+
1947
+ def run(data: pd.DataFrame, eps: float = 0.5, min_samples: int = 5) -> dict:
1948
+ numeric_df = data.select_dtypes(include=[np.number]).dropna()
1949
+ if numeric_df.empty:
1950
+ return {"error": "No numeric data available for DBSCAN."}
1951
+
1952
+ dbscan = DBSCAN(eps=eps, min_samples=min_samples)
1953
+ labels = dbscan.fit_predict(numeric_df)
1954
+
1955
+ out_df = numeric_df.copy()
1956
+ out_df["ClusterID"] = labels
1957
+
1958
+ noise_count = int((labels == -1).sum())
1959
+
1960
+ return {
1961
+ "data": out_df,
1962
+ "labels": labels.tolist(),
1963
+ "summary": f"DBSCAN found {len(set(labels)) - (1 if -1 in labels else 0)} clusters and {noise_count} noise points."
1964
+ }
1965
+ '''
1966
+
1967
+ TSNE_VISUALIZER = '''import pandas as pd
1968
+ import numpy as np
1969
+ from sklearn.manifold import TSNE
1970
+ from typing import Annotated
1971
+
1972
+ def run(
1973
+ data: pd.DataFrame,
1974
+ label_column: Annotated[str, "column"] = "",
1975
+ n_clusters: int = 4,
1976
+ perplexity: float = 30.0,
1977
+ ) -> dict:
1978
+ """t-SNE 2D scatter with automatic cluster coloring.
1979
+
1980
+ label_column : name of a column whose values will color the dots.
1981
+ Leave blank → KMeans auto-clustering (n_clusters groups).
1982
+ n_clusters : number of KMeans clusters used when label_column is blank.
1983
+ perplexity : t-SNE perplexity (5–50 works for most datasets).
1984
+ """
1985
+ df = data.copy()
1986
+ numeric_df = df.select_dtypes(include=[np.number]).dropna()
1987
+
1988
+ if numeric_df.empty:
1989
+ return {"error": "No numeric data available for t-SNE."}
1990
+
1991
+ # Cap at 3000 rows for speed; keep a reproducible sample
1992
+ if len(numeric_df) > 3000:
1993
+ numeric_df = numeric_df.sample(3000, random_state=42)
1994
+
1995
+ # ── Run t-SNE ──────────────────────────────────────────────────────────────
1996
+ perp = min(float(perplexity), max(5.0, (len(numeric_df) - 1) / 3))
1997
+ tsne = TSNE(n_components=2, perplexity=perp, random_state=42, n_iter=300)
1998
+ embedding = tsne.fit_transform(numeric_df.values)
1999
+
2000
+ # ── Build labels array for color-coding ───────────────────────────────────
2001
+ col = label_column.strip() if label_column else ""
2002
+
2003
+ if col and col in df.columns:
2004
+ # Use the requested column aligned to sampled rows
2005
+ raw = df[col].loc[numeric_df.index]
2006
+ dtype = raw.dtype
2007
+
2008
+ if dtype == object or str(dtype) == "category":
2009
+ # Categorical: use as-is
2010
+ labels = [str(v) for v in raw]
2011
+ else:
2012
+ # Numeric target → bucket into 4 quantile bands for readability
2013
+ try:
2014
+ banded = pd.qcut(raw, q=min(n_clusters, raw.nunique()), duplicates="drop", labels=False)
2015
+ labels = [f"Q{int(v)+1}" if pd.notna(v) else "?" for v in banded]
2016
+ except Exception:
2017
+ labels = [str(v) for v in raw]
2018
+ else:
2019
+ # Auto-cluster with KMeans
2020
+ from sklearn.cluster import KMeans
2021
+ from sklearn.preprocessing import StandardScaler
2022
+ scaled = StandardScaler().fit_transform(numeric_df.values)
2023
+ k = min(n_clusters, len(numeric_df) - 1, 10)
2024
+ km = KMeans(n_clusters=k, random_state=42, n_init="auto")
2025
+ cluster_ids = km.fit_predict(scaled)
2026
+ labels = [f"Cluster {int(c)}" for c in cluster_ids]
2027
+
2028
+ n = len(embedding)
2029
+ dot_size = 8 if n < 500 else (6 if n < 1500 else 4)
2030
+
2031
+ return {
2032
+ "tsne_scatter": {
2033
+ "x": embedding[:, 0].tolist(),
2034
+ "y": embedding[:, 1].tolist(),
2035
+ "labels": labels, # ← color-coding key
2036
+ "title": "t-SNE Cluster Visualization",
2037
+ "x_label": "Dim 1",
2038
+ "y_label": "Dim 2",
2039
+ "dot_size": dot_size,
2040
+ },
2041
+ "summary": f"t-SNE projection of {n} samples with {len(set(labels))} color groups.",
2042
+ }
2043
+ '''
2044
+
2045
+ ISOLATION_FOREST_ANOMALY = '''import pandas as pd
2046
+ import numpy as np
2047
+ from sklearn.ensemble import IsolationForest
2048
+
2049
+ def run(data: pd.DataFrame, contamination: float = 0.05) -> dict:
2050
+ numeric_df = data.select_dtypes(include=[np.number]).dropna()
2051
+ if numeric_df.empty:
2052
+ return {"error": "No numeric data available for Isolation Forest."}
2053
+
2054
+ clf = IsolationForest(contamination=contamination, random_state=42)
2055
+ labels = clf.fit_predict(numeric_df)
2056
+
2057
+ out_df = numeric_df.copy()
2058
+ out_df["is_anomaly"] = labels
2059
+
2060
+ anomaly_count = int((labels == -1).sum())
2061
+
2062
+ return {
2063
+ "data": out_df,
2064
+ "summary": f"Isolation Forest detected {anomaly_count} anomalies."
2065
+ }
2066
+ '''
2067
+
2068
+ STACKING_REGRESSOR = '''import pandas as pd
2069
+ from sklearn.ensemble import StackingRegressor, RandomForestRegressor
2070
+ from sklearn.linear_model import LinearRegression, Ridge
2071
+ try:
2072
+ from xgboost import XGBRegressor
2073
+ except ImportError:
2074
+ XGBRegressor = None
2075
+
2076
+ def run(X_train, y_train, X_test):
2077
+ estimators = [
2078
+ ('lr', LinearRegression()),
2079
+ ('rf', RandomForestRegressor(n_estimators=50, random_state=42))
2080
+ ]
2081
+ if XGBRegressor is not None:
2082
+ estimators.append(('xgb', XGBRegressor(n_estimators=50, random_state=42)))
2083
+
2084
+ model = StackingRegressor(
2085
+ estimators=estimators,
2086
+ final_estimator=Ridge()
2087
+ )
2088
+ model.fit(X_train, y_train)
2089
+ y_pred = model.predict(X_test)
2090
+
2091
+ return {
2092
+ "model": model,
2093
+ "y_pred": y_pred.tolist(),
2094
+ "summary": f"Stacked Regressor with {len(estimators)} base models and Ridge final estimator."
2095
+ }
2096
+ '''
2097
+
2098
+ QUANTILE_REGRESSOR = '''import pandas as pd
2099
+ from typing import Annotated
2100
+
2101
+ def run(X_train, y_train, X_test, quantile: Annotated[float, "percentile (0-1)"] = 0.5) -> dict:
2102
+ try:
2103
+ from sklearn.linear_model import QuantileRegressor
2104
+ model = QuantileRegressor(quantile=quantile, solver='highs')
2105
+ model.fit(X_train, y_train)
2106
+ y_pred = model.predict(X_test)
2107
+
2108
+ return {
2109
+ "model": model,
2110
+ "y_pred": y_pred.tolist(),
2111
+ "summary": f"Quantile Regressor fitted for {quantile*100}th percentile."
2112
+ }
2113
+ except Exception as e:
2114
+ return {"error": f"QuantileRegressor failed: {str(e)}"}
2115
+ '''
2116
+
2117
+ EPSILON_GREEDY_BANDIT = '''import pandas as pd
2118
+ import numpy as np
2119
+ from typing import Annotated
2120
+
2121
+ def run(data: pd.DataFrame, action_col: Annotated[str, "column"], reward_col: Annotated[str, "column"], epsilon: float = 0.1) -> dict:
2122
+ if action_col not in data.columns or reward_col not in data.columns:
2123
+ return {"error": "Action or reward column not found."}
2124
+
2125
+ actions = data[action_col].unique()
2126
+ q_values = {a: 0.0 for a in actions}
2127
+ action_counts = {a: 0 for a in actions}
2128
+
2129
+ for _, row in data.iterrows():
2130
+ a = row[action_col]
2131
+ r = row[reward_col]
2132
+ action_counts[a] += 1
2133
+ q_values[a] += (r - q_values[a]) / action_counts[a]
2134
+
2135
+ optimal_action = max(q_values, key=q_values.get)
2136
+
2137
+ return {
2138
+ "q_values": q_values,
2139
+ "optimal_policy": str(optimal_action),
2140
+ "summary": f"Bandit converged on optimal action: {optimal_action}"
2141
+ }
2142
+ '''
2143
+
2144
+ MARKOV_CHAIN_SIMULATOR = '''import pandas as pd
2145
+ import numpy as np
2146
+ from typing import Annotated
2147
+
2148
+ def run(data: pd.DataFrame, state_col: Annotated[str, "column"]) -> dict:
2149
+ if state_col not in data.columns:
2150
+ return {"error": "State column not found."}
2151
+
2152
+ states = data[state_col].astype(str).tolist()
2153
+ transitions = {}
2154
+
2155
+ for i in range(len(states) - 1):
2156
+ curr_state = states[i]
2157
+ next_state = states[i+1]
2158
+
2159
+ if curr_state not in transitions:
2160
+ transitions[curr_state] = {}
2161
+ if next_state not in transitions[curr_state]:
2162
+ transitions[curr_state][next_state] = 0
2163
+
2164
+ transitions[curr_state][next_state] += 1
2165
+
2166
+ probs = {}
2167
+ for state, next_states in transitions.items():
2168
+ total = sum(next_states.values())
2169
+ probs[state] = {k: v / total for k, v in next_states.items()}
2170
+
2171
+ return {
2172
+ "transition_probabilities": probs,
2173
+ "summary": f"Markov Chain built with {len(probs)} distinct states."
2174
+ }
2175
+ '''
2176
+
2177
+ SILHOUETTE_SCORE_NODE = '''import pandas as pd
2178
+ import numpy as np
2179
+ from sklearn.metrics import silhouette_score
2180
+ from typing import Annotated
2181
+
2182
+ def run(data: pd.DataFrame, labels_col: Annotated[str, "column"]) -> dict:
2183
+ if labels_col not in data.columns:
2184
+ return {"error": "Labels column not found."}
2185
+
2186
+ numeric_df = data.select_dtypes(include=[np.number]).drop(columns=[labels_col], errors='ignore').dropna()
2187
+ labels = data[labels_col].loc[numeric_df.index]
2188
+
2189
+ unique_labels = len(set(labels))
2190
+ if unique_labels < 2 or unique_labels >= len(numeric_df):
2191
+ return {"error": "Silhouette score requires 2 <= n_clusters <= n_samples - 1"}
2192
+
2193
+ if len(numeric_df) > 5000:
2194
+ idx = np.random.choice(numeric_df.index, 5000, replace=False)
2195
+ numeric_df = numeric_df.loc[idx]
2196
+ labels = labels.loc[idx]
2197
+
2198
+ score = silhouette_score(numeric_df, labels)
2199
+
2200
+ return {
2201
+ "silhouette_score": score,
2202
+ "summary": f"Clusters are {'well-separated' if score > 0.5 else 'overlapping'}. Score: {score:.3f}"
2203
+ }
2204
+ '''
2205
+
2206
+ # ══════════════════════════════════════════════════════════════════════════════
2207
+ # NEW TEMPLATES — Advanced Preprocessing, Clustering, Specialised Models, XAI
2208
+ # ══════════════════════════════════════════════════════════════════════════════
2209
+
2210
+ # ── Advanced Preprocessing ────────────────────────────────────────────────────
2211
+
2212
+ PROPER_CAPITALIZATION_CLEANER = '''from typing import Annotated
2213
+
2214
+ def run(data, columns: str = "") -> dict:
2215
+ """Standardise text casing in string columns (title-case by default).
2216
+ Leave columns blank to process all object columns.
2217
+ """
2218
+ import pandas as pd
2219
+
2220
+ df = data.copy()
2221
+ cols = [c.strip() for c in columns.split(",") if c.strip()] if columns.strip() \
2222
+ else df.select_dtypes(include=["object", "category"]).columns.tolist()
2223
+ fixed = []
2224
+ for col in cols:
2225
+ if col in df.columns:
2226
+ df[col] = df[col].astype(str).str.strip().str.title()
2227
+ fixed.append(col)
2228
+ return {"data": df, "fixed_columns": fixed}
2229
+ '''
2230
+
2231
+ Z_SCORE_OUTLIER_DETECTOR = '''
2232
+ def run(data, threshold: float = 3.0, method: str = "flag") -> dict:
2233
+ """Detect extreme outliers using Z-scores (|Z| > threshold).
2234
+
2235
+ method='flag' — adds a boolean *_outlier column for each numeric feature.
2236
+ method='drop' — removes rows where ANY feature exceeds the threshold.
2237
+ method='clip' — clips values to ±threshold standard deviations.
2238
+ """
2239
+ import pandas as pd
2240
+ import numpy as np
2241
+
2242
+ df = data.copy()
2243
+ num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
2244
+ rows_before = len(df)
2245
+
2246
+ if method == "flag":
2247
+ for col in num_cols:
2248
+ mean, std = df[col].mean(), df[col].std()
2249
+ if std > 0:
2250
+ df[f"{col}_outlier"] = (((df[col] - mean) / std).abs() > threshold)
2251
+ return {"data": df, "columns_flagged": len(num_cols)}
2252
+
2253
+ elif method == "drop":
2254
+ keep = pd.Series(True, index=df.index)
2255
+ for col in num_cols:
2256
+ mean, std = df[col].mean(), df[col].std()
2257
+ if std > 0:
2258
+ keep &= ((df[col] - mean) / std).abs() <= threshold
2259
+ df = df[keep]
2260
+ return {"data": df, "rows_removed": rows_before - len(df)}
2261
+
2262
+ else: # clip
2263
+ for col in num_cols:
2264
+ mean, std = df[col].mean(), df[col].std()
2265
+ if std > 0:
2266
+ df[col] = df[col].clip(mean - threshold * std, mean + threshold * std)
2267
+ return {"data": df, "columns_clipped": len(num_cols)}
2268
+ '''
2269
+
2270
+ BOX_COX_TRANSFORMER = '''from typing import Annotated
2271
+
2272
+ def run(data, columns: str = "") -> dict:
2273
+ """Apply Box-Cox transformation to stabilise variance in positive numeric columns.
2274
+
2275
+ More powerful than log1p for regression targets — finds the optimal lambda
2276
+ per column. Skips columns with non-positive values (Box-Cox requires x > 0).
2277
+ Leave columns blank to transform all eligible numeric columns.
2278
+ """
2279
+ import pandas as pd
2280
+ import numpy as np
2281
+ from scipy.stats import boxcox
2282
+
2283
+ df = data.copy()
2284
+ num_cols = [c.strip() for c in columns.split(",") if c.strip()] if columns.strip() \
2285
+ else df.select_dtypes(include=[np.number]).columns.tolist()
2286
+
2287
+ transformed, skipped, lambdas = [], [], {}
2288
+ for col in num_cols:
2289
+ if col not in df.columns:
2290
+ continue
2291
+ if df[col].min() <= 0:
2292
+ skipped.append(col)
2293
+ continue
2294
+ df[col], lam = boxcox(df[col].dropna())
2295
+ lambdas[col] = round(float(lam), 4)
2296
+ transformed.append(col)
2297
+
2298
+ return {"data": df, "transformed": transformed, "skipped_non_positive": skipped, "lambdas": lambdas}
2299
+ '''
2300
+
2301
+ MEAN_TARGET_ENCODER = '''from typing import Annotated
2302
+
2303
+ def run(data,
2304
+ column: Annotated[str, "column"] = "category",
2305
+ target: Annotated[str, "column"] = "target",
2306
+ smoothing: int = 10) -> dict:
2307
+ """Replace a categorical column with the smoothed mean of the target per category.
2308
+
2309
+ Smoothing blends category mean with global mean to prevent overfitting on rare
2310
+ categories. The original column is dropped; a new *_mean_enc column is added.
2311
+ Ideal for high-cardinality columns in Kaggle-style tasks.
2312
+ """
2313
+ import pandas as pd
2314
+ import numpy as np
2315
+
2316
+ df = data.copy()
2317
+ if column not in df.columns:
2318
+ raise ValueError(f"Column '{column}' not found.")
2319
+ if target not in df.columns:
2320
+ raise ValueError(f"Target '{target}' not found.")
2321
+
2322
+ global_mean = df[target].mean()
2323
+ stats = df.groupby(column)[target].agg(["mean", "count"])
2324
+ smoothed = (stats["count"] * stats["mean"] + smoothing * global_mean) / (stats["count"] + smoothing)
2325
+
2326
+ new_col = f"{column}_mean_enc"
2327
+ df[new_col] = df[column].map(smoothed).fillna(global_mean)
2328
+ df = df.drop(columns=[column])
2329
+ return {"data": df, "new_column": new_col, "unique_categories": int(len(stats))}
2330
+ '''
2331
+
2332
+ COLOCATED_FEATURE_GENERATOR = '''from typing import Annotated
2333
+
2334
+ def run(data,
2335
+ lat_col: Annotated[str, "column"] = "latitude",
2336
+ lon_col: Annotated[str, "column"] = "longitude") -> dict:
2337
+ """Generate distance-based features from latitude/longitude coordinates.
2338
+
2339
+ Adds: distance_from_centroid, distance_from_origin, lat_lon_product.
2340
+ Useful for real estate, logistics, and location-based prediction tasks.
2341
+ """
2342
+ import pandas as pd
2343
+ import numpy as np
2344
+
2345
+ df = data.copy()
2346
+ for col in (lat_col, lon_col):
2347
+ if col not in df.columns:
2348
+ raise ValueError(f"Column '{col}' not found.")
2349
+
2350
+ lat, lon = df[lat_col].astype(float), df[lon_col].astype(float)
2351
+ cx, cy = lat.mean(), lon.mean()
2352
+
2353
+ df["distance_from_centroid"] = np.sqrt((lat - cx) ** 2 + (lon - cy) ** 2)
2354
+ df["distance_from_origin"] = np.sqrt(lat ** 2 + lon ** 2)
2355
+ df["lat_lon_product"] = lat * lon
2356
+ df["lat_lon_ratio"] = (lat / lon.replace(0, np.nan)).fillna(0)
2357
+
2358
+ return {"data": df, "features_added": ["distance_from_centroid", "distance_from_origin", "lat_lon_product", "lat_lon_ratio"]}
2359
+ '''
2360
+
2361
+ ZIP_CODE_GROUPER = '''from typing import Annotated
2362
+
2363
+ def run(data,
2364
+ zip_column: Annotated[str, "column"] = "zip_code",
2365
+ prefix_length: int = 3) -> dict:
2366
+ """Cluster postal codes into regional groups by truncating to the first N digits.
2367
+
2368
+ Reduces cardinality from thousands of unique zip codes down to hundreds of regions.
2369
+ Adds a new *_region column and drops the original.
2370
+ """
2371
+ import pandas as pd
2372
+
2373
+ df = data.copy()
2374
+ if zip_column not in df.columns:
2375
+ raise ValueError(f"Column '{zip_column}' not found.")
2376
+
2377
+ region_col = f"{zip_column}_region"
2378
+ df[region_col] = df[zip_column].astype(str).str.strip().str[:prefix_length].str.zfill(prefix_length)
2379
+ original_unique = int(df[zip_column].nunique())
2380
+ region_unique = int(df[region_col].nunique())
2381
+ df = df.drop(columns=[zip_column])
2382
+ return {"data": df, "original_unique": original_unique, "region_unique": region_unique, "new_column": region_col}
2383
+ '''
2384
+
2385
+ NULL_INDICATOR_CREATOR = '''
2386
+ def run(data, min_missing_rate: float = 0.0) -> dict:
2387
+ """Add boolean indicator columns for columns that contain missing values.
2388
+
2389
+ For each column with missing rate > min_missing_rate, adds a *_was_null
2390
+ binary column (1 = was missing, 0 = was present).
2391
+
2392
+ Sometimes the FACT that data is missing is itself a predictive signal
2393
+ (e.g., a blank 'income' field may indicate unemployment).
2394
+ """
2395
+ import pandas as pd
2396
+
2397
+ df = data.copy()
2398
+ missing_rates = df.isnull().mean()
2399
+ eligible = missing_rates[missing_rates > min_missing_rate].index.tolist()
2400
+
2401
+ added = []
2402
+ for col in eligible:
2403
+ indicator = f"{col}_was_null"
2404
+ df[indicator] = df[col].isnull().astype(int)
2405
+ added.append(indicator)
2406
+
2407
+ return {"data": df, "indicators_added": added, "count": len(added)}
2408
+ '''
2409
+
2410
+ # ── Unsupervised & Clustering ─────────────────────────────────────────────────
2411
+
2412
+ UMAP_DIMENSIONALITY_REDUCTION = '''
2413
+ def run(data, n_components: int = 2, n_neighbors: int = 15, min_dist: float = 0.1) -> dict:
2414
+ """Reduce dimensions to 2D (or n_components) using UMAP.
2415
+
2416
+ Falls back to t-SNE if umap-learn is not installed.
2417
+ Samples up to 10,000 rows for speed. Adds umap_1, umap_2 columns.
2418
+ """
2419
+ import numpy as np
2420
+ import pandas as pd
2421
+
2422
+ df = data.copy()
2423
+ num_df = df.select_dtypes(include=[np.number]).dropna()
2424
+ if num_df.empty:
2425
+ raise ValueError("No numeric columns found for UMAP.")
2426
+ if len(num_df) > 10_000:
2427
+ num_df = num_df.sample(10_000, random_state=42)
2428
+
2429
+ try:
2430
+ import umap
2431
+ reducer = umap.UMAP(n_components=n_components, n_neighbors=n_neighbors,
2432
+ min_dist=min_dist, random_state=42)
2433
+ embedding = reducer.fit_transform(num_df.values)
2434
+ method = "UMAP"
2435
+ except ImportError:
2436
+ from sklearn.manifold import TSNE
2437
+ reducer = TSNE(n_components=min(n_components, 3), perplexity=min(30, len(num_df) - 1),
2438
+ random_state=42, n_iter=500)
2439
+ embedding = reducer.fit_transform(num_df.values)
2440
+ method = "t-SNE (UMAP fallback — pip install umap-learn)"
2441
+
2442
+ out_df = num_df.copy()
2443
+ for i in range(embedding.shape[1]):
2444
+ out_df[f"umap_{i+1}"] = embedding[:, i]
2445
+
2446
+ return {
2447
+ "data": out_df,
2448
+ "method": method,
2449
+ "feature_target_scatter": {
2450
+ "x": embedding[:, 0].tolist(),
2451
+ "y": embedding[:, 1].tolist(),
2452
+ "feature_name": "umap_1",
2453
+ "target_name": "umap_2",
2454
+ },
2455
+ }
2456
+ '''
2457
+
2458
+ ELBOW_METHOD_DATA = '''
2459
+ def run(data, max_k: int = 10, sample_size: int = 5000) -> dict:
2460
+ """Compute K-Means inertia for K = 1 … max_k to produce an Elbow curve.
2461
+
2462
+ Run this BEFORE KMeans to pick the optimal number of clusters.
2463
+ Returns inertias and a recommended_k heuristic (largest second-derivative).
2464
+ """
2465
+ import numpy as np
2466
+ import pandas as pd
2467
+ from sklearn.cluster import KMeans
2468
+ from sklearn.preprocessing import StandardScaler
2469
+
2470
+ df = data.select_dtypes(include=[np.number]).dropna()
2471
+ if len(df) > sample_size:
2472
+ df = df.sample(sample_size, random_state=42)
2473
+
2474
+ scaler = StandardScaler()
2475
+ X = scaler.fit_transform(df)
2476
+
2477
+ inertias, ks = [], list(range(1, max_k + 1))
2478
+ for k in ks:
2479
+ km = KMeans(n_clusters=k, n_init=5, random_state=42)
2480
+ km.fit(X)
2481
+ inertias.append(float(km.inertia_))
2482
+
2483
+ # Second-derivative heuristic for recommended K
2484
+ recommended_k = 2
2485
+ if len(inertias) >= 3:
2486
+ diffs2 = [inertias[i-1] - 2*inertias[i] + inertias[i+1] for i in range(1, len(inertias)-1)]
2487
+ recommended_k = diffs2.index(max(diffs2)) + 2 # +2 because we start at k=1
2488
+
2489
+ return {"ks": ks, "inertias": inertias, "recommended_k": recommended_k, "summary": f"Elbow suggests K={recommended_k}"}
2490
+ '''
2491
+
2492
+ # ── Specialised Models ────────────────────────────────────────────────────────
2493
+
2494
+ BALANCED_RF_CLASSIFIER = '''from sklearn.ensemble import RandomForestClassifier
2495
+ from sklearn.model_selection import cross_val_score
2496
+
2497
+ def run(
2498
+ X_train, X_test, y_train,
2499
+ n_estimators: int = 100,
2500
+ max_depth: int = 0,
2501
+ random_state: int = 42,
2502
+ cross_validation: bool = False,
2503
+ cv_folds: int = 5,
2504
+ ) -> dict:
2505
+ """Random Forest with class_weight='balanced' — handles imbalanced classes automatically.
2506
+
2507
+ Unlike a standard RF, every tree is trained with inverse-frequency sample weights so
2508
+ minority classes receive equal attention. No SMOTE required.
2509
+ """
2510
+ import numpy as np
2511
+ from sklearn.ensemble import RandomForestClassifier
2512
+ from sklearn.model_selection import cross_val_score
2513
+
2514
+ md = max_depth if max_depth > 0 else None
2515
+ model = RandomForestClassifier(
2516
+ n_estimators=n_estimators,
2517
+ max_depth=md,
2518
+ class_weight="balanced",
2519
+ random_state=random_state,
2520
+ )
2521
+ if cross_validation:
2522
+ scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="f1_weighted")
2523
+ model.fit(X_train, y_train)
2524
+ y_pred = model.predict(X_test)
2525
+ return {"model": model, "y_pred": y_pred,
2526
+ "cv_mean": round(float(scores.mean()), 4),
2527
+ "cv_std": round(float(scores.std()), 4)}
2528
+ model.fit(X_train, y_train)
2529
+ y_pred = model.predict(X_test)
2530
+ return {"model": model, "y_pred": y_pred}
2531
+ '''
2532
+
2533
+ RIDGE_CV_REGRESSOR = '''
2534
+ def run(X_train, X_test, y_train, cv_folds: int = 5) -> dict:
2535
+ """Ridge regression with automatic alpha selection via built-in cross-validation.
2536
+
2537
+ RidgeCV tests a range of regularisation strengths and picks the best one
2538
+ automatically — no manual hyperparameter tuning needed.
2539
+ """
2540
+ import numpy as np
2541
+ from sklearn.linear_model import RidgeCV
2542
+
2543
+ alphas = [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
2544
+ model = RidgeCV(alphas=alphas, cv=cv_folds)
2545
+ model.fit(X_train, y_train)
2546
+ y_pred = model.predict(X_test)
2547
+ return {
2548
+ "model": model,
2549
+ "y_pred": y_pred,
2550
+ "best_alpha": float(model.alpha_),
2551
+ "summary": f"RidgeCV selected alpha={model.alpha_:.4g}",
2552
+ }
2553
+ '''
2554
+
2555
+ POISSON_REGRESSOR = '''
2556
+ def run(
2557
+ X_train, X_test, y_train,
2558
+ max_iter: int = 300,
2559
+ alpha: float = 1.0,
2560
+ cross_validation: bool = False,
2561
+ cv_folds: int = 5,
2562
+ ) -> dict:
2563
+ """Poisson regression — purpose-built for non-negative integer count data.
2564
+
2565
+ Use when the target represents counts (insurance claims, website visits,
2566
+ defects per unit). Standard linear regression is incorrect for count targets.
2567
+ """
2568
+ import numpy as np
2569
+ from sklearn.linear_model import PoissonRegressor
2570
+ from sklearn.model_selection import cross_val_score
2571
+
2572
+ model = PoissonRegressor(alpha=alpha, max_iter=max_iter)
2573
+ if cross_validation:
2574
+ scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
2575
+ model.fit(X_train, y_train)
2576
+ y_pred = model.predict(X_test)
2577
+ return {"model": model, "y_pred": y_pred,
2578
+ "cv_mean": round(float(scores.mean()), 4),
2579
+ "cv_std": round(float(scores.std()), 4)}
2580
+ model.fit(X_train, y_train)
2581
+ y_pred = model.predict(X_test)
2582
+ return {"model": model, "y_pred": y_pred}
2583
+ '''
2584
+
2585
+ # ── Deep Evaluation & XAI ────────────────────────────────────────────────────
2586
+
2587
+ SHAP_EXPLAINER = '''
2588
+ def run(model, X_train, X_test) -> dict:
2589
+ """Compute SHAP (Shapley Additive Explanations) values for model interpretability.
2590
+
2591
+ Returns per-feature mean |SHAP| importances and the top feature.
2592
+ Falls back to permutation importance if shap is not installed.
2593
+ Samples up to 200 rows from X_test for speed.
2594
+ """
2595
+ import numpy as np
2596
+ import pandas as pd
2597
+
2598
+ cols = list(X_train.columns) if hasattr(X_train, "columns") else [f"f{i}" for i in range(X_train.shape[1])]
2599
+ X_sample = X_test[:200] if len(X_test) > 200 else X_test
2600
+
2601
+ try:
2602
+ import shap
2603
+ # TreeExplainer is fast for tree-based models; falls back to KernelExplainer otherwise
2604
+ try:
2605
+ explainer = shap.TreeExplainer(model)
2606
+ shap_values = explainer.shap_values(X_sample)
2607
+ except Exception:
2608
+ explainer = shap.KernelExplainer(model.predict, shap.sample(X_sample, 50))
2609
+ shap_values = explainer.shap_values(X_sample, nsamples=100)
2610
+
2611
+ # For multiclass, shap_values is a list — take mean over classes
2612
+ if isinstance(shap_values, list):
2613
+ shap_arr = np.mean([np.abs(sv) for sv in shap_values], axis=0)
2614
+ else:
2615
+ shap_arr = np.abs(shap_values)
2616
+
2617
+ mean_shap = shap_arr.mean(axis=0)
2618
+ pairs = sorted(zip(mean_shap, cols), reverse=True)
2619
+ return {
2620
+ "feature_importances": {c: round(float(v), 6) for v, c in pairs},
2621
+ "top_feature": pairs[0][1],
2622
+ "top_shap_value": round(float(pairs[0][0]), 6),
2623
+ "method": "SHAP",
2624
+ }
2625
+
2626
+ except ImportError:
2627
+ # Fallback: permutation importance
2628
+ from sklearn.inspection import permutation_importance
2629
+ r = permutation_importance(model, X_sample, model.predict(X_sample), n_repeats=5, random_state=42)
2630
+ pairs = sorted(zip(r.importances_mean, cols), reverse=True)
2631
+ return {
2632
+ "feature_importances": {c: round(float(v), 6) for v, c in pairs},
2633
+ "top_feature": pairs[0][1],
2634
+ "top_shap_value": round(float(pairs[0][0]), 6),
2635
+ "method": "Permutation Importance (pip install shap for SHAP values)",
2636
+ }
2637
+ '''
2638
+
2639
+ PARTIAL_DEPENDENCE_PLOTS = '''from typing import Annotated
2640
+
2641
+ def run(model, X_train, feature: Annotated[str, "column"] = "feature_name") -> dict:
2642
+ """Compute Partial Dependence Plot data: the marginal effect of one feature on predictions.
2643
+
2644
+ Shows HOW the model's output changes as a single feature varies, holding all others constant.
2645
+ Crucial for understanding the direction and shape of a feature's influence.
2646
+ """
2647
+ import numpy as np
2648
+ import pandas as pd
2649
+ from sklearn.inspection import partial_dependence
2650
+
2651
+ if feature not in X_train.columns:
2652
+ raise ValueError(f"Feature '{feature}' not in X_train columns: {list(X_train.columns)}")
2653
+
2654
+ sample = X_train.sample(min(1000, len(X_train)), random_state=42) if len(X_train) > 1000 else X_train
2655
+
2656
+ try:
2657
+ result = partial_dependence(model, sample, features=[feature], kind="average")
2658
+ grid = result["grid_values"][0].tolist()
2659
+ response = result["average"][0].tolist()
2660
+ except Exception as e:
2661
+ raise ValueError(f"Partial dependence failed: {e}")
2662
+
2663
+ return {
2664
+ "partial_dependence": {
2665
+ "feature": feature,
2666
+ "grid": grid,
2667
+ "average_response": response,
2668
+ },
2669
+ "feature_name": feature,
2670
+ }
2671
+ '''
2672
+
2673
+ CALIBRATION_CURVE_DATA = '''
2674
+ def run(model, X_test, y_test, n_bins: int = 10) -> dict:
2675
+ """Check if a classifier's predicted probabilities are well-calibrated.
2676
+
2677
+ A perfectly calibrated model has its 90th-percentile predictions correct 90% of the time.
2678
+ Returns fraction_of_positives vs mean_predicted_probability for plotting.
2679
+ """
2680
+ import numpy as np
2681
+ from sklearn.calibration import calibration_curve
2682
+
2683
+ if not hasattr(model, "predict_proba"):
2684
+ raise ValueError("Model must support predict_proba for calibration analysis.")
2685
+
2686
+ y_prob = model.predict_proba(X_test)
2687
+ # Binary: use positive class; multiclass: use max probability
2688
+ if y_prob.shape[1] == 2:
2689
+ prob_pos = y_prob[:, 1]
2690
+ y_bin = np.asarray(y_test)
2691
+ else:
2692
+ prob_pos = y_prob.max(axis=1)
2693
+ y_bin = (np.asarray(y_test) == model.classes_[y_prob.argmax(axis=1)]).astype(int)
2694
+
2695
+ frac_pos, mean_pred = calibration_curve(y_bin, prob_pos, n_bins=n_bins, strategy="uniform")
2696
+
2697
+ brier = float(np.mean((prob_pos - y_bin) ** 2))
2698
+ return {
2699
+ "fraction_of_positives": frac_pos.tolist(),
2700
+ "mean_predicted_value": mean_pred.tolist(),
2701
+ "brier_score": round(brier, 4),
2702
+ "summary": f"Brier score: {brier:.4f} (lower = better calibrated)",
2703
+ }
2704
+ '''
2705
+
2706
+ LEARNING_CURVE_ANALYZER = '''
2707
+ def run(model, X_train, y_train, cv_folds: int = 5, scoring: str = "accuracy") -> dict:
2708
+ """Plot training vs validation score across increasing dataset sizes.
2709
+
2710
+ Diagnoses model behaviour:
2711
+ - High train score, low val score → overfitting (try regularisation)
2712
+ - Both scores low → underfitting (try more complex model or features)
2713
+ - Scores converge at high sample count → more data won't help
2714
+ """
2715
+ import numpy as np
2716
+ from sklearn.model_selection import learning_curve
2717
+
2718
+ # Use 5 evenly spaced training sizes from 10% to 100%
2719
+ train_sizes_abs, train_scores, val_scores = learning_curve(
2720
+ model, X_train, y_train,
2721
+ cv=cv_folds,
2722
+ scoring=scoring,
2723
+ train_sizes=np.linspace(0.1, 1.0, 5),
2724
+ n_jobs=-1,
2725
+ )
2726
+
2727
+ return {
2728
+ "train_sizes": train_sizes_abs.tolist(),
2729
+ "train_scores": train_scores.mean(axis=1).round(4).tolist(),
2730
+ "val_scores": val_scores.mean(axis=1).round(4).tolist(),
2731
+ "train_std": train_scores.std(axis=1).round(4).tolist(),
2732
+ "val_std": val_scores.std(axis=1).round(4).tolist(),
2733
+ "scoring": scoring,
2734
+ "summary": f"Final val {scoring}: {val_scores.mean(axis=1)[-1]:.4f}",
2735
+ }
2736
+ '''
2737
+
2738
+ COST_BENEFIT_MATRIX = '''
2739
+ def run(
2740
+ y_test, y_pred,
2741
+ tp_value: float = 100.0,
2742
+ fp_cost: float = 10.0,
2743
+ fn_cost: float = 50.0,
2744
+ tn_value: float = 0.0,
2745
+ ) -> dict:
2746
+ """Compute business value of model predictions using a cost-benefit matrix.
2747
+
2748
+ Assign dollar values to each outcome type (TP, FP, FN, TN) to translate
2749
+ model accuracy into real business impact. Essential for fraud detection,
2750
+ churn prevention, and medical diagnostics where error costs differ.
2751
+ """
2752
+ import numpy as np
2753
+ from sklearn.metrics import confusion_matrix
2754
+
2755
+ y_t, y_p = np.asarray(y_test), np.asarray(y_pred)
2756
+ if len(np.unique(y_t)) != 2:
2757
+ raise ValueError("Cost-benefit matrix requires binary classification.")
2758
+
2759
+ cm = confusion_matrix(y_t, y_p)
2760
+ tn, fp, fn, tp = cm.ravel()
2761
+
2762
+ total_value = tp * tp_value - fp * fp_cost - fn * fn_cost + tn * tn_value
2763
+ random_value = (len(y_t) * y_t.mean()) * tp_value - (len(y_t) * (1 - y_t.mean())) * fp_cost
2764
+ value_per_case = total_value / len(y_t)
2765
+
2766
+ return {
2767
+ "confusion_matrix": {"TP": int(tp), "FP": int(fp), "FN": int(fn), "TN": int(tn)},
2768
+ "total_business_value": round(total_value, 2),
2769
+ "random_baseline_value": round(random_value, 2),
2770
+ "value_per_case": round(value_per_case, 2),
2771
+ "roi_vs_random": round((total_value - random_value) / max(abs(random_value), 1) * 100, 1),
2772
+ "summary": f"Model generates ${total_value:,.0f} total vs ${random_value:,.0f} random baseline.",
2773
+ }
2774
+ '''
2775
+
2776
+
2777
+ TEMPLATES: list[dict] = [
2778
+ # Data
2779
+ {"id": "csv_loader", "label": "CSV Loader", "category": "Data", "code": CSV_LOADER},
2780
+ {"id": "csv_exporter", "label": "CSV Exporter", "category": "Data", "code": CSV_EXPORTER},
2781
+ {"id": "eda", "label": "EDA", "category": "Data", "code": EDA},
2782
+ {"id": "eda_histogram", "label": "Histogram", "category": "Visualize", "code": EDA_HISTOGRAM},
2783
+ {"id": "eda_correlation", "label": "Correlation Matrix", "category": "Visualize", "code": EDA_CORRELATION},
2784
+ {"id": "eda_value_counts", "label": "Value Counts", "category": "Visualize", "code": EDA_VALUE_COUNTS},
2785
+ {"id": "eda_box_plot", "label": "Box Plot", "category": "Visualize", "code": EDA_BOX_PLOT},
2786
+ {"id": "predict", "label": "Predict", "category": "Visualize", "code": PREDICT},
2787
+ # Preprocessing
2788
+ {"id": "data_cleaning", "label": "Data Cleaning", "category": "Preprocessing", "code": DATA_CLEANING},
2789
+ {"id": "smart_outlier_remover", "label": "Smart Outlier Remover", "category": "Preprocessing", "code": SMART_OUTLIER_REMOVER},
2790
+ {"id": "advanced_imputer", "label": "Advanced Imputer", "category": "Preprocessing", "code": ADVANCED_IMPUTER},
2791
+ {"id": "skewness_fixer", "label": "Skewness Fixer", "category": "Preprocessing", "code": SKEWNESS_FIXER},
2792
+ {"id": "high_cardinality_encoder", "label": "High-Cardinality Encoder","category": "Preprocessing", "code": HIGH_CARDINALITY_ENCODER},
2793
+ {"id": "feature_scaler_robust", "label": "Robust Scaler", "category": "Preprocessing", "code": FEATURE_SCALER_ROBUST},
2794
+ {"id": "multicollinearity_filter", "label": "Multicollinearity Filter","category": "Preprocessing", "code": MULTICOLLINEARITY_FILTER},
2795
+ {"id": "text_cleaner_basic", "label": "Text Cleaner", "category": "Preprocessing", "code": TEXT_CLEANER_BASIC},
2796
+ {"id": "smote_sampler", "label": "SMOTE Sampler", "category": "Preprocessing", "code": SMOTE_SAMPLER},
2797
+ {"id": "label_encoder", "label": "Label Encoder", "category": "Preprocessing", "code": LABEL_ENCODER},
2798
+ {"id": "one_hot_encoder", "label": "One-Hot Encoder", "category": "Preprocessing", "code": ONE_HOT_ENCODER},
2799
+ {"id": "train_test_split", "label": "Train/Test Split", "category": "Preprocessing", "code": TRAIN_TEST_SPLIT},
2800
+ {"id": "standard_scaler", "label": "Standard Scaler", "category": "Preprocessing", "code": STANDARD_SCALER},
2801
+ {"id": "min_max_scaler", "label": "Min-Max Scaler", "category": "Preprocessing", "code": MIN_MAX_SCALER},
2802
+ {"id": "pca", "label": "PCA", "category": "Preprocessing", "code": PCA},
2803
+ {"id": "polynomial_features", "label": "Polynomial Features", "category": "Preprocessing", "code": POLYNOMIAL_FEATURES},
2804
+ {"id": "datetime_extractor", "label": "Datetime Extractor", "category": "Preprocessing", "code": DATETIME_EXTRACTOR},
2805
+ {"id": "binary_encoder", "label": "Binary Encoder", "category": "Preprocessing", "code": BINARY_ENCODER},
2806
+ {"id": "frequency_encoder", "label": "Frequency Encoder", "category": "Preprocessing", "code": FREQUENCY_ENCODER},
2807
+ {"id": "ordinal_encoder", "label": "Ordinal Encoder", "category": "Preprocessing", "code": ORDINAL_ENCODER},
2808
+ {"id": "vif_feature_selection", "label": "VIF Feature Selection", "category": "Preprocessing", "code": VIF_FEATURE_SELECTION},
2809
+ {"id": "pca_whitening", "label": "PCA Whitening", "category": "Preprocessing", "code": PCA_WHITENING},
2810
+ {"id": "k_means_clustering_features", "label": "K-Means Features", "category": "Preprocessing", "code": K_MEANS_CLUSTERING_FEATURES},
2811
+ {"id": "lag_feature_generator", "label": "Lag Features", "category": "Preprocessing", "code": LAG_FEATURE_GENERATOR},
2812
+ {"id": "rolling_window_stats", "label": "Rolling Window Stats", "category": "Preprocessing", "code": ROLLING_WINDOW_STATS},
2813
+ {"id": "tomek_links", "label": "Tomek Links", "category": "Preprocessing", "code": TOMEK_LINKS},
2814
+ {"id": "random_under_sampler", "label": "Random Under Sampler", "category": "Preprocessing", "code": RANDOM_UNDER_SAMPLER},
2815
+ {"id": "rfe_feature_selector", "label": "RFE Feature Selector", "category": "Preprocessing", "code": RFE_FEATURE_SELECTOR},
2816
+ # Classifiers
2817
+ {"id": "logistic_regression", "label": "Logistic Regression", "category": "Classifiers", "code": LOGISTIC_REGRESSION},
2818
+ {"id": "random_forest_classifier", "label": "RF Classifier", "category": "Classifiers", "code": RANDOM_FOREST_CLASSIFIER},
2819
+ {"id": "gradient_boosting_classifier","label": "GB Classifier", "category": "Classifiers", "code": GRADIENT_BOOSTING_CLASSIFIER},
2820
+ {"id": "decision_tree_classifier", "label": "DT Classifier", "category": "Classifiers", "code": DECISION_TREE_CLASSIFIER},
2821
+ {"id": "svm_classifier", "label": "SVM Classifier", "category": "Classifiers", "code": SVM_CLASSIFIER},
2822
+ {"id": "knn_classifier", "label": "KNN Classifier", "category": "Classifiers", "code": KNN_CLASSIFIER},
2823
+ {"id": "xgboost_node", "label": "XGBoost", "category": "Classifiers", "code": XGBOOST_NODE},
2824
+ {"id": "lightgbm_node", "label": "LightGBM", "category": "Classifiers", "code": LIGHTGBM_NODE},
2825
+ {"id": "adaboost_node", "label": "AdaBoost", "category": "Classifiers", "code": ADABOOST_NODE},
2826
+ {"id": "voting_ensemble", "label": "Voting Ensemble", "category": "Classifiers", "code": VOTING_ENSEMBLE},
2827
+ # Regressors
2828
+ {"id": "linear_regression", "label": "Linear Regression", "category": "Regression", "code": LINEAR_REGRESSION},
2829
+ {"id": "random_forest_regressor", "label": "RF Regressor", "category": "Regression", "code": RANDOM_FOREST_REGRESSOR},
2830
+ {"id": "gradient_boosting_regressor", "label": "GB Regressor", "category": "Regression", "code": GRADIENT_BOOSTING_REGRESSOR},
2831
+ {"id": "decision_tree_regressor", "label": "DT Regressor", "category": "Regression", "code": DECISION_TREE_REGRESSOR},
2832
+ {"id": "svm_regressor", "label": "SVM Regressor", "category": "Regression", "code": SVM_REGRESSOR},
2833
+ {"id": "knn_regressor", "label": "KNN Regressor", "category": "Regression", "code": KNN_REGRESSOR},
2834
+ # Evaluation
2835
+ {"id": "accuracy", "label": "Accuracy", "category": "Evaluation", "code": ACCURACY},
2836
+ {"id": "classification_report", "label": "Classification Report", "category": "Evaluation", "code": CLASSIFICATION_REPORT},
2837
+ {"id": "validation_report", "label": "Validation Report", "category": "Evaluation", "code": VALIDATION_REPORT},
2838
+ {"id": "regression_metrics", "label": "Regression Metrics", "category": "Evaluation", "code": REGRESSION_METRICS},
2839
+ {"id": "feature_importance", "label": "Feature Importance", "category": "Evaluation", "code": FEATURE_IMPORTANCE},
2840
+ {"id": "confusion_matrix_plotter", "label": "Confusion Matrix", "category": "Visualize", "code": CONFUSION_MATRIX_PLOTTER},
2841
+ {"id": "roc_pr_curve_data", "label": "ROC & PR Curves", "category": "Visualize", "code": ROC_PR_CURVE_DATA},
2842
+ {"id": "residual_plotter", "label": "Residual Plot", "category": "Visualize", "code": RESIDUAL_PLOTTER},
2843
+ {"id": "feature_importance_visualizer","label": "Feature Importance Plot","category": "Visualize", "code": FEATURE_IMPORTANCE_VISUALIZER},
2844
+ {"id": "decision_boundary_2d", "label": "Decision Boundary 2D", "category": "Visualize", "code": DECISION_BOUNDARY_2D},
2845
+ {"id": "prediction_vs_actual_scatter","label": "Pred vs Actual Scatter", "category": "Visualize", "code": PREDICTION_VS_ACTUAL_SCATTER},
2846
+ {"id": "inverse_target_transformer", "label": "Inverse Transformer", "category": "Evaluation", "code": INVERSE_TARGET_TRANSFORMER},
2847
+ {"id": "threshold_optimizer", "label": "Threshold Optimizer", "category": "Evaluation", "code": THRESHOLD_OPTIMIZER},
2848
+ {"id": "permutation_importance", "label": "Permutation Importance", "category": "Evaluation", "code": PERMUTATION_IMPORTANCE},
2849
+ {"id": "learning_curve_data", "label": "Learning Curve", "category": "Evaluation", "code": LEARNING_CURVE_DATA},
2850
+ {"id": "lift_gain_charts", "label": "Lift & Gain Charts", "category": "Evaluation", "code": LIFT_GAIN_CHARTS},
2851
+ {"id": "auto_ml", "label": "AutoML", "category": "Evaluation", "code": AUTO_ML},
2852
+ {"id": "correlation_heatmap", "label": "Correlation Heatmap", "category": "Visualize", "code": CORRELATION_HEATMAP},
2853
+ {"id": "missing_value_map", "label": "Missing Value Map", "category": "Visualize", "code": MISSING_VALUE_MAP},
2854
+ {"id": "class_balance_visualizer", "label": "Class Balance", "category": "Visualize", "code": CLASS_BALANCE_VISUALIZER},
2855
+ {"id": "feature_target_scatter", "label": "Feature/Target Scatter", "category": "Visualize", "code": FEATURE_TARGET_SCATTER},
2856
+ {"id": "model_error_histogram", "label": "Model Error Histogram", "category": "Visualize", "code": MODEL_ERROR_HISTOGRAM},
2857
+ {"id": "partial_dependence_data", "label": "Partial Dependence", "category": "Visualize", "code": PARTIAL_DEPENDENCE_DATA},
2858
+ {"id": "multiclass_roc_data", "label": "Multiclass ROC", "category": "Visualize", "code": MULTICLASS_ROC_DATA},
2859
+
2860
+ # Unsupervised
2861
+ {"id": "dbscan_clustering", "label": "DBSCAN Clustering", "category": "Unsupervised", "code": DBSCAN_CLUSTERING},
2862
+ {"id": "tsne_visualizer", "label": "t-SNE Visualizer", "category": "Unsupervised", "code": TSNE_VISUALIZER},
2863
+ {"id": "isolation_forest_anomaly", "label": "Isolation Forest", "category": "Unsupervised", "code": ISOLATION_FOREST_ANOMALY},
2864
+
2865
+ # Ensembles
2866
+ {"id": "stacking_regressor", "label": "Stacking Regressor", "category": "Regression", "code": STACKING_REGRESSOR},
2867
+ {"id": "quantile_regressor", "label": "Quantile Regressor", "category": "Regression", "code": QUANTILE_REGRESSOR},
2868
+
2869
+ # Agentic / Reinforcement
2870
+ {"id": "epsilon_greedy_bandit", "label": "Epsilon-Greedy Bandit", "category": "Agentic", "code": EPSILON_GREEDY_BANDIT},
2871
+ {"id": "markov_chain_simulator", "label": "Markov Chain Simulator", "category": "Agentic", "code": MARKOV_CHAIN_SIMULATOR},
2872
+
2873
+ # Evaluation
2874
+ {"id": "silhouette_score_node", "label": "Silhouette Score", "category": "Evaluation", "code": SILHOUETTE_SCORE_NODE},
2875
+
2876
+ # ── Advanced Preprocessing ───────────────────────────────────────────────
2877
+ {"id": "proper_capitalization_cleaner", "label": "Capitalisation Cleaner", "category": "Preprocessing", "code": PROPER_CAPITALIZATION_CLEANER},
2878
+ {"id": "z_score_outlier_detector", "label": "Z-Score Outlier Detector", "category": "Preprocessing", "code": Z_SCORE_OUTLIER_DETECTOR},
2879
+ {"id": "box_cox_transformer", "label": "Box-Cox Transformer", "category": "Preprocessing", "code": BOX_COX_TRANSFORMER},
2880
+ {"id": "mean_target_encoder", "label": "Mean Target Encoder", "category": "Preprocessing", "code": MEAN_TARGET_ENCODER},
2881
+ {"id": "colocated_feature_generator", "label": "Geo Feature Generator", "category": "Preprocessing", "code": COLOCATED_FEATURE_GENERATOR},
2882
+ {"id": "zip_code_grouper", "label": "Zip Code Grouper", "category": "Preprocessing", "code": ZIP_CODE_GROUPER},
2883
+ {"id": "null_indicator_creator", "label": "Null Indicator Creator", "category": "Preprocessing", "code": NULL_INDICATOR_CREATOR},
2884
+
2885
+ # ── Unsupervised & Clustering ────────────────────────────────────────────
2886
+ {"id": "umap_dimensionality_reduction", "label": "UMAP Reducer", "category": "Unsupervised", "code": UMAP_DIMENSIONALITY_REDUCTION},
2887
+ {"id": "elbow_method_data", "label": "Elbow Method", "category": "Unsupervised", "code": ELBOW_METHOD_DATA},
2888
+
2889
+ # ── Specialised Models ───────────────────────────────────────────────────
2890
+ {"id": "balanced_rf_classifier", "label": "Balanced RF Classifier", "category": "Classifiers", "code": BALANCED_RF_CLASSIFIER},
2891
+ {"id": "ridge_cv_regressor", "label": "RidgeCV Regressor", "category": "Regression", "code": RIDGE_CV_REGRESSOR},
2892
+ {"id": "poisson_regressor", "label": "Poisson Regressor", "category": "Regression", "code": POISSON_REGRESSOR},
2893
+
2894
+ # ── Deep Evaluation & XAI ────────────────────────────────────────────────
2895
+ {"id": "shap_explainer", "label": "SHAP Explainer", "category": "Evaluation", "code": SHAP_EXPLAINER},
2896
+ {"id": "partial_dependence_plots", "label": "Partial Dependence Plots","category": "Evaluation", "code": PARTIAL_DEPENDENCE_PLOTS},
2897
+ {"id": "calibration_curve_data", "label": "Calibration Curve", "category": "Evaluation", "code": CALIBRATION_CURVE_DATA},
2898
+ {"id": "learning_curve_analyzer", "label": "Learning Curve Analyzer", "category": "Evaluation", "code": LEARNING_CURVE_ANALYZER},
2899
+ {"id": "cost_benefit_matrix", "label": "Cost-Benefit Matrix", "category": "Evaluation", "code": COST_BENEFIT_MATRIX},
2900
+ ]
2901
+
2902
+
2903
+ def get_template(template_id: str) -> dict | None:
2904
+ for t in TEMPLATES:
2905
+ if t["id"] == template_id:
2906
+ return t
2907
+ return None