m8flow 1.0.2 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bundled/backend/Dockerfile +41 -0
- package/bundled/backend/add_nodes.py +416 -0
- package/bundled/backend/api/routes/appstate.py +102 -0
- package/bundled/backend/api/routes/flows.py +64 -5
- package/bundled/backend/api/routes/nodes.py +25 -1
- package/bundled/backend/core/code_validator.py +2 -0
- package/bundled/backend/core/executor.py +19 -3
- package/bundled/backend/main.py +16 -4
- package/bundled/backend/requirements.txt +27 -6
- package/bundled/backend/services/llm_service.py +984 -108
- package/bundled/backend/services/self_healer.py +1 -1
- package/bundled/backend/temp.json +0 -0
- package/bundled/backend/templates.json +0 -0
- package/bundled/backend/templates.py +2907 -745
- package/bundled/backend/warmup.py +65 -0
- package/bundled/frontend-dist/assets/index-CKUZ27n8.css +1 -0
- package/bundled/frontend-dist/assets/index-DNaB6zf0.js +46 -0
- package/bundled/frontend-dist/index.html +2 -2
- package/lib/backend.js +184 -35
- package/lib/ports.js +42 -0
- package/lib/run.js +42 -15
- package/lib/setup.js +143 -59
- package/package.json +5 -4
- package/scripts/check-docker.js +35 -0
- package/bundled/frontend-dist/assets/index-BAQ3lKsy.css +0 -1
- package/bundled/frontend-dist/assets/index-CZCCzeUC.js +0 -41
|
@@ -1,745 +1,2907 @@
|
|
|
1
|
-
"""Pre-built node templates. Each template is just Python source defining `def run(...)`."""
|
|
2
|
-
|
|
3
|
-
# ── Data ──────────────────────────────────────────────────────────────────────
|
|
4
|
-
|
|
5
|
-
CSV_LOADER = '''import pandas as pd
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
data
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
"""
|
|
143
|
-
df = data.copy()
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
'''
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
'''
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
) -> dict:
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
from sklearn.
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
'''
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
'''
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
"
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
from sklearn.
|
|
565
|
-
from sklearn.
|
|
566
|
-
|
|
567
|
-
def run(
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
}
|
|
630
|
-
'''
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
from
|
|
634
|
-
|
|
635
|
-
def run(
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
if
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
{"
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
]
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
1
|
+
"""Pre-built node templates. Each template is just Python source defining `def run(...)`."""
|
|
2
|
+
|
|
3
|
+
# ── Data ──────────────────────────────────────────────────────────────────────
|
|
4
|
+
|
|
5
|
+
CSV_LOADER = '''import pandas as pd
|
|
6
|
+
import os
|
|
7
|
+
from typing import Annotated
|
|
8
|
+
|
|
9
|
+
def run(file_path: Annotated[str, "file"] = "data.csv") -> dict:
|
|
10
|
+
"""Loads a CSV file. Looks in the storage directory if a relative path is provided."""
|
|
11
|
+
# Resolve path
|
|
12
|
+
if not os.path.isabs(file_path):
|
|
13
|
+
storage_dir = os.environ.get("M8FLOW_UPLOAD_DIR") or os.path.abspath("uploads")
|
|
14
|
+
potential_path = os.path.join(storage_dir, file_path)
|
|
15
|
+
if os.path.exists(potential_path):
|
|
16
|
+
file_path = potential_path
|
|
17
|
+
|
|
18
|
+
df = pd.read_csv(file_path)
|
|
19
|
+
return {"data": df}
|
|
20
|
+
'''
|
|
21
|
+
|
|
22
|
+
CSV_EXPORTER = '''import pandas as pd
|
|
23
|
+
import numpy as np
|
|
24
|
+
import os
|
|
25
|
+
from typing import Annotated
|
|
26
|
+
|
|
27
|
+
def run(
|
|
28
|
+
data: Annotated[pd.DataFrame, "dataframe"] = None,
|
|
29
|
+
y_pred: Annotated[np.ndarray, "array"] = None,
|
|
30
|
+
y_test: Annotated[np.ndarray, "array"] = None,
|
|
31
|
+
file_path: Annotated[str, "file"] = "output.csv"
|
|
32
|
+
) -> dict:
|
|
33
|
+
"""Exports data to a CSV file. Supports DataFrames or Model predictions."""
|
|
34
|
+
# 1. Resolve data
|
|
35
|
+
if data is None:
|
|
36
|
+
if y_pred is not None and y_test is not None:
|
|
37
|
+
data = pd.DataFrame({"y_test": np.asarray(y_test).ravel(), "y_pred": np.asarray(y_pred).ravel()})
|
|
38
|
+
elif y_pred is not None:
|
|
39
|
+
data = pd.DataFrame({"predictions": np.asarray(y_pred).ravel()})
|
|
40
|
+
else:
|
|
41
|
+
return {"status": "error", "error": "No data or predictions provided to export"}
|
|
42
|
+
|
|
43
|
+
if not isinstance(data, pd.DataFrame):
|
|
44
|
+
try:
|
|
45
|
+
data = pd.DataFrame(data)
|
|
46
|
+
except Exception as e:
|
|
47
|
+
return {"status": "error", "error": f"Failed to convert input to DataFrame: {e}"}
|
|
48
|
+
|
|
49
|
+
# 2. Resolve output path
|
|
50
|
+
# If it's just a filename, try to save it in the storage directory if possible
|
|
51
|
+
# We look for M8FLOW_UPLOAD_DIR or default 'uploads'
|
|
52
|
+
storage_dir = os.environ.get("M8FLOW_UPLOAD_DIR") or os.path.abspath("uploads")
|
|
53
|
+
if not os.path.isabs(file_path):
|
|
54
|
+
os.makedirs(storage_dir, exist_ok=True)
|
|
55
|
+
full_path = os.path.join(storage_dir, file_path)
|
|
56
|
+
else:
|
|
57
|
+
full_path = file_path
|
|
58
|
+
|
|
59
|
+
data.to_csv(full_path, index=False)
|
|
60
|
+
return {"status": "saved", "path": full_path, "rows": len(data)}
|
|
61
|
+
'''
|
|
62
|
+
|
|
63
|
+
# ── EDA ───────────────────────────────────────────────────────────────────────
|
|
64
|
+
|
|
65
|
+
EDA = '''import pandas as pd
|
|
66
|
+
import numpy as np
|
|
67
|
+
|
|
68
|
+
def run(data) -> dict:
|
|
69
|
+
"""Exploratory Data Analysis. Always passes 'data' through so it can be chained."""
|
|
70
|
+
df = data.copy() if hasattr(data, "copy") else pd.DataFrame(data)
|
|
71
|
+
|
|
72
|
+
shape = list(df.shape)
|
|
73
|
+
dtypes = {col: str(dt) for col, dt in df.dtypes.items()}
|
|
74
|
+
|
|
75
|
+
# Missing values
|
|
76
|
+
miss = df.isnull().sum()
|
|
77
|
+
missing = {
|
|
78
|
+
col: {"count": int(miss[col]), "pct": round(float(miss[col] / len(df) * 100), 2)}
|
|
79
|
+
for col in df.columns if miss[col] > 0
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
# Numeric summary
|
|
83
|
+
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
84
|
+
numeric_summary = {}
|
|
85
|
+
for col in num_cols:
|
|
86
|
+
s = df[col]
|
|
87
|
+
numeric_summary[col] = {
|
|
88
|
+
"mean": round(float(s.mean()), 4),
|
|
89
|
+
"median": round(float(s.median()), 4),
|
|
90
|
+
"std": round(float(s.std()), 4),
|
|
91
|
+
"min": round(float(s.min()), 4),
|
|
92
|
+
"max": round(float(s.max()), 4),
|
|
93
|
+
"skew": round(float(s.skew()), 4),
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
# Categorical summary
|
|
97
|
+
cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
|
|
98
|
+
categorical_summary = {}
|
|
99
|
+
for col in cat_cols[:8]:
|
|
100
|
+
vc = df[col].value_counts()
|
|
101
|
+
categorical_summary[col] = {
|
|
102
|
+
"unique": int(df[col].nunique()),
|
|
103
|
+
"top": str(vc.index[0]) if len(vc) else "",
|
|
104
|
+
"top_count": int(vc.iloc[0]) if len(vc) else 0,
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
# Correlations (top 10 pairs by absolute value)
|
|
108
|
+
correlations = {}
|
|
109
|
+
if len(num_cols) >= 2:
|
|
110
|
+
corr = df[num_cols].corr()
|
|
111
|
+
pairs = []
|
|
112
|
+
for i in range(len(num_cols)):
|
|
113
|
+
for j in range(i + 1, len(num_cols)):
|
|
114
|
+
a, b = num_cols[i], num_cols[j]
|
|
115
|
+
val = float(corr.loc[a, b])
|
|
116
|
+
if not (val != val): # skip NaN
|
|
117
|
+
pairs.append((abs(val), a, b, val))
|
|
118
|
+
pairs.sort(reverse=True)
|
|
119
|
+
correlations = {f"{a}—{b}": round(v, 4) for _, a, b, v in pairs[:10]}
|
|
120
|
+
|
|
121
|
+
return {
|
|
122
|
+
"data": df, # pass-through so chained nodes get the DataFrame
|
|
123
|
+
"shape": shape,
|
|
124
|
+
"dtypes": dtypes,
|
|
125
|
+
"missing": missing,
|
|
126
|
+
"numeric_summary": numeric_summary,
|
|
127
|
+
"categorical_summary": categorical_summary,
|
|
128
|
+
"correlations": correlations,
|
|
129
|
+
}
|
|
130
|
+
'''
|
|
131
|
+
|
|
132
|
+
# ── Preprocessing ─────────────────────────────────────────────────────────────
|
|
133
|
+
|
|
134
|
+
SMART_OUTLIER_REMOVER = '''import pandas as pd
|
|
135
|
+
import numpy as np
|
|
136
|
+
|
|
137
|
+
def run(
|
|
138
|
+
data,
|
|
139
|
+
multiplier: float = 1.5,
|
|
140
|
+
method: str = "drop", # choices: drop | clip
|
|
141
|
+
) -> dict:
|
|
142
|
+
"""Remove or clip outliers using the IQR method across all numeric columns."""
|
|
143
|
+
df = data.copy()
|
|
144
|
+
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
145
|
+
removed_per_col = {}
|
|
146
|
+
for col in num_cols:
|
|
147
|
+
q1 = df[col].quantile(0.25)
|
|
148
|
+
q3 = df[col].quantile(0.75)
|
|
149
|
+
iqr = q3 - q1
|
|
150
|
+
lo = q1 - multiplier * iqr
|
|
151
|
+
hi = q3 + multiplier * iqr
|
|
152
|
+
outliers = ((df[col] < lo) | (df[col] > hi)).sum()
|
|
153
|
+
removed_per_col[col] = int(outliers)
|
|
154
|
+
if method == "clip":
|
|
155
|
+
df[col] = df[col].clip(lower=lo, upper=hi)
|
|
156
|
+
else: # drop
|
|
157
|
+
df = df[(df[col] >= lo) & (df[col] <= hi)]
|
|
158
|
+
return {
|
|
159
|
+
"data": df,
|
|
160
|
+
"rows_before": len(data),
|
|
161
|
+
"rows_after": len(df),
|
|
162
|
+
"outliers_removed_per_col": removed_per_col,
|
|
163
|
+
}
|
|
164
|
+
'''
|
|
165
|
+
|
|
166
|
+
ADVANCED_IMPUTER = '''import pandas as pd
|
|
167
|
+
import numpy as np
|
|
168
|
+
|
|
169
|
+
def run(
|
|
170
|
+
data,
|
|
171
|
+
strategy: str = "knn", # choices: knn | mice
|
|
172
|
+
n_neighbors: int = 5,
|
|
173
|
+
) -> dict:
|
|
174
|
+
"""Smart missing-value imputation: KNNImputer or IterativeImputer (MICE)."""
|
|
175
|
+
from sklearn.impute import KNNImputer
|
|
176
|
+
df = data.copy()
|
|
177
|
+
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
178
|
+
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
|
|
179
|
+
missing_before = int(df[num_cols].isnull().sum().sum())
|
|
180
|
+
if strategy == "mice":
|
|
181
|
+
from sklearn.experimental import enable_iterative_imputer # noqa: F401
|
|
182
|
+
from sklearn.impute import IterativeImputer
|
|
183
|
+
imputer = IterativeImputer(max_iter=10, random_state=42)
|
|
184
|
+
else:
|
|
185
|
+
imputer = KNNImputer(n_neighbors=n_neighbors)
|
|
186
|
+
df[num_cols] = imputer.fit_transform(df[num_cols])
|
|
187
|
+
# Fill remaining categorical columns with mode
|
|
188
|
+
for col in cat_cols:
|
|
189
|
+
if df[col].isnull().any():
|
|
190
|
+
df[col] = df[col].fillna(df[col].mode()[0])
|
|
191
|
+
return {
|
|
192
|
+
"data": df,
|
|
193
|
+
"strategy": strategy,
|
|
194
|
+
"missing_before": missing_before,
|
|
195
|
+
"missing_after": int(df[num_cols].isnull().sum().sum()),
|
|
196
|
+
}
|
|
197
|
+
'''
|
|
198
|
+
|
|
199
|
+
SKEWNESS_FIXER = '''import pandas as pd
|
|
200
|
+
import numpy as np
|
|
201
|
+
|
|
202
|
+
def run(
|
|
203
|
+
data,
|
|
204
|
+
threshold: float = 0.75,
|
|
205
|
+
) -> dict:
|
|
206
|
+
"""Detect skewed numeric columns and apply log1p or sqrt transformation."""
|
|
207
|
+
df = data.copy()
|
|
208
|
+
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
209
|
+
transformed = {}
|
|
210
|
+
for col in num_cols:
|
|
211
|
+
skew = float(df[col].skew())
|
|
212
|
+
if abs(skew) > threshold:
|
|
213
|
+
if df[col].min() >= 0:
|
|
214
|
+
if skew > 2: # heavy skew → log1p
|
|
215
|
+
df[col] = np.log1p(df[col])
|
|
216
|
+
transformed[col] = {"skew_before": round(skew, 4), "transform": "log1p"}
|
|
217
|
+
else: # moderate skew → sqrt
|
|
218
|
+
df[col] = np.sqrt(df[col])
|
|
219
|
+
transformed[col] = {"skew_before": round(skew, 4), "transform": "sqrt"}
|
|
220
|
+
else: # column has negatives → shift then log1p
|
|
221
|
+
shift = abs(df[col].min()) + 1
|
|
222
|
+
df[col] = np.log1p(df[col] + shift)
|
|
223
|
+
transformed[col] = {"skew_before": round(skew, 4), "transform": f"log1p(x+{round(shift,2)})"}
|
|
224
|
+
return {
|
|
225
|
+
"data": df,
|
|
226
|
+
"columns_transformed": transformed,
|
|
227
|
+
"n_transformed": len(transformed),
|
|
228
|
+
}
|
|
229
|
+
'''
|
|
230
|
+
|
|
231
|
+
HIGH_CARDINALITY_ENCODER = '''import pandas as pd
|
|
232
|
+
import numpy as np
|
|
233
|
+
from typing import Annotated
|
|
234
|
+
|
|
235
|
+
def run(
|
|
236
|
+
data,
|
|
237
|
+
column: Annotated[str, "column"] = "city",
|
|
238
|
+
target: Annotated[str, "column"] = "target",
|
|
239
|
+
smoothing: float = 1.0,
|
|
240
|
+
) -> dict:
|
|
241
|
+
"""Target Encoding for high-cardinality columns (e.g. City, ZipCode).
|
|
242
|
+
Uses smoothed means to prevent overfitting on rare categories.
|
|
243
|
+
Supports comma-separated lists of columns."""
|
|
244
|
+
df = data.copy()
|
|
245
|
+
if target not in df.columns:
|
|
246
|
+
raise ValueError(f"Target column \'{target}\' not found in DataFrame.")
|
|
247
|
+
|
|
248
|
+
global_mean = df[target].mean()
|
|
249
|
+
cols_to_encode = [c.strip() for c in column.split(",") if c.strip()]
|
|
250
|
+
stats_info = {}
|
|
251
|
+
|
|
252
|
+
for col in cols_to_encode:
|
|
253
|
+
if col not in df.columns:
|
|
254
|
+
continue
|
|
255
|
+
stats = df.groupby(col)[target].agg(["mean", "count"])
|
|
256
|
+
# Smoothed target encoding formula
|
|
257
|
+
stats["smoothed"] = (
|
|
258
|
+
(stats["count"] * stats["mean"] + smoothing * global_mean)
|
|
259
|
+
/ (stats["count"] + smoothing)
|
|
260
|
+
)
|
|
261
|
+
df[col] = df[col].map(stats["smoothed"]).fillna(global_mean)
|
|
262
|
+
stats_info[col] = int(len(stats))
|
|
263
|
+
|
|
264
|
+
return {
|
|
265
|
+
"data": df,
|
|
266
|
+
"encoded_columns": cols_to_encode,
|
|
267
|
+
"target_column": target,
|
|
268
|
+
"categories_encoded_per_col": stats_info,
|
|
269
|
+
"global_mean": round(float(global_mean), 6),
|
|
270
|
+
}
|
|
271
|
+
'''
|
|
272
|
+
|
|
273
|
+
FEATURE_SCALER_ROBUST = '''import pandas as pd
|
|
274
|
+
import numpy as np
|
|
275
|
+
|
|
276
|
+
def run(data) -> dict:
|
|
277
|
+
"""Scale numeric features with RobustScaler (median + IQR).
|
|
278
|
+
Unlike StandardScaler, it is not distorted by extreme outliers."""
|
|
279
|
+
from sklearn.preprocessing import RobustScaler
|
|
280
|
+
df = data.copy()
|
|
281
|
+
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
282
|
+
scaler = RobustScaler()
|
|
283
|
+
df[num_cols] = scaler.fit_transform(df[num_cols])
|
|
284
|
+
return {
|
|
285
|
+
"data": df,
|
|
286
|
+
"scaled_columns": num_cols,
|
|
287
|
+
"n_scaled": len(num_cols),
|
|
288
|
+
}
|
|
289
|
+
'''
|
|
290
|
+
|
|
291
|
+
MULTICOLLINEARITY_FILTER = '''import pandas as pd
|
|
292
|
+
import numpy as np
|
|
293
|
+
|
|
294
|
+
def run(
|
|
295
|
+
data,
|
|
296
|
+
threshold: float = 0.90,
|
|
297
|
+
) -> dict:
|
|
298
|
+
"""Drop features whose absolute Pearson correlation with any other feature exceeds threshold."""
|
|
299
|
+
df = data.copy()
|
|
300
|
+
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
301
|
+
corr_matrix = df[num_cols].corr().abs()
|
|
302
|
+
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
|
303
|
+
to_drop = [col for col in upper.columns if any(upper[col] > threshold)]
|
|
304
|
+
highly_correlated_pairs = []
|
|
305
|
+
for col in to_drop:
|
|
306
|
+
partners = upper.index[upper[col] > threshold].tolist()
|
|
307
|
+
for partner in partners:
|
|
308
|
+
highly_correlated_pairs.append({
|
|
309
|
+
"dropped": col,
|
|
310
|
+
"correlated_with": partner,
|
|
311
|
+
"correlation": round(float(upper.loc[partner, col]), 4),
|
|
312
|
+
})
|
|
313
|
+
df = df.drop(columns=to_drop)
|
|
314
|
+
return {
|
|
315
|
+
"data": df,
|
|
316
|
+
"dropped_columns": to_drop,
|
|
317
|
+
"n_dropped": len(to_drop),
|
|
318
|
+
"highly_correlated_pairs": highly_correlated_pairs,
|
|
319
|
+
"remaining_features": df.shape[1],
|
|
320
|
+
}
|
|
321
|
+
'''
|
|
322
|
+
|
|
323
|
+
TEXT_CLEANER_BASIC = '''import pandas as pd
|
|
324
|
+
import re
|
|
325
|
+
from typing import Annotated
|
|
326
|
+
|
|
327
|
+
def run(
|
|
328
|
+
data,
|
|
329
|
+
column: Annotated[str, "column"] = "text",
|
|
330
|
+
remove_stopwords: bool = True,
|
|
331
|
+
) -> dict:
|
|
332
|
+
"""Basic NLP text cleaning: lowercase, strip HTML, remove punctuation, remove stop words."""
|
|
333
|
+
df = data.copy()
|
|
334
|
+
if column not in df.columns:
|
|
335
|
+
raise ValueError(f"Column \'{column}\' not found in DataFrame.")
|
|
336
|
+
# Build stop-word set (use nltk if available, otherwise fall back to a minimal list)
|
|
337
|
+
stop_words = set()
|
|
338
|
+
if remove_stopwords:
|
|
339
|
+
try:
|
|
340
|
+
import nltk
|
|
341
|
+
try:
|
|
342
|
+
from nltk.corpus import stopwords
|
|
343
|
+
stop_words = set(stopwords.words("english"))
|
|
344
|
+
except LookupError:
|
|
345
|
+
nltk.download("stopwords", quiet=True)
|
|
346
|
+
from nltk.corpus import stopwords
|
|
347
|
+
stop_words = set(stopwords.words("english"))
|
|
348
|
+
except ImportError:
|
|
349
|
+
# Minimal fallback stop-word list if nltk is not installed
|
|
350
|
+
stop_words = {
|
|
351
|
+
"i","me","my","we","our","you","your","he","she","it","its",
|
|
352
|
+
"they","their","this","that","a","an","the","and","but","or",
|
|
353
|
+
"in","on","at","to","for","of","is","was","are","were","be",
|
|
354
|
+
"been","have","has","do","does","did","with","as","by","from",
|
|
355
|
+
}
|
|
356
|
+
def _clean(text):
|
|
357
|
+
if not isinstance(text, str):
|
|
358
|
+
return ""
|
|
359
|
+
text = text.lower()
|
|
360
|
+
text = re.sub(r"<[^>]+>", " ", text) # remove HTML tags
|
|
361
|
+
text = re.sub(r"[^a-z0-9\s]", " ", text) # remove punctuation
|
|
362
|
+
text = re.sub(r"\s+", " ", text).strip() # collapse whitespace
|
|
363
|
+
if stop_words:
|
|
364
|
+
text = " ".join(w for w in text.split() if w not in stop_words)
|
|
365
|
+
return text
|
|
366
|
+
original_sample = str(df[column].iloc[0]) if len(df) else ""
|
|
367
|
+
df[column] = df[column].apply(_clean)
|
|
368
|
+
return {
|
|
369
|
+
"data": df,
|
|
370
|
+
"cleaned_column": column,
|
|
371
|
+
"sample_before": original_sample[:120],
|
|
372
|
+
"sample_after": str(df[column].iloc[0])[:120] if len(df) else "",
|
|
373
|
+
}
|
|
374
|
+
'''
|
|
375
|
+
|
|
376
|
+
SMOTE_SAMPLER = '''import pandas as pd
|
|
377
|
+
import numpy as np
|
|
378
|
+
from typing import Annotated
|
|
379
|
+
|
|
380
|
+
def run(
|
|
381
|
+
data,
|
|
382
|
+
target_column: Annotated[str, "column"] = "target",
|
|
383
|
+
random_state: int = 42,
|
|
384
|
+
k_neighbors: int = 5,
|
|
385
|
+
) -> dict:
|
|
386
|
+
"""Oversample the minority class to fix class imbalance using SMOTE.
|
|
387
|
+
Falls back to RandomOverSampler if imbalanced-learn is unavailable."""
|
|
388
|
+
df = data.copy()
|
|
389
|
+
if target_column not in df.columns:
|
|
390
|
+
raise ValueError(f"Target column \'{target_column}\' not found in DataFrame.")
|
|
391
|
+
X = df.drop(columns=[target_column])
|
|
392
|
+
y = df[target_column]
|
|
393
|
+
# Keep only numeric features for sampling
|
|
394
|
+
X_num = X.select_dtypes(include=[np.number])
|
|
395
|
+
class_counts_before = y.value_counts().to_dict()
|
|
396
|
+
sampler_used = None
|
|
397
|
+
try:
|
|
398
|
+
from imblearn.over_sampling import SMOTE
|
|
399
|
+
sampler = SMOTE(k_neighbors=k_neighbors, random_state=random_state)
|
|
400
|
+
sampler_used = "SMOTE"
|
|
401
|
+
except ImportError:
|
|
402
|
+
try:
|
|
403
|
+
from imblearn.over_sampling import RandomOverSampler
|
|
404
|
+
sampler = RandomOverSampler(random_state=random_state)
|
|
405
|
+
sampler_used = "RandomOverSampler (SMOTE unavailable)"
|
|
406
|
+
except ImportError:
|
|
407
|
+
# Pure-numpy fallback: duplicate minority rows
|
|
408
|
+
classes, counts = np.unique(y, return_counts=True)
|
|
409
|
+
majority_count = counts.max()
|
|
410
|
+
X_resampled, y_resampled = X_num.copy(), y.copy()
|
|
411
|
+
for cls, cnt in zip(classes, counts):
|
|
412
|
+
if cnt < majority_count:
|
|
413
|
+
shortage = majority_count - cnt
|
|
414
|
+
minority_rows = X_num[y == cls]
|
|
415
|
+
minority_y = y[y == cls]
|
|
416
|
+
idx = np.random.RandomState(random_state).choice(len(minority_rows), shortage, replace=True)
|
|
417
|
+
X_resampled = pd.concat([X_resampled, minority_rows.iloc[idx]], ignore_index=True)
|
|
418
|
+
y_resampled = pd.concat([y_resampled, minority_y.iloc[idx]], ignore_index=True)
|
|
419
|
+
df_out = X_resampled.copy()
|
|
420
|
+
df_out[target_column] = y_resampled.values
|
|
421
|
+
return {
|
|
422
|
+
"data": df_out,
|
|
423
|
+
"sampler_used": "numpy_fallback",
|
|
424
|
+
"rows_before": len(data),
|
|
425
|
+
"rows_after": len(df_out),
|
|
426
|
+
"class_distribution_before": {str(k): int(v) for k, v in class_counts_before.items()},
|
|
427
|
+
"class_distribution_after": {str(k): int(v) for k, v in y_resampled.value_counts().items()},
|
|
428
|
+
}
|
|
429
|
+
X_res, y_res = sampler.fit_resample(X_num, y)
|
|
430
|
+
df_out = pd.DataFrame(X_res, columns=X_num.columns)
|
|
431
|
+
df_out[target_column] = y_res
|
|
432
|
+
return {
|
|
433
|
+
"data": df_out,
|
|
434
|
+
"sampler_used": sampler_used,
|
|
435
|
+
"rows_before": len(data),
|
|
436
|
+
"rows_after": len(df_out),
|
|
437
|
+
"class_distribution_before": {str(k): int(v) for k, v in class_counts_before.items()},
|
|
438
|
+
"class_distribution_after": {str(k): int(v) for k, v in pd.Series(y_res).value_counts().items()},
|
|
439
|
+
}
|
|
440
|
+
'''
|
|
441
|
+
|
|
442
|
+
DATA_CLEANING = '''import pandas as pd
|
|
443
|
+
|
|
444
|
+
def run(data, strategy: str = "drop", fill_value: float = 0.0) -> dict:
|
|
445
|
+
df = data.copy()
|
|
446
|
+
if strategy == "fill":
|
|
447
|
+
df = df.fillna(fill_value)
|
|
448
|
+
else:
|
|
449
|
+
df = df.dropna()
|
|
450
|
+
return {"data": df}
|
|
451
|
+
'''
|
|
452
|
+
|
|
453
|
+
LABEL_ENCODER = '''from sklearn.preprocessing import LabelEncoder
|
|
454
|
+
import pandas as pd
|
|
455
|
+
|
|
456
|
+
def run(data, columns: str = "") -> dict:
|
|
457
|
+
"""Encode categorical columns. Leave columns blank to encode all object columns."""
|
|
458
|
+
df = data.copy()
|
|
459
|
+
encoders = {}
|
|
460
|
+
cols = [c.strip() for c in columns.split(",") if c.strip()] if columns.strip() else \
|
|
461
|
+
df.select_dtypes(include=["object", "category"]).columns.tolist()
|
|
462
|
+
for col in cols:
|
|
463
|
+
if col in df.columns:
|
|
464
|
+
le = LabelEncoder()
|
|
465
|
+
df[col] = le.fit_transform(df[col].astype(str))
|
|
466
|
+
encoders[col] = list(le.classes_)
|
|
467
|
+
return {"data": df, "encoders": encoders}
|
|
468
|
+
'''
|
|
469
|
+
|
|
470
|
+
ONE_HOT_ENCODER = '''import pandas as pd
|
|
471
|
+
|
|
472
|
+
def run(data, columns: str = "") -> dict:
|
|
473
|
+
"""One-hot encode categorical columns. Leave columns blank to encode all object/category columns."""
|
|
474
|
+
df = data.copy()
|
|
475
|
+
cols = [c.strip() for c in columns.split(",") if c.strip()] if columns.strip() else \
|
|
476
|
+
df.select_dtypes(include=["object", "category"]).columns.tolist()
|
|
477
|
+
if cols:
|
|
478
|
+
df = pd.get_dummies(df, columns=cols, drop_first=False)
|
|
479
|
+
# Convert boolean columns produced by get_dummies to int (0/1) for sklearn
|
|
480
|
+
bool_cols = df.select_dtypes(include=["bool"]).columns.tolist()
|
|
481
|
+
if bool_cols:
|
|
482
|
+
df[bool_cols] = df[bool_cols].astype(int)
|
|
483
|
+
return {"data": df}
|
|
484
|
+
'''
|
|
485
|
+
|
|
486
|
+
TRAIN_TEST_SPLIT = '''from sklearn.model_selection import train_test_split as _split
|
|
487
|
+
from typing import Annotated
|
|
488
|
+
|
|
489
|
+
def run(
|
|
490
|
+
data,
|
|
491
|
+
target_column: Annotated[str, "column"] = "target",
|
|
492
|
+
test_size: float = 0.2,
|
|
493
|
+
random_state: int = 42,
|
|
494
|
+
) -> dict:
|
|
495
|
+
X = data.drop(columns=[target_column])
|
|
496
|
+
y = data[target_column]
|
|
497
|
+
X_train, X_test, y_train, y_test = _split(
|
|
498
|
+
X, y, test_size=test_size, random_state=random_state
|
|
499
|
+
)
|
|
500
|
+
return {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}
|
|
501
|
+
'''
|
|
502
|
+
|
|
503
|
+
STANDARD_SCALER = '''from sklearn.preprocessing import StandardScaler
|
|
504
|
+
|
|
505
|
+
def run(X_train, X_test, y_train, y_test) -> dict:
|
|
506
|
+
scaler = StandardScaler()
|
|
507
|
+
X_train_s = scaler.fit_transform(X_train)
|
|
508
|
+
X_test_s = scaler.transform(X_test)
|
|
509
|
+
return {"X_train": X_train_s, "X_test": X_test_s, "y_train": y_train, "y_test": y_test}
|
|
510
|
+
'''
|
|
511
|
+
|
|
512
|
+
MIN_MAX_SCALER = '''from sklearn.preprocessing import MinMaxScaler
|
|
513
|
+
|
|
514
|
+
def run(X_train, X_test, y_train, y_test) -> dict:
|
|
515
|
+
scaler = MinMaxScaler()
|
|
516
|
+
X_train_s = scaler.fit_transform(X_train)
|
|
517
|
+
X_test_s = scaler.transform(X_test)
|
|
518
|
+
return {"X_train": X_train_s, "X_test": X_test_s, "y_train": y_train, "y_test": y_test}
|
|
519
|
+
'''
|
|
520
|
+
|
|
521
|
+
PCA = '''from sklearn.decomposition import PCA
|
|
522
|
+
|
|
523
|
+
def run(X_train, X_test, y_train, y_test, n_components: int = 2) -> dict:
|
|
524
|
+
pca = PCA(n_components=n_components)
|
|
525
|
+
X_train_p = pca.fit_transform(X_train)
|
|
526
|
+
X_test_p = pca.transform(X_test)
|
|
527
|
+
explained = float(sum(pca.explained_variance_ratio_))
|
|
528
|
+
return {"X_train": X_train_p, "X_test": X_test_p, "y_train": y_train, "y_test": y_test,
|
|
529
|
+
"explained_variance": round(explained, 4)}
|
|
530
|
+
'''
|
|
531
|
+
|
|
532
|
+
# ── Shared CV helper (inlined into each template to stay self-contained) ────────
|
|
533
|
+
#
|
|
534
|
+
# def _run_cv(model, X_train, y_train, cv_folds, scoring):
|
|
535
|
+
# scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring=scoring)
|
|
536
|
+
# return {"cv_mean": round(float(scores.mean()), 4),
|
|
537
|
+
# "cv_std": round(float(scores.std()), 4),
|
|
538
|
+
# "cv_scores": [round(float(s), 4) for s in scores]}
|
|
539
|
+
#
|
|
540
|
+
# ── Classifier models ─────────────────────────────────────────────────────────
|
|
541
|
+
|
|
542
|
+
LOGISTIC_REGRESSION = '''from sklearn.linear_model import LogisticRegression
|
|
543
|
+
from sklearn.model_selection import cross_val_score
|
|
544
|
+
|
|
545
|
+
def run(
|
|
546
|
+
X_train, X_test, y_train,
|
|
547
|
+
C: float = 1.0, max_iter: int = 200, random_state: int = 42,
|
|
548
|
+
cross_validation: bool = False, cv_folds: int = 5,
|
|
549
|
+
) -> dict:
|
|
550
|
+
model = LogisticRegression(C=C, max_iter=max_iter, random_state=random_state)
|
|
551
|
+
if cross_validation:
|
|
552
|
+
scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")
|
|
553
|
+
model.fit(X_train, y_train)
|
|
554
|
+
y_pred = model.predict(X_test)
|
|
555
|
+
return {"model": model, "y_pred": y_pred,
|
|
556
|
+
"cv_mean": round(float(scores.mean()), 4),
|
|
557
|
+
"cv_std": round(float(scores.std()), 4),
|
|
558
|
+
"cv_scores": [round(float(s), 4) for s in scores]}
|
|
559
|
+
model.fit(X_train, y_train)
|
|
560
|
+
y_pred = model.predict(X_test)
|
|
561
|
+
return {"model": model, "y_pred": y_pred}
|
|
562
|
+
'''
|
|
563
|
+
|
|
564
|
+
RANDOM_FOREST_CLASSIFIER = '''from sklearn.ensemble import RandomForestClassifier
|
|
565
|
+
from sklearn.model_selection import cross_val_score
|
|
566
|
+
|
|
567
|
+
def run(
|
|
568
|
+
X_train, X_test, y_train,
|
|
569
|
+
n_estimators: int = 100, max_depth: int = 0, random_state: int = 42,
|
|
570
|
+
cross_validation: bool = False, cv_folds: int = 5,
|
|
571
|
+
) -> dict:
|
|
572
|
+
md = max_depth if max_depth > 0 else None
|
|
573
|
+
model = RandomForestClassifier(n_estimators=n_estimators, max_depth=md, random_state=random_state)
|
|
574
|
+
if cross_validation:
|
|
575
|
+
scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")
|
|
576
|
+
model.fit(X_train, y_train)
|
|
577
|
+
y_pred = model.predict(X_test)
|
|
578
|
+
return {"model": model, "y_pred": y_pred,
|
|
579
|
+
"cv_mean": round(float(scores.mean()), 4),
|
|
580
|
+
"cv_std": round(float(scores.std()), 4),
|
|
581
|
+
"cv_scores": [round(float(s), 4) for s in scores]}
|
|
582
|
+
model.fit(X_train, y_train)
|
|
583
|
+
y_pred = model.predict(X_test)
|
|
584
|
+
return {"model": model, "y_pred": y_pred}
|
|
585
|
+
'''
|
|
586
|
+
|
|
587
|
+
GRADIENT_BOOSTING_CLASSIFIER = '''from sklearn.ensemble import GradientBoostingClassifier
|
|
588
|
+
from sklearn.model_selection import cross_val_score
|
|
589
|
+
|
|
590
|
+
def run(
|
|
591
|
+
X_train, X_test, y_train,
|
|
592
|
+
n_estimators: int = 100, learning_rate: float = 0.1, random_state: int = 42,
|
|
593
|
+
cross_validation: bool = False, cv_folds: int = 5,
|
|
594
|
+
) -> dict:
|
|
595
|
+
model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=random_state)
|
|
596
|
+
if cross_validation:
|
|
597
|
+
scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")
|
|
598
|
+
model.fit(X_train, y_train)
|
|
599
|
+
y_pred = model.predict(X_test)
|
|
600
|
+
return {"model": model, "y_pred": y_pred,
|
|
601
|
+
"cv_mean": round(float(scores.mean()), 4),
|
|
602
|
+
"cv_std": round(float(scores.std()), 4),
|
|
603
|
+
"cv_scores": [round(float(s), 4) for s in scores]}
|
|
604
|
+
model.fit(X_train, y_train)
|
|
605
|
+
y_pred = model.predict(X_test)
|
|
606
|
+
return {"model": model, "y_pred": y_pred}
|
|
607
|
+
'''
|
|
608
|
+
|
|
609
|
+
DECISION_TREE_CLASSIFIER = '''from sklearn.tree import DecisionTreeClassifier
|
|
610
|
+
from sklearn.model_selection import cross_val_score
|
|
611
|
+
|
|
612
|
+
def run(
|
|
613
|
+
X_train, X_test, y_train,
|
|
614
|
+
max_depth: int = 0, random_state: int = 42,
|
|
615
|
+
cross_validation: bool = False, cv_folds: int = 5,
|
|
616
|
+
) -> dict:
|
|
617
|
+
md = max_depth if max_depth > 0 else None
|
|
618
|
+
model = DecisionTreeClassifier(max_depth=md, random_state=random_state)
|
|
619
|
+
if cross_validation:
|
|
620
|
+
scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")
|
|
621
|
+
model.fit(X_train, y_train)
|
|
622
|
+
y_pred = model.predict(X_test)
|
|
623
|
+
return {"model": model, "y_pred": y_pred,
|
|
624
|
+
"cv_mean": round(float(scores.mean()), 4),
|
|
625
|
+
"cv_std": round(float(scores.std()), 4),
|
|
626
|
+
"cv_scores": [round(float(s), 4) for s in scores]}
|
|
627
|
+
model.fit(X_train, y_train)
|
|
628
|
+
y_pred = model.predict(X_test)
|
|
629
|
+
return {"model": model, "y_pred": y_pred}
|
|
630
|
+
'''
|
|
631
|
+
|
|
632
|
+
SVM_CLASSIFIER = '''from sklearn.svm import SVC
|
|
633
|
+
from sklearn.model_selection import cross_val_score
|
|
634
|
+
|
|
635
|
+
def run(
|
|
636
|
+
X_train, X_test, y_train,
|
|
637
|
+
C: float = 1.0, kernel: str = "rbf",
|
|
638
|
+
cross_validation: bool = False, cv_folds: int = 5,
|
|
639
|
+
) -> dict:
|
|
640
|
+
model = SVC(C=C, kernel=kernel)
|
|
641
|
+
if cross_validation:
|
|
642
|
+
scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")
|
|
643
|
+
model.fit(X_train, y_train)
|
|
644
|
+
y_pred = model.predict(X_test)
|
|
645
|
+
return {"model": model, "y_pred": y_pred,
|
|
646
|
+
"cv_mean": round(float(scores.mean()), 4),
|
|
647
|
+
"cv_std": round(float(scores.std()), 4),
|
|
648
|
+
"cv_scores": [round(float(s), 4) for s in scores]}
|
|
649
|
+
model.fit(X_train, y_train)
|
|
650
|
+
y_pred = model.predict(X_test)
|
|
651
|
+
return {"model": model, "y_pred": y_pred}
|
|
652
|
+
'''
|
|
653
|
+
|
|
654
|
+
KNN_CLASSIFIER = '''from sklearn.neighbors import KNeighborsClassifier
|
|
655
|
+
from sklearn.model_selection import cross_val_score
|
|
656
|
+
|
|
657
|
+
def run(
|
|
658
|
+
X_train, X_test, y_train,
|
|
659
|
+
n_neighbors: int = 5,
|
|
660
|
+
cross_validation: bool = False, cv_folds: int = 5,
|
|
661
|
+
) -> dict:
|
|
662
|
+
model = KNeighborsClassifier(n_neighbors=n_neighbors)
|
|
663
|
+
if cross_validation:
|
|
664
|
+
scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")
|
|
665
|
+
model.fit(X_train, y_train)
|
|
666
|
+
y_pred = model.predict(X_test)
|
|
667
|
+
return {"model": model, "y_pred": y_pred,
|
|
668
|
+
"cv_mean": round(float(scores.mean()), 4),
|
|
669
|
+
"cv_std": round(float(scores.std()), 4),
|
|
670
|
+
"cv_scores": [round(float(s), 4) for s in scores]}
|
|
671
|
+
model.fit(X_train, y_train)
|
|
672
|
+
y_pred = model.predict(X_test)
|
|
673
|
+
return {"model": model, "y_pred": y_pred}
|
|
674
|
+
'''
|
|
675
|
+
|
|
676
|
+
# ── Regressor models ──────────────────────────────────────────────────────────
|
|
677
|
+
|
|
678
|
+
LINEAR_REGRESSION = '''from sklearn.linear_model import LinearRegression
|
|
679
|
+
from sklearn.model_selection import cross_val_score
|
|
680
|
+
|
|
681
|
+
def run(
|
|
682
|
+
X_train, X_test, y_train,
|
|
683
|
+
cross_validation: bool = False, cv_folds: int = 5,
|
|
684
|
+
) -> dict:
|
|
685
|
+
model = LinearRegression()
|
|
686
|
+
if cross_validation:
|
|
687
|
+
scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
|
|
688
|
+
model.fit(X_train, y_train)
|
|
689
|
+
y_pred = model.predict(X_test)
|
|
690
|
+
return {"model": model, "y_pred": y_pred,
|
|
691
|
+
"cv_mean": round(float(scores.mean()), 4),
|
|
692
|
+
"cv_std": round(float(scores.std()), 4),
|
|
693
|
+
"cv_scores": [round(float(s), 4) for s in scores]}
|
|
694
|
+
model.fit(X_train, y_train)
|
|
695
|
+
y_pred = model.predict(X_test)
|
|
696
|
+
return {"model": model, "y_pred": y_pred}
|
|
697
|
+
'''
|
|
698
|
+
|
|
699
|
+
RANDOM_FOREST_REGRESSOR = '''from sklearn.ensemble import RandomForestRegressor
|
|
700
|
+
from sklearn.model_selection import cross_val_score
|
|
701
|
+
|
|
702
|
+
def run(
|
|
703
|
+
X_train, X_test, y_train,
|
|
704
|
+
n_estimators: int = 100, max_depth: int = 0, random_state: int = 42,
|
|
705
|
+
cross_validation: bool = False, cv_folds: int = 5,
|
|
706
|
+
) -> dict:
|
|
707
|
+
md = max_depth if max_depth > 0 else None
|
|
708
|
+
model = RandomForestRegressor(n_estimators=n_estimators, max_depth=md, random_state=random_state)
|
|
709
|
+
if cross_validation:
|
|
710
|
+
scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
|
|
711
|
+
model.fit(X_train, y_train)
|
|
712
|
+
y_pred = model.predict(X_test)
|
|
713
|
+
return {"model": model, "y_pred": y_pred,
|
|
714
|
+
"cv_mean": round(float(scores.mean()), 4),
|
|
715
|
+
"cv_std": round(float(scores.std()), 4),
|
|
716
|
+
"cv_scores": [round(float(s), 4) for s in scores]}
|
|
717
|
+
model.fit(X_train, y_train)
|
|
718
|
+
y_pred = model.predict(X_test)
|
|
719
|
+
return {"model": model, "y_pred": y_pred}
|
|
720
|
+
'''
|
|
721
|
+
|
|
722
|
+
GRADIENT_BOOSTING_REGRESSOR = '''from sklearn.ensemble import GradientBoostingRegressor
|
|
723
|
+
from sklearn.model_selection import cross_val_score
|
|
724
|
+
|
|
725
|
+
def run(
|
|
726
|
+
X_train, X_test, y_train,
|
|
727
|
+
n_estimators: int = 100, learning_rate: float = 0.1, random_state: int = 42,
|
|
728
|
+
cross_validation: bool = False, cv_folds: int = 5,
|
|
729
|
+
) -> dict:
|
|
730
|
+
model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, random_state=random_state)
|
|
731
|
+
if cross_validation:
|
|
732
|
+
scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
|
|
733
|
+
model.fit(X_train, y_train)
|
|
734
|
+
y_pred = model.predict(X_test)
|
|
735
|
+
return {"model": model, "y_pred": y_pred,
|
|
736
|
+
"cv_mean": round(float(scores.mean()), 4),
|
|
737
|
+
"cv_std": round(float(scores.std()), 4),
|
|
738
|
+
"cv_scores": [round(float(s), 4) for s in scores]}
|
|
739
|
+
model.fit(X_train, y_train)
|
|
740
|
+
y_pred = model.predict(X_test)
|
|
741
|
+
return {"model": model, "y_pred": y_pred}
|
|
742
|
+
'''
|
|
743
|
+
|
|
744
|
+
DECISION_TREE_REGRESSOR = '''from sklearn.tree import DecisionTreeRegressor
|
|
745
|
+
from sklearn.model_selection import cross_val_score
|
|
746
|
+
|
|
747
|
+
def run(
|
|
748
|
+
X_train, X_test, y_train,
|
|
749
|
+
max_depth: int = 0, random_state: int = 42,
|
|
750
|
+
cross_validation: bool = False, cv_folds: int = 5,
|
|
751
|
+
) -> dict:
|
|
752
|
+
md = max_depth if max_depth > 0 else None
|
|
753
|
+
model = DecisionTreeRegressor(max_depth=md, random_state=random_state)
|
|
754
|
+
if cross_validation:
|
|
755
|
+
scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
|
|
756
|
+
model.fit(X_train, y_train)
|
|
757
|
+
y_pred = model.predict(X_test)
|
|
758
|
+
return {"model": model, "y_pred": y_pred,
|
|
759
|
+
"cv_mean": round(float(scores.mean()), 4),
|
|
760
|
+
"cv_std": round(float(scores.std()), 4),
|
|
761
|
+
"cv_scores": [round(float(s), 4) for s in scores]}
|
|
762
|
+
model.fit(X_train, y_train)
|
|
763
|
+
y_pred = model.predict(X_test)
|
|
764
|
+
return {"model": model, "y_pred": y_pred}
|
|
765
|
+
'''
|
|
766
|
+
|
|
767
|
+
SVM_REGRESSOR = '''from sklearn.svm import SVR
|
|
768
|
+
from sklearn.model_selection import cross_val_score
|
|
769
|
+
|
|
770
|
+
def run(
|
|
771
|
+
X_train, X_test, y_train,
|
|
772
|
+
C: float = 1.0, kernel: str = "rbf",
|
|
773
|
+
cross_validation: bool = False, cv_folds: int = 5,
|
|
774
|
+
) -> dict:
|
|
775
|
+
model = SVR(C=C, kernel=kernel)
|
|
776
|
+
if cross_validation:
|
|
777
|
+
scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
|
|
778
|
+
model.fit(X_train, y_train)
|
|
779
|
+
y_pred = model.predict(X_test)
|
|
780
|
+
return {"model": model, "y_pred": y_pred,
|
|
781
|
+
"cv_mean": round(float(scores.mean()), 4),
|
|
782
|
+
"cv_std": round(float(scores.std()), 4),
|
|
783
|
+
"cv_scores": [round(float(s), 4) for s in scores]}
|
|
784
|
+
model.fit(X_train, y_train)
|
|
785
|
+
y_pred = model.predict(X_test)
|
|
786
|
+
return {"model": model, "y_pred": y_pred}
|
|
787
|
+
'''
|
|
788
|
+
|
|
789
|
+
KNN_REGRESSOR = '''from sklearn.neighbors import KNeighborsRegressor
|
|
790
|
+
from sklearn.model_selection import cross_val_score
|
|
791
|
+
|
|
792
|
+
def run(
|
|
793
|
+
X_train, X_test, y_train,
|
|
794
|
+
n_neighbors: int = 5,
|
|
795
|
+
cross_validation: bool = False, cv_folds: int = 5,
|
|
796
|
+
) -> dict:
|
|
797
|
+
model = KNeighborsRegressor(n_neighbors=n_neighbors)
|
|
798
|
+
if cross_validation:
|
|
799
|
+
scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
|
|
800
|
+
model.fit(X_train, y_train)
|
|
801
|
+
y_pred = model.predict(X_test)
|
|
802
|
+
return {"model": model, "y_pred": y_pred,
|
|
803
|
+
"cv_mean": round(float(scores.mean()), 4),
|
|
804
|
+
"cv_std": round(float(scores.std()), 4),
|
|
805
|
+
"cv_scores": [round(float(s), 4) for s in scores]}
|
|
806
|
+
model.fit(X_train, y_train)
|
|
807
|
+
y_pred = model.predict(X_test)
|
|
808
|
+
return {"model": model, "y_pred": y_pred}
|
|
809
|
+
'''
|
|
810
|
+
|
|
811
|
+
# ── Evaluation ────────────────────────────────────────────────────────────────
|
|
812
|
+
|
|
813
|
+
ACCURACY = '''from sklearn.metrics import accuracy_score
|
|
814
|
+
|
|
815
|
+
def run(y_test, y_pred) -> dict:
|
|
816
|
+
return {"accuracy": float(accuracy_score(y_test, y_pred))}
|
|
817
|
+
'''
|
|
818
|
+
|
|
819
|
+
CLASSIFICATION_REPORT = '''from sklearn.metrics import (
|
|
820
|
+
accuracy_score, f1_score, precision_score, recall_score,
|
|
821
|
+
confusion_matrix, classification_report,
|
|
822
|
+
)
|
|
823
|
+
|
|
824
|
+
def run(y_test, y_pred) -> dict:
|
|
825
|
+
return {
|
|
826
|
+
"accuracy": round(float(accuracy_score(y_test, y_pred)), 4),
|
|
827
|
+
"f1_score": round(float(f1_score(y_test, y_pred, average="weighted", zero_division=0)), 4),
|
|
828
|
+
"precision": round(float(precision_score(y_test, y_pred, average="weighted", zero_division=0)), 4),
|
|
829
|
+
"recall": round(float(recall_score(y_test, y_pred, average="weighted", zero_division=0)), 4),
|
|
830
|
+
"confusion_matrix": confusion_matrix(y_test, y_pred).tolist(),
|
|
831
|
+
"class_report": classification_report(y_test, y_pred, output_dict=True, zero_division=0),
|
|
832
|
+
}
|
|
833
|
+
'''
|
|
834
|
+
|
|
835
|
+
VALIDATION_REPORT = '''from sklearn.model_selection import cross_validate
|
|
836
|
+
import numpy as np
|
|
837
|
+
|
|
838
|
+
def run(model, X_train, y_train, cv_folds: int = 5) -> dict:
|
|
839
|
+
"""Run K-Fold cross validation and generate a robust validation report."""
|
|
840
|
+
# Using macro averages for multiclass compatibility
|
|
841
|
+
scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
|
|
842
|
+
|
|
843
|
+
# Catching cases where model might not be a classifier (though typically it will be here)
|
|
844
|
+
try:
|
|
845
|
+
scores = cross_validate(model, X_train, y_train, cv=cv_folds, scoring=scoring)
|
|
846
|
+
return {
|
|
847
|
+
"cv_folds": cv_folds,
|
|
848
|
+
"mean_accuracy": round(float(np.mean(scores['test_accuracy'])), 4),
|
|
849
|
+
"std_accuracy": round(float(np.std(scores['test_accuracy'])), 4),
|
|
850
|
+
"mean_precision": round(float(np.mean(scores['test_precision_macro'])), 4),
|
|
851
|
+
"mean_recall": round(float(np.mean(scores['test_recall_macro'])), 4),
|
|
852
|
+
"mean_f1": round(float(np.mean(scores['test_f1_macro'])), 4),
|
|
853
|
+
}
|
|
854
|
+
except Exception as e:
|
|
855
|
+
# Fallback for regressors
|
|
856
|
+
scores = cross_validate(model, X_train, y_train, cv=cv_folds, scoring=['r2', 'neg_mean_squared_error'])
|
|
857
|
+
return {
|
|
858
|
+
"cv_folds": cv_folds,
|
|
859
|
+
"mean_r2": round(float(np.mean(scores['test_r2'])), 4),
|
|
860
|
+
"mean_mse": round(float(np.mean(-scores['test_neg_mean_squared_error'])), 4),
|
|
861
|
+
}
|
|
862
|
+
'''
|
|
863
|
+
|
|
864
|
+
REGRESSION_METRICS = '''from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
|
865
|
+
import numpy as np
|
|
866
|
+
|
|
867
|
+
def run(y_test, y_pred) -> dict:
|
|
868
|
+
mse = float(mean_squared_error(y_test, y_pred))
|
|
869
|
+
return {
|
|
870
|
+
"mse": round(mse, 4),
|
|
871
|
+
"rmse": round(float(np.sqrt(mse)), 4),
|
|
872
|
+
"mae": round(float(mean_absolute_error(y_test, y_pred)), 4),
|
|
873
|
+
"r2": round(float(r2_score(y_test, y_pred)), 4),
|
|
874
|
+
}
|
|
875
|
+
'''
|
|
876
|
+
|
|
877
|
+
FEATURE_IMPORTANCE = '''import numpy as np
|
|
878
|
+
|
|
879
|
+
def run(model, X_train) -> dict:
|
|
880
|
+
"""Extract feature importances from any tree-based model."""
|
|
881
|
+
if not hasattr(model, "feature_importances_"):
|
|
882
|
+
raise ValueError(f"{type(model).__name__} has no feature_importances_. Use RF, GBM, or DT.")
|
|
883
|
+
imps = model.feature_importances_
|
|
884
|
+
names = list(X_train.columns) if hasattr(X_train, "columns") else [f"f{i}" for i in range(len(imps))]
|
|
885
|
+
pairs = sorted(zip(imps, names), reverse=True)
|
|
886
|
+
top = min(20, len(pairs))
|
|
887
|
+
return {
|
|
888
|
+
"feature_importances": {name: round(float(imp), 6) for imp, name in pairs[:top]},
|
|
889
|
+
"top_feature": pairs[0][1],
|
|
890
|
+
"top_importance": round(float(pairs[0][0]), 6),
|
|
891
|
+
}
|
|
892
|
+
'''
|
|
893
|
+
|
|
894
|
+
AUTO_ML = '''from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
|
|
895
|
+
from sklearn.linear_model import LogisticRegression
|
|
896
|
+
from sklearn.metrics import accuracy_score
|
|
897
|
+
|
|
898
|
+
def run(X_train, X_test, y_train, y_test) -> dict:
|
|
899
|
+
candidates = {
|
|
900
|
+
"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
|
|
901
|
+
"Gradient Boosting": GradientBoostingClassifier(random_state=42),
|
|
902
|
+
"Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
|
|
903
|
+
}
|
|
904
|
+
scores = {}
|
|
905
|
+
best_name, best_score, best_model, best_pred = "", -1.0, None, None
|
|
906
|
+
for name, mdl in candidates.items():
|
|
907
|
+
mdl.fit(X_train, y_train)
|
|
908
|
+
preds = mdl.predict(X_test)
|
|
909
|
+
score = float(accuracy_score(y_test, preds))
|
|
910
|
+
scores[name] = round(score, 4)
|
|
911
|
+
if score > best_score:
|
|
912
|
+
best_score, best_name, best_model, best_pred = score, name, mdl, preds
|
|
913
|
+
return {
|
|
914
|
+
"model": best_model,
|
|
915
|
+
"y_pred": best_pred,
|
|
916
|
+
"best_algorithm": best_name,
|
|
917
|
+
"accuracy": round(best_score, 4),
|
|
918
|
+
"all_scores": scores,
|
|
919
|
+
}
|
|
920
|
+
'''
|
|
921
|
+
|
|
922
|
+
|
|
923
|
+
EDA_HISTOGRAM = '''import pandas as pd
|
|
924
|
+
import numpy as np
|
|
925
|
+
from typing import Annotated
|
|
926
|
+
|
|
927
|
+
def run(data, column: Annotated[str, "column"] = "price", bins: int = 20) -> dict:
|
|
928
|
+
"""Distribution histogram for a single numeric column."""
|
|
929
|
+
series = data[column].dropna()
|
|
930
|
+
counts, edges = np.histogram(series, bins=bins)
|
|
931
|
+
return {
|
|
932
|
+
"histogram": {
|
|
933
|
+
"column": column,
|
|
934
|
+
"counts": counts.tolist(),
|
|
935
|
+
"bin_edges": [round(float(e), 2) for e in edges.tolist()],
|
|
936
|
+
"mean": round(float(series.mean()), 2),
|
|
937
|
+
"median": round(float(series.median()), 2),
|
|
938
|
+
"std": round(float(series.std()), 2),
|
|
939
|
+
"min": round(float(series.min()), 2),
|
|
940
|
+
"max": round(float(series.max()), 2),
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
'''
|
|
944
|
+
|
|
945
|
+
EDA_CORRELATION = '''import pandas as pd
|
|
946
|
+
import numpy as np
|
|
947
|
+
|
|
948
|
+
def run(data) -> dict:
|
|
949
|
+
"""Full correlation matrix for all numeric columns."""
|
|
950
|
+
num_df = data.select_dtypes(include=[np.number])
|
|
951
|
+
corr = num_df.corr().round(3)
|
|
952
|
+
matrix = []
|
|
953
|
+
for row in corr.values.tolist():
|
|
954
|
+
matrix.append([None if (v != v) else round(v, 3) for v in row])
|
|
955
|
+
return {
|
|
956
|
+
"correlation_matrix": {
|
|
957
|
+
"columns": list(corr.columns),
|
|
958
|
+
"matrix": matrix,
|
|
959
|
+
}
|
|
960
|
+
}
|
|
961
|
+
'''
|
|
962
|
+
|
|
963
|
+
EDA_VALUE_COUNTS = '''import pandas as pd
|
|
964
|
+
from typing import Annotated
|
|
965
|
+
|
|
966
|
+
def run(data, column: Annotated[str, "column"] = "mainroad", top_n: int = 10) -> dict:
|
|
967
|
+
"""Value counts for a categorical column."""
|
|
968
|
+
vc = data[column].value_counts().head(top_n)
|
|
969
|
+
return {
|
|
970
|
+
"value_counts": {
|
|
971
|
+
"column": column,
|
|
972
|
+
"labels": [str(l) for l in vc.index.tolist()],
|
|
973
|
+
"counts": vc.values.tolist(),
|
|
974
|
+
"total": int(len(data)),
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
'''
|
|
978
|
+
|
|
979
|
+
EDA_BOX_PLOT = '''import pandas as pd
|
|
980
|
+
import numpy as np
|
|
981
|
+
from typing import Annotated
|
|
982
|
+
|
|
983
|
+
def run(data, column: Annotated[str, "column"] = "price") -> dict:
|
|
984
|
+
"""Box plot statistics (min, Q1, median, Q3, max, outliers) for a column."""
|
|
985
|
+
s = data[column].dropna()
|
|
986
|
+
q1, q3 = float(s.quantile(0.25)), float(s.quantile(0.75))
|
|
987
|
+
iqr = q3 - q1
|
|
988
|
+
outliers = s[(s < q1 - 1.5*iqr) | (s > q3 + 1.5*iqr)]
|
|
989
|
+
return {
|
|
990
|
+
"box_plot": {
|
|
991
|
+
"column": column,
|
|
992
|
+
"min": round(float(s.min()), 2),
|
|
993
|
+
"q1": round(q1, 2),
|
|
994
|
+
"median": round(float(s.median()), 2),
|
|
995
|
+
"q3": round(q3, 2),
|
|
996
|
+
"max": round(float(s.max()), 2),
|
|
997
|
+
"mean": round(float(s.mean()), 2),
|
|
998
|
+
"iqr": round(iqr, 2),
|
|
999
|
+
"outlier_count": int(len(outliers)),
|
|
1000
|
+
"whisker_low": round(float(max(s.min(), q1 - 1.5*iqr)), 2),
|
|
1001
|
+
"whisker_high": round(float(min(s.max(), q3 + 1.5*iqr)), 2),
|
|
1002
|
+
}
|
|
1003
|
+
}
|
|
1004
|
+
'''
|
|
1005
|
+
|
|
1006
|
+
PREDICT = '''import pandas as pd
|
|
1007
|
+
import json
|
|
1008
|
+
|
|
1009
|
+
def run(model, feature_json: str = "{}") -> dict:
|
|
1010
|
+
"""Predict from a trained model. Enter feature values as a JSON string.
|
|
1011
|
+
Example: {"area": 7000, "bedrooms": 3, "bathrooms": 2, "stories": 2,
|
|
1012
|
+
"mainroad": 1, "guestroom": 0, "basement": 0,
|
|
1013
|
+
"hotwaterheating": 0, "airconditioning": 1, "parking": 2,
|
|
1014
|
+
"prefarea": 0, "furnishingstatus": 1}
|
|
1015
|
+
"""
|
|
1016
|
+
features = json.loads(feature_json) if feature_json.strip() else {}
|
|
1017
|
+
if not features:
|
|
1018
|
+
raise ValueError("Enter feature values as JSON in the feature_json field")
|
|
1019
|
+
df = pd.DataFrame([features])
|
|
1020
|
+
prediction = model.predict(df)
|
|
1021
|
+
result = prediction[0]
|
|
1022
|
+
scalar = float(result) if hasattr(result, "__float__") else str(result)
|
|
1023
|
+
return {
|
|
1024
|
+
"prediction": scalar,
|
|
1025
|
+
"features_used": features,
|
|
1026
|
+
}
|
|
1027
|
+
'''
|
|
1028
|
+
|
|
1029
|
+
CONFUSION_MATRIX_PLOTTER = '''import numpy as np
|
|
1030
|
+
from sklearn.metrics import confusion_matrix
|
|
1031
|
+
import json
|
|
1032
|
+
|
|
1033
|
+
def run(y_test, y_pred, normalize: bool = False) -> dict:
|
|
1034
|
+
"""Computes the confusion matrix and outputs a JSON-serializable structure."""
|
|
1035
|
+
labels = sorted(list(set(y_test) | set(y_pred)))
|
|
1036
|
+
cm = confusion_matrix(y_test, y_pred, labels=labels)
|
|
1037
|
+
if normalize:
|
|
1038
|
+
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
|
|
1039
|
+
cm = np.nan_to_num(cm)
|
|
1040
|
+
|
|
1041
|
+
matrix_data = cm.tolist()
|
|
1042
|
+
labels_data = [str(l) for l in labels]
|
|
1043
|
+
|
|
1044
|
+
return {
|
|
1045
|
+
"correlation_heatmap": {
|
|
1046
|
+
"x": labels_data,
|
|
1047
|
+
"y": labels_data,
|
|
1048
|
+
"z": matrix_data
|
|
1049
|
+
},
|
|
1050
|
+
"summary": "Confusion matrix for error analysis"
|
|
1051
|
+
}
|
|
1052
|
+
'''
|
|
1053
|
+
|
|
1054
|
+
ROC_PR_CURVE_DATA = '''import numpy as np
|
|
1055
|
+
from sklearn.metrics import roc_curve, precision_recall_curve, auc
|
|
1056
|
+
|
|
1057
|
+
def run(model, X_test, y_test) -> dict:
|
|
1058
|
+
"""Calculates ROC and PR curves data (TPR, FPR, Precision, Recall) across thresholds."""
|
|
1059
|
+
if not hasattr(model, "predict_proba"):
|
|
1060
|
+
raise ValueError("Model does not support predict_proba() required for ROC/PR curves.")
|
|
1061
|
+
|
|
1062
|
+
classes = model.classes_
|
|
1063
|
+
if len(classes) != 2:
|
|
1064
|
+
raise ValueError("ROC/PR curve data generation currently supports binary classification only.")
|
|
1065
|
+
|
|
1066
|
+
y_scores = model.predict_proba(X_test)[:, 1]
|
|
1067
|
+
pos_label = classes[1]
|
|
1068
|
+
|
|
1069
|
+
fpr, tpr, roc_thresh = roc_curve(y_test, y_scores, pos_label=pos_label)
|
|
1070
|
+
roc_auc = auc(fpr, tpr)
|
|
1071
|
+
|
|
1072
|
+
precision, recall, pr_thresh = precision_recall_curve(y_test, y_scores, pos_label=pos_label)
|
|
1073
|
+
pr_auc = auc(recall, precision)
|
|
1074
|
+
|
|
1075
|
+
return {
|
|
1076
|
+
"roc_curves": {
|
|
1077
|
+
"ROC (FPR vs TPR)": {
|
|
1078
|
+
"fpr": fpr.tolist(),
|
|
1079
|
+
"tpr": tpr.tolist(),
|
|
1080
|
+
"auc": float(roc_auc)
|
|
1081
|
+
},
|
|
1082
|
+
"PR (Recall vs Prec)": {
|
|
1083
|
+
"fpr": recall.tolist(), # X-axis
|
|
1084
|
+
"tpr": precision.tolist(), # Y-axis
|
|
1085
|
+
"auc": float(pr_auc)
|
|
1086
|
+
}
|
|
1087
|
+
}
|
|
1088
|
+
}
|
|
1089
|
+
'''
|
|
1090
|
+
|
|
1091
|
+
RESIDUAL_PLOTTER = '''import numpy as np
|
|
1092
|
+
|
|
1093
|
+
def run(y_test, y_pred) -> dict:
|
|
1094
|
+
"""Generates (Predicted, Residual) pairs for regression error analysis."""
|
|
1095
|
+
y_test_arr = np.array(y_test).ravel()
|
|
1096
|
+
y_pred_arr = np.array(y_pred).ravel()
|
|
1097
|
+
|
|
1098
|
+
residuals = y_test_arr - y_pred_arr
|
|
1099
|
+
|
|
1100
|
+
if len(y_pred_arr) > 1000:
|
|
1101
|
+
idx = np.random.choice(len(y_pred_arr), 1000, replace=False)
|
|
1102
|
+
y_pred_arr = y_pred_arr[idx]
|
|
1103
|
+
residuals = residuals[idx]
|
|
1104
|
+
|
|
1105
|
+
return {
|
|
1106
|
+
"feature_target_scatter": {
|
|
1107
|
+
"x": y_pred_arr.tolist(),
|
|
1108
|
+
"y": residuals.tolist(),
|
|
1109
|
+
"feature_name": "Predicted",
|
|
1110
|
+
"target_name": "Residuals"
|
|
1111
|
+
},
|
|
1112
|
+
"summary": "Residual plot data points"
|
|
1113
|
+
}
|
|
1114
|
+
'''
|
|
1115
|
+
|
|
1116
|
+
FEATURE_IMPORTANCE_VISUALIZER = '''import numpy as np
|
|
1117
|
+
|
|
1118
|
+
def run(model, X_train=None, top_n: int = 10) -> dict:
|
|
1119
|
+
"""Extracts and sorts feature importances or coefficients from a model."""
|
|
1120
|
+
importances = None
|
|
1121
|
+
names = None
|
|
1122
|
+
|
|
1123
|
+
if hasattr(model, 'feature_importances_'):
|
|
1124
|
+
importances = model.feature_importances_
|
|
1125
|
+
elif hasattr(model, 'coef_'):
|
|
1126
|
+
importances = np.abs(model.coef_[0]) if model.coef_.ndim > 1 else np.abs(model.coef_)
|
|
1127
|
+
else:
|
|
1128
|
+
raise ValueError("Model has neither 'feature_importances_' nor 'coef_'.")
|
|
1129
|
+
|
|
1130
|
+
if hasattr(model, 'feature_names_in_'):
|
|
1131
|
+
names = model.feature_names_in_
|
|
1132
|
+
elif X_train is not None and hasattr(X_train, 'columns'):
|
|
1133
|
+
names = X_train.columns
|
|
1134
|
+
else:
|
|
1135
|
+
names = [f"Feature {i}" for i in range(len(importances))]
|
|
1136
|
+
|
|
1137
|
+
indices = np.argsort(importances)[::-1][:top_n]
|
|
1138
|
+
|
|
1139
|
+
top_importances = importances[indices].tolist()
|
|
1140
|
+
top_names = [str(names[i]) for i in indices]
|
|
1141
|
+
|
|
1142
|
+
return {
|
|
1143
|
+
"value_counts": {
|
|
1144
|
+
"labels": top_names,
|
|
1145
|
+
"counts": top_importances,
|
|
1146
|
+
"column": "Feature Importance",
|
|
1147
|
+
"total": sum(top_importances) if sum(top_importances) > 0 else 1
|
|
1148
|
+
},
|
|
1149
|
+
"summary": f"Top {top_n} features"
|
|
1150
|
+
}
|
|
1151
|
+
'''
|
|
1152
|
+
|
|
1153
|
+
DECISION_BOUNDARY_2D = '''import numpy as np
|
|
1154
|
+
from sklearn.decomposition import PCA
|
|
1155
|
+
|
|
1156
|
+
def run(model, X_train, y_train, grid_resolution: int = 50) -> dict:
|
|
1157
|
+
"""Reduces data to 2D (if needed) and predicts over a meshgrid to visualize decision boundaries."""
|
|
1158
|
+
X = np.array(X_train)
|
|
1159
|
+
y = np.array(y_train)
|
|
1160
|
+
|
|
1161
|
+
pca_used = False
|
|
1162
|
+
if X.shape[1] > 2:
|
|
1163
|
+
pca = PCA(n_components=2)
|
|
1164
|
+
X_2d = pca.fit_transform(X)
|
|
1165
|
+
pca_used = True
|
|
1166
|
+
elif X.shape[1] == 2:
|
|
1167
|
+
X_2d = X
|
|
1168
|
+
else:
|
|
1169
|
+
raise ValueError("Cannot draw 2D boundary for 1D feature space.")
|
|
1170
|
+
|
|
1171
|
+
x_min, x_max = X_2d[:, 0].min() - 1, X_2d[:, 0].max() + 1
|
|
1172
|
+
y_min, y_max = X_2d[:, 1].min() - 1, X_2d[:, 1].max() + 1
|
|
1173
|
+
|
|
1174
|
+
xx, yy = np.meshgrid(
|
|
1175
|
+
np.linspace(x_min, x_max, grid_resolution),
|
|
1176
|
+
np.linspace(y_min, y_max, grid_resolution)
|
|
1177
|
+
)
|
|
1178
|
+
|
|
1179
|
+
grid_points_2d = np.c_[xx.ravel(), yy.ravel()]
|
|
1180
|
+
|
|
1181
|
+
if pca_used:
|
|
1182
|
+
grid_points_orig = pca.inverse_transform(grid_points_2d)
|
|
1183
|
+
Z = model.predict(grid_points_orig)
|
|
1184
|
+
else:
|
|
1185
|
+
Z = model.predict(grid_points_2d)
|
|
1186
|
+
|
|
1187
|
+
sample_size = min(300, len(X_2d))
|
|
1188
|
+
idx = np.random.choice(len(X_2d), sample_size, replace=False)
|
|
1189
|
+
|
|
1190
|
+
if hasattr(Z[0], "item"):
|
|
1191
|
+
Z = np.array([z.item() if hasattr(z, "item") else z for z in Z])
|
|
1192
|
+
|
|
1193
|
+
return {
|
|
1194
|
+
"feature_target_scatter": {
|
|
1195
|
+
"x": X_2d[idx, 0].tolist(),
|
|
1196
|
+
"y": X_2d[idx, 1].tolist(),
|
|
1197
|
+
"feature_name": "Dim 1",
|
|
1198
|
+
"target_name": "Dim 2"
|
|
1199
|
+
},
|
|
1200
|
+
"summary": f"Decision Boundary Scatter. PCA applied: {pca_used}"
|
|
1201
|
+
}
|
|
1202
|
+
'''
|
|
1203
|
+
|
|
1204
|
+
PREDICTION_VS_ACTUAL_SCATTER = '''import numpy as np
|
|
1205
|
+
|
|
1206
|
+
def run(y_test, y_pred) -> dict:
|
|
1207
|
+
"""Returns actual vs predicted pairs for plotting a regression scatter plot."""
|
|
1208
|
+
y_test_arr = np.array(y_test).ravel()
|
|
1209
|
+
y_pred_arr = np.array(y_pred).ravel()
|
|
1210
|
+
|
|
1211
|
+
if len(y_test_arr) > 1000:
|
|
1212
|
+
idx = np.random.choice(len(y_test_arr), 1000, replace=False)
|
|
1213
|
+
y_test_arr = y_test_arr[idx]
|
|
1214
|
+
y_pred_arr = y_pred_arr[idx]
|
|
1215
|
+
|
|
1216
|
+
return {
|
|
1217
|
+
"feature_target_scatter": {
|
|
1218
|
+
"x": y_test_arr.tolist(),
|
|
1219
|
+
"y": y_pred_arr.tolist(),
|
|
1220
|
+
"feature_name": "Actual",
|
|
1221
|
+
"target_name": "Predicted"
|
|
1222
|
+
},
|
|
1223
|
+
"summary": "Actual vs Predicted points"
|
|
1224
|
+
}
|
|
1225
|
+
'''
|
|
1226
|
+
|
|
1227
|
+
POLYNOMIAL_FEATURES = '''import pandas as pd
|
|
1228
|
+
from sklearn.preprocessing import PolynomialFeatures
|
|
1229
|
+
|
|
1230
|
+
def run(data, degree: int = 2, interaction_only: bool = False) -> dict:
|
|
1231
|
+
df = data.copy()
|
|
1232
|
+
num_cols = df.select_dtypes(include=['number']).columns.tolist()
|
|
1233
|
+
if not num_cols:
|
|
1234
|
+
return {"data": df, "summary": "No numeric columns found"}
|
|
1235
|
+
|
|
1236
|
+
poly = PolynomialFeatures(degree=degree, interaction_only=interaction_only, include_bias=False)
|
|
1237
|
+
poly_features = poly.fit_transform(df[num_cols])
|
|
1238
|
+
|
|
1239
|
+
feature_names = poly.get_feature_names_out(num_cols)
|
|
1240
|
+
poly_df = pd.DataFrame(poly_features, columns=feature_names, index=df.index)
|
|
1241
|
+
|
|
1242
|
+
df = df.drop(columns=num_cols)
|
|
1243
|
+
df = pd.concat([df, poly_df], axis=1)
|
|
1244
|
+
|
|
1245
|
+
return {"data": df, "summary": f"Added {len(feature_names)} polynomial features"}
|
|
1246
|
+
'''
|
|
1247
|
+
|
|
1248
|
+
DATETIME_EXTRACTOR = '''import pandas as pd
|
|
1249
|
+
from typing import Annotated
|
|
1250
|
+
|
|
1251
|
+
def run(data, column: Annotated[str, "column"] = "date") -> dict:
|
|
1252
|
+
df = data.copy()
|
|
1253
|
+
if column not in df.columns:
|
|
1254
|
+
raise ValueError(f"Column '{column}' not found.")
|
|
1255
|
+
|
|
1256
|
+
dt_series = pd.to_datetime(df[column], errors='coerce')
|
|
1257
|
+
|
|
1258
|
+
df[f"{column}_year"] = dt_series.dt.year
|
|
1259
|
+
df[f"{column}_month"] = dt_series.dt.month
|
|
1260
|
+
df[f"{column}_day"] = dt_series.dt.day
|
|
1261
|
+
df[f"{column}_dayofweek"] = dt_series.dt.dayofweek
|
|
1262
|
+
|
|
1263
|
+
df = df.drop(columns=[column])
|
|
1264
|
+
|
|
1265
|
+
return {"data": df, "summary": f"Extracted datetime components from {column}"}
|
|
1266
|
+
'''
|
|
1267
|
+
|
|
1268
|
+
RFE_FEATURE_SELECTOR = '''import pandas as pd
|
|
1269
|
+
import numpy as np
|
|
1270
|
+
from typing import Annotated
|
|
1271
|
+
from sklearn.feature_selection import RFE
|
|
1272
|
+
from sklearn.ensemble import RandomForestRegressor
|
|
1273
|
+
|
|
1274
|
+
def run(data, target_column: Annotated[str, "column"] = "target", n_to_select: int = 10) -> dict:
|
|
1275
|
+
df = data.copy()
|
|
1276
|
+
if target_column not in df.columns:
|
|
1277
|
+
raise ValueError(f"Target column '{target_column}' not found.")
|
|
1278
|
+
|
|
1279
|
+
y = df[target_column]
|
|
1280
|
+
X = df.drop(columns=[target_column])
|
|
1281
|
+
|
|
1282
|
+
X_num = X.select_dtypes(include=[np.number])
|
|
1283
|
+
X_non_num = X.select_dtypes(exclude=[np.number])
|
|
1284
|
+
|
|
1285
|
+
if X_num.shape[1] <= n_to_select:
|
|
1286
|
+
return {"data": df, "summary": "Feature count already <= n_to_select"}
|
|
1287
|
+
|
|
1288
|
+
estimator = RandomForestRegressor(n_estimators=50, random_state=42)
|
|
1289
|
+
selector = RFE(estimator, n_features_to_select=n_to_select, step=1)
|
|
1290
|
+
|
|
1291
|
+
selector.fit(X_num, y)
|
|
1292
|
+
|
|
1293
|
+
selected_features = X_num.columns[selector.support_].tolist()
|
|
1294
|
+
|
|
1295
|
+
df_out = pd.concat([X_non_num, X_num[selected_features]], axis=1)
|
|
1296
|
+
df_out[target_column] = y
|
|
1297
|
+
|
|
1298
|
+
return {"data": df_out, "selected_features": selected_features}
|
|
1299
|
+
'''
|
|
1300
|
+
|
|
1301
|
+
INVERSE_TARGET_TRANSFORMER = '''import numpy as np
|
|
1302
|
+
|
|
1303
|
+
def run(y_pred, y_test, method: str = "expm1") -> dict:
|
|
1304
|
+
y_p = np.array(y_pred)
|
|
1305
|
+
y_t = np.array(y_test)
|
|
1306
|
+
|
|
1307
|
+
if method == "expm1":
|
|
1308
|
+
y_p_inv = np.expm1(y_p)
|
|
1309
|
+
y_t_inv = np.expm1(y_t)
|
|
1310
|
+
elif method == "square":
|
|
1311
|
+
y_p_inv = np.square(y_p)
|
|
1312
|
+
y_t_inv = np.square(y_t)
|
|
1313
|
+
else:
|
|
1314
|
+
raise ValueError(f"Unsupported method: {method}")
|
|
1315
|
+
|
|
1316
|
+
return {
|
|
1317
|
+
"y_pred": y_p_inv.tolist() if hasattr(y_p_inv, "tolist") else y_p_inv,
|
|
1318
|
+
"y_test": y_t_inv.tolist() if hasattr(y_t_inv, "tolist") else y_t_inv,
|
|
1319
|
+
"method": method
|
|
1320
|
+
}
|
|
1321
|
+
'''
|
|
1322
|
+
|
|
1323
|
+
THRESHOLD_OPTIMIZER = '''import numpy as np
|
|
1324
|
+
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
|
|
1325
|
+
|
|
1326
|
+
def run(model, X_test, y_test, target_metric: str = "f1") -> dict:
|
|
1327
|
+
if not hasattr(model, "predict_proba"):
|
|
1328
|
+
raise ValueError("Model does not support predict_proba required for threshold optimization.")
|
|
1329
|
+
|
|
1330
|
+
classes = model.classes_
|
|
1331
|
+
if len(classes) != 2:
|
|
1332
|
+
raise ValueError("Threshold optimization requires binary classification.")
|
|
1333
|
+
|
|
1334
|
+
y_scores = model.predict_proba(X_test)[:, 1]
|
|
1335
|
+
pos_label = classes[1]
|
|
1336
|
+
|
|
1337
|
+
y_true_bin = (np.array(y_test) == pos_label).astype(int)
|
|
1338
|
+
|
|
1339
|
+
thresholds = np.linspace(0.0, 1.0, 101)
|
|
1340
|
+
best_thresh = 0.5
|
|
1341
|
+
best_score = -1.0
|
|
1342
|
+
|
|
1343
|
+
metrics = []
|
|
1344
|
+
|
|
1345
|
+
for t in thresholds:
|
|
1346
|
+
y_pred_t = (y_scores >= t).astype(int)
|
|
1347
|
+
|
|
1348
|
+
if target_metric == "f1":
|
|
1349
|
+
score = f1_score(y_true_bin, y_pred_t, zero_division=0)
|
|
1350
|
+
elif target_metric == "precision":
|
|
1351
|
+
score = precision_score(y_true_bin, y_pred_t, zero_division=0)
|
|
1352
|
+
elif target_metric == "recall":
|
|
1353
|
+
score = recall_score(y_true_bin, y_pred_t, zero_division=0)
|
|
1354
|
+
elif target_metric == "accuracy":
|
|
1355
|
+
score = accuracy_score(y_true_bin, y_pred_t)
|
|
1356
|
+
else:
|
|
1357
|
+
raise ValueError(f"Unsupported target_metric: {target_metric}")
|
|
1358
|
+
|
|
1359
|
+
metrics.append({"threshold": float(t), "score": float(score)})
|
|
1360
|
+
|
|
1361
|
+
if score > best_score:
|
|
1362
|
+
best_score = score
|
|
1363
|
+
best_thresh = t
|
|
1364
|
+
|
|
1365
|
+
return {
|
|
1366
|
+
"best_threshold": float(best_thresh),
|
|
1367
|
+
"best_score": float(best_score),
|
|
1368
|
+
"target_metric": target_metric,
|
|
1369
|
+
"curve_data": metrics
|
|
1370
|
+
}
|
|
1371
|
+
'''
|
|
1372
|
+
|
|
1373
|
+
|
|
1374
|
+
BINARY_ENCODER = '''import pandas as pd
|
|
1375
|
+
try:
|
|
1376
|
+
import category_encoders as ce
|
|
1377
|
+
except ImportError:
|
|
1378
|
+
pass
|
|
1379
|
+
|
|
1380
|
+
def run(data, columns: str = "") -> dict:
|
|
1381
|
+
df = data.copy()
|
|
1382
|
+
cols = [c.strip() for c in columns.split(",") if c.strip()] if columns.strip() else df.select_dtypes(include=["object", "category"]).columns.tolist()
|
|
1383
|
+
|
|
1384
|
+
if not cols:
|
|
1385
|
+
return {"data": df, "summary": "No columns to encode"}
|
|
1386
|
+
|
|
1387
|
+
try:
|
|
1388
|
+
encoder = ce.BinaryEncoder(cols=cols)
|
|
1389
|
+
df = encoder.fit_transform(df)
|
|
1390
|
+
return {"data": df, "summary": f"Binary encoded {len(cols)} columns"}
|
|
1391
|
+
except NameError:
|
|
1392
|
+
return {"data": df, "summary": "category_encoders not installed. Skipping."}
|
|
1393
|
+
'''
|
|
1394
|
+
|
|
1395
|
+
FREQUENCY_ENCODER = '''import pandas as pd
|
|
1396
|
+
|
|
1397
|
+
def run(data, columns: str = "") -> dict:
|
|
1398
|
+
df = data.copy()
|
|
1399
|
+
cols = [c.strip() for c in columns.split(",") if c.strip()] if columns.strip() else df.select_dtypes(include=["object", "category"]).columns.tolist()
|
|
1400
|
+
|
|
1401
|
+
if not cols:
|
|
1402
|
+
return {"data": df, "summary": "No columns to encode"}
|
|
1403
|
+
|
|
1404
|
+
for col in cols:
|
|
1405
|
+
if col in df.columns:
|
|
1406
|
+
freq = df[col].value_counts(normalize=True)
|
|
1407
|
+
df[col] = df[col].map(freq)
|
|
1408
|
+
|
|
1409
|
+
return {"data": df, "summary": f"Frequency encoded {len(cols)} columns"}
|
|
1410
|
+
'''
|
|
1411
|
+
|
|
1412
|
+
ORDINAL_ENCODER = '''import pandas as pd
|
|
1413
|
+
from sklearn.preprocessing import OrdinalEncoder
|
|
1414
|
+
|
|
1415
|
+
def run(data, columns: str = "") -> dict:
|
|
1416
|
+
df = data.copy()
|
|
1417
|
+
cols = [c.strip() for c in columns.split(",") if c.strip()] if columns.strip() else df.select_dtypes(include=["object", "category"]).columns.tolist()
|
|
1418
|
+
|
|
1419
|
+
if not cols:
|
|
1420
|
+
return {"data": df, "summary": "No columns to encode"}
|
|
1421
|
+
|
|
1422
|
+
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
|
|
1423
|
+
df[cols] = encoder.fit_transform(df[cols].astype(str))
|
|
1424
|
+
|
|
1425
|
+
return {"data": df, "summary": f"Ordinal encoded {len(cols)} columns"}
|
|
1426
|
+
'''
|
|
1427
|
+
|
|
1428
|
+
VIF_FEATURE_SELECTION = '''import pandas as pd
|
|
1429
|
+
import numpy as np
|
|
1430
|
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
|
1431
|
+
|
|
1432
|
+
def run(data, threshold: float = 5.0) -> dict:
|
|
1433
|
+
df = data.copy()
|
|
1434
|
+
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
1435
|
+
|
|
1436
|
+
if len(num_cols) < 2:
|
|
1437
|
+
return {"data": df, "summary": "Not enough numeric columns for VIF"}
|
|
1438
|
+
|
|
1439
|
+
X = df[num_cols].dropna()
|
|
1440
|
+
dropped = []
|
|
1441
|
+
|
|
1442
|
+
while True:
|
|
1443
|
+
vif_data = pd.DataFrame()
|
|
1444
|
+
vif_data["feature"] = X.columns
|
|
1445
|
+
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
|
|
1446
|
+
|
|
1447
|
+
max_vif = vif_data["VIF"].max()
|
|
1448
|
+
if max_vif > threshold:
|
|
1449
|
+
max_feat = vif_data.sort_values("VIF", ascending=False).iloc[0]["feature"]
|
|
1450
|
+
X = X.drop(columns=[max_feat])
|
|
1451
|
+
dropped.append(max_feat)
|
|
1452
|
+
else:
|
|
1453
|
+
break
|
|
1454
|
+
|
|
1455
|
+
df = df.drop(columns=dropped)
|
|
1456
|
+
return {"data": df, "summary": f"Dropped {len(dropped)} features due to high VIF: {dropped}"}
|
|
1457
|
+
'''
|
|
1458
|
+
|
|
1459
|
+
PCA_WHITENING = '''import pandas as pd
|
|
1460
|
+
import numpy as np
|
|
1461
|
+
from sklearn.decomposition import PCA
|
|
1462
|
+
|
|
1463
|
+
def run(data, n_components: int = 0, whiten: bool = True) -> dict:
|
|
1464
|
+
df = data.copy()
|
|
1465
|
+
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
1466
|
+
|
|
1467
|
+
if not num_cols:
|
|
1468
|
+
return {"data": df, "summary": "No numeric columns for PCA"}
|
|
1469
|
+
|
|
1470
|
+
n = n_components if n_components > 0 else None
|
|
1471
|
+
pca = PCA(n_components=n, whiten=whiten)
|
|
1472
|
+
|
|
1473
|
+
pca_features = pca.fit_transform(df[num_cols].fillna(0))
|
|
1474
|
+
|
|
1475
|
+
feature_names = [f"pca_{i}" for i in range(pca_features.shape[1])]
|
|
1476
|
+
pca_df = pd.DataFrame(pca_features, columns=feature_names, index=df.index)
|
|
1477
|
+
|
|
1478
|
+
df = df.drop(columns=num_cols)
|
|
1479
|
+
df = pd.concat([df, pca_df], axis=1)
|
|
1480
|
+
|
|
1481
|
+
return {"data": df, "summary": f"Applied PCA whitening, created {len(feature_names)} components"}
|
|
1482
|
+
'''
|
|
1483
|
+
|
|
1484
|
+
K_MEANS_CLUSTERING_FEATURES = '''import pandas as pd
|
|
1485
|
+
import numpy as np
|
|
1486
|
+
from sklearn.cluster import KMeans
|
|
1487
|
+
|
|
1488
|
+
def run(data, n_clusters: int = 3, random_state: int = 42) -> dict:
|
|
1489
|
+
df = data.copy()
|
|
1490
|
+
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
1491
|
+
|
|
1492
|
+
if not num_cols:
|
|
1493
|
+
return {"data": df, "summary": "No numeric columns for KMeans"}
|
|
1494
|
+
|
|
1495
|
+
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init='auto')
|
|
1496
|
+
df['ClusterID'] = kmeans.fit_predict(df[num_cols].fillna(0))
|
|
1497
|
+
|
|
1498
|
+
return {"data": df, "summary": f"Added ClusterID with {n_clusters} clusters"}
|
|
1499
|
+
'''
|
|
1500
|
+
|
|
1501
|
+
XGBOOST_NODE = '''import pandas as pd
|
|
1502
|
+
import numpy as np
|
|
1503
|
+
|
|
1504
|
+
def run(X_train, X_test, y_train, task_type: str = "classifier", n_estimators: int = 100, learning_rate: float = 0.1, max_depth: int = 3) -> dict:
|
|
1505
|
+
if task_type.lower() == "classifier":
|
|
1506
|
+
from xgboost import XGBClassifier
|
|
1507
|
+
model = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, use_label_encoder=False, eval_metric='logloss')
|
|
1508
|
+
else:
|
|
1509
|
+
from xgboost import XGBRegressor
|
|
1510
|
+
model = XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
|
|
1511
|
+
|
|
1512
|
+
model.fit(X_train, y_train)
|
|
1513
|
+
y_pred = model.predict(X_test)
|
|
1514
|
+
|
|
1515
|
+
return {"model": model, "y_pred": y_pred}
|
|
1516
|
+
'''
|
|
1517
|
+
|
|
1518
|
+
LIGHTGBM_NODE = '''import pandas as pd
|
|
1519
|
+
|
|
1520
|
+
def run(X_train, X_test, y_train, task_type: str = "classifier", n_estimators: int = 100, learning_rate: float = 0.1, max_depth: int = -1) -> dict:
|
|
1521
|
+
if task_type.lower() == "classifier":
|
|
1522
|
+
from lightgbm import LGBMClassifier
|
|
1523
|
+
model = LGBMClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
|
|
1524
|
+
else:
|
|
1525
|
+
from lightgbm import LGBMRegressor
|
|
1526
|
+
model = LGBMRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
|
|
1527
|
+
|
|
1528
|
+
model.fit(X_train, y_train)
|
|
1529
|
+
y_pred = model.predict(X_test)
|
|
1530
|
+
|
|
1531
|
+
return {"model": model, "y_pred": y_pred}
|
|
1532
|
+
'''
|
|
1533
|
+
|
|
1534
|
+
ADABOOST_NODE = '''import pandas as pd
|
|
1535
|
+
|
|
1536
|
+
def run(X_train, X_test, y_train, task_type: str = "classifier", n_estimators: int = 50, learning_rate: float = 1.0) -> dict:
|
|
1537
|
+
if task_type.lower() == "classifier":
|
|
1538
|
+
from sklearn.ensemble import AdaBoostClassifier
|
|
1539
|
+
model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
|
|
1540
|
+
else:
|
|
1541
|
+
from sklearn.ensemble import AdaBoostRegressor
|
|
1542
|
+
model = AdaBoostRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
|
|
1543
|
+
|
|
1544
|
+
model.fit(X_train, y_train)
|
|
1545
|
+
y_pred = model.predict(X_test)
|
|
1546
|
+
|
|
1547
|
+
return {"model": model, "y_pred": y_pred}
|
|
1548
|
+
'''
|
|
1549
|
+
|
|
1550
|
+
VOTING_ENSEMBLE = '''import pandas as pd
|
|
1551
|
+
import numpy as np
|
|
1552
|
+
|
|
1553
|
+
def run(model1, model2, model3, X_train, X_test, y_train, task_type: str = "classifier", voting: str = "hard") -> dict:
|
|
1554
|
+
estimators = [('m1', model1), ('m2', model2), ('m3', model3)]
|
|
1555
|
+
|
|
1556
|
+
if task_type.lower() == "classifier":
|
|
1557
|
+
from sklearn.ensemble import VotingClassifier
|
|
1558
|
+
model = VotingClassifier(estimators=estimators, voting=voting)
|
|
1559
|
+
else:
|
|
1560
|
+
from sklearn.ensemble import VotingRegressor
|
|
1561
|
+
model = VotingRegressor(estimators=estimators)
|
|
1562
|
+
|
|
1563
|
+
model.fit(X_train, y_train)
|
|
1564
|
+
y_pred = model.predict(X_test)
|
|
1565
|
+
|
|
1566
|
+
return {"model": model, "y_pred": y_pred}
|
|
1567
|
+
'''
|
|
1568
|
+
|
|
1569
|
+
LAG_FEATURE_GENERATOR = '''import pandas as pd
|
|
1570
|
+
from typing import Annotated
|
|
1571
|
+
|
|
1572
|
+
def run(data, column: Annotated[str, "column"], lags: int = 3) -> dict:
|
|
1573
|
+
df = data.copy()
|
|
1574
|
+
if column not in df.columns:
|
|
1575
|
+
raise ValueError(f"Column '{column}' not found.")
|
|
1576
|
+
|
|
1577
|
+
for i in range(1, lags + 1):
|
|
1578
|
+
df[f"{column}_lag_{i}"] = df[column].shift(i)
|
|
1579
|
+
|
|
1580
|
+
return {"data": df, "summary": f"Created {lags} lag features for {column}"}
|
|
1581
|
+
'''
|
|
1582
|
+
|
|
1583
|
+
ROLLING_WINDOW_STATS = '''import pandas as pd
|
|
1584
|
+
from typing import Annotated
|
|
1585
|
+
|
|
1586
|
+
def run(data, column: Annotated[str, "column"], window: int = 7) -> dict:
|
|
1587
|
+
df = data.copy()
|
|
1588
|
+
if column not in df.columns:
|
|
1589
|
+
raise ValueError(f"Column '{column}' not found.")
|
|
1590
|
+
|
|
1591
|
+
df[f"{column}_roll_mean_{window}"] = df[column].rolling(window=window).mean()
|
|
1592
|
+
df[f"{column}_roll_std_{window}"] = df[column].rolling(window=window).std()
|
|
1593
|
+
|
|
1594
|
+
return {"data": df, "summary": f"Created rolling window ({window}) stats for {column}"}
|
|
1595
|
+
'''
|
|
1596
|
+
|
|
1597
|
+
PERMUTATION_IMPORTANCE = '''import numpy as np
|
|
1598
|
+
from sklearn.inspection import permutation_importance
|
|
1599
|
+
|
|
1600
|
+
def run(model, X_test, y_test, scoring: str = "accuracy", n_repeats: int = 5, random_state: int = 42) -> dict:
|
|
1601
|
+
result = permutation_importance(model, X_test, y_test, scoring=scoring, n_repeats=n_repeats, random_state=random_state)
|
|
1602
|
+
|
|
1603
|
+
importances = result.importances_mean
|
|
1604
|
+
|
|
1605
|
+
if hasattr(X_test, 'columns'):
|
|
1606
|
+
names = X_test.columns
|
|
1607
|
+
else:
|
|
1608
|
+
names = [f"Feature {i}" for i in range(len(importances))]
|
|
1609
|
+
|
|
1610
|
+
indices = np.argsort(importances)[::-1]
|
|
1611
|
+
|
|
1612
|
+
top_importances = importances[indices].tolist()
|
|
1613
|
+
top_names = [str(names[i]) for i in indices]
|
|
1614
|
+
|
|
1615
|
+
return {
|
|
1616
|
+
"features": top_names,
|
|
1617
|
+
"importances": top_importances,
|
|
1618
|
+
"summary": "Permutation Importance"
|
|
1619
|
+
}
|
|
1620
|
+
'''
|
|
1621
|
+
|
|
1622
|
+
LEARNING_CURVE_DATA = '''import numpy as np
|
|
1623
|
+
from sklearn.model_selection import learning_curve
|
|
1624
|
+
|
|
1625
|
+
def run(model, X_train, y_train, cv_folds: int = 5, scoring: str = "accuracy") -> dict:
|
|
1626
|
+
train_sizes, train_scores, test_scores = learning_curve(
|
|
1627
|
+
model, X_train, y_train, cv=cv_folds, scoring=scoring,
|
|
1628
|
+
train_sizes=np.linspace(0.1, 1.0, 10), random_state=42
|
|
1629
|
+
)
|
|
1630
|
+
|
|
1631
|
+
train_mean = np.mean(train_scores, axis=1).tolist()
|
|
1632
|
+
test_mean = np.mean(test_scores, axis=1).tolist()
|
|
1633
|
+
sizes = train_sizes.tolist()
|
|
1634
|
+
|
|
1635
|
+
return {
|
|
1636
|
+
"train_sizes": sizes,
|
|
1637
|
+
"train_scores": train_mean,
|
|
1638
|
+
"val_scores": test_mean,
|
|
1639
|
+
"scoring": scoring,
|
|
1640
|
+
"summary": "Learning Curve Data"
|
|
1641
|
+
}
|
|
1642
|
+
'''
|
|
1643
|
+
|
|
1644
|
+
LIFT_GAIN_CHARTS = '''import numpy as np
|
|
1645
|
+
|
|
1646
|
+
def run(model, X_test, y_test) -> dict:
|
|
1647
|
+
if not hasattr(model, "predict_proba"):
|
|
1648
|
+
raise ValueError("Model does not support predict_proba required for lift/gain charts.")
|
|
1649
|
+
|
|
1650
|
+
classes = model.classes_
|
|
1651
|
+
if len(classes) != 2:
|
|
1652
|
+
raise ValueError("Lift/Gain charts require binary classification.")
|
|
1653
|
+
|
|
1654
|
+
y_scores = model.predict_proba(X_test)[:, 1]
|
|
1655
|
+
pos_label = classes[1]
|
|
1656
|
+
|
|
1657
|
+
y_true_bin = (np.array(y_test) == pos_label).astype(int)
|
|
1658
|
+
|
|
1659
|
+
indices = np.argsort(y_scores)[::-1]
|
|
1660
|
+
y_true_sorted = y_true_bin[indices]
|
|
1661
|
+
|
|
1662
|
+
total_positives = y_true_bin.sum()
|
|
1663
|
+
total_samples = len(y_true_bin)
|
|
1664
|
+
|
|
1665
|
+
cum_positives = np.cumsum(y_true_sorted)
|
|
1666
|
+
|
|
1667
|
+
gain = cum_positives / max(total_positives, 1)
|
|
1668
|
+
|
|
1669
|
+
count = np.arange(1, total_samples + 1)
|
|
1670
|
+
lift = (cum_positives / count) / (total_positives / total_samples)
|
|
1671
|
+
|
|
1672
|
+
deciles = np.linspace(0, 1, 11)[1:]
|
|
1673
|
+
gain_deciles = [gain[min(int(d * total_samples) - 1, total_samples - 1)] for d in deciles]
|
|
1674
|
+
lift_deciles = [lift[min(int(d * total_samples) - 1, total_samples - 1)] for d in deciles]
|
|
1675
|
+
|
|
1676
|
+
return {
|
|
1677
|
+
"deciles": deciles.tolist(),
|
|
1678
|
+
"gain": gain_deciles,
|
|
1679
|
+
"lift": lift_deciles,
|
|
1680
|
+
"summary": "Lift and Gain Data"
|
|
1681
|
+
}
|
|
1682
|
+
'''
|
|
1683
|
+
|
|
1684
|
+
TOMEK_LINKS = '''import pandas as pd
|
|
1685
|
+
import numpy as np
|
|
1686
|
+
from typing import Annotated
|
|
1687
|
+
|
|
1688
|
+
def run(data, target_column: Annotated[str, "column"] = "target") -> dict:
|
|
1689
|
+
df = data.copy()
|
|
1690
|
+
if target_column not in df.columns:
|
|
1691
|
+
raise ValueError(f"Target column '{target_column}' not found.")
|
|
1692
|
+
|
|
1693
|
+
X = df.drop(columns=[target_column])
|
|
1694
|
+
y = df[target_column]
|
|
1695
|
+
|
|
1696
|
+
try:
|
|
1697
|
+
from imblearn.under_sampling import TomekLinks
|
|
1698
|
+
tl = TomekLinks()
|
|
1699
|
+
X_res, y_res = tl.fit_resample(X, y)
|
|
1700
|
+
df_out = X_res.copy()
|
|
1701
|
+
df_out[target_column] = y_res
|
|
1702
|
+
|
|
1703
|
+
return {"data": df_out, "summary": f"Tomek Links removed {len(df) - len(df_out)} noisy samples."}
|
|
1704
|
+
except ImportError:
|
|
1705
|
+
return {"data": df, "summary": "imbalanced-learn not installed. Skipping Tomek Links."}
|
|
1706
|
+
'''
|
|
1707
|
+
|
|
1708
|
+
RANDOM_UNDER_SAMPLER = '''import pandas as pd
|
|
1709
|
+
from typing import Annotated
|
|
1710
|
+
|
|
1711
|
+
def run(data, target_column: Annotated[str, "column"] = "target", random_state: int = 42) -> dict:
|
|
1712
|
+
df = data.copy()
|
|
1713
|
+
if target_column not in df.columns:
|
|
1714
|
+
raise ValueError(f"Target column '{target_column}' not found.")
|
|
1715
|
+
|
|
1716
|
+
try:
|
|
1717
|
+
from imblearn.under_sampling import RandomUnderSampler
|
|
1718
|
+
X = df.drop(columns=[target_column])
|
|
1719
|
+
y = df[target_column]
|
|
1720
|
+
rus = RandomUnderSampler(random_state=random_state)
|
|
1721
|
+
X_res, y_res = rus.fit_resample(X, y)
|
|
1722
|
+
df_out = X_res.copy()
|
|
1723
|
+
df_out[target_column] = y_res
|
|
1724
|
+
|
|
1725
|
+
return {"data": df_out, "summary": f"Undersampled from {len(df)} to {len(df_out)} rows."}
|
|
1726
|
+
except ImportError:
|
|
1727
|
+
min_class_size = df[target_column].value_counts().min()
|
|
1728
|
+
df_out = df.groupby(target_column).sample(n=min_class_size, random_state=random_state)
|
|
1729
|
+
return {"data": df_out, "summary": f"Manual undersampled from {len(df)} to {len(df_out)} rows."}
|
|
1730
|
+
'''
|
|
1731
|
+
|
|
1732
|
+
CORRELATION_HEATMAP = '''import pandas as pd
|
|
1733
|
+
import numpy as np
|
|
1734
|
+
|
|
1735
|
+
def run(data: pd.DataFrame) -> dict:
|
|
1736
|
+
numeric_df = data.select_dtypes(include=[np.number])
|
|
1737
|
+
if numeric_df.empty:
|
|
1738
|
+
return {"error": "No numeric columns found for correlation."}
|
|
1739
|
+
|
|
1740
|
+
corr_matrix = numeric_df.corr().fillna(0)
|
|
1741
|
+
cols = corr_matrix.columns.tolist()
|
|
1742
|
+
|
|
1743
|
+
z_values = corr_matrix.values.tolist()
|
|
1744
|
+
|
|
1745
|
+
return {
|
|
1746
|
+
"correlation_heatmap": {
|
|
1747
|
+
"x": cols,
|
|
1748
|
+
"y": cols,
|
|
1749
|
+
"z": z_values
|
|
1750
|
+
}
|
|
1751
|
+
}
|
|
1752
|
+
'''
|
|
1753
|
+
|
|
1754
|
+
MISSING_VALUE_MAP = '''import pandas as pd
|
|
1755
|
+
|
|
1756
|
+
def run(data: pd.DataFrame) -> dict:
|
|
1757
|
+
if len(data) > 100:
|
|
1758
|
+
sampled = data.sample(100, random_state=42)
|
|
1759
|
+
else:
|
|
1760
|
+
sampled = data
|
|
1761
|
+
|
|
1762
|
+
null_matrix = sampled.isnull().astype(int)
|
|
1763
|
+
cols = null_matrix.columns.tolist()
|
|
1764
|
+
|
|
1765
|
+
z_values = null_matrix.values.tolist()
|
|
1766
|
+
|
|
1767
|
+
return {
|
|
1768
|
+
"missing_value_map": {
|
|
1769
|
+
"x": cols,
|
|
1770
|
+
"y": list(range(len(sampled))),
|
|
1771
|
+
"z": z_values
|
|
1772
|
+
}
|
|
1773
|
+
}
|
|
1774
|
+
'''
|
|
1775
|
+
|
|
1776
|
+
CLASS_BALANCE_VISUALIZER = '''import pandas as pd
|
|
1777
|
+
from typing import Annotated
|
|
1778
|
+
|
|
1779
|
+
def run(data: pd.DataFrame, target: Annotated[str, "column"]) -> dict:
|
|
1780
|
+
if target not in data.columns:
|
|
1781
|
+
return {"error": f"Target column '{target}' not found."}
|
|
1782
|
+
|
|
1783
|
+
counts = data[target].value_counts(dropna=False)
|
|
1784
|
+
percentages = data[target].value_counts(normalize=True, dropna=False) * 100
|
|
1785
|
+
|
|
1786
|
+
classes = [str(c) for c in counts.index]
|
|
1787
|
+
|
|
1788
|
+
return {
|
|
1789
|
+
"class_balance": {
|
|
1790
|
+
"classes": classes,
|
|
1791
|
+
"counts": counts.tolist(),
|
|
1792
|
+
"percentages": percentages.tolist()
|
|
1793
|
+
}
|
|
1794
|
+
}
|
|
1795
|
+
'''
|
|
1796
|
+
|
|
1797
|
+
FEATURE_TARGET_SCATTER = '''import pandas as pd
|
|
1798
|
+
import numpy as np
|
|
1799
|
+
from typing import Annotated
|
|
1800
|
+
|
|
1801
|
+
def run(data: pd.DataFrame, target: Annotated[str, "column"]) -> dict:
|
|
1802
|
+
if target not in data.columns:
|
|
1803
|
+
return {"error": f"Target column '{target}' not found."}
|
|
1804
|
+
|
|
1805
|
+
df = data.dropna(subset=[target])
|
|
1806
|
+
|
|
1807
|
+
# Identify numeric features (excluding target)
|
|
1808
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
1809
|
+
if target in numeric_cols:
|
|
1810
|
+
numeric_cols.remove(target)
|
|
1811
|
+
|
|
1812
|
+
if not numeric_cols:
|
|
1813
|
+
return {"error": "No numeric features found."}
|
|
1814
|
+
|
|
1815
|
+
target_is_numeric = pd.api.types.is_numeric_dtype(df[target])
|
|
1816
|
+
|
|
1817
|
+
corrs = []
|
|
1818
|
+
for col in numeric_cols:
|
|
1819
|
+
if target_is_numeric:
|
|
1820
|
+
corr = df[col].corr(df[target])
|
|
1821
|
+
if pd.isna(corr): corr = 0
|
|
1822
|
+
corrs.append((col, corr))
|
|
1823
|
+
else:
|
|
1824
|
+
corrs.append((col, 0))
|
|
1825
|
+
|
|
1826
|
+
# Sort by absolute correlation and take top 8
|
|
1827
|
+
corrs.sort(key=lambda x: abs(x[1]), reverse=True)
|
|
1828
|
+
top_features = [x[0] for x in corrs[:8]]
|
|
1829
|
+
top_corrs = {x[0]: x[1] for x in corrs[:8]}
|
|
1830
|
+
|
|
1831
|
+
# Cap samples for rendering speed
|
|
1832
|
+
if len(df) > 600:
|
|
1833
|
+
df = df.sample(600, random_state=42)
|
|
1834
|
+
|
|
1835
|
+
panels = []
|
|
1836
|
+
for f in top_features:
|
|
1837
|
+
panels.append({
|
|
1838
|
+
"feature": f,
|
|
1839
|
+
"x": df[f].tolist(),
|
|
1840
|
+
"y": df[target].tolist(),
|
|
1841
|
+
"corr": top_corrs[f]
|
|
1842
|
+
})
|
|
1843
|
+
|
|
1844
|
+
return {
|
|
1845
|
+
"scatter_grid": {
|
|
1846
|
+
"panels": panels,
|
|
1847
|
+
"title": "Continuous Feature vs Target",
|
|
1848
|
+
"target_name": target
|
|
1849
|
+
}
|
|
1850
|
+
}
|
|
1851
|
+
'''
|
|
1852
|
+
|
|
1853
|
+
MODEL_ERROR_HISTOGRAM = '''import numpy as np
|
|
1854
|
+
|
|
1855
|
+
def run(y_test, y_pred) -> dict:
|
|
1856
|
+
y_t = np.array(y_test)
|
|
1857
|
+
y_p = np.array(y_pred)
|
|
1858
|
+
|
|
1859
|
+
residuals = y_t - y_p
|
|
1860
|
+
counts, bin_edges = np.histogram(residuals, bins=30)
|
|
1861
|
+
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
|
|
1862
|
+
|
|
1863
|
+
return {
|
|
1864
|
+
"model_error_histogram": {
|
|
1865
|
+
"counts": counts.tolist(),
|
|
1866
|
+
"bin_centers": bin_centers.tolist(),
|
|
1867
|
+
"bin_edges": bin_edges.tolist()
|
|
1868
|
+
}
|
|
1869
|
+
}
|
|
1870
|
+
'''
|
|
1871
|
+
|
|
1872
|
+
PARTIAL_DEPENDENCE_DATA = '''import pandas as pd
|
|
1873
|
+
import numpy as np
|
|
1874
|
+
from sklearn.inspection import partial_dependence
|
|
1875
|
+
from typing import Annotated
|
|
1876
|
+
|
|
1877
|
+
def run(model, X_train: pd.DataFrame, feature: Annotated[str, "column"]) -> dict:
|
|
1878
|
+
if feature not in X_train.columns:
|
|
1879
|
+
return {"error": f"Feature '{feature}' not found in X_train."}
|
|
1880
|
+
|
|
1881
|
+
if len(X_train) > 1000:
|
|
1882
|
+
X_sample = X_train.sample(1000, random_state=42)
|
|
1883
|
+
else:
|
|
1884
|
+
X_sample = X_train
|
|
1885
|
+
|
|
1886
|
+
try:
|
|
1887
|
+
pd_result = partial_dependence(model, X_sample, features=[feature], kind='average')
|
|
1888
|
+
avg_response = pd_result['average'].tolist()
|
|
1889
|
+
grid_values = pd_result['grid_values'][0].tolist()
|
|
1890
|
+
|
|
1891
|
+
return {
|
|
1892
|
+
"partial_dependence": {
|
|
1893
|
+
"feature": feature,
|
|
1894
|
+
"grid": grid_values,
|
|
1895
|
+
"average_response": avg_response
|
|
1896
|
+
}
|
|
1897
|
+
}
|
|
1898
|
+
except Exception as e:
|
|
1899
|
+
return {"error": f"Partial dependence failed: {str(e)}"}
|
|
1900
|
+
'''
|
|
1901
|
+
|
|
1902
|
+
MULTICLASS_ROC_DATA = '''import numpy as np
|
|
1903
|
+
from sklearn.metrics import roc_curve, auc
|
|
1904
|
+
from sklearn.preprocessing import label_binarize
|
|
1905
|
+
|
|
1906
|
+
def run(model, X_test, y_test) -> dict:
|
|
1907
|
+
if not hasattr(model, "predict_proba"):
|
|
1908
|
+
return {"error": "Model does not support predict_proba, which is required for ROC."}
|
|
1909
|
+
|
|
1910
|
+
y_t = np.array(y_test)
|
|
1911
|
+
classes = np.unique(y_t)
|
|
1912
|
+
|
|
1913
|
+
if len(classes) < 2:
|
|
1914
|
+
return {"error": "Need at least 2 classes for ROC."}
|
|
1915
|
+
|
|
1916
|
+
try:
|
|
1917
|
+
y_prob = model.predict_proba(X_test)
|
|
1918
|
+
except Exception as e:
|
|
1919
|
+
return {"error": f"predict_proba failed: {str(e)}"}
|
|
1920
|
+
|
|
1921
|
+
roc_data = {}
|
|
1922
|
+
|
|
1923
|
+
if len(classes) == 2:
|
|
1924
|
+
fpr, tpr, _ = roc_curve(y_t, y_prob[:, 1], pos_label=classes[1])
|
|
1925
|
+
roc_data[str(classes[1])] = {
|
|
1926
|
+
"fpr": fpr.tolist(),
|
|
1927
|
+
"tpr": tpr.tolist(),
|
|
1928
|
+
"auc": auc(fpr, tpr)
|
|
1929
|
+
}
|
|
1930
|
+
else:
|
|
1931
|
+
y_test_bin = label_binarize(y_t, classes=classes)
|
|
1932
|
+
for i, cls in enumerate(classes):
|
|
1933
|
+
fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
|
|
1934
|
+
roc_data[str(cls)] = {
|
|
1935
|
+
"fpr": fpr.tolist(),
|
|
1936
|
+
"tpr": tpr.tolist(),
|
|
1937
|
+
"auc": auc(fpr, tpr)
|
|
1938
|
+
}
|
|
1939
|
+
|
|
1940
|
+
return {"roc_curves": roc_data}
|
|
1941
|
+
'''
|
|
1942
|
+
|
|
1943
|
+
DBSCAN_CLUSTERING = '''import pandas as pd
|
|
1944
|
+
import numpy as np
|
|
1945
|
+
from sklearn.cluster import DBSCAN
|
|
1946
|
+
|
|
1947
|
+
def run(data: pd.DataFrame, eps: float = 0.5, min_samples: int = 5) -> dict:
|
|
1948
|
+
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
|
1949
|
+
if numeric_df.empty:
|
|
1950
|
+
return {"error": "No numeric data available for DBSCAN."}
|
|
1951
|
+
|
|
1952
|
+
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
|
|
1953
|
+
labels = dbscan.fit_predict(numeric_df)
|
|
1954
|
+
|
|
1955
|
+
out_df = numeric_df.copy()
|
|
1956
|
+
out_df["ClusterID"] = labels
|
|
1957
|
+
|
|
1958
|
+
noise_count = int((labels == -1).sum())
|
|
1959
|
+
|
|
1960
|
+
return {
|
|
1961
|
+
"data": out_df,
|
|
1962
|
+
"labels": labels.tolist(),
|
|
1963
|
+
"summary": f"DBSCAN found {len(set(labels)) - (1 if -1 in labels else 0)} clusters and {noise_count} noise points."
|
|
1964
|
+
}
|
|
1965
|
+
'''
|
|
1966
|
+
|
|
1967
|
+
TSNE_VISUALIZER = '''import pandas as pd
|
|
1968
|
+
import numpy as np
|
|
1969
|
+
from sklearn.manifold import TSNE
|
|
1970
|
+
from typing import Annotated
|
|
1971
|
+
|
|
1972
|
+
def run(
|
|
1973
|
+
data: pd.DataFrame,
|
|
1974
|
+
label_column: Annotated[str, "column"] = "",
|
|
1975
|
+
n_clusters: int = 4,
|
|
1976
|
+
perplexity: float = 30.0,
|
|
1977
|
+
) -> dict:
|
|
1978
|
+
"""t-SNE 2D scatter with automatic cluster coloring.
|
|
1979
|
+
|
|
1980
|
+
label_column : name of a column whose values will color the dots.
|
|
1981
|
+
Leave blank → KMeans auto-clustering (n_clusters groups).
|
|
1982
|
+
n_clusters : number of KMeans clusters used when label_column is blank.
|
|
1983
|
+
perplexity : t-SNE perplexity (5–50 works for most datasets).
|
|
1984
|
+
"""
|
|
1985
|
+
df = data.copy()
|
|
1986
|
+
numeric_df = df.select_dtypes(include=[np.number]).dropna()
|
|
1987
|
+
|
|
1988
|
+
if numeric_df.empty:
|
|
1989
|
+
return {"error": "No numeric data available for t-SNE."}
|
|
1990
|
+
|
|
1991
|
+
# Cap at 3000 rows for speed; keep a reproducible sample
|
|
1992
|
+
if len(numeric_df) > 3000:
|
|
1993
|
+
numeric_df = numeric_df.sample(3000, random_state=42)
|
|
1994
|
+
|
|
1995
|
+
# ── Run t-SNE ──────────────────────────────────────────────────────────────
|
|
1996
|
+
perp = min(float(perplexity), max(5.0, (len(numeric_df) - 1) / 3))
|
|
1997
|
+
tsne = TSNE(n_components=2, perplexity=perp, random_state=42, n_iter=300)
|
|
1998
|
+
embedding = tsne.fit_transform(numeric_df.values)
|
|
1999
|
+
|
|
2000
|
+
# ── Build labels array for color-coding ───────────────────────────────────
|
|
2001
|
+
col = label_column.strip() if label_column else ""
|
|
2002
|
+
|
|
2003
|
+
if col and col in df.columns:
|
|
2004
|
+
# Use the requested column aligned to sampled rows
|
|
2005
|
+
raw = df[col].loc[numeric_df.index]
|
|
2006
|
+
dtype = raw.dtype
|
|
2007
|
+
|
|
2008
|
+
if dtype == object or str(dtype) == "category":
|
|
2009
|
+
# Categorical: use as-is
|
|
2010
|
+
labels = [str(v) for v in raw]
|
|
2011
|
+
else:
|
|
2012
|
+
# Numeric target → bucket into 4 quantile bands for readability
|
|
2013
|
+
try:
|
|
2014
|
+
banded = pd.qcut(raw, q=min(n_clusters, raw.nunique()), duplicates="drop", labels=False)
|
|
2015
|
+
labels = [f"Q{int(v)+1}" if pd.notna(v) else "?" for v in banded]
|
|
2016
|
+
except Exception:
|
|
2017
|
+
labels = [str(v) for v in raw]
|
|
2018
|
+
else:
|
|
2019
|
+
# Auto-cluster with KMeans
|
|
2020
|
+
from sklearn.cluster import KMeans
|
|
2021
|
+
from sklearn.preprocessing import StandardScaler
|
|
2022
|
+
scaled = StandardScaler().fit_transform(numeric_df.values)
|
|
2023
|
+
k = min(n_clusters, len(numeric_df) - 1, 10)
|
|
2024
|
+
km = KMeans(n_clusters=k, random_state=42, n_init="auto")
|
|
2025
|
+
cluster_ids = km.fit_predict(scaled)
|
|
2026
|
+
labels = [f"Cluster {int(c)}" for c in cluster_ids]
|
|
2027
|
+
|
|
2028
|
+
n = len(embedding)
|
|
2029
|
+
dot_size = 8 if n < 500 else (6 if n < 1500 else 4)
|
|
2030
|
+
|
|
2031
|
+
return {
|
|
2032
|
+
"tsne_scatter": {
|
|
2033
|
+
"x": embedding[:, 0].tolist(),
|
|
2034
|
+
"y": embedding[:, 1].tolist(),
|
|
2035
|
+
"labels": labels, # ← color-coding key
|
|
2036
|
+
"title": "t-SNE Cluster Visualization",
|
|
2037
|
+
"x_label": "Dim 1",
|
|
2038
|
+
"y_label": "Dim 2",
|
|
2039
|
+
"dot_size": dot_size,
|
|
2040
|
+
},
|
|
2041
|
+
"summary": f"t-SNE projection of {n} samples with {len(set(labels))} color groups.",
|
|
2042
|
+
}
|
|
2043
|
+
'''
|
|
2044
|
+
|
|
2045
|
+
ISOLATION_FOREST_ANOMALY = '''import pandas as pd
|
|
2046
|
+
import numpy as np
|
|
2047
|
+
from sklearn.ensemble import IsolationForest
|
|
2048
|
+
|
|
2049
|
+
def run(data: pd.DataFrame, contamination: float = 0.05) -> dict:
|
|
2050
|
+
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
|
2051
|
+
if numeric_df.empty:
|
|
2052
|
+
return {"error": "No numeric data available for Isolation Forest."}
|
|
2053
|
+
|
|
2054
|
+
clf = IsolationForest(contamination=contamination, random_state=42)
|
|
2055
|
+
labels = clf.fit_predict(numeric_df)
|
|
2056
|
+
|
|
2057
|
+
out_df = numeric_df.copy()
|
|
2058
|
+
out_df["is_anomaly"] = labels
|
|
2059
|
+
|
|
2060
|
+
anomaly_count = int((labels == -1).sum())
|
|
2061
|
+
|
|
2062
|
+
return {
|
|
2063
|
+
"data": out_df,
|
|
2064
|
+
"summary": f"Isolation Forest detected {anomaly_count} anomalies."
|
|
2065
|
+
}
|
|
2066
|
+
'''
|
|
2067
|
+
|
|
2068
|
+
STACKING_REGRESSOR = '''import pandas as pd
|
|
2069
|
+
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
|
|
2070
|
+
from sklearn.linear_model import LinearRegression, Ridge
|
|
2071
|
+
try:
|
|
2072
|
+
from xgboost import XGBRegressor
|
|
2073
|
+
except ImportError:
|
|
2074
|
+
XGBRegressor = None
|
|
2075
|
+
|
|
2076
|
+
def run(X_train, y_train, X_test):
|
|
2077
|
+
estimators = [
|
|
2078
|
+
('lr', LinearRegression()),
|
|
2079
|
+
('rf', RandomForestRegressor(n_estimators=50, random_state=42))
|
|
2080
|
+
]
|
|
2081
|
+
if XGBRegressor is not None:
|
|
2082
|
+
estimators.append(('xgb', XGBRegressor(n_estimators=50, random_state=42)))
|
|
2083
|
+
|
|
2084
|
+
model = StackingRegressor(
|
|
2085
|
+
estimators=estimators,
|
|
2086
|
+
final_estimator=Ridge()
|
|
2087
|
+
)
|
|
2088
|
+
model.fit(X_train, y_train)
|
|
2089
|
+
y_pred = model.predict(X_test)
|
|
2090
|
+
|
|
2091
|
+
return {
|
|
2092
|
+
"model": model,
|
|
2093
|
+
"y_pred": y_pred.tolist(),
|
|
2094
|
+
"summary": f"Stacked Regressor with {len(estimators)} base models and Ridge final estimator."
|
|
2095
|
+
}
|
|
2096
|
+
'''
|
|
2097
|
+
|
|
2098
|
+
QUANTILE_REGRESSOR = '''import pandas as pd
|
|
2099
|
+
from typing import Annotated
|
|
2100
|
+
|
|
2101
|
+
def run(X_train, y_train, X_test, quantile: Annotated[float, "percentile (0-1)"] = 0.5) -> dict:
|
|
2102
|
+
try:
|
|
2103
|
+
from sklearn.linear_model import QuantileRegressor
|
|
2104
|
+
model = QuantileRegressor(quantile=quantile, solver='highs')
|
|
2105
|
+
model.fit(X_train, y_train)
|
|
2106
|
+
y_pred = model.predict(X_test)
|
|
2107
|
+
|
|
2108
|
+
return {
|
|
2109
|
+
"model": model,
|
|
2110
|
+
"y_pred": y_pred.tolist(),
|
|
2111
|
+
"summary": f"Quantile Regressor fitted for {quantile*100}th percentile."
|
|
2112
|
+
}
|
|
2113
|
+
except Exception as e:
|
|
2114
|
+
return {"error": f"QuantileRegressor failed: {str(e)}"}
|
|
2115
|
+
'''
|
|
2116
|
+
|
|
2117
|
+
EPSILON_GREEDY_BANDIT = '''import pandas as pd
|
|
2118
|
+
import numpy as np
|
|
2119
|
+
from typing import Annotated
|
|
2120
|
+
|
|
2121
|
+
def run(data: pd.DataFrame, action_col: Annotated[str, "column"], reward_col: Annotated[str, "column"], epsilon: float = 0.1) -> dict:
|
|
2122
|
+
if action_col not in data.columns or reward_col not in data.columns:
|
|
2123
|
+
return {"error": "Action or reward column not found."}
|
|
2124
|
+
|
|
2125
|
+
actions = data[action_col].unique()
|
|
2126
|
+
q_values = {a: 0.0 for a in actions}
|
|
2127
|
+
action_counts = {a: 0 for a in actions}
|
|
2128
|
+
|
|
2129
|
+
for _, row in data.iterrows():
|
|
2130
|
+
a = row[action_col]
|
|
2131
|
+
r = row[reward_col]
|
|
2132
|
+
action_counts[a] += 1
|
|
2133
|
+
q_values[a] += (r - q_values[a]) / action_counts[a]
|
|
2134
|
+
|
|
2135
|
+
optimal_action = max(q_values, key=q_values.get)
|
|
2136
|
+
|
|
2137
|
+
return {
|
|
2138
|
+
"q_values": q_values,
|
|
2139
|
+
"optimal_policy": str(optimal_action),
|
|
2140
|
+
"summary": f"Bandit converged on optimal action: {optimal_action}"
|
|
2141
|
+
}
|
|
2142
|
+
'''
|
|
2143
|
+
|
|
2144
|
+
MARKOV_CHAIN_SIMULATOR = '''import pandas as pd
|
|
2145
|
+
import numpy as np
|
|
2146
|
+
from typing import Annotated
|
|
2147
|
+
|
|
2148
|
+
def run(data: pd.DataFrame, state_col: Annotated[str, "column"]) -> dict:
|
|
2149
|
+
if state_col not in data.columns:
|
|
2150
|
+
return {"error": "State column not found."}
|
|
2151
|
+
|
|
2152
|
+
states = data[state_col].astype(str).tolist()
|
|
2153
|
+
transitions = {}
|
|
2154
|
+
|
|
2155
|
+
for i in range(len(states) - 1):
|
|
2156
|
+
curr_state = states[i]
|
|
2157
|
+
next_state = states[i+1]
|
|
2158
|
+
|
|
2159
|
+
if curr_state not in transitions:
|
|
2160
|
+
transitions[curr_state] = {}
|
|
2161
|
+
if next_state not in transitions[curr_state]:
|
|
2162
|
+
transitions[curr_state][next_state] = 0
|
|
2163
|
+
|
|
2164
|
+
transitions[curr_state][next_state] += 1
|
|
2165
|
+
|
|
2166
|
+
probs = {}
|
|
2167
|
+
for state, next_states in transitions.items():
|
|
2168
|
+
total = sum(next_states.values())
|
|
2169
|
+
probs[state] = {k: v / total for k, v in next_states.items()}
|
|
2170
|
+
|
|
2171
|
+
return {
|
|
2172
|
+
"transition_probabilities": probs,
|
|
2173
|
+
"summary": f"Markov Chain built with {len(probs)} distinct states."
|
|
2174
|
+
}
|
|
2175
|
+
'''
|
|
2176
|
+
|
|
2177
|
+
SILHOUETTE_SCORE_NODE = '''import pandas as pd
|
|
2178
|
+
import numpy as np
|
|
2179
|
+
from sklearn.metrics import silhouette_score
|
|
2180
|
+
from typing import Annotated
|
|
2181
|
+
|
|
2182
|
+
def run(data: pd.DataFrame, labels_col: Annotated[str, "column"]) -> dict:
|
|
2183
|
+
if labels_col not in data.columns:
|
|
2184
|
+
return {"error": "Labels column not found."}
|
|
2185
|
+
|
|
2186
|
+
numeric_df = data.select_dtypes(include=[np.number]).drop(columns=[labels_col], errors='ignore').dropna()
|
|
2187
|
+
labels = data[labels_col].loc[numeric_df.index]
|
|
2188
|
+
|
|
2189
|
+
unique_labels = len(set(labels))
|
|
2190
|
+
if unique_labels < 2 or unique_labels >= len(numeric_df):
|
|
2191
|
+
return {"error": "Silhouette score requires 2 <= n_clusters <= n_samples - 1"}
|
|
2192
|
+
|
|
2193
|
+
if len(numeric_df) > 5000:
|
|
2194
|
+
idx = np.random.choice(numeric_df.index, 5000, replace=False)
|
|
2195
|
+
numeric_df = numeric_df.loc[idx]
|
|
2196
|
+
labels = labels.loc[idx]
|
|
2197
|
+
|
|
2198
|
+
score = silhouette_score(numeric_df, labels)
|
|
2199
|
+
|
|
2200
|
+
return {
|
|
2201
|
+
"silhouette_score": score,
|
|
2202
|
+
"summary": f"Clusters are {'well-separated' if score > 0.5 else 'overlapping'}. Score: {score:.3f}"
|
|
2203
|
+
}
|
|
2204
|
+
'''
|
|
2205
|
+
|
|
2206
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
2207
|
+
# NEW TEMPLATES — Advanced Preprocessing, Clustering, Specialised Models, XAI
|
|
2208
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
2209
|
+
|
|
2210
|
+
# ── Advanced Preprocessing ────────────────────────────────────────────────────
|
|
2211
|
+
|
|
2212
|
+
PROPER_CAPITALIZATION_CLEANER = '''from typing import Annotated
|
|
2213
|
+
|
|
2214
|
+
def run(data, columns: str = "") -> dict:
|
|
2215
|
+
"""Standardise text casing in string columns (title-case by default).
|
|
2216
|
+
Leave columns blank to process all object columns.
|
|
2217
|
+
"""
|
|
2218
|
+
import pandas as pd
|
|
2219
|
+
|
|
2220
|
+
df = data.copy()
|
|
2221
|
+
cols = [c.strip() for c in columns.split(",") if c.strip()] if columns.strip() \
|
|
2222
|
+
else df.select_dtypes(include=["object", "category"]).columns.tolist()
|
|
2223
|
+
fixed = []
|
|
2224
|
+
for col in cols:
|
|
2225
|
+
if col in df.columns:
|
|
2226
|
+
df[col] = df[col].astype(str).str.strip().str.title()
|
|
2227
|
+
fixed.append(col)
|
|
2228
|
+
return {"data": df, "fixed_columns": fixed}
|
|
2229
|
+
'''
|
|
2230
|
+
|
|
2231
|
+
Z_SCORE_OUTLIER_DETECTOR = '''
|
|
2232
|
+
def run(data, threshold: float = 3.0, method: str = "flag") -> dict:
|
|
2233
|
+
"""Detect extreme outliers using Z-scores (|Z| > threshold).
|
|
2234
|
+
|
|
2235
|
+
method='flag' — adds a boolean *_outlier column for each numeric feature.
|
|
2236
|
+
method='drop' — removes rows where ANY feature exceeds the threshold.
|
|
2237
|
+
method='clip' — clips values to ±threshold standard deviations.
|
|
2238
|
+
"""
|
|
2239
|
+
import pandas as pd
|
|
2240
|
+
import numpy as np
|
|
2241
|
+
|
|
2242
|
+
df = data.copy()
|
|
2243
|
+
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
2244
|
+
rows_before = len(df)
|
|
2245
|
+
|
|
2246
|
+
if method == "flag":
|
|
2247
|
+
for col in num_cols:
|
|
2248
|
+
mean, std = df[col].mean(), df[col].std()
|
|
2249
|
+
if std > 0:
|
|
2250
|
+
df[f"{col}_outlier"] = (((df[col] - mean) / std).abs() > threshold)
|
|
2251
|
+
return {"data": df, "columns_flagged": len(num_cols)}
|
|
2252
|
+
|
|
2253
|
+
elif method == "drop":
|
|
2254
|
+
keep = pd.Series(True, index=df.index)
|
|
2255
|
+
for col in num_cols:
|
|
2256
|
+
mean, std = df[col].mean(), df[col].std()
|
|
2257
|
+
if std > 0:
|
|
2258
|
+
keep &= ((df[col] - mean) / std).abs() <= threshold
|
|
2259
|
+
df = df[keep]
|
|
2260
|
+
return {"data": df, "rows_removed": rows_before - len(df)}
|
|
2261
|
+
|
|
2262
|
+
else: # clip
|
|
2263
|
+
for col in num_cols:
|
|
2264
|
+
mean, std = df[col].mean(), df[col].std()
|
|
2265
|
+
if std > 0:
|
|
2266
|
+
df[col] = df[col].clip(mean - threshold * std, mean + threshold * std)
|
|
2267
|
+
return {"data": df, "columns_clipped": len(num_cols)}
|
|
2268
|
+
'''
|
|
2269
|
+
|
|
2270
|
+
BOX_COX_TRANSFORMER = '''from typing import Annotated
|
|
2271
|
+
|
|
2272
|
+
def run(data, columns: str = "") -> dict:
|
|
2273
|
+
"""Apply Box-Cox transformation to stabilise variance in positive numeric columns.
|
|
2274
|
+
|
|
2275
|
+
More powerful than log1p for regression targets — finds the optimal lambda
|
|
2276
|
+
per column. Skips columns with non-positive values (Box-Cox requires x > 0).
|
|
2277
|
+
Leave columns blank to transform all eligible numeric columns.
|
|
2278
|
+
"""
|
|
2279
|
+
import pandas as pd
|
|
2280
|
+
import numpy as np
|
|
2281
|
+
from scipy.stats import boxcox
|
|
2282
|
+
|
|
2283
|
+
df = data.copy()
|
|
2284
|
+
num_cols = [c.strip() for c in columns.split(",") if c.strip()] if columns.strip() \
|
|
2285
|
+
else df.select_dtypes(include=[np.number]).columns.tolist()
|
|
2286
|
+
|
|
2287
|
+
transformed, skipped, lambdas = [], [], {}
|
|
2288
|
+
for col in num_cols:
|
|
2289
|
+
if col not in df.columns:
|
|
2290
|
+
continue
|
|
2291
|
+
if df[col].min() <= 0:
|
|
2292
|
+
skipped.append(col)
|
|
2293
|
+
continue
|
|
2294
|
+
df[col], lam = boxcox(df[col].dropna())
|
|
2295
|
+
lambdas[col] = round(float(lam), 4)
|
|
2296
|
+
transformed.append(col)
|
|
2297
|
+
|
|
2298
|
+
return {"data": df, "transformed": transformed, "skipped_non_positive": skipped, "lambdas": lambdas}
|
|
2299
|
+
'''
|
|
2300
|
+
|
|
2301
|
+
MEAN_TARGET_ENCODER = '''from typing import Annotated
|
|
2302
|
+
|
|
2303
|
+
def run(data,
|
|
2304
|
+
column: Annotated[str, "column"] = "category",
|
|
2305
|
+
target: Annotated[str, "column"] = "target",
|
|
2306
|
+
smoothing: int = 10) -> dict:
|
|
2307
|
+
"""Replace a categorical column with the smoothed mean of the target per category.
|
|
2308
|
+
|
|
2309
|
+
Smoothing blends category mean with global mean to prevent overfitting on rare
|
|
2310
|
+
categories. The original column is dropped; a new *_mean_enc column is added.
|
|
2311
|
+
Ideal for high-cardinality columns in Kaggle-style tasks.
|
|
2312
|
+
"""
|
|
2313
|
+
import pandas as pd
|
|
2314
|
+
import numpy as np
|
|
2315
|
+
|
|
2316
|
+
df = data.copy()
|
|
2317
|
+
if column not in df.columns:
|
|
2318
|
+
raise ValueError(f"Column '{column}' not found.")
|
|
2319
|
+
if target not in df.columns:
|
|
2320
|
+
raise ValueError(f"Target '{target}' not found.")
|
|
2321
|
+
|
|
2322
|
+
global_mean = df[target].mean()
|
|
2323
|
+
stats = df.groupby(column)[target].agg(["mean", "count"])
|
|
2324
|
+
smoothed = (stats["count"] * stats["mean"] + smoothing * global_mean) / (stats["count"] + smoothing)
|
|
2325
|
+
|
|
2326
|
+
new_col = f"{column}_mean_enc"
|
|
2327
|
+
df[new_col] = df[column].map(smoothed).fillna(global_mean)
|
|
2328
|
+
df = df.drop(columns=[column])
|
|
2329
|
+
return {"data": df, "new_column": new_col, "unique_categories": int(len(stats))}
|
|
2330
|
+
'''
|
|
2331
|
+
|
|
2332
|
+
COLOCATED_FEATURE_GENERATOR = '''from typing import Annotated
|
|
2333
|
+
|
|
2334
|
+
def run(data,
|
|
2335
|
+
lat_col: Annotated[str, "column"] = "latitude",
|
|
2336
|
+
lon_col: Annotated[str, "column"] = "longitude") -> dict:
|
|
2337
|
+
"""Generate distance-based features from latitude/longitude coordinates.
|
|
2338
|
+
|
|
2339
|
+
Adds: distance_from_centroid, distance_from_origin, lat_lon_product.
|
|
2340
|
+
Useful for real estate, logistics, and location-based prediction tasks.
|
|
2341
|
+
"""
|
|
2342
|
+
import pandas as pd
|
|
2343
|
+
import numpy as np
|
|
2344
|
+
|
|
2345
|
+
df = data.copy()
|
|
2346
|
+
for col in (lat_col, lon_col):
|
|
2347
|
+
if col not in df.columns:
|
|
2348
|
+
raise ValueError(f"Column '{col}' not found.")
|
|
2349
|
+
|
|
2350
|
+
lat, lon = df[lat_col].astype(float), df[lon_col].astype(float)
|
|
2351
|
+
cx, cy = lat.mean(), lon.mean()
|
|
2352
|
+
|
|
2353
|
+
df["distance_from_centroid"] = np.sqrt((lat - cx) ** 2 + (lon - cy) ** 2)
|
|
2354
|
+
df["distance_from_origin"] = np.sqrt(lat ** 2 + lon ** 2)
|
|
2355
|
+
df["lat_lon_product"] = lat * lon
|
|
2356
|
+
df["lat_lon_ratio"] = (lat / lon.replace(0, np.nan)).fillna(0)
|
|
2357
|
+
|
|
2358
|
+
return {"data": df, "features_added": ["distance_from_centroid", "distance_from_origin", "lat_lon_product", "lat_lon_ratio"]}
|
|
2359
|
+
'''
|
|
2360
|
+
|
|
2361
|
+
ZIP_CODE_GROUPER = '''from typing import Annotated
|
|
2362
|
+
|
|
2363
|
+
def run(data,
|
|
2364
|
+
zip_column: Annotated[str, "column"] = "zip_code",
|
|
2365
|
+
prefix_length: int = 3) -> dict:
|
|
2366
|
+
"""Cluster postal codes into regional groups by truncating to the first N digits.
|
|
2367
|
+
|
|
2368
|
+
Reduces cardinality from thousands of unique zip codes down to hundreds of regions.
|
|
2369
|
+
Adds a new *_region column and drops the original.
|
|
2370
|
+
"""
|
|
2371
|
+
import pandas as pd
|
|
2372
|
+
|
|
2373
|
+
df = data.copy()
|
|
2374
|
+
if zip_column not in df.columns:
|
|
2375
|
+
raise ValueError(f"Column '{zip_column}' not found.")
|
|
2376
|
+
|
|
2377
|
+
region_col = f"{zip_column}_region"
|
|
2378
|
+
df[region_col] = df[zip_column].astype(str).str.strip().str[:prefix_length].str.zfill(prefix_length)
|
|
2379
|
+
original_unique = int(df[zip_column].nunique())
|
|
2380
|
+
region_unique = int(df[region_col].nunique())
|
|
2381
|
+
df = df.drop(columns=[zip_column])
|
|
2382
|
+
return {"data": df, "original_unique": original_unique, "region_unique": region_unique, "new_column": region_col}
|
|
2383
|
+
'''
|
|
2384
|
+
|
|
2385
|
+
NULL_INDICATOR_CREATOR = '''
|
|
2386
|
+
def run(data, min_missing_rate: float = 0.0) -> dict:
|
|
2387
|
+
"""Add boolean indicator columns for columns that contain missing values.
|
|
2388
|
+
|
|
2389
|
+
For each column with missing rate > min_missing_rate, adds a *_was_null
|
|
2390
|
+
binary column (1 = was missing, 0 = was present).
|
|
2391
|
+
|
|
2392
|
+
Sometimes the FACT that data is missing is itself a predictive signal
|
|
2393
|
+
(e.g., a blank 'income' field may indicate unemployment).
|
|
2394
|
+
"""
|
|
2395
|
+
import pandas as pd
|
|
2396
|
+
|
|
2397
|
+
df = data.copy()
|
|
2398
|
+
missing_rates = df.isnull().mean()
|
|
2399
|
+
eligible = missing_rates[missing_rates > min_missing_rate].index.tolist()
|
|
2400
|
+
|
|
2401
|
+
added = []
|
|
2402
|
+
for col in eligible:
|
|
2403
|
+
indicator = f"{col}_was_null"
|
|
2404
|
+
df[indicator] = df[col].isnull().astype(int)
|
|
2405
|
+
added.append(indicator)
|
|
2406
|
+
|
|
2407
|
+
return {"data": df, "indicators_added": added, "count": len(added)}
|
|
2408
|
+
'''
|
|
2409
|
+
|
|
2410
|
+
# ── Unsupervised & Clustering ─────────────────────────────────────────────────
|
|
2411
|
+
|
|
2412
|
+
UMAP_DIMENSIONALITY_REDUCTION = '''
|
|
2413
|
+
def run(data, n_components: int = 2, n_neighbors: int = 15, min_dist: float = 0.1) -> dict:
|
|
2414
|
+
"""Reduce dimensions to 2D (or n_components) using UMAP.
|
|
2415
|
+
|
|
2416
|
+
Falls back to t-SNE if umap-learn is not installed.
|
|
2417
|
+
Samples up to 10,000 rows for speed. Adds umap_1, umap_2 columns.
|
|
2418
|
+
"""
|
|
2419
|
+
import numpy as np
|
|
2420
|
+
import pandas as pd
|
|
2421
|
+
|
|
2422
|
+
df = data.copy()
|
|
2423
|
+
num_df = df.select_dtypes(include=[np.number]).dropna()
|
|
2424
|
+
if num_df.empty:
|
|
2425
|
+
raise ValueError("No numeric columns found for UMAP.")
|
|
2426
|
+
if len(num_df) > 10_000:
|
|
2427
|
+
num_df = num_df.sample(10_000, random_state=42)
|
|
2428
|
+
|
|
2429
|
+
try:
|
|
2430
|
+
import umap
|
|
2431
|
+
reducer = umap.UMAP(n_components=n_components, n_neighbors=n_neighbors,
|
|
2432
|
+
min_dist=min_dist, random_state=42)
|
|
2433
|
+
embedding = reducer.fit_transform(num_df.values)
|
|
2434
|
+
method = "UMAP"
|
|
2435
|
+
except ImportError:
|
|
2436
|
+
from sklearn.manifold import TSNE
|
|
2437
|
+
reducer = TSNE(n_components=min(n_components, 3), perplexity=min(30, len(num_df) - 1),
|
|
2438
|
+
random_state=42, n_iter=500)
|
|
2439
|
+
embedding = reducer.fit_transform(num_df.values)
|
|
2440
|
+
method = "t-SNE (UMAP fallback — pip install umap-learn)"
|
|
2441
|
+
|
|
2442
|
+
out_df = num_df.copy()
|
|
2443
|
+
for i in range(embedding.shape[1]):
|
|
2444
|
+
out_df[f"umap_{i+1}"] = embedding[:, i]
|
|
2445
|
+
|
|
2446
|
+
return {
|
|
2447
|
+
"data": out_df,
|
|
2448
|
+
"method": method,
|
|
2449
|
+
"feature_target_scatter": {
|
|
2450
|
+
"x": embedding[:, 0].tolist(),
|
|
2451
|
+
"y": embedding[:, 1].tolist(),
|
|
2452
|
+
"feature_name": "umap_1",
|
|
2453
|
+
"target_name": "umap_2",
|
|
2454
|
+
},
|
|
2455
|
+
}
|
|
2456
|
+
'''
|
|
2457
|
+
|
|
2458
|
+
ELBOW_METHOD_DATA = '''
|
|
2459
|
+
def run(data, max_k: int = 10, sample_size: int = 5000) -> dict:
|
|
2460
|
+
"""Compute K-Means inertia for K = 1 … max_k to produce an Elbow curve.
|
|
2461
|
+
|
|
2462
|
+
Run this BEFORE KMeans to pick the optimal number of clusters.
|
|
2463
|
+
Returns inertias and a recommended_k heuristic (largest second-derivative).
|
|
2464
|
+
"""
|
|
2465
|
+
import numpy as np
|
|
2466
|
+
import pandas as pd
|
|
2467
|
+
from sklearn.cluster import KMeans
|
|
2468
|
+
from sklearn.preprocessing import StandardScaler
|
|
2469
|
+
|
|
2470
|
+
df = data.select_dtypes(include=[np.number]).dropna()
|
|
2471
|
+
if len(df) > sample_size:
|
|
2472
|
+
df = df.sample(sample_size, random_state=42)
|
|
2473
|
+
|
|
2474
|
+
scaler = StandardScaler()
|
|
2475
|
+
X = scaler.fit_transform(df)
|
|
2476
|
+
|
|
2477
|
+
inertias, ks = [], list(range(1, max_k + 1))
|
|
2478
|
+
for k in ks:
|
|
2479
|
+
km = KMeans(n_clusters=k, n_init=5, random_state=42)
|
|
2480
|
+
km.fit(X)
|
|
2481
|
+
inertias.append(float(km.inertia_))
|
|
2482
|
+
|
|
2483
|
+
# Second-derivative heuristic for recommended K
|
|
2484
|
+
recommended_k = 2
|
|
2485
|
+
if len(inertias) >= 3:
|
|
2486
|
+
diffs2 = [inertias[i-1] - 2*inertias[i] + inertias[i+1] for i in range(1, len(inertias)-1)]
|
|
2487
|
+
recommended_k = diffs2.index(max(diffs2)) + 2 # +2 because we start at k=1
|
|
2488
|
+
|
|
2489
|
+
return {"ks": ks, "inertias": inertias, "recommended_k": recommended_k, "summary": f"Elbow suggests K={recommended_k}"}
|
|
2490
|
+
'''
|
|
2491
|
+
|
|
2492
|
+
# ── Specialised Models ────────────────────────────────────────────────────────
|
|
2493
|
+
|
|
2494
|
+
BALANCED_RF_CLASSIFIER = '''from sklearn.ensemble import RandomForestClassifier
|
|
2495
|
+
from sklearn.model_selection import cross_val_score
|
|
2496
|
+
|
|
2497
|
+
def run(
|
|
2498
|
+
X_train, X_test, y_train,
|
|
2499
|
+
n_estimators: int = 100,
|
|
2500
|
+
max_depth: int = 0,
|
|
2501
|
+
random_state: int = 42,
|
|
2502
|
+
cross_validation: bool = False,
|
|
2503
|
+
cv_folds: int = 5,
|
|
2504
|
+
) -> dict:
|
|
2505
|
+
"""Random Forest with class_weight='balanced' — handles imbalanced classes automatically.
|
|
2506
|
+
|
|
2507
|
+
Unlike a standard RF, every tree is trained with inverse-frequency sample weights so
|
|
2508
|
+
minority classes receive equal attention. No SMOTE required.
|
|
2509
|
+
"""
|
|
2510
|
+
import numpy as np
|
|
2511
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
2512
|
+
from sklearn.model_selection import cross_val_score
|
|
2513
|
+
|
|
2514
|
+
md = max_depth if max_depth > 0 else None
|
|
2515
|
+
model = RandomForestClassifier(
|
|
2516
|
+
n_estimators=n_estimators,
|
|
2517
|
+
max_depth=md,
|
|
2518
|
+
class_weight="balanced",
|
|
2519
|
+
random_state=random_state,
|
|
2520
|
+
)
|
|
2521
|
+
if cross_validation:
|
|
2522
|
+
scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="f1_weighted")
|
|
2523
|
+
model.fit(X_train, y_train)
|
|
2524
|
+
y_pred = model.predict(X_test)
|
|
2525
|
+
return {"model": model, "y_pred": y_pred,
|
|
2526
|
+
"cv_mean": round(float(scores.mean()), 4),
|
|
2527
|
+
"cv_std": round(float(scores.std()), 4)}
|
|
2528
|
+
model.fit(X_train, y_train)
|
|
2529
|
+
y_pred = model.predict(X_test)
|
|
2530
|
+
return {"model": model, "y_pred": y_pred}
|
|
2531
|
+
'''
|
|
2532
|
+
|
|
2533
|
+
RIDGE_CV_REGRESSOR = '''
|
|
2534
|
+
def run(X_train, X_test, y_train, cv_folds: int = 5) -> dict:
|
|
2535
|
+
"""Ridge regression with automatic alpha selection via built-in cross-validation.
|
|
2536
|
+
|
|
2537
|
+
RidgeCV tests a range of regularisation strengths and picks the best one
|
|
2538
|
+
automatically — no manual hyperparameter tuning needed.
|
|
2539
|
+
"""
|
|
2540
|
+
import numpy as np
|
|
2541
|
+
from sklearn.linear_model import RidgeCV
|
|
2542
|
+
|
|
2543
|
+
alphas = [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
|
|
2544
|
+
model = RidgeCV(alphas=alphas, cv=cv_folds)
|
|
2545
|
+
model.fit(X_train, y_train)
|
|
2546
|
+
y_pred = model.predict(X_test)
|
|
2547
|
+
return {
|
|
2548
|
+
"model": model,
|
|
2549
|
+
"y_pred": y_pred,
|
|
2550
|
+
"best_alpha": float(model.alpha_),
|
|
2551
|
+
"summary": f"RidgeCV selected alpha={model.alpha_:.4g}",
|
|
2552
|
+
}
|
|
2553
|
+
'''
|
|
2554
|
+
|
|
2555
|
+
POISSON_REGRESSOR = '''
|
|
2556
|
+
def run(
|
|
2557
|
+
X_train, X_test, y_train,
|
|
2558
|
+
max_iter: int = 300,
|
|
2559
|
+
alpha: float = 1.0,
|
|
2560
|
+
cross_validation: bool = False,
|
|
2561
|
+
cv_folds: int = 5,
|
|
2562
|
+
) -> dict:
|
|
2563
|
+
"""Poisson regression — purpose-built for non-negative integer count data.
|
|
2564
|
+
|
|
2565
|
+
Use when the target represents counts (insurance claims, website visits,
|
|
2566
|
+
defects per unit). Standard linear regression is incorrect for count targets.
|
|
2567
|
+
"""
|
|
2568
|
+
import numpy as np
|
|
2569
|
+
from sklearn.linear_model import PoissonRegressor
|
|
2570
|
+
from sklearn.model_selection import cross_val_score
|
|
2571
|
+
|
|
2572
|
+
model = PoissonRegressor(alpha=alpha, max_iter=max_iter)
|
|
2573
|
+
if cross_validation:
|
|
2574
|
+
scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="r2")
|
|
2575
|
+
model.fit(X_train, y_train)
|
|
2576
|
+
y_pred = model.predict(X_test)
|
|
2577
|
+
return {"model": model, "y_pred": y_pred,
|
|
2578
|
+
"cv_mean": round(float(scores.mean()), 4),
|
|
2579
|
+
"cv_std": round(float(scores.std()), 4)}
|
|
2580
|
+
model.fit(X_train, y_train)
|
|
2581
|
+
y_pred = model.predict(X_test)
|
|
2582
|
+
return {"model": model, "y_pred": y_pred}
|
|
2583
|
+
'''
|
|
2584
|
+
|
|
2585
|
+
# ── Deep Evaluation & XAI ────────────────────────────────────────────────────
|
|
2586
|
+
|
|
2587
|
+
SHAP_EXPLAINER = '''
|
|
2588
|
+
def run(model, X_train, X_test) -> dict:
|
|
2589
|
+
"""Compute SHAP (Shapley Additive Explanations) values for model interpretability.
|
|
2590
|
+
|
|
2591
|
+
Returns per-feature mean |SHAP| importances and the top feature.
|
|
2592
|
+
Falls back to permutation importance if shap is not installed.
|
|
2593
|
+
Samples up to 200 rows from X_test for speed.
|
|
2594
|
+
"""
|
|
2595
|
+
import numpy as np
|
|
2596
|
+
import pandas as pd
|
|
2597
|
+
|
|
2598
|
+
cols = list(X_train.columns) if hasattr(X_train, "columns") else [f"f{i}" for i in range(X_train.shape[1])]
|
|
2599
|
+
X_sample = X_test[:200] if len(X_test) > 200 else X_test
|
|
2600
|
+
|
|
2601
|
+
try:
|
|
2602
|
+
import shap
|
|
2603
|
+
# TreeExplainer is fast for tree-based models; falls back to KernelExplainer otherwise
|
|
2604
|
+
try:
|
|
2605
|
+
explainer = shap.TreeExplainer(model)
|
|
2606
|
+
shap_values = explainer.shap_values(X_sample)
|
|
2607
|
+
except Exception:
|
|
2608
|
+
explainer = shap.KernelExplainer(model.predict, shap.sample(X_sample, 50))
|
|
2609
|
+
shap_values = explainer.shap_values(X_sample, nsamples=100)
|
|
2610
|
+
|
|
2611
|
+
# For multiclass, shap_values is a list — take mean over classes
|
|
2612
|
+
if isinstance(shap_values, list):
|
|
2613
|
+
shap_arr = np.mean([np.abs(sv) for sv in shap_values], axis=0)
|
|
2614
|
+
else:
|
|
2615
|
+
shap_arr = np.abs(shap_values)
|
|
2616
|
+
|
|
2617
|
+
mean_shap = shap_arr.mean(axis=0)
|
|
2618
|
+
pairs = sorted(zip(mean_shap, cols), reverse=True)
|
|
2619
|
+
return {
|
|
2620
|
+
"feature_importances": {c: round(float(v), 6) for v, c in pairs},
|
|
2621
|
+
"top_feature": pairs[0][1],
|
|
2622
|
+
"top_shap_value": round(float(pairs[0][0]), 6),
|
|
2623
|
+
"method": "SHAP",
|
|
2624
|
+
}
|
|
2625
|
+
|
|
2626
|
+
except ImportError:
|
|
2627
|
+
# Fallback: permutation importance
|
|
2628
|
+
from sklearn.inspection import permutation_importance
|
|
2629
|
+
r = permutation_importance(model, X_sample, model.predict(X_sample), n_repeats=5, random_state=42)
|
|
2630
|
+
pairs = sorted(zip(r.importances_mean, cols), reverse=True)
|
|
2631
|
+
return {
|
|
2632
|
+
"feature_importances": {c: round(float(v), 6) for v, c in pairs},
|
|
2633
|
+
"top_feature": pairs[0][1],
|
|
2634
|
+
"top_shap_value": round(float(pairs[0][0]), 6),
|
|
2635
|
+
"method": "Permutation Importance (pip install shap for SHAP values)",
|
|
2636
|
+
}
|
|
2637
|
+
'''
|
|
2638
|
+
|
|
2639
|
+
PARTIAL_DEPENDENCE_PLOTS = '''from typing import Annotated
|
|
2640
|
+
|
|
2641
|
+
def run(model, X_train, feature: Annotated[str, "column"] = "feature_name") -> dict:
|
|
2642
|
+
"""Compute Partial Dependence Plot data: the marginal effect of one feature on predictions.
|
|
2643
|
+
|
|
2644
|
+
Shows HOW the model's output changes as a single feature varies, holding all others constant.
|
|
2645
|
+
Crucial for understanding the direction and shape of a feature's influence.
|
|
2646
|
+
"""
|
|
2647
|
+
import numpy as np
|
|
2648
|
+
import pandas as pd
|
|
2649
|
+
from sklearn.inspection import partial_dependence
|
|
2650
|
+
|
|
2651
|
+
if feature not in X_train.columns:
|
|
2652
|
+
raise ValueError(f"Feature '{feature}' not in X_train columns: {list(X_train.columns)}")
|
|
2653
|
+
|
|
2654
|
+
sample = X_train.sample(min(1000, len(X_train)), random_state=42) if len(X_train) > 1000 else X_train
|
|
2655
|
+
|
|
2656
|
+
try:
|
|
2657
|
+
result = partial_dependence(model, sample, features=[feature], kind="average")
|
|
2658
|
+
grid = result["grid_values"][0].tolist()
|
|
2659
|
+
response = result["average"][0].tolist()
|
|
2660
|
+
except Exception as e:
|
|
2661
|
+
raise ValueError(f"Partial dependence failed: {e}")
|
|
2662
|
+
|
|
2663
|
+
return {
|
|
2664
|
+
"partial_dependence": {
|
|
2665
|
+
"feature": feature,
|
|
2666
|
+
"grid": grid,
|
|
2667
|
+
"average_response": response,
|
|
2668
|
+
},
|
|
2669
|
+
"feature_name": feature,
|
|
2670
|
+
}
|
|
2671
|
+
'''
|
|
2672
|
+
|
|
2673
|
+
CALIBRATION_CURVE_DATA = '''
|
|
2674
|
+
def run(model, X_test, y_test, n_bins: int = 10) -> dict:
|
|
2675
|
+
"""Check if a classifier's predicted probabilities are well-calibrated.
|
|
2676
|
+
|
|
2677
|
+
A perfectly calibrated model has its 90th-percentile predictions correct 90% of the time.
|
|
2678
|
+
Returns fraction_of_positives vs mean_predicted_probability for plotting.
|
|
2679
|
+
"""
|
|
2680
|
+
import numpy as np
|
|
2681
|
+
from sklearn.calibration import calibration_curve
|
|
2682
|
+
|
|
2683
|
+
if not hasattr(model, "predict_proba"):
|
|
2684
|
+
raise ValueError("Model must support predict_proba for calibration analysis.")
|
|
2685
|
+
|
|
2686
|
+
y_prob = model.predict_proba(X_test)
|
|
2687
|
+
# Binary: use positive class; multiclass: use max probability
|
|
2688
|
+
if y_prob.shape[1] == 2:
|
|
2689
|
+
prob_pos = y_prob[:, 1]
|
|
2690
|
+
y_bin = np.asarray(y_test)
|
|
2691
|
+
else:
|
|
2692
|
+
prob_pos = y_prob.max(axis=1)
|
|
2693
|
+
y_bin = (np.asarray(y_test) == model.classes_[y_prob.argmax(axis=1)]).astype(int)
|
|
2694
|
+
|
|
2695
|
+
frac_pos, mean_pred = calibration_curve(y_bin, prob_pos, n_bins=n_bins, strategy="uniform")
|
|
2696
|
+
|
|
2697
|
+
brier = float(np.mean((prob_pos - y_bin) ** 2))
|
|
2698
|
+
return {
|
|
2699
|
+
"fraction_of_positives": frac_pos.tolist(),
|
|
2700
|
+
"mean_predicted_value": mean_pred.tolist(),
|
|
2701
|
+
"brier_score": round(brier, 4),
|
|
2702
|
+
"summary": f"Brier score: {brier:.4f} (lower = better calibrated)",
|
|
2703
|
+
}
|
|
2704
|
+
'''
|
|
2705
|
+
|
|
2706
|
+
LEARNING_CURVE_ANALYZER = '''
|
|
2707
|
+
def run(model, X_train, y_train, cv_folds: int = 5, scoring: str = "accuracy") -> dict:
|
|
2708
|
+
"""Plot training vs validation score across increasing dataset sizes.
|
|
2709
|
+
|
|
2710
|
+
Diagnoses model behaviour:
|
|
2711
|
+
- High train score, low val score → overfitting (try regularisation)
|
|
2712
|
+
- Both scores low → underfitting (try more complex model or features)
|
|
2713
|
+
- Scores converge at high sample count → more data won't help
|
|
2714
|
+
"""
|
|
2715
|
+
import numpy as np
|
|
2716
|
+
from sklearn.model_selection import learning_curve
|
|
2717
|
+
|
|
2718
|
+
# Use 5 evenly spaced training sizes from 10% to 100%
|
|
2719
|
+
train_sizes_abs, train_scores, val_scores = learning_curve(
|
|
2720
|
+
model, X_train, y_train,
|
|
2721
|
+
cv=cv_folds,
|
|
2722
|
+
scoring=scoring,
|
|
2723
|
+
train_sizes=np.linspace(0.1, 1.0, 5),
|
|
2724
|
+
n_jobs=-1,
|
|
2725
|
+
)
|
|
2726
|
+
|
|
2727
|
+
return {
|
|
2728
|
+
"train_sizes": train_sizes_abs.tolist(),
|
|
2729
|
+
"train_scores": train_scores.mean(axis=1).round(4).tolist(),
|
|
2730
|
+
"val_scores": val_scores.mean(axis=1).round(4).tolist(),
|
|
2731
|
+
"train_std": train_scores.std(axis=1).round(4).tolist(),
|
|
2732
|
+
"val_std": val_scores.std(axis=1).round(4).tolist(),
|
|
2733
|
+
"scoring": scoring,
|
|
2734
|
+
"summary": f"Final val {scoring}: {val_scores.mean(axis=1)[-1]:.4f}",
|
|
2735
|
+
}
|
|
2736
|
+
'''
|
|
2737
|
+
|
|
2738
|
+
COST_BENEFIT_MATRIX = '''
|
|
2739
|
+
def run(
|
|
2740
|
+
y_test, y_pred,
|
|
2741
|
+
tp_value: float = 100.0,
|
|
2742
|
+
fp_cost: float = 10.0,
|
|
2743
|
+
fn_cost: float = 50.0,
|
|
2744
|
+
tn_value: float = 0.0,
|
|
2745
|
+
) -> dict:
|
|
2746
|
+
"""Compute business value of model predictions using a cost-benefit matrix.
|
|
2747
|
+
|
|
2748
|
+
Assign dollar values to each outcome type (TP, FP, FN, TN) to translate
|
|
2749
|
+
model accuracy into real business impact. Essential for fraud detection,
|
|
2750
|
+
churn prevention, and medical diagnostics where error costs differ.
|
|
2751
|
+
"""
|
|
2752
|
+
import numpy as np
|
|
2753
|
+
from sklearn.metrics import confusion_matrix
|
|
2754
|
+
|
|
2755
|
+
y_t, y_p = np.asarray(y_test), np.asarray(y_pred)
|
|
2756
|
+
if len(np.unique(y_t)) != 2:
|
|
2757
|
+
raise ValueError("Cost-benefit matrix requires binary classification.")
|
|
2758
|
+
|
|
2759
|
+
cm = confusion_matrix(y_t, y_p)
|
|
2760
|
+
tn, fp, fn, tp = cm.ravel()
|
|
2761
|
+
|
|
2762
|
+
total_value = tp * tp_value - fp * fp_cost - fn * fn_cost + tn * tn_value
|
|
2763
|
+
random_value = (len(y_t) * y_t.mean()) * tp_value - (len(y_t) * (1 - y_t.mean())) * fp_cost
|
|
2764
|
+
value_per_case = total_value / len(y_t)
|
|
2765
|
+
|
|
2766
|
+
return {
|
|
2767
|
+
"confusion_matrix": {"TP": int(tp), "FP": int(fp), "FN": int(fn), "TN": int(tn)},
|
|
2768
|
+
"total_business_value": round(total_value, 2),
|
|
2769
|
+
"random_baseline_value": round(random_value, 2),
|
|
2770
|
+
"value_per_case": round(value_per_case, 2),
|
|
2771
|
+
"roi_vs_random": round((total_value - random_value) / max(abs(random_value), 1) * 100, 1),
|
|
2772
|
+
"summary": f"Model generates ${total_value:,.0f} total vs ${random_value:,.0f} random baseline.",
|
|
2773
|
+
}
|
|
2774
|
+
'''
|
|
2775
|
+
|
|
2776
|
+
|
|
2777
|
+
TEMPLATES: list[dict] = [
|
|
2778
|
+
# Data
|
|
2779
|
+
{"id": "csv_loader", "label": "CSV Loader", "category": "Data", "code": CSV_LOADER},
|
|
2780
|
+
{"id": "csv_exporter", "label": "CSV Exporter", "category": "Data", "code": CSV_EXPORTER},
|
|
2781
|
+
{"id": "eda", "label": "EDA", "category": "Data", "code": EDA},
|
|
2782
|
+
{"id": "eda_histogram", "label": "Histogram", "category": "Visualize", "code": EDA_HISTOGRAM},
|
|
2783
|
+
{"id": "eda_correlation", "label": "Correlation Matrix", "category": "Visualize", "code": EDA_CORRELATION},
|
|
2784
|
+
{"id": "eda_value_counts", "label": "Value Counts", "category": "Visualize", "code": EDA_VALUE_COUNTS},
|
|
2785
|
+
{"id": "eda_box_plot", "label": "Box Plot", "category": "Visualize", "code": EDA_BOX_PLOT},
|
|
2786
|
+
{"id": "predict", "label": "Predict", "category": "Visualize", "code": PREDICT},
|
|
2787
|
+
# Preprocessing
|
|
2788
|
+
{"id": "data_cleaning", "label": "Data Cleaning", "category": "Preprocessing", "code": DATA_CLEANING},
|
|
2789
|
+
{"id": "smart_outlier_remover", "label": "Smart Outlier Remover", "category": "Preprocessing", "code": SMART_OUTLIER_REMOVER},
|
|
2790
|
+
{"id": "advanced_imputer", "label": "Advanced Imputer", "category": "Preprocessing", "code": ADVANCED_IMPUTER},
|
|
2791
|
+
{"id": "skewness_fixer", "label": "Skewness Fixer", "category": "Preprocessing", "code": SKEWNESS_FIXER},
|
|
2792
|
+
{"id": "high_cardinality_encoder", "label": "High-Cardinality Encoder","category": "Preprocessing", "code": HIGH_CARDINALITY_ENCODER},
|
|
2793
|
+
{"id": "feature_scaler_robust", "label": "Robust Scaler", "category": "Preprocessing", "code": FEATURE_SCALER_ROBUST},
|
|
2794
|
+
{"id": "multicollinearity_filter", "label": "Multicollinearity Filter","category": "Preprocessing", "code": MULTICOLLINEARITY_FILTER},
|
|
2795
|
+
{"id": "text_cleaner_basic", "label": "Text Cleaner", "category": "Preprocessing", "code": TEXT_CLEANER_BASIC},
|
|
2796
|
+
{"id": "smote_sampler", "label": "SMOTE Sampler", "category": "Preprocessing", "code": SMOTE_SAMPLER},
|
|
2797
|
+
{"id": "label_encoder", "label": "Label Encoder", "category": "Preprocessing", "code": LABEL_ENCODER},
|
|
2798
|
+
{"id": "one_hot_encoder", "label": "One-Hot Encoder", "category": "Preprocessing", "code": ONE_HOT_ENCODER},
|
|
2799
|
+
{"id": "train_test_split", "label": "Train/Test Split", "category": "Preprocessing", "code": TRAIN_TEST_SPLIT},
|
|
2800
|
+
{"id": "standard_scaler", "label": "Standard Scaler", "category": "Preprocessing", "code": STANDARD_SCALER},
|
|
2801
|
+
{"id": "min_max_scaler", "label": "Min-Max Scaler", "category": "Preprocessing", "code": MIN_MAX_SCALER},
|
|
2802
|
+
{"id": "pca", "label": "PCA", "category": "Preprocessing", "code": PCA},
|
|
2803
|
+
{"id": "polynomial_features", "label": "Polynomial Features", "category": "Preprocessing", "code": POLYNOMIAL_FEATURES},
|
|
2804
|
+
{"id": "datetime_extractor", "label": "Datetime Extractor", "category": "Preprocessing", "code": DATETIME_EXTRACTOR},
|
|
2805
|
+
{"id": "binary_encoder", "label": "Binary Encoder", "category": "Preprocessing", "code": BINARY_ENCODER},
|
|
2806
|
+
{"id": "frequency_encoder", "label": "Frequency Encoder", "category": "Preprocessing", "code": FREQUENCY_ENCODER},
|
|
2807
|
+
{"id": "ordinal_encoder", "label": "Ordinal Encoder", "category": "Preprocessing", "code": ORDINAL_ENCODER},
|
|
2808
|
+
{"id": "vif_feature_selection", "label": "VIF Feature Selection", "category": "Preprocessing", "code": VIF_FEATURE_SELECTION},
|
|
2809
|
+
{"id": "pca_whitening", "label": "PCA Whitening", "category": "Preprocessing", "code": PCA_WHITENING},
|
|
2810
|
+
{"id": "k_means_clustering_features", "label": "K-Means Features", "category": "Preprocessing", "code": K_MEANS_CLUSTERING_FEATURES},
|
|
2811
|
+
{"id": "lag_feature_generator", "label": "Lag Features", "category": "Preprocessing", "code": LAG_FEATURE_GENERATOR},
|
|
2812
|
+
{"id": "rolling_window_stats", "label": "Rolling Window Stats", "category": "Preprocessing", "code": ROLLING_WINDOW_STATS},
|
|
2813
|
+
{"id": "tomek_links", "label": "Tomek Links", "category": "Preprocessing", "code": TOMEK_LINKS},
|
|
2814
|
+
{"id": "random_under_sampler", "label": "Random Under Sampler", "category": "Preprocessing", "code": RANDOM_UNDER_SAMPLER},
|
|
2815
|
+
{"id": "rfe_feature_selector", "label": "RFE Feature Selector", "category": "Preprocessing", "code": RFE_FEATURE_SELECTOR},
|
|
2816
|
+
# Classifiers
|
|
2817
|
+
{"id": "logistic_regression", "label": "Logistic Regression", "category": "Classifiers", "code": LOGISTIC_REGRESSION},
|
|
2818
|
+
{"id": "random_forest_classifier", "label": "RF Classifier", "category": "Classifiers", "code": RANDOM_FOREST_CLASSIFIER},
|
|
2819
|
+
{"id": "gradient_boosting_classifier","label": "GB Classifier", "category": "Classifiers", "code": GRADIENT_BOOSTING_CLASSIFIER},
|
|
2820
|
+
{"id": "decision_tree_classifier", "label": "DT Classifier", "category": "Classifiers", "code": DECISION_TREE_CLASSIFIER},
|
|
2821
|
+
{"id": "svm_classifier", "label": "SVM Classifier", "category": "Classifiers", "code": SVM_CLASSIFIER},
|
|
2822
|
+
{"id": "knn_classifier", "label": "KNN Classifier", "category": "Classifiers", "code": KNN_CLASSIFIER},
|
|
2823
|
+
{"id": "xgboost_node", "label": "XGBoost", "category": "Classifiers", "code": XGBOOST_NODE},
|
|
2824
|
+
{"id": "lightgbm_node", "label": "LightGBM", "category": "Classifiers", "code": LIGHTGBM_NODE},
|
|
2825
|
+
{"id": "adaboost_node", "label": "AdaBoost", "category": "Classifiers", "code": ADABOOST_NODE},
|
|
2826
|
+
{"id": "voting_ensemble", "label": "Voting Ensemble", "category": "Classifiers", "code": VOTING_ENSEMBLE},
|
|
2827
|
+
# Regressors
|
|
2828
|
+
{"id": "linear_regression", "label": "Linear Regression", "category": "Regression", "code": LINEAR_REGRESSION},
|
|
2829
|
+
{"id": "random_forest_regressor", "label": "RF Regressor", "category": "Regression", "code": RANDOM_FOREST_REGRESSOR},
|
|
2830
|
+
{"id": "gradient_boosting_regressor", "label": "GB Regressor", "category": "Regression", "code": GRADIENT_BOOSTING_REGRESSOR},
|
|
2831
|
+
{"id": "decision_tree_regressor", "label": "DT Regressor", "category": "Regression", "code": DECISION_TREE_REGRESSOR},
|
|
2832
|
+
{"id": "svm_regressor", "label": "SVM Regressor", "category": "Regression", "code": SVM_REGRESSOR},
|
|
2833
|
+
{"id": "knn_regressor", "label": "KNN Regressor", "category": "Regression", "code": KNN_REGRESSOR},
|
|
2834
|
+
# Evaluation
|
|
2835
|
+
{"id": "accuracy", "label": "Accuracy", "category": "Evaluation", "code": ACCURACY},
|
|
2836
|
+
{"id": "classification_report", "label": "Classification Report", "category": "Evaluation", "code": CLASSIFICATION_REPORT},
|
|
2837
|
+
{"id": "validation_report", "label": "Validation Report", "category": "Evaluation", "code": VALIDATION_REPORT},
|
|
2838
|
+
{"id": "regression_metrics", "label": "Regression Metrics", "category": "Evaluation", "code": REGRESSION_METRICS},
|
|
2839
|
+
{"id": "feature_importance", "label": "Feature Importance", "category": "Evaluation", "code": FEATURE_IMPORTANCE},
|
|
2840
|
+
{"id": "confusion_matrix_plotter", "label": "Confusion Matrix", "category": "Visualize", "code": CONFUSION_MATRIX_PLOTTER},
|
|
2841
|
+
{"id": "roc_pr_curve_data", "label": "ROC & PR Curves", "category": "Visualize", "code": ROC_PR_CURVE_DATA},
|
|
2842
|
+
{"id": "residual_plotter", "label": "Residual Plot", "category": "Visualize", "code": RESIDUAL_PLOTTER},
|
|
2843
|
+
{"id": "feature_importance_visualizer","label": "Feature Importance Plot","category": "Visualize", "code": FEATURE_IMPORTANCE_VISUALIZER},
|
|
2844
|
+
{"id": "decision_boundary_2d", "label": "Decision Boundary 2D", "category": "Visualize", "code": DECISION_BOUNDARY_2D},
|
|
2845
|
+
{"id": "prediction_vs_actual_scatter","label": "Pred vs Actual Scatter", "category": "Visualize", "code": PREDICTION_VS_ACTUAL_SCATTER},
|
|
2846
|
+
{"id": "inverse_target_transformer", "label": "Inverse Transformer", "category": "Evaluation", "code": INVERSE_TARGET_TRANSFORMER},
|
|
2847
|
+
{"id": "threshold_optimizer", "label": "Threshold Optimizer", "category": "Evaluation", "code": THRESHOLD_OPTIMIZER},
|
|
2848
|
+
{"id": "permutation_importance", "label": "Permutation Importance", "category": "Evaluation", "code": PERMUTATION_IMPORTANCE},
|
|
2849
|
+
{"id": "learning_curve_data", "label": "Learning Curve", "category": "Evaluation", "code": LEARNING_CURVE_DATA},
|
|
2850
|
+
{"id": "lift_gain_charts", "label": "Lift & Gain Charts", "category": "Evaluation", "code": LIFT_GAIN_CHARTS},
|
|
2851
|
+
{"id": "auto_ml", "label": "AutoML", "category": "Evaluation", "code": AUTO_ML},
|
|
2852
|
+
{"id": "correlation_heatmap", "label": "Correlation Heatmap", "category": "Visualize", "code": CORRELATION_HEATMAP},
|
|
2853
|
+
{"id": "missing_value_map", "label": "Missing Value Map", "category": "Visualize", "code": MISSING_VALUE_MAP},
|
|
2854
|
+
{"id": "class_balance_visualizer", "label": "Class Balance", "category": "Visualize", "code": CLASS_BALANCE_VISUALIZER},
|
|
2855
|
+
{"id": "feature_target_scatter", "label": "Feature/Target Scatter", "category": "Visualize", "code": FEATURE_TARGET_SCATTER},
|
|
2856
|
+
{"id": "model_error_histogram", "label": "Model Error Histogram", "category": "Visualize", "code": MODEL_ERROR_HISTOGRAM},
|
|
2857
|
+
{"id": "partial_dependence_data", "label": "Partial Dependence", "category": "Visualize", "code": PARTIAL_DEPENDENCE_DATA},
|
|
2858
|
+
{"id": "multiclass_roc_data", "label": "Multiclass ROC", "category": "Visualize", "code": MULTICLASS_ROC_DATA},
|
|
2859
|
+
|
|
2860
|
+
# Unsupervised
|
|
2861
|
+
{"id": "dbscan_clustering", "label": "DBSCAN Clustering", "category": "Unsupervised", "code": DBSCAN_CLUSTERING},
|
|
2862
|
+
{"id": "tsne_visualizer", "label": "t-SNE Visualizer", "category": "Unsupervised", "code": TSNE_VISUALIZER},
|
|
2863
|
+
{"id": "isolation_forest_anomaly", "label": "Isolation Forest", "category": "Unsupervised", "code": ISOLATION_FOREST_ANOMALY},
|
|
2864
|
+
|
|
2865
|
+
# Ensembles
|
|
2866
|
+
{"id": "stacking_regressor", "label": "Stacking Regressor", "category": "Regression", "code": STACKING_REGRESSOR},
|
|
2867
|
+
{"id": "quantile_regressor", "label": "Quantile Regressor", "category": "Regression", "code": QUANTILE_REGRESSOR},
|
|
2868
|
+
|
|
2869
|
+
# Agentic / Reinforcement
|
|
2870
|
+
{"id": "epsilon_greedy_bandit", "label": "Epsilon-Greedy Bandit", "category": "Agentic", "code": EPSILON_GREEDY_BANDIT},
|
|
2871
|
+
{"id": "markov_chain_simulator", "label": "Markov Chain Simulator", "category": "Agentic", "code": MARKOV_CHAIN_SIMULATOR},
|
|
2872
|
+
|
|
2873
|
+
# Evaluation
|
|
2874
|
+
{"id": "silhouette_score_node", "label": "Silhouette Score", "category": "Evaluation", "code": SILHOUETTE_SCORE_NODE},
|
|
2875
|
+
|
|
2876
|
+
# ── Advanced Preprocessing ───────────────────────────────────────────────
|
|
2877
|
+
{"id": "proper_capitalization_cleaner", "label": "Capitalisation Cleaner", "category": "Preprocessing", "code": PROPER_CAPITALIZATION_CLEANER},
|
|
2878
|
+
{"id": "z_score_outlier_detector", "label": "Z-Score Outlier Detector", "category": "Preprocessing", "code": Z_SCORE_OUTLIER_DETECTOR},
|
|
2879
|
+
{"id": "box_cox_transformer", "label": "Box-Cox Transformer", "category": "Preprocessing", "code": BOX_COX_TRANSFORMER},
|
|
2880
|
+
{"id": "mean_target_encoder", "label": "Mean Target Encoder", "category": "Preprocessing", "code": MEAN_TARGET_ENCODER},
|
|
2881
|
+
{"id": "colocated_feature_generator", "label": "Geo Feature Generator", "category": "Preprocessing", "code": COLOCATED_FEATURE_GENERATOR},
|
|
2882
|
+
{"id": "zip_code_grouper", "label": "Zip Code Grouper", "category": "Preprocessing", "code": ZIP_CODE_GROUPER},
|
|
2883
|
+
{"id": "null_indicator_creator", "label": "Null Indicator Creator", "category": "Preprocessing", "code": NULL_INDICATOR_CREATOR},
|
|
2884
|
+
|
|
2885
|
+
# ── Unsupervised & Clustering ────────────────────────────────────────────
|
|
2886
|
+
{"id": "umap_dimensionality_reduction", "label": "UMAP Reducer", "category": "Unsupervised", "code": UMAP_DIMENSIONALITY_REDUCTION},
|
|
2887
|
+
{"id": "elbow_method_data", "label": "Elbow Method", "category": "Unsupervised", "code": ELBOW_METHOD_DATA},
|
|
2888
|
+
|
|
2889
|
+
# ── Specialised Models ───────────────────────────────────────────────────
|
|
2890
|
+
{"id": "balanced_rf_classifier", "label": "Balanced RF Classifier", "category": "Classifiers", "code": BALANCED_RF_CLASSIFIER},
|
|
2891
|
+
{"id": "ridge_cv_regressor", "label": "RidgeCV Regressor", "category": "Regression", "code": RIDGE_CV_REGRESSOR},
|
|
2892
|
+
{"id": "poisson_regressor", "label": "Poisson Regressor", "category": "Regression", "code": POISSON_REGRESSOR},
|
|
2893
|
+
|
|
2894
|
+
# ── Deep Evaluation & XAI ────────────────────────────────────────────────
|
|
2895
|
+
{"id": "shap_explainer", "label": "SHAP Explainer", "category": "Evaluation", "code": SHAP_EXPLAINER},
|
|
2896
|
+
{"id": "partial_dependence_plots", "label": "Partial Dependence Plots","category": "Evaluation", "code": PARTIAL_DEPENDENCE_PLOTS},
|
|
2897
|
+
{"id": "calibration_curve_data", "label": "Calibration Curve", "category": "Evaluation", "code": CALIBRATION_CURVE_DATA},
|
|
2898
|
+
{"id": "learning_curve_analyzer", "label": "Learning Curve Analyzer", "category": "Evaluation", "code": LEARNING_CURVE_ANALYZER},
|
|
2899
|
+
{"id": "cost_benefit_matrix", "label": "Cost-Benefit Matrix", "category": "Evaluation", "code": COST_BENEFIT_MATRIX},
|
|
2900
|
+
]
|
|
2901
|
+
|
|
2902
|
+
|
|
2903
|
+
def get_template(template_id: str) -> dict | None:
|
|
2904
|
+
for t in TEMPLATES:
|
|
2905
|
+
if t["id"] == template_id:
|
|
2906
|
+
return t
|
|
2907
|
+
return None
|