python-flexeval 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. flexeval/__init__.py +11 -0
  2. flexeval/__main__.py +11 -0
  3. flexeval/classes/__init__.py +15 -0
  4. flexeval/classes/base.py +32 -0
  5. flexeval/classes/dataset.py +82 -0
  6. flexeval/classes/eval_runner.py +158 -0
  7. flexeval/classes/eval_set_run.py +32 -0
  8. flexeval/classes/message.py +183 -0
  9. flexeval/classes/metric.py +55 -0
  10. flexeval/classes/thread.py +79 -0
  11. flexeval/classes/tool_call.py +51 -0
  12. flexeval/classes/turn.py +206 -0
  13. flexeval/cli.py +104 -0
  14. flexeval/completions.py +147 -0
  15. flexeval/compute_metrics.py +788 -0
  16. flexeval/config.yaml +23 -0
  17. flexeval/configuration/__init__.py +1 -0
  18. flexeval/configuration/completion_functions.py +231 -0
  19. flexeval/configuration/evals.yaml +864 -0
  20. flexeval/configuration/function_metrics.py +650 -0
  21. flexeval/configuration/rubric_metrics.yaml +194 -0
  22. flexeval/data_loader.py +513 -0
  23. flexeval/db_utils.py +38 -0
  24. flexeval/dependency_graph.py +234 -0
  25. flexeval/eval_schema.json +256 -0
  26. flexeval/function_types.py +173 -0
  27. flexeval/helpers.py +52 -0
  28. flexeval/io/__init__.py +1 -0
  29. flexeval/io/parsers/yaml_parser.py +69 -0
  30. flexeval/log_utils.py +34 -0
  31. flexeval/metrics/__init__.py +8 -0
  32. flexeval/metrics/access.py +28 -0
  33. flexeval/metrics/save.py +39 -0
  34. flexeval/rubric.py +62 -0
  35. flexeval/run_utils.py +65 -0
  36. flexeval/runner.py +132 -0
  37. flexeval/schema/__init__.py +11 -0
  38. flexeval/schema/config_schema.py +46 -0
  39. flexeval/schema/eval_schema.py +163 -0
  40. flexeval/schema/evalrun_schema.py +97 -0
  41. flexeval/schema/rubric_schema.py +40 -0
  42. flexeval/schema/schema_utils.py +26 -0
  43. python_flexeval-0.1.5.dist-info/METADATA +118 -0
  44. python_flexeval-0.1.5.dist-info/RECORD +47 -0
  45. python_flexeval-0.1.5.dist-info/WHEEL +4 -0
  46. python_flexeval-0.1.5.dist-info/entry_points.txt +2 -0
  47. python_flexeval-0.1.5.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,864 @@
1
+ multimodels-langgraph-rubric-dependencies:
2
+ data:
3
+ - data/test-cases/ckpts_livehint/checkpoint.db
4
+
5
+ name: plot evaluations, models comparison
6
+ notes: |-
7
+ Comparing models on langgraph rubric evaluation performance,
8
+ compare same-model consistency in performance with duplicate evaluations,
9
+ evaluate when requested for a plot, if plot provided, if provided and correct,
10
+ if provided and appropriate. if plot provided but not requested.
11
+ if plot provided and bounds reasonable, if create_a_plot is true in response_to_student
12
+ matches the DesmosPlot counts, if plot matches description before or after.
13
+
14
+ config:
15
+ max_n_conversation_threads: 10
16
+ nb_evaluations_per_thread: 2
17
+ max_workers: 5
18
+
19
+ metrics:
20
+ function:
21
+ # compute most metrics only on assistant turns
22
+ - name: is_role
23
+ kwargs:
24
+ role: assistant
25
+
26
+ # count tool calls by name to locate the responses from DesmosPlot in assistant turns
27
+ #NOTE: can have a DesmosPlot rendered but no request for a plot, no dependency here
28
+ - name: count_tool_calls_by_name
29
+ metric_level: Turn
30
+ depends_on:
31
+ - name: is_role
32
+ kwargs:
33
+ role: assistant
34
+ metric_name: assistant
35
+ metric_min_value: 1
36
+
37
+ # count when assistant wants to create a plot at any time in a turn - should always lead to a DesmosPlot rendered
38
+ - name: count_of_parts_matching_regex
39
+ kwargs:
40
+ expression: 'create_a_plot": true'
41
+ metric_level: Turn
42
+ depends_on:
43
+ - name: is_role
44
+ kwargs:
45
+ role: assistant
46
+ metric_name: assistant
47
+ metric_min_value: 1
48
+
49
+ rubric:
50
+ # check if student requested a plot in their last message
51
+ - name: is_request_for_plot
52
+ # context_only: false
53
+ metric_level: Turn
54
+ depends_on:
55
+ - name: is_role
56
+ kwargs:
57
+ role: assistant
58
+ metric_name: assistant
59
+ metric_min_value: 1
60
+
61
+ # We don't assume that to be pedagogically correct, a plot has to be requested. So here only check if the plot has been rendered, was it pedagogically correct.
62
+ # Can cross information with is_plot_requested to know if a requested plot that has been rendered is pedagogically correct
63
+ - name: is_pedagogically_appropriate_plot
64
+ metric_level: Turn
65
+ depends_on:
66
+ - name: count_tool_calls_by_name
67
+ type: function
68
+ metric_name: DesmosPlot
69
+ metric_min_value: 1
70
+
71
+ # if a plot is generated, validates that it matches the assistant's description following the plot
72
+ - name: plot_matches_followup_description
73
+ metric_level: Turn
74
+ depends_on:
75
+ - name: count_tool_calls_by_name
76
+ type: function
77
+ metric_name: DesmosPlot
78
+ metric_min_value: 1
79
+
80
+ # if plot generated, check if bounds are reasonable
81
+ - name: plot_bounds_are_reasonable
82
+ metric_level: Turn
83
+ depends_on:
84
+ - name: count_tool_calls_by_name
85
+ type: function
86
+ metric_name: DesmosPlot
87
+ metric_min_value: 1
88
+
89
+ # if generated, is the plot mathematically correct compared to its intended use case
90
+ - name: is_correct_plot
91
+ metric_level: Turn
92
+ depends_on:
93
+ - name: count_tool_calls_by_name
94
+ type: function
95
+ metric_name: DesmosPlot
96
+ metric_min_value: 1
97
+
98
+
99
+ grader_llm:
100
+ function_name: open_ai_completion
101
+ kwargs:
102
+ model_name: o3-mini
103
+ api_key_name: OPENAI_API_KEY
104
+
105
+
106
+
107
+ test-multimodels-langgraph-rubric-dependencies:
108
+ data:
109
+ - ../../data/test-cases/ckpts_livehint/checkpoint.db
110
+
111
+ name: LLM models comparison
112
+ notes: Testing multi models langgraph rubric evaluation comparison
113
+
114
+ config:
115
+ max_n_conversation_threads: 10
116
+ max_workers: 6
117
+
118
+ metrics:
119
+ function:
120
+ # compute most metrics only on assistant turns
121
+ - name: is_role
122
+ kwargs:
123
+ role: assistant
124
+
125
+ # count tool calls by name to locate the responses from DesmosPlot in assistant turns
126
+ - name: count_tool_calls_by_name
127
+ metric_level: Turn
128
+ depends_on:
129
+ - name: is_role
130
+ kwargs:
131
+ role: assistant
132
+ metric_name: assistant
133
+ metric_min_value: 1
134
+
135
+ rubric:
136
+ - name: is_request_for_plot
137
+ # context_only: false
138
+ metric_level: Turn
139
+ depends_on:
140
+ - name: is_role
141
+ kwargs:
142
+ role: assistant
143
+ metric_name: assistant
144
+ metric_min_value: 1
145
+
146
+ - name: is_pedagogically_appropriate_plot
147
+ metric_level: Turn
148
+ depends_on:
149
+ - name: count_tool_calls_by_name
150
+ type: function
151
+ metric_name: DesmosPlot
152
+ metric_min_value: 1
153
+ - name: is_request_for_plot
154
+ type: rubric
155
+ metric_min_value: 1
156
+
157
+ grader_llm:
158
+ function_name: open_ai_completion
159
+ kwargs:
160
+ model_name: o3-mini
161
+ api_key_name: OPENAI_API_KEY
162
+
163
+
164
+
165
+ test-multimodels-langgraph:
166
+ data:
167
+ - ../../data/test-cases/ckpts_livehint/checkpoint.db
168
+
169
+ name: LLM models comparison
170
+ notes: Testing multi models langgraph rubric evaluation comparison
171
+
172
+ config:
173
+ max_n_conversation_threads: 2
174
+ max_workers: 6
175
+
176
+ metrics:
177
+ function:
178
+ - name: is_role
179
+ kwargs:
180
+ role: assistant
181
+
182
+ rubric:
183
+ - name: no_plot_after_student_requested
184
+ context_only: false
185
+ depends_on:
186
+ - name: is_role
187
+ type: function
188
+ kwargs:
189
+ role: assistant
190
+ metric_name: assistant
191
+ metric_min_value: 1
192
+
193
+ grader_llm:
194
+ function_name: open_ai_completion
195
+ kwargs:
196
+ model_name: o3-mini
197
+ api_key_name: OPENAI_API_KEY
198
+
199
+
200
+
201
+
202
+ issues-fixing-rubrics:
203
+ data:
204
+ - ../../data/test-cases/ckpts_livehint/checkpoint_gpt-4o-mini-2024-07-18_1540.sqlite
205
+
206
+ name: Issues fixing 03-05-25
207
+ notes: Fixing issues like plot bounds unreasonable, plot generated by request but unhelpful, plot generated but incorrect, technical error, student reasonably asked for a plot but no plot given
208
+
209
+ config:
210
+ max_n_conversation_threads: 3
211
+ max_workers: 6
212
+
213
+ metrics:
214
+ function:
215
+ # - name: string_length
216
+
217
+ - name: is_role
218
+ kwargs:
219
+ role: assistant
220
+
221
+ - name: is_role
222
+ kwargs:
223
+ role: user
224
+
225
+ - name: count_tool_calls
226
+ depends_on:
227
+ - name: is_role
228
+ kwargs:
229
+ role: assistant
230
+ metric_name: assistant
231
+ metric_min_value: 1
232
+
233
+ - name: count_of_parts_matching_regex
234
+ kwargs:
235
+ expression: "An error occurred while processing your request"
236
+ metric_level: Turn
237
+
238
+ rubric:
239
+ - name: is_plot_generated_upon_request_pedagogically_appropriate
240
+ depends_on:
241
+ - name: is_role
242
+ kwargs:
243
+ role: assistant
244
+ metric_name: assistant
245
+ metric_min_value: 1
246
+ - name: count_tool_calls
247
+ metric_min_value: 1
248
+
249
+ - name: is_plot_generated_upon_request_correct
250
+ depends_on:
251
+ - name: is_role
252
+ kwargs:
253
+ role: assistant
254
+ metric_name: assistant
255
+ metric_min_value: 1
256
+ - name: count_tool_calls
257
+ metric_min_value: 1
258
+
259
+ - name: are_bounds_reasonable_with_plot_generated_correctly
260
+ depends_on:
261
+ - name: is_role
262
+ kwargs:
263
+ role: assistant
264
+ metric_name: assistant
265
+ metric_min_value: 1
266
+ - name: count_tool_calls
267
+ metric_min_value: 1
268
+
269
+ - name: no_plot_after_student_requested
270
+ context_only: false
271
+ depends_on:
272
+ - name: is_role
273
+ type: function
274
+ kwargs:
275
+ role: assistant
276
+ metric_name: assistant
277
+ metric_min_value: 1
278
+
279
+ - name: desmos_code_rendered_on_toolcall
280
+ context_only: false
281
+ metric_level: Turn
282
+ depends_on:
283
+ - name: count_tool_calls
284
+ metric_min_value: 1
285
+
286
+ grader_llm:
287
+ function_name: open_ai_completion
288
+ kwargs:
289
+ model_name: o3-mini
290
+ api_key_name: OPENAI_API_KEY
291
+
292
+
293
+ test-multiple-rubrics-files:
294
+ data:
295
+ - ../../data/test-cases/ckpts_livehint/checkpoint_gpt-4o-mini-2024-07-18_1540.sqlite
296
+
297
+ name: Test running rubrics from multiple files.
298
+ notes: If different rubric files have similar rubric names, we only consider the rubric instance in the first file found in config.yaml
299
+ config:
300
+ max_n_conversation_threads: 3
301
+ max_workers: 6
302
+
303
+ metrics:
304
+ function:
305
+
306
+ - name: is_role
307
+ kwargs:
308
+ role: assistant
309
+
310
+ - name: is_role
311
+ kwargs:
312
+ role: user
313
+
314
+ - name: count_tool_calls
315
+ depends_on:
316
+ - name: is_role
317
+ kwargs:
318
+ role: assistant
319
+ metric_name: assistant
320
+ metric_min_value: 1
321
+
322
+ - name: count_of_parts_matching_regex
323
+ kwargs:
324
+ expression: "An error occurred while processing your request"
325
+ metric_level: Turn
326
+
327
+ rubric:
328
+ - name: plot_matches_description
329
+ context_only: false
330
+ metric_level: Turn
331
+ depends_on:
332
+ - name: count_tool_calls
333
+ metric_min_value: 1
334
+ - name: count_of_parts_matching_regex
335
+ kwargs:
336
+ expression: "An error occurred while processing your request"
337
+ metric_name: "An error occurred while processing your request"
338
+ metric_max_value: 0
339
+
340
+ - name: is_plot_generated_upon_request_correct
341
+ depends_on:
342
+ - name: is_role
343
+ kwargs:
344
+ role: assistant
345
+ metric_name: assistant
346
+ metric_min_value: 1
347
+ - name: count_tool_calls
348
+ metric_min_value: 1
349
+
350
+ - name: plot_matches_description_example_project
351
+ context_only: false
352
+ metric_level: Turn
353
+ depends_on:
354
+ - name: count_tool_calls
355
+ metric_min_value: 1
356
+ - name: count_of_parts_matching_regex
357
+ kwargs:
358
+ expression: "An error occurred while processing your request"
359
+ metric_name: "An error occurred while processing your request"
360
+ metric_max_value: 0
361
+
362
+ grader_llm:
363
+ function_name: open_ai_completion
364
+ kwargs:
365
+ model_name: o3-mini
366
+ api_key_name: OPENAI_API_KEY
367
+
368
+ test-new:
369
+ data:
370
+ - ../../data/test-cases/example.jsonl
371
+
372
+ name: test string length
373
+ notes: test notes
374
+
375
+ config:
376
+ max_workers: 6
377
+
378
+ metrics:
379
+ function:
380
+ - name: string_length
381
+
382
+ - name: openai_moderation_api #run for every turn by default
383
+ context_only: false #this is the default
384
+
385
+ - name: openai_moderation_api
386
+ context_only: true
387
+
388
+ rubric:
389
+ - name: yeasayer_completion
390
+ depends_on:
391
+ - name: openai_moderation_api
392
+ context_only: true
393
+ metric_min_value: 0.1
394
+
395
+ completion_llm:
396
+ function_name: open_ai_completion
397
+ include_system_prompt: False
398
+ kwargs:
399
+ model_name: o3-mini
400
+ api_key_name: OPENAI_API_KEY
401
+ n: 2
402
+
403
+ grader_llm:
404
+ function_name: open_ai_completion
405
+ kwargs:
406
+ model_name: o3-mini
407
+ api_key_name: OPENAI_API_KEY
408
+
409
+ example:
410
+ data:
411
+ - ../../data/test-cases/dependency_example_test.jsonl
412
+
413
+ do_completion: False
414
+
415
+ name: dependency example
416
+ notes: my notes
417
+
418
+ #anything in here will overwrite entries in src/llm-evals/config.yaml
419
+ config:
420
+ max_workers: 8
421
+
422
+ metrics:
423
+ function:
424
+ - name: string_length
425
+
426
+ - name: is_role
427
+ kwargs:
428
+ role: assistant
429
+
430
+ - name: is_role
431
+ kwargs:
432
+ role: user
433
+
434
+ - name: count_tool_calls
435
+ depends_on:
436
+ - name: is_role
437
+ kwargs:
438
+ role: assistant
439
+ metric_name: assistant
440
+ metric_min_value: 1
441
+
442
+ - name: value_counts_by_tool_name
443
+ kwargs:
444
+ json_key: latex
445
+ depends_on:
446
+ - name: is_role
447
+ kwargs:
448
+ role: assistant
449
+ metric_name: assistant
450
+ metric_min_value: 1
451
+
452
+ - name: count_messages_per_role
453
+
454
+ - name: openai_moderation_api #run for every turn by default
455
+ context_only: false #this is the default
456
+
457
+ - name: openai_moderation_api
458
+ context_only: true
459
+
460
+ - name: flesch_kincaid_grade
461
+
462
+ rubric:
463
+ - name: yeasayer_completion
464
+ depends_on:
465
+ - name: openai_moderation_api
466
+ context_only: true
467
+ metric_min_value: 0.1
468
+
469
+ - name: is_request_for_plot
470
+ context_only: true
471
+ depends_on:
472
+ - name: is_role
473
+ type: function
474
+ kwargs:
475
+ role: assistant
476
+ metric_name: assistant
477
+ metric_min_value: 1
478
+
479
+ - name: is_student_acting_as_tutor
480
+
481
+ - name: is_pedagogically_appropriate_plot
482
+ depends_on:
483
+ - name: is_role
484
+ kwargs:
485
+ role: assistant
486
+ metric_name: assistant
487
+ metric_min_value: 1
488
+ - name: count_tool_calls
489
+ metric_min_value: 1 #TODO - sum of the 'value' for all rows where function_name is 'count_tool_calls' should be >= 1
490
+
491
+ completion_llm:
492
+ function_name: open_ai_completion
493
+ include_system_prompt: False
494
+ kwargs:
495
+ model_name: gpt-3.5-turbo
496
+ api_key_name: OPENAI_API_KEY
497
+ n: 2
498
+
499
+ grader_llm:
500
+ function_name: open_ai_completion
501
+ kwargs:
502
+ model_name: gpt-4o
503
+ api_key_name: OPENAI_API_KEY
504
+
505
+ example-langgraph-no-rubrics:
506
+ data:
507
+ - ../../data/plotting-feb-2025/checkpoint-06022024-AIED-small.sqlite
508
+ #- /Users/arafferty/git/pedagogical-plots/src/langgraph/checkpoint.db
509
+ do_completion: False
510
+
511
+ name: dependency example
512
+ notes: my notes
513
+
514
+ #anything in here will overwrite entries in src/llm-evals/config.yaml
515
+ config:
516
+ max_workers: 1
517
+
518
+ metrics:
519
+ function:
520
+ # - name: string_length
521
+
522
+ - name: is_role
523
+ # metric_level: Turn
524
+ kwargs:
525
+ role: assistant
526
+
527
+ - name: is_role
528
+ metric_level: Message
529
+ kwargs:
530
+ role: user
531
+
532
+ - name: is_role
533
+ metric_level: Message
534
+ kwargs:
535
+ role: assistant
536
+
537
+ - name: string_length
538
+ metric_level: Thread
539
+
540
+ - name: string_length
541
+ metric_level: Turn
542
+
543
+ - name: string_length
544
+ metric_level: Message
545
+
546
+ - name: count_tool_calls_by_name
547
+ metric_level: ToolCall
548
+
549
+ - name: count_tool_calls
550
+ metric_level: Turn
551
+
552
+
553
+ - name: count_tool_calls_by_name
554
+ metric_level: Thread
555
+
556
+ - name: count_tool_calls_by_name
557
+ metric_level: Message
558
+ depends_on:
559
+ - name: is_role
560
+ kwargs:
561
+ role: assistant
562
+ metric_name: assistant
563
+ metric_min_value: 1
564
+
565
+ - name: count_tool_calls_by_name
566
+ metric_level: Turn
567
+ depends_on:
568
+ - name: is_role
569
+ kwargs:
570
+ role: assistant
571
+ metric_name: assistant
572
+ metric_min_value: 1
573
+
574
+ - name: count_numeric_tool_call_params_by_name
575
+ metric_level: ToolCall
576
+
577
+ - name: count_llm_models
578
+ metric_level: Thread
579
+
580
+ - name: message_matches_regex
581
+ kwargs:
582
+ expression: "Please use the appropriate available tools to generate the requested plot or diagram."
583
+ metric_level: Message
584
+
585
+ - name: count_of_parts_matching_regex
586
+ kwargs:
587
+ expression: "Please use the appropriate available tools to generate the requested plot or diagram."
588
+ metric_level: Turn
589
+
590
+ # - name: count_messages_per_role
591
+ # metric_level: Thread
592
+
593
+ # - name: count_messages_per_role
594
+ # metric_level: Turn
595
+
596
+ cl-no-rubrics:
597
+ data:
598
+ - ../../data/plotting-feb-2025/checkpoint-06022024-AIED-small.sqlite
599
+ do_completion: False
600
+
601
+ name: my-test
602
+ notes: analysis on function metrics only, on bot-driven conversations
603
+
604
+ #anything in here will overwrite entries in src/llm-evals/config.yaml
605
+ config:
606
+ max_workers: 8
607
+
608
+ metrics:
609
+ function:
610
+ - name: string_length
611
+ metric_level: Thread
612
+
613
+
614
+ - name: count_emojis
615
+
616
+ - name: is_role
617
+ kwargs:
618
+ role: assistant
619
+
620
+ - name: is_role
621
+ kwargs:
622
+ role: user
623
+
624
+ - name: count_tool_calls
625
+ depends_on:
626
+ - name: is_role
627
+ kwargs:
628
+ role: assistant
629
+ metric_name: assistant
630
+ metric_min_value: 1
631
+
632
+ - name: value_counts_by_tool_name
633
+ kwargs:
634
+ json_key: latex
635
+ depends_on:
636
+ - name: is_role
637
+ kwargs:
638
+ role: assistant
639
+ metric_name: assistant
640
+ metric_min_value: 1
641
+
642
+ - name: count_messages_per_role
643
+
644
+ - name: error_count
645
+
646
+ - name: rendering_error_count
647
+
648
+ - name: flesch_kincaid_grade
649
+
650
+ plotting_some_rubrics:
651
+ data:
652
+ - ../../data/plotting-feb-2025/checkpoint-06022024-AIED.sqlite
653
+ do_completion: False
654
+
655
+ name: Plotting evaluation evals
656
+ notes: my notes
657
+
658
+ #anything in here will overwrite entries in src/llm-evals/config.yaml
659
+ config:
660
+ max_workers: 6
661
+
662
+ metrics:
663
+ function:
664
+ - name: string_length
665
+ metric_level: Thread
666
+
667
+ - name: is_role
668
+ kwargs:
669
+ role: assistant
670
+
671
+ - name: is_role
672
+ kwargs:
673
+ role: user
674
+
675
+ - name: is_role
676
+ metric_level: Message
677
+ kwargs:
678
+ role: user
679
+
680
+ - name: is_role
681
+ metric_level: Message
682
+ kwargs:
683
+ role: assistant
684
+
685
+ - name: is_langgraph_type
686
+ metric_level: Message
687
+ kwargs:
688
+ type: ai
689
+
690
+ - name: string_length
691
+ metric_level: Turn
692
+
693
+ - name: string_length
694
+ metric_level: Message
695
+
696
+ - name: count_tool_calls_by_name
697
+ metric_level: ToolCall
698
+
699
+ - name: count_tool_calls
700
+ metric_level: Turn
701
+ depends_on:
702
+ - name: is_role
703
+ kwargs:
704
+ role: assistant
705
+ metric_name: assistant
706
+ metric_min_value: 1
707
+
708
+
709
+ - name: count_tool_calls_by_name
710
+ metric_level: Thread
711
+
712
+ - name: count_tool_calls_by_name
713
+ metric_level: Message
714
+ depends_on:
715
+ - name: is_role
716
+ kwargs:
717
+ role: assistant
718
+ metric_name: assistant
719
+ metric_min_value: 1
720
+
721
+ - name: count_tool_calls_by_name
722
+ metric_level: Turn
723
+ depends_on:
724
+ - name: is_role
725
+ kwargs:
726
+ role: assistant
727
+ metric_name: assistant
728
+ metric_min_value: 1
729
+
730
+ - name: count_numeric_tool_call_params_by_name
731
+ metric_level: ToolCall
732
+
733
+ - name: count_llm_models
734
+ metric_level: Thread
735
+
736
+ - name: message_matches_regex
737
+ kwargs:
738
+ expression: "Please use the appropriate available tools to generate the requested plot or diagram."
739
+ metric_level: Message
740
+ # metric_name: "ToolMessage_text_indicator"
741
+
742
+ - name: message_matches_regex
743
+ kwargs:
744
+ expression: "desmos"
745
+ metric_level: Message
746
+
747
+ - name: message_matches_regex
748
+ kwargs:
749
+ expression: "functions\\..*\\{"
750
+ metric_level: Message
751
+ # metric_name: "function_call_in_content"
752
+ depends_on:
753
+ - name: is_langgraph_type
754
+ kwargs:
755
+ type: ai
756
+ metric_name: ai
757
+ metric_min_value: 1
758
+
759
+ - name: message_matches_regex
760
+ kwargs:
761
+ expression: "exception"
762
+ metric_level: Message
763
+ # metric_name: "exception_in_content"
764
+ depends_on:
765
+ - name: is_role
766
+ kwargs:
767
+ role: assistant
768
+ metric_name: assistant
769
+ metric_min_value: 1
770
+
771
+ - name: count_of_parts_matching_regex
772
+ kwargs:
773
+ expression: "Please use the appropriate available tools to generate the requested plot or diagram"
774
+ metric_level: Turn
775
+ # metric_name: "contains_plot_description"
776
+
777
+ - name: count_messages_per_role
778
+ kwargs:
779
+ use_langgraph_roles: true
780
+ metric_level: Thread
781
+
782
+ - name: count_tokens
783
+ metric_level: Turn
784
+
785
+ - name: count_errors
786
+ metric_level: Thread
787
+
788
+ - name: message_matches_regex
789
+ kwargs:
790
+ expression: "An error occurred while processing your request"
791
+ metric_level: Message
792
+ # metric_name: "error_in_content"
793
+
794
+ - name: count_of_parts_matching_regex
795
+ kwargs:
796
+ expression: "An error occurred while processing your request"
797
+ metric_level: Turn
798
+
799
+ rubric:
800
+ - name: is_request_for_plot
801
+ context_only: false
802
+ metric_level: Turn
803
+ depends_on:
804
+ - name: is_role
805
+ kwargs:
806
+ role: assistant
807
+ metric_name: assistant
808
+ metric_min_value: 1
809
+
810
+ - name: is_student_acting_as_tutor
811
+ context_only: false
812
+ metric_level: Turn
813
+ depends_on:
814
+ - name: is_role
815
+ kwargs:
816
+ role: user
817
+ metric_name: user
818
+ metric_min_value: 1
819
+
820
+ - name: is_pedagogically_appropriate_plot
821
+ context_only: false
822
+ metric_level: Turn
823
+ depends_on:
824
+ - name: count_tool_calls
825
+ metric_min_value: 1
826
+
827
+ - name: plot_matches_description
828
+ context_only: false
829
+ metric_level: Turn
830
+ depends_on:
831
+ - name: count_tool_calls
832
+ metric_min_value: 1
833
+ - name: count_of_parts_matching_regex
834
+ kwargs:
835
+ expression: "An error occurred while processing your request"
836
+ metric_name: "An error occurred while processing your request"
837
+ metric_max_value: 0
838
+
839
+ - name: plot_bounds_are_sufficiently_large
840
+ context_only: false
841
+ metric_level: Turn
842
+ depends_on:
843
+ - name: count_tool_calls
844
+ metric_min_value: 1
845
+ - name: count_of_parts_matching_regex
846
+ kwargs:
847
+ expression: "An error occurred while processing your request"
848
+ metric_name: "An error occurred while processing your request"
849
+ metric_max_value: 0
850
+
851
+ # completion_llm:
852
+ # function_name: open_ai_completion
853
+ # include_system_prompt: False
854
+ # kwargs:
855
+ # model_name: gpt-3.5-turbo
856
+ # api_key_name: OPENAI_API_KEY
857
+ # n: 2
858
+
859
+ grader_llm:
860
+ function_name: open_ai_completion
861
+ kwargs:
862
+ model_name: o3-mini
863
+ api_key_name: OPENAI_API_KEY
864
+