python-flexeval 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flexeval/__init__.py +11 -0
- flexeval/__main__.py +11 -0
- flexeval/classes/__init__.py +15 -0
- flexeval/classes/base.py +32 -0
- flexeval/classes/dataset.py +82 -0
- flexeval/classes/eval_runner.py +158 -0
- flexeval/classes/eval_set_run.py +32 -0
- flexeval/classes/message.py +183 -0
- flexeval/classes/metric.py +55 -0
- flexeval/classes/thread.py +79 -0
- flexeval/classes/tool_call.py +51 -0
- flexeval/classes/turn.py +206 -0
- flexeval/cli.py +104 -0
- flexeval/completions.py +147 -0
- flexeval/compute_metrics.py +788 -0
- flexeval/config.yaml +23 -0
- flexeval/configuration/__init__.py +1 -0
- flexeval/configuration/completion_functions.py +231 -0
- flexeval/configuration/evals.yaml +864 -0
- flexeval/configuration/function_metrics.py +650 -0
- flexeval/configuration/rubric_metrics.yaml +194 -0
- flexeval/data_loader.py +513 -0
- flexeval/db_utils.py +38 -0
- flexeval/dependency_graph.py +234 -0
- flexeval/eval_schema.json +256 -0
- flexeval/function_types.py +173 -0
- flexeval/helpers.py +52 -0
- flexeval/io/__init__.py +1 -0
- flexeval/io/parsers/yaml_parser.py +69 -0
- flexeval/log_utils.py +34 -0
- flexeval/metrics/__init__.py +8 -0
- flexeval/metrics/access.py +28 -0
- flexeval/metrics/save.py +39 -0
- flexeval/rubric.py +62 -0
- flexeval/run_utils.py +65 -0
- flexeval/runner.py +132 -0
- flexeval/schema/__init__.py +11 -0
- flexeval/schema/config_schema.py +46 -0
- flexeval/schema/eval_schema.py +163 -0
- flexeval/schema/evalrun_schema.py +97 -0
- flexeval/schema/rubric_schema.py +40 -0
- flexeval/schema/schema_utils.py +26 -0
- python_flexeval-0.1.5.dist-info/METADATA +118 -0
- python_flexeval-0.1.5.dist-info/RECORD +47 -0
- python_flexeval-0.1.5.dist-info/WHEEL +4 -0
- python_flexeval-0.1.5.dist-info/entry_points.txt +2 -0
- python_flexeval-0.1.5.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,864 @@
|
|
|
1
|
+
multimodels-langgraph-rubric-dependencies:
|
|
2
|
+
data:
|
|
3
|
+
- data/test-cases/ckpts_livehint/checkpoint.db
|
|
4
|
+
|
|
5
|
+
name: plot evaluations, models comparison
|
|
6
|
+
notes: |-
|
|
7
|
+
Comparing models on langgraph rubric evaluation performance,
|
|
8
|
+
compare same-model consistency in performance with duplicate evaluations,
|
|
9
|
+
evaluate when requested for a plot, if plot provided, if provided and correct,
|
|
10
|
+
if provided and appropriate. if plot provided but not requested.
|
|
11
|
+
if plot provided and bounds reasonable, if create_a_plot is true in response_to_student
|
|
12
|
+
matches the DesmosPlot counts, if plot matches description before or after.
|
|
13
|
+
|
|
14
|
+
config:
|
|
15
|
+
max_n_conversation_threads: 10
|
|
16
|
+
nb_evaluations_per_thread: 2
|
|
17
|
+
max_workers: 5
|
|
18
|
+
|
|
19
|
+
metrics:
|
|
20
|
+
function:
|
|
21
|
+
# compute most metrics only on assistant turns
|
|
22
|
+
- name: is_role
|
|
23
|
+
kwargs:
|
|
24
|
+
role: assistant
|
|
25
|
+
|
|
26
|
+
# count tool calls by name to locate the responses from DesmosPlot in assistant turns
|
|
27
|
+
#NOTE: can have a DesmosPlot rendered but no request for a plot, no dependency here
|
|
28
|
+
- name: count_tool_calls_by_name
|
|
29
|
+
metric_level: Turn
|
|
30
|
+
depends_on:
|
|
31
|
+
- name: is_role
|
|
32
|
+
kwargs:
|
|
33
|
+
role: assistant
|
|
34
|
+
metric_name: assistant
|
|
35
|
+
metric_min_value: 1
|
|
36
|
+
|
|
37
|
+
# count when assistant wants to create a plot at any time in a turn - should always lead to a DesmosPlot rendered
|
|
38
|
+
- name: count_of_parts_matching_regex
|
|
39
|
+
kwargs:
|
|
40
|
+
expression: 'create_a_plot": true'
|
|
41
|
+
metric_level: Turn
|
|
42
|
+
depends_on:
|
|
43
|
+
- name: is_role
|
|
44
|
+
kwargs:
|
|
45
|
+
role: assistant
|
|
46
|
+
metric_name: assistant
|
|
47
|
+
metric_min_value: 1
|
|
48
|
+
|
|
49
|
+
rubric:
|
|
50
|
+
# check if student requested a plot in their last message
|
|
51
|
+
- name: is_request_for_plot
|
|
52
|
+
# context_only: false
|
|
53
|
+
metric_level: Turn
|
|
54
|
+
depends_on:
|
|
55
|
+
- name: is_role
|
|
56
|
+
kwargs:
|
|
57
|
+
role: assistant
|
|
58
|
+
metric_name: assistant
|
|
59
|
+
metric_min_value: 1
|
|
60
|
+
|
|
61
|
+
# We don't assume that to be pedagogically correct, a plot has to be requested. So here only check if the plot has been rendered, was it pedagogically correct.
|
|
62
|
+
# Can cross information with is_plot_requested to know if a requested plot that has been rendered is pedagogically correct
|
|
63
|
+
- name: is_pedagogically_appropriate_plot
|
|
64
|
+
metric_level: Turn
|
|
65
|
+
depends_on:
|
|
66
|
+
- name: count_tool_calls_by_name
|
|
67
|
+
type: function
|
|
68
|
+
metric_name: DesmosPlot
|
|
69
|
+
metric_min_value: 1
|
|
70
|
+
|
|
71
|
+
# if a plot is generated, validates that it matches the assistant's description following the plot
|
|
72
|
+
- name: plot_matches_followup_description
|
|
73
|
+
metric_level: Turn
|
|
74
|
+
depends_on:
|
|
75
|
+
- name: count_tool_calls_by_name
|
|
76
|
+
type: function
|
|
77
|
+
metric_name: DesmosPlot
|
|
78
|
+
metric_min_value: 1
|
|
79
|
+
|
|
80
|
+
# if plot generated, check if bounds are reasonable
|
|
81
|
+
- name: plot_bounds_are_reasonable
|
|
82
|
+
metric_level: Turn
|
|
83
|
+
depends_on:
|
|
84
|
+
- name: count_tool_calls_by_name
|
|
85
|
+
type: function
|
|
86
|
+
metric_name: DesmosPlot
|
|
87
|
+
metric_min_value: 1
|
|
88
|
+
|
|
89
|
+
# if generated, is the plot mathematically correct compared to its intended use case
|
|
90
|
+
- name: is_correct_plot
|
|
91
|
+
metric_level: Turn
|
|
92
|
+
depends_on:
|
|
93
|
+
- name: count_tool_calls_by_name
|
|
94
|
+
type: function
|
|
95
|
+
metric_name: DesmosPlot
|
|
96
|
+
metric_min_value: 1
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
grader_llm:
|
|
100
|
+
function_name: open_ai_completion
|
|
101
|
+
kwargs:
|
|
102
|
+
model_name: o3-mini
|
|
103
|
+
api_key_name: OPENAI_API_KEY
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
test-multimodels-langgraph-rubric-dependencies:
|
|
108
|
+
data:
|
|
109
|
+
- ../../data/test-cases/ckpts_livehint/checkpoint.db
|
|
110
|
+
|
|
111
|
+
name: LLM models comparison
|
|
112
|
+
notes: Testing multi models langgraph rubric evaluation comparison
|
|
113
|
+
|
|
114
|
+
config:
|
|
115
|
+
max_n_conversation_threads: 10
|
|
116
|
+
max_workers: 6
|
|
117
|
+
|
|
118
|
+
metrics:
|
|
119
|
+
function:
|
|
120
|
+
# compute most metrics only on assistant turns
|
|
121
|
+
- name: is_role
|
|
122
|
+
kwargs:
|
|
123
|
+
role: assistant
|
|
124
|
+
|
|
125
|
+
# count tool calls by name to locate the responses from DesmosPlot in assistant turns
|
|
126
|
+
- name: count_tool_calls_by_name
|
|
127
|
+
metric_level: Turn
|
|
128
|
+
depends_on:
|
|
129
|
+
- name: is_role
|
|
130
|
+
kwargs:
|
|
131
|
+
role: assistant
|
|
132
|
+
metric_name: assistant
|
|
133
|
+
metric_min_value: 1
|
|
134
|
+
|
|
135
|
+
rubric:
|
|
136
|
+
- name: is_request_for_plot
|
|
137
|
+
# context_only: false
|
|
138
|
+
metric_level: Turn
|
|
139
|
+
depends_on:
|
|
140
|
+
- name: is_role
|
|
141
|
+
kwargs:
|
|
142
|
+
role: assistant
|
|
143
|
+
metric_name: assistant
|
|
144
|
+
metric_min_value: 1
|
|
145
|
+
|
|
146
|
+
- name: is_pedagogically_appropriate_plot
|
|
147
|
+
metric_level: Turn
|
|
148
|
+
depends_on:
|
|
149
|
+
- name: count_tool_calls_by_name
|
|
150
|
+
type: function
|
|
151
|
+
metric_name: DesmosPlot
|
|
152
|
+
metric_min_value: 1
|
|
153
|
+
- name: is_request_for_plot
|
|
154
|
+
type: rubric
|
|
155
|
+
metric_min_value: 1
|
|
156
|
+
|
|
157
|
+
grader_llm:
|
|
158
|
+
function_name: open_ai_completion
|
|
159
|
+
kwargs:
|
|
160
|
+
model_name: o3-mini
|
|
161
|
+
api_key_name: OPENAI_API_KEY
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
test-multimodels-langgraph:
|
|
166
|
+
data:
|
|
167
|
+
- ../../data/test-cases/ckpts_livehint/checkpoint.db
|
|
168
|
+
|
|
169
|
+
name: LLM models comparison
|
|
170
|
+
notes: Testing multi models langgraph rubric evaluation comparison
|
|
171
|
+
|
|
172
|
+
config:
|
|
173
|
+
max_n_conversation_threads: 2
|
|
174
|
+
max_workers: 6
|
|
175
|
+
|
|
176
|
+
metrics:
|
|
177
|
+
function:
|
|
178
|
+
- name: is_role
|
|
179
|
+
kwargs:
|
|
180
|
+
role: assistant
|
|
181
|
+
|
|
182
|
+
rubric:
|
|
183
|
+
- name: no_plot_after_student_requested
|
|
184
|
+
context_only: false
|
|
185
|
+
depends_on:
|
|
186
|
+
- name: is_role
|
|
187
|
+
type: function
|
|
188
|
+
kwargs:
|
|
189
|
+
role: assistant
|
|
190
|
+
metric_name: assistant
|
|
191
|
+
metric_min_value: 1
|
|
192
|
+
|
|
193
|
+
grader_llm:
|
|
194
|
+
function_name: open_ai_completion
|
|
195
|
+
kwargs:
|
|
196
|
+
model_name: o3-mini
|
|
197
|
+
api_key_name: OPENAI_API_KEY
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
issues-fixing-rubrics:
|
|
203
|
+
data:
|
|
204
|
+
- ../../data/test-cases/ckpts_livehint/checkpoint_gpt-4o-mini-2024-07-18_1540.sqlite
|
|
205
|
+
|
|
206
|
+
name: Issues fixing 03-05-25
|
|
207
|
+
notes: Fixing issues like plot bounds unreasonable, plot generated by request but unhelpful, plot generated but incorrect, technical error, student reasonably asked for a plot but no plot given
|
|
208
|
+
|
|
209
|
+
config:
|
|
210
|
+
max_n_conversation_threads: 3
|
|
211
|
+
max_workers: 6
|
|
212
|
+
|
|
213
|
+
metrics:
|
|
214
|
+
function:
|
|
215
|
+
# - name: string_length
|
|
216
|
+
|
|
217
|
+
- name: is_role
|
|
218
|
+
kwargs:
|
|
219
|
+
role: assistant
|
|
220
|
+
|
|
221
|
+
- name: is_role
|
|
222
|
+
kwargs:
|
|
223
|
+
role: user
|
|
224
|
+
|
|
225
|
+
- name: count_tool_calls
|
|
226
|
+
depends_on:
|
|
227
|
+
- name: is_role
|
|
228
|
+
kwargs:
|
|
229
|
+
role: assistant
|
|
230
|
+
metric_name: assistant
|
|
231
|
+
metric_min_value: 1
|
|
232
|
+
|
|
233
|
+
- name: count_of_parts_matching_regex
|
|
234
|
+
kwargs:
|
|
235
|
+
expression: "An error occurred while processing your request"
|
|
236
|
+
metric_level: Turn
|
|
237
|
+
|
|
238
|
+
rubric:
|
|
239
|
+
- name: is_plot_generated_upon_request_pedagogically_appropriate
|
|
240
|
+
depends_on:
|
|
241
|
+
- name: is_role
|
|
242
|
+
kwargs:
|
|
243
|
+
role: assistant
|
|
244
|
+
metric_name: assistant
|
|
245
|
+
metric_min_value: 1
|
|
246
|
+
- name: count_tool_calls
|
|
247
|
+
metric_min_value: 1
|
|
248
|
+
|
|
249
|
+
- name: is_plot_generated_upon_request_correct
|
|
250
|
+
depends_on:
|
|
251
|
+
- name: is_role
|
|
252
|
+
kwargs:
|
|
253
|
+
role: assistant
|
|
254
|
+
metric_name: assistant
|
|
255
|
+
metric_min_value: 1
|
|
256
|
+
- name: count_tool_calls
|
|
257
|
+
metric_min_value: 1
|
|
258
|
+
|
|
259
|
+
- name: are_bounds_reasonable_with_plot_generated_correctly
|
|
260
|
+
depends_on:
|
|
261
|
+
- name: is_role
|
|
262
|
+
kwargs:
|
|
263
|
+
role: assistant
|
|
264
|
+
metric_name: assistant
|
|
265
|
+
metric_min_value: 1
|
|
266
|
+
- name: count_tool_calls
|
|
267
|
+
metric_min_value: 1
|
|
268
|
+
|
|
269
|
+
- name: no_plot_after_student_requested
|
|
270
|
+
context_only: false
|
|
271
|
+
depends_on:
|
|
272
|
+
- name: is_role
|
|
273
|
+
type: function
|
|
274
|
+
kwargs:
|
|
275
|
+
role: assistant
|
|
276
|
+
metric_name: assistant
|
|
277
|
+
metric_min_value: 1
|
|
278
|
+
|
|
279
|
+
- name: desmos_code_rendered_on_toolcall
|
|
280
|
+
context_only: false
|
|
281
|
+
metric_level: Turn
|
|
282
|
+
depends_on:
|
|
283
|
+
- name: count_tool_calls
|
|
284
|
+
metric_min_value: 1
|
|
285
|
+
|
|
286
|
+
grader_llm:
|
|
287
|
+
function_name: open_ai_completion
|
|
288
|
+
kwargs:
|
|
289
|
+
model_name: o3-mini
|
|
290
|
+
api_key_name: OPENAI_API_KEY
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
test-multiple-rubrics-files:
|
|
294
|
+
data:
|
|
295
|
+
- ../../data/test-cases/ckpts_livehint/checkpoint_gpt-4o-mini-2024-07-18_1540.sqlite
|
|
296
|
+
|
|
297
|
+
name: Test running rubrics from multiple files.
|
|
298
|
+
notes: If different rubric files have similar rubric names, we only consider the rubric instance in the first file found in config.yaml
|
|
299
|
+
config:
|
|
300
|
+
max_n_conversation_threads: 3
|
|
301
|
+
max_workers: 6
|
|
302
|
+
|
|
303
|
+
metrics:
|
|
304
|
+
function:
|
|
305
|
+
|
|
306
|
+
- name: is_role
|
|
307
|
+
kwargs:
|
|
308
|
+
role: assistant
|
|
309
|
+
|
|
310
|
+
- name: is_role
|
|
311
|
+
kwargs:
|
|
312
|
+
role: user
|
|
313
|
+
|
|
314
|
+
- name: count_tool_calls
|
|
315
|
+
depends_on:
|
|
316
|
+
- name: is_role
|
|
317
|
+
kwargs:
|
|
318
|
+
role: assistant
|
|
319
|
+
metric_name: assistant
|
|
320
|
+
metric_min_value: 1
|
|
321
|
+
|
|
322
|
+
- name: count_of_parts_matching_regex
|
|
323
|
+
kwargs:
|
|
324
|
+
expression: "An error occurred while processing your request"
|
|
325
|
+
metric_level: Turn
|
|
326
|
+
|
|
327
|
+
rubric:
|
|
328
|
+
- name: plot_matches_description
|
|
329
|
+
context_only: false
|
|
330
|
+
metric_level: Turn
|
|
331
|
+
depends_on:
|
|
332
|
+
- name: count_tool_calls
|
|
333
|
+
metric_min_value: 1
|
|
334
|
+
- name: count_of_parts_matching_regex
|
|
335
|
+
kwargs:
|
|
336
|
+
expression: "An error occurred while processing your request"
|
|
337
|
+
metric_name: "An error occurred while processing your request"
|
|
338
|
+
metric_max_value: 0
|
|
339
|
+
|
|
340
|
+
- name: is_plot_generated_upon_request_correct
|
|
341
|
+
depends_on:
|
|
342
|
+
- name: is_role
|
|
343
|
+
kwargs:
|
|
344
|
+
role: assistant
|
|
345
|
+
metric_name: assistant
|
|
346
|
+
metric_min_value: 1
|
|
347
|
+
- name: count_tool_calls
|
|
348
|
+
metric_min_value: 1
|
|
349
|
+
|
|
350
|
+
- name: plot_matches_description_example_project
|
|
351
|
+
context_only: false
|
|
352
|
+
metric_level: Turn
|
|
353
|
+
depends_on:
|
|
354
|
+
- name: count_tool_calls
|
|
355
|
+
metric_min_value: 1
|
|
356
|
+
- name: count_of_parts_matching_regex
|
|
357
|
+
kwargs:
|
|
358
|
+
expression: "An error occurred while processing your request"
|
|
359
|
+
metric_name: "An error occurred while processing your request"
|
|
360
|
+
metric_max_value: 0
|
|
361
|
+
|
|
362
|
+
grader_llm:
|
|
363
|
+
function_name: open_ai_completion
|
|
364
|
+
kwargs:
|
|
365
|
+
model_name: o3-mini
|
|
366
|
+
api_key_name: OPENAI_API_KEY
|
|
367
|
+
|
|
368
|
+
test-new:
|
|
369
|
+
data:
|
|
370
|
+
- ../../data/test-cases/example.jsonl
|
|
371
|
+
|
|
372
|
+
name: test string length
|
|
373
|
+
notes: test notes
|
|
374
|
+
|
|
375
|
+
config:
|
|
376
|
+
max_workers: 6
|
|
377
|
+
|
|
378
|
+
metrics:
|
|
379
|
+
function:
|
|
380
|
+
- name: string_length
|
|
381
|
+
|
|
382
|
+
- name: openai_moderation_api #run for every turn by default
|
|
383
|
+
context_only: false #this is the default
|
|
384
|
+
|
|
385
|
+
- name: openai_moderation_api
|
|
386
|
+
context_only: true
|
|
387
|
+
|
|
388
|
+
rubric:
|
|
389
|
+
- name: yeasayer_completion
|
|
390
|
+
depends_on:
|
|
391
|
+
- name: openai_moderation_api
|
|
392
|
+
context_only: true
|
|
393
|
+
metric_min_value: 0.1
|
|
394
|
+
|
|
395
|
+
completion_llm:
|
|
396
|
+
function_name: open_ai_completion
|
|
397
|
+
include_system_prompt: False
|
|
398
|
+
kwargs:
|
|
399
|
+
model_name: o3-mini
|
|
400
|
+
api_key_name: OPENAI_API_KEY
|
|
401
|
+
n: 2
|
|
402
|
+
|
|
403
|
+
grader_llm:
|
|
404
|
+
function_name: open_ai_completion
|
|
405
|
+
kwargs:
|
|
406
|
+
model_name: o3-mini
|
|
407
|
+
api_key_name: OPENAI_API_KEY
|
|
408
|
+
|
|
409
|
+
example:
|
|
410
|
+
data:
|
|
411
|
+
- ../../data/test-cases/dependency_example_test.jsonl
|
|
412
|
+
|
|
413
|
+
do_completion: False
|
|
414
|
+
|
|
415
|
+
name: dependency example
|
|
416
|
+
notes: my notes
|
|
417
|
+
|
|
418
|
+
#anything in here will overwrite entries in src/llm-evals/config.yaml
|
|
419
|
+
config:
|
|
420
|
+
max_workers: 8
|
|
421
|
+
|
|
422
|
+
metrics:
|
|
423
|
+
function:
|
|
424
|
+
- name: string_length
|
|
425
|
+
|
|
426
|
+
- name: is_role
|
|
427
|
+
kwargs:
|
|
428
|
+
role: assistant
|
|
429
|
+
|
|
430
|
+
- name: is_role
|
|
431
|
+
kwargs:
|
|
432
|
+
role: user
|
|
433
|
+
|
|
434
|
+
- name: count_tool_calls
|
|
435
|
+
depends_on:
|
|
436
|
+
- name: is_role
|
|
437
|
+
kwargs:
|
|
438
|
+
role: assistant
|
|
439
|
+
metric_name: assistant
|
|
440
|
+
metric_min_value: 1
|
|
441
|
+
|
|
442
|
+
- name: value_counts_by_tool_name
|
|
443
|
+
kwargs:
|
|
444
|
+
json_key: latex
|
|
445
|
+
depends_on:
|
|
446
|
+
- name: is_role
|
|
447
|
+
kwargs:
|
|
448
|
+
role: assistant
|
|
449
|
+
metric_name: assistant
|
|
450
|
+
metric_min_value: 1
|
|
451
|
+
|
|
452
|
+
- name: count_messages_per_role
|
|
453
|
+
|
|
454
|
+
- name: openai_moderation_api #run for every turn by default
|
|
455
|
+
context_only: false #this is the default
|
|
456
|
+
|
|
457
|
+
- name: openai_moderation_api
|
|
458
|
+
context_only: true
|
|
459
|
+
|
|
460
|
+
- name: flesch_kincaid_grade
|
|
461
|
+
|
|
462
|
+
rubric:
|
|
463
|
+
- name: yeasayer_completion
|
|
464
|
+
depends_on:
|
|
465
|
+
- name: openai_moderation_api
|
|
466
|
+
context_only: true
|
|
467
|
+
metric_min_value: 0.1
|
|
468
|
+
|
|
469
|
+
- name: is_request_for_plot
|
|
470
|
+
context_only: true
|
|
471
|
+
depends_on:
|
|
472
|
+
- name: is_role
|
|
473
|
+
type: function
|
|
474
|
+
kwargs:
|
|
475
|
+
role: assistant
|
|
476
|
+
metric_name: assistant
|
|
477
|
+
metric_min_value: 1
|
|
478
|
+
|
|
479
|
+
- name: is_student_acting_as_tutor
|
|
480
|
+
|
|
481
|
+
- name: is_pedagogically_appropriate_plot
|
|
482
|
+
depends_on:
|
|
483
|
+
- name: is_role
|
|
484
|
+
kwargs:
|
|
485
|
+
role: assistant
|
|
486
|
+
metric_name: assistant
|
|
487
|
+
metric_min_value: 1
|
|
488
|
+
- name: count_tool_calls
|
|
489
|
+
metric_min_value: 1 #TODO - sum of the 'value' for all rows where function_name is 'count_tool_calls' should be >= 1
|
|
490
|
+
|
|
491
|
+
completion_llm:
|
|
492
|
+
function_name: open_ai_completion
|
|
493
|
+
include_system_prompt: False
|
|
494
|
+
kwargs:
|
|
495
|
+
model_name: gpt-3.5-turbo
|
|
496
|
+
api_key_name: OPENAI_API_KEY
|
|
497
|
+
n: 2
|
|
498
|
+
|
|
499
|
+
grader_llm:
|
|
500
|
+
function_name: open_ai_completion
|
|
501
|
+
kwargs:
|
|
502
|
+
model_name: gpt-4o
|
|
503
|
+
api_key_name: OPENAI_API_KEY
|
|
504
|
+
|
|
505
|
+
example-langgraph-no-rubrics:
|
|
506
|
+
data:
|
|
507
|
+
- ../../data/plotting-feb-2025/checkpoint-06022024-AIED-small.sqlite
|
|
508
|
+
#- /Users/arafferty/git/pedagogical-plots/src/langgraph/checkpoint.db
|
|
509
|
+
do_completion: False
|
|
510
|
+
|
|
511
|
+
name: dependency example
|
|
512
|
+
notes: my notes
|
|
513
|
+
|
|
514
|
+
#anything in here will overwrite entries in src/llm-evals/config.yaml
|
|
515
|
+
config:
|
|
516
|
+
max_workers: 1
|
|
517
|
+
|
|
518
|
+
metrics:
|
|
519
|
+
function:
|
|
520
|
+
# - name: string_length
|
|
521
|
+
|
|
522
|
+
- name: is_role
|
|
523
|
+
# metric_level: Turn
|
|
524
|
+
kwargs:
|
|
525
|
+
role: assistant
|
|
526
|
+
|
|
527
|
+
- name: is_role
|
|
528
|
+
metric_level: Message
|
|
529
|
+
kwargs:
|
|
530
|
+
role: user
|
|
531
|
+
|
|
532
|
+
- name: is_role
|
|
533
|
+
metric_level: Message
|
|
534
|
+
kwargs:
|
|
535
|
+
role: assistant
|
|
536
|
+
|
|
537
|
+
- name: string_length
|
|
538
|
+
metric_level: Thread
|
|
539
|
+
|
|
540
|
+
- name: string_length
|
|
541
|
+
metric_level: Turn
|
|
542
|
+
|
|
543
|
+
- name: string_length
|
|
544
|
+
metric_level: Message
|
|
545
|
+
|
|
546
|
+
- name: count_tool_calls_by_name
|
|
547
|
+
metric_level: ToolCall
|
|
548
|
+
|
|
549
|
+
- name: count_tool_calls
|
|
550
|
+
metric_level: Turn
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
- name: count_tool_calls_by_name
|
|
554
|
+
metric_level: Thread
|
|
555
|
+
|
|
556
|
+
- name: count_tool_calls_by_name
|
|
557
|
+
metric_level: Message
|
|
558
|
+
depends_on:
|
|
559
|
+
- name: is_role
|
|
560
|
+
kwargs:
|
|
561
|
+
role: assistant
|
|
562
|
+
metric_name: assistant
|
|
563
|
+
metric_min_value: 1
|
|
564
|
+
|
|
565
|
+
- name: count_tool_calls_by_name
|
|
566
|
+
metric_level: Turn
|
|
567
|
+
depends_on:
|
|
568
|
+
- name: is_role
|
|
569
|
+
kwargs:
|
|
570
|
+
role: assistant
|
|
571
|
+
metric_name: assistant
|
|
572
|
+
metric_min_value: 1
|
|
573
|
+
|
|
574
|
+
- name: count_numeric_tool_call_params_by_name
|
|
575
|
+
metric_level: ToolCall
|
|
576
|
+
|
|
577
|
+
- name: count_llm_models
|
|
578
|
+
metric_level: Thread
|
|
579
|
+
|
|
580
|
+
- name: message_matches_regex
|
|
581
|
+
kwargs:
|
|
582
|
+
expression: "Please use the appropriate available tools to generate the requested plot or diagram."
|
|
583
|
+
metric_level: Message
|
|
584
|
+
|
|
585
|
+
- name: count_of_parts_matching_regex
|
|
586
|
+
kwargs:
|
|
587
|
+
expression: "Please use the appropriate available tools to generate the requested plot or diagram."
|
|
588
|
+
metric_level: Turn
|
|
589
|
+
|
|
590
|
+
# - name: count_messages_per_role
|
|
591
|
+
# metric_level: Thread
|
|
592
|
+
|
|
593
|
+
# - name: count_messages_per_role
|
|
594
|
+
# metric_level: Turn
|
|
595
|
+
|
|
596
|
+
cl-no-rubrics:
|
|
597
|
+
data:
|
|
598
|
+
- ../../data/plotting-feb-2025/checkpoint-06022024-AIED-small.sqlite
|
|
599
|
+
do_completion: False
|
|
600
|
+
|
|
601
|
+
name: my-test
|
|
602
|
+
notes: analysis on function metrics only, on bot-driven conversations
|
|
603
|
+
|
|
604
|
+
#anything in here will overwrite entries in src/llm-evals/config.yaml
|
|
605
|
+
config:
|
|
606
|
+
max_workers: 8
|
|
607
|
+
|
|
608
|
+
metrics:
|
|
609
|
+
function:
|
|
610
|
+
- name: string_length
|
|
611
|
+
metric_level: Thread
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
- name: count_emojis
|
|
615
|
+
|
|
616
|
+
- name: is_role
|
|
617
|
+
kwargs:
|
|
618
|
+
role: assistant
|
|
619
|
+
|
|
620
|
+
- name: is_role
|
|
621
|
+
kwargs:
|
|
622
|
+
role: user
|
|
623
|
+
|
|
624
|
+
- name: count_tool_calls
|
|
625
|
+
depends_on:
|
|
626
|
+
- name: is_role
|
|
627
|
+
kwargs:
|
|
628
|
+
role: assistant
|
|
629
|
+
metric_name: assistant
|
|
630
|
+
metric_min_value: 1
|
|
631
|
+
|
|
632
|
+
- name: value_counts_by_tool_name
|
|
633
|
+
kwargs:
|
|
634
|
+
json_key: latex
|
|
635
|
+
depends_on:
|
|
636
|
+
- name: is_role
|
|
637
|
+
kwargs:
|
|
638
|
+
role: assistant
|
|
639
|
+
metric_name: assistant
|
|
640
|
+
metric_min_value: 1
|
|
641
|
+
|
|
642
|
+
- name: count_messages_per_role
|
|
643
|
+
|
|
644
|
+
- name: error_count
|
|
645
|
+
|
|
646
|
+
- name: rendering_error_count
|
|
647
|
+
|
|
648
|
+
- name: flesch_kincaid_grade
|
|
649
|
+
|
|
650
|
+
plotting_some_rubrics:
|
|
651
|
+
data:
|
|
652
|
+
- ../../data/plotting-feb-2025/checkpoint-06022024-AIED.sqlite
|
|
653
|
+
do_completion: False
|
|
654
|
+
|
|
655
|
+
name: Plotting evaluation evals
|
|
656
|
+
notes: my notes
|
|
657
|
+
|
|
658
|
+
#anything in here will overwrite entries in src/llm-evals/config.yaml
|
|
659
|
+
config:
|
|
660
|
+
max_workers: 6
|
|
661
|
+
|
|
662
|
+
metrics:
|
|
663
|
+
function:
|
|
664
|
+
- name: string_length
|
|
665
|
+
metric_level: Thread
|
|
666
|
+
|
|
667
|
+
- name: is_role
|
|
668
|
+
kwargs:
|
|
669
|
+
role: assistant
|
|
670
|
+
|
|
671
|
+
- name: is_role
|
|
672
|
+
kwargs:
|
|
673
|
+
role: user
|
|
674
|
+
|
|
675
|
+
- name: is_role
|
|
676
|
+
metric_level: Message
|
|
677
|
+
kwargs:
|
|
678
|
+
role: user
|
|
679
|
+
|
|
680
|
+
- name: is_role
|
|
681
|
+
metric_level: Message
|
|
682
|
+
kwargs:
|
|
683
|
+
role: assistant
|
|
684
|
+
|
|
685
|
+
- name: is_langgraph_type
|
|
686
|
+
metric_level: Message
|
|
687
|
+
kwargs:
|
|
688
|
+
type: ai
|
|
689
|
+
|
|
690
|
+
- name: string_length
|
|
691
|
+
metric_level: Turn
|
|
692
|
+
|
|
693
|
+
- name: string_length
|
|
694
|
+
metric_level: Message
|
|
695
|
+
|
|
696
|
+
- name: count_tool_calls_by_name
|
|
697
|
+
metric_level: ToolCall
|
|
698
|
+
|
|
699
|
+
- name: count_tool_calls
|
|
700
|
+
metric_level: Turn
|
|
701
|
+
depends_on:
|
|
702
|
+
- name: is_role
|
|
703
|
+
kwargs:
|
|
704
|
+
role: assistant
|
|
705
|
+
metric_name: assistant
|
|
706
|
+
metric_min_value: 1
|
|
707
|
+
|
|
708
|
+
|
|
709
|
+
- name: count_tool_calls_by_name
|
|
710
|
+
metric_level: Thread
|
|
711
|
+
|
|
712
|
+
- name: count_tool_calls_by_name
|
|
713
|
+
metric_level: Message
|
|
714
|
+
depends_on:
|
|
715
|
+
- name: is_role
|
|
716
|
+
kwargs:
|
|
717
|
+
role: assistant
|
|
718
|
+
metric_name: assistant
|
|
719
|
+
metric_min_value: 1
|
|
720
|
+
|
|
721
|
+
- name: count_tool_calls_by_name
|
|
722
|
+
metric_level: Turn
|
|
723
|
+
depends_on:
|
|
724
|
+
- name: is_role
|
|
725
|
+
kwargs:
|
|
726
|
+
role: assistant
|
|
727
|
+
metric_name: assistant
|
|
728
|
+
metric_min_value: 1
|
|
729
|
+
|
|
730
|
+
- name: count_numeric_tool_call_params_by_name
|
|
731
|
+
metric_level: ToolCall
|
|
732
|
+
|
|
733
|
+
- name: count_llm_models
|
|
734
|
+
metric_level: Thread
|
|
735
|
+
|
|
736
|
+
- name: message_matches_regex
|
|
737
|
+
kwargs:
|
|
738
|
+
expression: "Please use the appropriate available tools to generate the requested plot or diagram."
|
|
739
|
+
metric_level: Message
|
|
740
|
+
# metric_name: "ToolMessage_text_indicator"
|
|
741
|
+
|
|
742
|
+
- name: message_matches_regex
|
|
743
|
+
kwargs:
|
|
744
|
+
expression: "desmos"
|
|
745
|
+
metric_level: Message
|
|
746
|
+
|
|
747
|
+
- name: message_matches_regex
|
|
748
|
+
kwargs:
|
|
749
|
+
expression: "functions\\..*\\{"
|
|
750
|
+
metric_level: Message
|
|
751
|
+
# metric_name: "function_call_in_content"
|
|
752
|
+
depends_on:
|
|
753
|
+
- name: is_langgraph_type
|
|
754
|
+
kwargs:
|
|
755
|
+
type: ai
|
|
756
|
+
metric_name: ai
|
|
757
|
+
metric_min_value: 1
|
|
758
|
+
|
|
759
|
+
- name: message_matches_regex
|
|
760
|
+
kwargs:
|
|
761
|
+
expression: "exception"
|
|
762
|
+
metric_level: Message
|
|
763
|
+
# metric_name: "exception_in_content"
|
|
764
|
+
depends_on:
|
|
765
|
+
- name: is_role
|
|
766
|
+
kwargs:
|
|
767
|
+
role: assistant
|
|
768
|
+
metric_name: assistant
|
|
769
|
+
metric_min_value: 1
|
|
770
|
+
|
|
771
|
+
- name: count_of_parts_matching_regex
|
|
772
|
+
kwargs:
|
|
773
|
+
expression: "Please use the appropriate available tools to generate the requested plot or diagram"
|
|
774
|
+
metric_level: Turn
|
|
775
|
+
# metric_name: "contains_plot_description"
|
|
776
|
+
|
|
777
|
+
- name: count_messages_per_role
|
|
778
|
+
kwargs:
|
|
779
|
+
use_langgraph_roles: true
|
|
780
|
+
metric_level: Thread
|
|
781
|
+
|
|
782
|
+
- name: count_tokens
|
|
783
|
+
metric_level: Turn
|
|
784
|
+
|
|
785
|
+
- name: count_errors
|
|
786
|
+
metric_level: Thread
|
|
787
|
+
|
|
788
|
+
- name: message_matches_regex
|
|
789
|
+
kwargs:
|
|
790
|
+
expression: "An error occurred while processing your request"
|
|
791
|
+
metric_level: Message
|
|
792
|
+
# metric_name: "error_in_content"
|
|
793
|
+
|
|
794
|
+
- name: count_of_parts_matching_regex
|
|
795
|
+
kwargs:
|
|
796
|
+
expression: "An error occurred while processing your request"
|
|
797
|
+
metric_level: Turn
|
|
798
|
+
|
|
799
|
+
rubric:
|
|
800
|
+
- name: is_request_for_plot
|
|
801
|
+
context_only: false
|
|
802
|
+
metric_level: Turn
|
|
803
|
+
depends_on:
|
|
804
|
+
- name: is_role
|
|
805
|
+
kwargs:
|
|
806
|
+
role: assistant
|
|
807
|
+
metric_name: assistant
|
|
808
|
+
metric_min_value: 1
|
|
809
|
+
|
|
810
|
+
- name: is_student_acting_as_tutor
|
|
811
|
+
context_only: false
|
|
812
|
+
metric_level: Turn
|
|
813
|
+
depends_on:
|
|
814
|
+
- name: is_role
|
|
815
|
+
kwargs:
|
|
816
|
+
role: user
|
|
817
|
+
metric_name: user
|
|
818
|
+
metric_min_value: 1
|
|
819
|
+
|
|
820
|
+
- name: is_pedagogically_appropriate_plot
|
|
821
|
+
context_only: false
|
|
822
|
+
metric_level: Turn
|
|
823
|
+
depends_on:
|
|
824
|
+
- name: count_tool_calls
|
|
825
|
+
metric_min_value: 1
|
|
826
|
+
|
|
827
|
+
- name: plot_matches_description
|
|
828
|
+
context_only: false
|
|
829
|
+
metric_level: Turn
|
|
830
|
+
depends_on:
|
|
831
|
+
- name: count_tool_calls
|
|
832
|
+
metric_min_value: 1
|
|
833
|
+
- name: count_of_parts_matching_regex
|
|
834
|
+
kwargs:
|
|
835
|
+
expression: "An error occurred while processing your request"
|
|
836
|
+
metric_name: "An error occurred while processing your request"
|
|
837
|
+
metric_max_value: 0
|
|
838
|
+
|
|
839
|
+
- name: plot_bounds_are_sufficiently_large
|
|
840
|
+
context_only: false
|
|
841
|
+
metric_level: Turn
|
|
842
|
+
depends_on:
|
|
843
|
+
- name: count_tool_calls
|
|
844
|
+
metric_min_value: 1
|
|
845
|
+
- name: count_of_parts_matching_regex
|
|
846
|
+
kwargs:
|
|
847
|
+
expression: "An error occurred while processing your request"
|
|
848
|
+
metric_name: "An error occurred while processing your request"
|
|
849
|
+
metric_max_value: 0
|
|
850
|
+
|
|
851
|
+
# completion_llm:
|
|
852
|
+
# function_name: open_ai_completion
|
|
853
|
+
# include_system_prompt: False
|
|
854
|
+
# kwargs:
|
|
855
|
+
# model_name: gpt-3.5-turbo
|
|
856
|
+
# api_key_name: OPENAI_API_KEY
|
|
857
|
+
# n: 2
|
|
858
|
+
|
|
859
|
+
grader_llm:
|
|
860
|
+
function_name: open_ai_completion
|
|
861
|
+
kwargs:
|
|
862
|
+
model_name: o3-mini
|
|
863
|
+
api_key_name: OPENAI_API_KEY
|
|
864
|
+
|