python-flexeval 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flexeval/__about__.py +1 -1
- flexeval/classes/dataset.py +12 -72
- flexeval/classes/eval_set_run.py +18 -7
- flexeval/classes/jsonview.py +10 -5
- flexeval/classes/message.py +11 -5
- flexeval/classes/metric.py +0 -8
- flexeval/classes/thread.py +0 -2
- flexeval/classes/tool_call.py +0 -2
- flexeval/classes/turn.py +7 -5
- flexeval/completions.py +8 -5
- flexeval/compute_metrics.py +45 -32
- flexeval/configuration/evals.yaml +2 -25
- flexeval/data_loader.py +219 -317
- flexeval/db_utils.py +11 -2
- flexeval/dependency_graph.py +3 -3
- flexeval/eval_schema.json +0 -18
- flexeval/function_types.py +2 -13
- flexeval/metrics/save.py +12 -8
- flexeval/run_utils.py +163 -17
- flexeval/runner.py +6 -14
- flexeval/schema/config_schema.py +12 -0
- flexeval/schema/eval_schema.py +3 -0
- flexeval/schema/evalrun_schema.py +41 -10
- {python_flexeval-0.3.0.dist-info → python_flexeval-0.4.0.dist-info}/METADATA +3 -3
- python_flexeval-0.4.0.dist-info/RECORD +49 -0
- {python_flexeval-0.3.0.dist-info → python_flexeval-0.4.0.dist-info}/WHEEL +1 -1
- python_flexeval-0.3.0.dist-info/RECORD +0 -49
- {python_flexeval-0.3.0.dist-info → python_flexeval-0.4.0.dist-info}/entry_points.txt +0 -0
- {python_flexeval-0.3.0.dist-info → python_flexeval-0.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -49,7 +49,6 @@ multimodels-langgraph-rubric-dependencies:
|
|
|
49
49
|
rubric:
|
|
50
50
|
# check if student requested a plot in their last message
|
|
51
51
|
- name: is_request_for_plot
|
|
52
|
-
# context_only: false
|
|
53
52
|
metric_level: Turn
|
|
54
53
|
depends_on:
|
|
55
54
|
- name: is_role
|
|
@@ -134,7 +133,6 @@ test-multimodels-langgraph-rubric-dependencies:
|
|
|
134
133
|
|
|
135
134
|
rubric:
|
|
136
135
|
- name: is_request_for_plot
|
|
137
|
-
# context_only: false
|
|
138
136
|
metric_level: Turn
|
|
139
137
|
depends_on:
|
|
140
138
|
- name: is_role
|
|
@@ -181,7 +179,6 @@ test-multimodels-langgraph:
|
|
|
181
179
|
|
|
182
180
|
rubric:
|
|
183
181
|
- name: no_plot_after_student_requested
|
|
184
|
-
context_only: false
|
|
185
182
|
depends_on:
|
|
186
183
|
- name: is_role
|
|
187
184
|
type: function
|
|
@@ -267,7 +264,6 @@ issues-fixing-rubrics:
|
|
|
267
264
|
metric_min_value: 1
|
|
268
265
|
|
|
269
266
|
- name: no_plot_after_student_requested
|
|
270
|
-
context_only: false
|
|
271
267
|
depends_on:
|
|
272
268
|
- name: is_role
|
|
273
269
|
type: function
|
|
@@ -277,7 +273,6 @@ issues-fixing-rubrics:
|
|
|
277
273
|
metric_min_value: 1
|
|
278
274
|
|
|
279
275
|
- name: desmos_code_rendered_on_toolcall
|
|
280
|
-
context_only: false
|
|
281
276
|
metric_level: Turn
|
|
282
277
|
depends_on:
|
|
283
278
|
- name: count_tool_calls
|
|
@@ -326,7 +321,6 @@ test-multiple-rubrics-files:
|
|
|
326
321
|
|
|
327
322
|
rubric:
|
|
328
323
|
- name: plot_matches_description
|
|
329
|
-
context_only: false
|
|
330
324
|
metric_level: Turn
|
|
331
325
|
depends_on:
|
|
332
326
|
- name: count_tool_calls
|
|
@@ -348,7 +342,6 @@ test-multiple-rubrics-files:
|
|
|
348
342
|
metric_min_value: 1
|
|
349
343
|
|
|
350
344
|
- name: plot_matches_description_example_project
|
|
351
|
-
context_only: false
|
|
352
345
|
metric_level: Turn
|
|
353
346
|
depends_on:
|
|
354
347
|
- name: count_tool_calls
|
|
@@ -379,17 +372,12 @@ test-new:
|
|
|
379
372
|
function:
|
|
380
373
|
- name: string_length
|
|
381
374
|
|
|
382
|
-
- name: openai_moderation_api #run for every turn by default
|
|
383
|
-
context_only: false #this is the default
|
|
384
|
-
|
|
385
375
|
- name: openai_moderation_api
|
|
386
|
-
|
|
387
|
-
|
|
376
|
+
|
|
388
377
|
rubric:
|
|
389
378
|
- name: yeasayer_completion
|
|
390
379
|
depends_on:
|
|
391
380
|
- name: openai_moderation_api
|
|
392
|
-
context_only: true
|
|
393
381
|
metric_min_value: 0.1
|
|
394
382
|
|
|
395
383
|
completion_llm:
|
|
@@ -451,11 +439,7 @@ example:
|
|
|
451
439
|
|
|
452
440
|
- name: count_messages_per_role
|
|
453
441
|
|
|
454
|
-
- name: openai_moderation_api #run for every turn by default
|
|
455
|
-
context_only: false #this is the default
|
|
456
|
-
|
|
457
442
|
- name: openai_moderation_api
|
|
458
|
-
context_only: true
|
|
459
443
|
|
|
460
444
|
- name: flesch_kincaid_grade
|
|
461
445
|
|
|
@@ -463,11 +447,9 @@ example:
|
|
|
463
447
|
- name: yeasayer_completion
|
|
464
448
|
depends_on:
|
|
465
449
|
- name: openai_moderation_api
|
|
466
|
-
context_only: true
|
|
467
450
|
metric_min_value: 0.1
|
|
468
451
|
|
|
469
452
|
- name: is_request_for_plot
|
|
470
|
-
context_only: true
|
|
471
453
|
depends_on:
|
|
472
454
|
- name: is_role
|
|
473
455
|
type: function
|
|
@@ -798,7 +780,6 @@ plotting_some_rubrics:
|
|
|
798
780
|
|
|
799
781
|
rubric:
|
|
800
782
|
- name: is_request_for_plot
|
|
801
|
-
context_only: false
|
|
802
783
|
metric_level: Turn
|
|
803
784
|
depends_on:
|
|
804
785
|
- name: is_role
|
|
@@ -808,7 +789,6 @@ plotting_some_rubrics:
|
|
|
808
789
|
metric_min_value: 1
|
|
809
790
|
|
|
810
791
|
- name: is_student_acting_as_tutor
|
|
811
|
-
context_only: false
|
|
812
792
|
metric_level: Turn
|
|
813
793
|
depends_on:
|
|
814
794
|
- name: is_role
|
|
@@ -818,14 +798,12 @@ plotting_some_rubrics:
|
|
|
818
798
|
metric_min_value: 1
|
|
819
799
|
|
|
820
800
|
- name: is_pedagogically_appropriate_plot
|
|
821
|
-
context_only: false
|
|
822
801
|
metric_level: Turn
|
|
823
802
|
depends_on:
|
|
824
803
|
- name: count_tool_calls
|
|
825
804
|
metric_min_value: 1
|
|
826
|
-
|
|
805
|
+
|
|
827
806
|
- name: plot_matches_description
|
|
828
|
-
context_only: false
|
|
829
807
|
metric_level: Turn
|
|
830
808
|
depends_on:
|
|
831
809
|
- name: count_tool_calls
|
|
@@ -837,7 +815,6 @@ plotting_some_rubrics:
|
|
|
837
815
|
metric_max_value: 0
|
|
838
816
|
|
|
839
817
|
- name: plot_bounds_are_sufficiently_large
|
|
840
|
-
context_only: false
|
|
841
818
|
metric_level: Turn
|
|
842
819
|
depends_on:
|
|
843
820
|
- name: count_tool_calls
|