crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +2 -2
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +16 -26
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +43 -13
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +7 -1
- helm/benchmark/presentation/summarize.py +84 -61
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +84 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +114 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +81 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +102 -55
- helm/clients/openai_responses_client.py +176 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +31 -6
- helm/clients/vertexai_client.py +17 -9
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +0 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +104 -12
- helm/common/local_context.py +140 -0
- helm/common/object_spec.py +23 -8
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +995 -45
- helm/config/model_metadata.yaml +780 -59
- helm/config/tokenizer_configs.yaml +224 -3
- helm/proxy/cli.py +4 -2
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -793
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
---
|
|
2
|
+
############################################################
|
|
3
|
+
metrics:
|
|
4
|
+
# Accuracy metrics:
|
|
5
|
+
- name: exact_match
|
|
6
|
+
display_name: Exact match
|
|
7
|
+
short_display_name: EM
|
|
8
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
9
|
+
lower_is_better: false
|
|
10
|
+
- name: quasi_exact_match
|
|
11
|
+
display_name: Quasi-exact match
|
|
12
|
+
short_display_name: EM
|
|
13
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
14
|
+
lower_is_better: false
|
|
15
|
+
|
|
16
|
+
# Classification metrics:
|
|
17
|
+
- name: classification_macro_f1
|
|
18
|
+
display_name: Macro-F1
|
|
19
|
+
description: Population-level macro-averaged F1 score.
|
|
20
|
+
lower_is_better: false
|
|
21
|
+
- name: classification_micro_f1
|
|
22
|
+
display_name: Micro-F1
|
|
23
|
+
description: Population-level micro-averaged F1 score.
|
|
24
|
+
lower_is_better: false
|
|
25
|
+
|
|
26
|
+
# Speech-specific metrics:
|
|
27
|
+
- name: wer
|
|
28
|
+
display_name: Word Error Rate
|
|
29
|
+
short_display_name: WER
|
|
30
|
+
description: Word Error Rate for automatic speech recognition evaluation.
|
|
31
|
+
lower_is_better: true
|
|
32
|
+
- name: mer
|
|
33
|
+
display_name: Match Error Rate
|
|
34
|
+
short_display_name: MER
|
|
35
|
+
description: Match Error Rate for automatic speech recognition evaluation.
|
|
36
|
+
lower_is_better: true
|
|
37
|
+
- name: wip
|
|
38
|
+
display_name: Word Information Preserved
|
|
39
|
+
short_display_name: WIP
|
|
40
|
+
description: Word Information Preserved for automatic speech recognition evaluation.
|
|
41
|
+
lower_is_better: false
|
|
42
|
+
|
|
43
|
+
############################################################
|
|
44
|
+
metric_groups:
|
|
45
|
+
- name: accuracy
|
|
46
|
+
display_name: Accuracy
|
|
47
|
+
metrics:
|
|
48
|
+
- name: ${main_name}
|
|
49
|
+
split: ${main_split}
|
|
50
|
+
|
|
51
|
+
- name: classification_metrics
|
|
52
|
+
display_name: Classification metrics
|
|
53
|
+
metrics:
|
|
54
|
+
- name: classification_macro_f1
|
|
55
|
+
split: ${main_split}
|
|
56
|
+
- name: classification_micro_f1
|
|
57
|
+
split: ${main_split}
|
|
58
|
+
- name: exact_match
|
|
59
|
+
split: ${main_split}
|
|
60
|
+
|
|
61
|
+
- name: speech_metrics
|
|
62
|
+
display_name: Speech metrics
|
|
63
|
+
metrics:
|
|
64
|
+
- name: wer
|
|
65
|
+
split: ${main_split}
|
|
66
|
+
- name: mer
|
|
67
|
+
split: ${main_split}
|
|
68
|
+
- name: wip
|
|
69
|
+
split: ${main_split}
|
|
70
|
+
|
|
71
|
+
#######################################################
|
|
72
|
+
run_groups:
|
|
73
|
+
- name: slphelm
|
|
74
|
+
display_name: SLPHelm Scenarios
|
|
75
|
+
description: Scenarios for speech language processing evaluation
|
|
76
|
+
category: All scenarios
|
|
77
|
+
subgroups:
|
|
78
|
+
- slphelm_disorder_diagnosis
|
|
79
|
+
- slphelm_asr_disorder_diagnosis
|
|
80
|
+
- slphelm_asr_transcription
|
|
81
|
+
- slphelm_disorder_type_classification
|
|
82
|
+
- slphelm_disorder_symptom_classification
|
|
83
|
+
|
|
84
|
+
- name: slphelm_disorder_diagnosis
|
|
85
|
+
display_name: Disorder Diagnosis
|
|
86
|
+
description: Speech-based disorder diagnosis evaluation using audio input.
|
|
87
|
+
category: Disorder Diagnosis
|
|
88
|
+
metric_groups:
|
|
89
|
+
- classification_metrics
|
|
90
|
+
environment:
|
|
91
|
+
main_name: classification_macro_f1
|
|
92
|
+
main_split: test
|
|
93
|
+
taxonomy:
|
|
94
|
+
task: disorder diagnosis
|
|
95
|
+
what: "Speech disorder classification from audio"
|
|
96
|
+
who: "Children with speech disorders"
|
|
97
|
+
when: "Clinical assessment"
|
|
98
|
+
language: English
|
|
99
|
+
|
|
100
|
+
- name: slphelm_asr_disorder_diagnosis
|
|
101
|
+
display_name: ASR-Based Disorder Diagnosis
|
|
102
|
+
description: Disorder diagnosis based on automatic speech recognition transcription accuracy.
|
|
103
|
+
category: ASR-Based Disorder Diagnosis
|
|
104
|
+
metric_groups:
|
|
105
|
+
- classification_metrics
|
|
106
|
+
environment:
|
|
107
|
+
main_name: classification_macro_f1
|
|
108
|
+
main_split: test
|
|
109
|
+
taxonomy:
|
|
110
|
+
task: asr-based disorder diagnosis
|
|
111
|
+
what: "Disorder classification from transcription accuracy"
|
|
112
|
+
who: "Children with speech disorders"
|
|
113
|
+
when: "Clinical assessment"
|
|
114
|
+
language: English
|
|
115
|
+
|
|
116
|
+
- name: slphelm_asr_transcription
|
|
117
|
+
display_name: ASR-Transcription
|
|
118
|
+
description: Automatic speech recognition transcription quality evaluation.
|
|
119
|
+
category: ASR-Transcription
|
|
120
|
+
metric_groups:
|
|
121
|
+
- speech_metrics
|
|
122
|
+
environment:
|
|
123
|
+
main_name: wer
|
|
124
|
+
main_split: test
|
|
125
|
+
taxonomy:
|
|
126
|
+
task: automatic speech recognition
|
|
127
|
+
what: "Speech transcription accuracy"
|
|
128
|
+
who: "Children with speech disorders"
|
|
129
|
+
when: "Clinical assessment"
|
|
130
|
+
language: English
|
|
131
|
+
|
|
132
|
+
- name: slphelm_disorder_type_classification
|
|
133
|
+
display_name: Disorder Type Classification
|
|
134
|
+
description: Classification of specific types of speech disorders.
|
|
135
|
+
category: Disorder Type Classification
|
|
136
|
+
metric_groups:
|
|
137
|
+
- classification_metrics
|
|
138
|
+
environment:
|
|
139
|
+
main_name: classification_macro_f1
|
|
140
|
+
main_split: test
|
|
141
|
+
taxonomy:
|
|
142
|
+
task: disorder type classification
|
|
143
|
+
what: "Specific speech disorder type identification"
|
|
144
|
+
who: "Children with speech disorders"
|
|
145
|
+
when: "Clinical assessment"
|
|
146
|
+
language: English
|
|
147
|
+
|
|
148
|
+
- name: slphelm_disorder_symptom_classification
|
|
149
|
+
display_name: Disorder Symptom Classification
|
|
150
|
+
description: Classification of specific symptoms within speech disorders.
|
|
151
|
+
category: Disorder Symptom Classification
|
|
152
|
+
metric_groups:
|
|
153
|
+
- classification_metrics
|
|
154
|
+
environment:
|
|
155
|
+
main_name: classification_macro_f1
|
|
156
|
+
main_split: test
|
|
157
|
+
taxonomy:
|
|
158
|
+
task: disorder symptom classification
|
|
159
|
+
what: "Speech disorder symptom identification"
|
|
160
|
+
who: "Children with speech disorders"
|
|
161
|
+
when: "Clinical assessment"
|
|
162
|
+
language: English
|
|
@@ -307,6 +307,8 @@ run_groups:
|
|
|
307
307
|
description: Does the model understand objects, counts and spatial relations? Can the model reason about both the text and image input?
|
|
308
308
|
category: Core scenarios
|
|
309
309
|
subgroups:
|
|
310
|
+
- mmmu
|
|
311
|
+
- exams_v
|
|
310
312
|
- gqa
|
|
311
313
|
- math_vista
|
|
312
314
|
- seed_bench
|
|
@@ -320,7 +322,6 @@ run_groups:
|
|
|
320
322
|
category: Core scenarios
|
|
321
323
|
subgroups:
|
|
322
324
|
- a_okvqa_base
|
|
323
|
-
- mmmu
|
|
324
325
|
- mme
|
|
325
326
|
- vibe_eval
|
|
326
327
|
- mm_star_knowledge
|
|
@@ -369,7 +370,6 @@ run_groups:
|
|
|
369
370
|
- a_okvqa_hindi
|
|
370
371
|
- a_okvqa_spanish
|
|
371
372
|
- a_okvqa_swahili
|
|
372
|
-
- exams_v
|
|
373
373
|
- bingo_multilinguality
|
|
374
374
|
- name: a_okvqa_base
|
|
375
375
|
display_name: A-OKVQA
|
|
@@ -378,7 +378,7 @@ run_groups:
|
|
|
378
378
|
- accuracy
|
|
379
379
|
- general_information
|
|
380
380
|
environment:
|
|
381
|
-
main_name:
|
|
381
|
+
main_name: quasi_prefix_exact_match
|
|
382
382
|
main_split: valid
|
|
383
383
|
taxonomy:
|
|
384
384
|
task: multiple-choice question answering
|
|
@@ -394,7 +394,7 @@ run_groups:
|
|
|
394
394
|
- fairness
|
|
395
395
|
- general_information
|
|
396
396
|
environment:
|
|
397
|
-
main_name:
|
|
397
|
+
main_name: quasi_prefix_exact_match
|
|
398
398
|
main_split: valid
|
|
399
399
|
taxonomy:
|
|
400
400
|
task: multiple-choice question answering
|
|
@@ -410,7 +410,7 @@ run_groups:
|
|
|
410
410
|
- translate
|
|
411
411
|
- general_information
|
|
412
412
|
environment:
|
|
413
|
-
main_name:
|
|
413
|
+
main_name: quasi_prefix_exact_match
|
|
414
414
|
main_split: valid
|
|
415
415
|
taxonomy:
|
|
416
416
|
task: multiple-choice question answering
|
|
@@ -426,7 +426,7 @@ run_groups:
|
|
|
426
426
|
- translate
|
|
427
427
|
- general_information
|
|
428
428
|
environment:
|
|
429
|
-
main_name:
|
|
429
|
+
main_name: quasi_prefix_exact_match
|
|
430
430
|
main_split: valid
|
|
431
431
|
taxonomy:
|
|
432
432
|
task: multiple-choice question answering
|
|
@@ -442,7 +442,7 @@ run_groups:
|
|
|
442
442
|
- translate
|
|
443
443
|
- general_information
|
|
444
444
|
environment:
|
|
445
|
-
main_name:
|
|
445
|
+
main_name: quasi_prefix_exact_match
|
|
446
446
|
main_split: valid
|
|
447
447
|
taxonomy:
|
|
448
448
|
task: multiple-choice question answering
|
|
@@ -458,7 +458,7 @@ run_groups:
|
|
|
458
458
|
- translate
|
|
459
459
|
- general_information
|
|
460
460
|
environment:
|
|
461
|
-
main_name:
|
|
461
|
+
main_name: quasi_prefix_exact_match
|
|
462
462
|
main_split: valid
|
|
463
463
|
taxonomy:
|
|
464
464
|
task: multiple-choice question answering
|
|
@@ -474,7 +474,7 @@ run_groups:
|
|
|
474
474
|
- accuracy
|
|
475
475
|
- general_information
|
|
476
476
|
environment:
|
|
477
|
-
main_name:
|
|
477
|
+
main_name: quasi_prefix_exact_match
|
|
478
478
|
main_split: valid
|
|
479
479
|
taxonomy:
|
|
480
480
|
task: multiple-choice question answering
|
|
@@ -490,7 +490,7 @@ run_groups:
|
|
|
490
490
|
- accuracy
|
|
491
491
|
- general_information
|
|
492
492
|
environment:
|
|
493
|
-
main_name:
|
|
493
|
+
main_name: quasi_prefix_exact_match
|
|
494
494
|
main_split: valid
|
|
495
495
|
taxonomy:
|
|
496
496
|
task: multiple-choice question answering
|
|
@@ -506,7 +506,7 @@ run_groups:
|
|
|
506
506
|
- accuracy
|
|
507
507
|
- general_information
|
|
508
508
|
environment:
|
|
509
|
-
main_name:
|
|
509
|
+
main_name: quasi_prefix_exact_match
|
|
510
510
|
main_split: valid
|
|
511
511
|
taxonomy:
|
|
512
512
|
task: multiple-choice question answering
|
|
@@ -522,7 +522,7 @@ run_groups:
|
|
|
522
522
|
- accuracy
|
|
523
523
|
- general_information
|
|
524
524
|
environment:
|
|
525
|
-
main_name:
|
|
525
|
+
main_name: quasi_prefix_exact_match
|
|
526
526
|
main_split: valid
|
|
527
527
|
taxonomy:
|
|
528
528
|
task: multiple-choice question answering
|
|
@@ -538,7 +538,7 @@ run_groups:
|
|
|
538
538
|
- accuracy
|
|
539
539
|
- general_information
|
|
540
540
|
environment:
|
|
541
|
-
main_name:
|
|
541
|
+
main_name: quasi_prefix_exact_match
|
|
542
542
|
main_split: valid
|
|
543
543
|
taxonomy:
|
|
544
544
|
task: multiple-choice question answering
|
|
@@ -554,7 +554,7 @@ run_groups:
|
|
|
554
554
|
- accuracy
|
|
555
555
|
- general_information
|
|
556
556
|
environment:
|
|
557
|
-
main_name:
|
|
557
|
+
main_name: quasi_prefix_exact_match
|
|
558
558
|
main_split: valid
|
|
559
559
|
taxonomy:
|
|
560
560
|
task: multiple-choice question answering
|
|
@@ -602,7 +602,7 @@ run_groups:
|
|
|
602
602
|
- accuracy
|
|
603
603
|
- general_information
|
|
604
604
|
environment:
|
|
605
|
-
main_name:
|
|
605
|
+
main_name: quasi_prefix_exact_match
|
|
606
606
|
main_split: valid
|
|
607
607
|
taxonomy:
|
|
608
608
|
task: short-answer question answering
|
|
@@ -618,7 +618,7 @@ run_groups:
|
|
|
618
618
|
- accuracy
|
|
619
619
|
- general_information
|
|
620
620
|
environment:
|
|
621
|
-
main_name:
|
|
621
|
+
main_name: quasi_prefix_exact_match
|
|
622
622
|
main_split: test
|
|
623
623
|
taxonomy:
|
|
624
624
|
task: toxicity identification
|
|
@@ -651,7 +651,7 @@ run_groups:
|
|
|
651
651
|
- accuracy
|
|
652
652
|
- general_information
|
|
653
653
|
environment:
|
|
654
|
-
main_name:
|
|
654
|
+
main_name: quasi_prefix_exact_match
|
|
655
655
|
main_split: valid
|
|
656
656
|
taxonomy:
|
|
657
657
|
task: short-answer question answering
|
|
@@ -667,7 +667,7 @@ run_groups:
|
|
|
667
667
|
- accuracy
|
|
668
668
|
- general_information
|
|
669
669
|
environment:
|
|
670
|
-
main_name:
|
|
670
|
+
main_name: quasi_prefix_exact_match
|
|
671
671
|
main_split: valid
|
|
672
672
|
taxonomy:
|
|
673
673
|
task: short-answer question answering
|
|
@@ -683,7 +683,7 @@ run_groups:
|
|
|
683
683
|
- fairness
|
|
684
684
|
- general_information
|
|
685
685
|
environment:
|
|
686
|
-
main_name:
|
|
686
|
+
main_name: quasi_prefix_exact_match
|
|
687
687
|
main_split: valid
|
|
688
688
|
taxonomy:
|
|
689
689
|
task: short-answer question answering
|
|
@@ -715,7 +715,7 @@ run_groups:
|
|
|
715
715
|
- accuracy
|
|
716
716
|
- general_information
|
|
717
717
|
environment:
|
|
718
|
-
main_name:
|
|
718
|
+
main_name: quasi_prefix_exact_match
|
|
719
719
|
main_split: valid
|
|
720
720
|
taxonomy:
|
|
721
721
|
task: multiple-choice question answering
|
|
@@ -795,7 +795,7 @@ run_groups:
|
|
|
795
795
|
- accuracy
|
|
796
796
|
- general_information
|
|
797
797
|
environment:
|
|
798
|
-
main_name:
|
|
798
|
+
main_name: quasi_prefix_exact_match
|
|
799
799
|
main_split: test
|
|
800
800
|
taxonomy:
|
|
801
801
|
task: short-answer question answering
|
|
@@ -811,7 +811,7 @@ run_groups:
|
|
|
811
811
|
- accuracy
|
|
812
812
|
- general_information
|
|
813
813
|
environment:
|
|
814
|
-
main_name:
|
|
814
|
+
main_name: quasi_prefix_exact_match
|
|
815
815
|
main_split: test
|
|
816
816
|
taxonomy:
|
|
817
817
|
task: multiple-choice question answering
|
|
@@ -827,7 +827,7 @@ run_groups:
|
|
|
827
827
|
- accuracy
|
|
828
828
|
- general_information
|
|
829
829
|
environment:
|
|
830
|
-
main_name:
|
|
830
|
+
main_name: quasi_prefix_exact_match
|
|
831
831
|
main_split: test
|
|
832
832
|
taxonomy:
|
|
833
833
|
task: multiple-choice question answering
|
|
@@ -875,7 +875,7 @@ run_groups:
|
|
|
875
875
|
- accuracy
|
|
876
876
|
- general_information
|
|
877
877
|
environment:
|
|
878
|
-
main_name:
|
|
878
|
+
main_name: quasi_prefix_exact_match
|
|
879
879
|
main_split: test
|
|
880
880
|
taxonomy:
|
|
881
881
|
task: multiple-choice question answering
|
|
@@ -891,7 +891,7 @@ run_groups:
|
|
|
891
891
|
- accuracy
|
|
892
892
|
- general_information
|
|
893
893
|
environment:
|
|
894
|
-
main_name:
|
|
894
|
+
main_name: quasi_prefix_exact_match
|
|
895
895
|
main_split: valid
|
|
896
896
|
taxonomy:
|
|
897
897
|
task: multiple-choice question answering
|
|
@@ -923,7 +923,7 @@ run_groups:
|
|
|
923
923
|
- accuracy
|
|
924
924
|
- general_information
|
|
925
925
|
environment:
|
|
926
|
-
main_name:
|
|
926
|
+
main_name: quasi_prefix_exact_match
|
|
927
927
|
main_split: test
|
|
928
928
|
taxonomy:
|
|
929
929
|
task: multiple-choice question answering
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
---
|
|
2
|
+
############################################################
|
|
3
|
+
perturbations: []
|
|
4
|
+
############################################################
|
|
5
|
+
metrics:
|
|
6
|
+
- name: num_references
|
|
7
|
+
display_name: '# ref'
|
|
8
|
+
description: Number of references.
|
|
9
|
+
- name: num_train_trials
|
|
10
|
+
display_name: '# trials'
|
|
11
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
12
|
+
- name: estimated_num_tokens_cost
|
|
13
|
+
display_name: 'cost'
|
|
14
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
15
|
+
- name: num_prompt_tokens
|
|
16
|
+
display_name: '# prompt tokens'
|
|
17
|
+
description: Number of tokens in the prompt.
|
|
18
|
+
- name: num_prompt_characters
|
|
19
|
+
display_name: '# prompt chars'
|
|
20
|
+
description: Number of characters in the prompt.
|
|
21
|
+
- name: num_completion_tokens
|
|
22
|
+
display_name: '# completion tokens'
|
|
23
|
+
description: Actual number of completion tokens (over all completions).
|
|
24
|
+
- name: num_output_tokens
|
|
25
|
+
display_name: '# output tokens'
|
|
26
|
+
description: Actual number of output tokens.
|
|
27
|
+
- name: max_num_output_tokens
|
|
28
|
+
display_name: 'Max output tokens'
|
|
29
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
30
|
+
- name: num_requests
|
|
31
|
+
display_name: '# requests'
|
|
32
|
+
description: Number of distinct API requests.
|
|
33
|
+
- name: num_instances
|
|
34
|
+
display_name: '# eval'
|
|
35
|
+
description: Number of evaluation instances.
|
|
36
|
+
- name: num_train_instances
|
|
37
|
+
display_name: '# train'
|
|
38
|
+
description: Number of training instances (e.g., in-context examples).
|
|
39
|
+
- name: prompt_truncated
|
|
40
|
+
display_name: truncated
|
|
41
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
42
|
+
- name: finish_reason_length
|
|
43
|
+
display_name: finish b/c length
|
|
44
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
45
|
+
- name: finish_reason_stop
|
|
46
|
+
display_name: finish b/c stop
|
|
47
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
48
|
+
- name: finish_reason_endoftext
|
|
49
|
+
display_name: finish b/c endoftext
|
|
50
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
51
|
+
- name: finish_reason_unknown
|
|
52
|
+
display_name: finish b/c unknown
|
|
53
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
54
|
+
- name: num_completions
|
|
55
|
+
display_name: '# completions'
|
|
56
|
+
description: Number of completions.
|
|
57
|
+
- name: predicted_index
|
|
58
|
+
display_name: Predicted index
|
|
59
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
60
|
+
|
|
61
|
+
# Vision Language metrics [image]:
|
|
62
|
+
- name: earth_mover_similarity
|
|
63
|
+
display_name: Earth Mover Similarity
|
|
64
|
+
short_display_name: EMD-Sim
|
|
65
|
+
description: 1 - Earth Mover Distance [(Rubner and Tomasi, 2000)](https://www.cs.cmu.edu/~efros/courses/LBMV07/Papers/rubner-jcviu-00.pdf) between an image generated by the model and the target image.
|
|
66
|
+
lower_is_better: false
|
|
67
|
+
- name: pixel_similarity
|
|
68
|
+
display_name: Pixel Similarity
|
|
69
|
+
short_display_name: PS
|
|
70
|
+
description: Pixel Similarity between an image generated by the model and the target image.
|
|
71
|
+
lower_is_better: false
|
|
72
|
+
- name: sift_similarity
|
|
73
|
+
display_name: SIFT Similarity
|
|
74
|
+
short_display_name: SIFT
|
|
75
|
+
description: SIFT Similarity (Scale-Invariant Feature Transform) [(Lowe, 1999)](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=790410) between an image generated by the model and the target image.
|
|
76
|
+
lower_is_better: false
|
|
77
|
+
- name: compilation_success
|
|
78
|
+
display_name: Compilation success
|
|
79
|
+
description: Fraction of instances where the generated code compiles successfully.
|
|
80
|
+
lower_is_better: false
|
|
81
|
+
- name: lpips_similarity
|
|
82
|
+
display_name: LPIPS similarity
|
|
83
|
+
short_display_name: LPIPS
|
|
84
|
+
description: LPIPS similarity (Learned Perceptual Image Patch Similarity) [(Zhang et al., 2018)](https://arxiv.org/abs/1801.03924) between an image generated by the model and the target image.
|
|
85
|
+
lower_is_better: false
|
|
86
|
+
- name: fid_similarity
|
|
87
|
+
display_name: FID similarity
|
|
88
|
+
short_display_name: FID
|
|
89
|
+
description: FID similarity (Fréchet Inception Distance) [(Heusel et al., 2017)](https://arxiv.org/abs/1706.08500) between an image generated by the model and the target image.
|
|
90
|
+
lower_is_better: false
|
|
91
|
+
- name: ssim_similarity
|
|
92
|
+
display_name: SSIM
|
|
93
|
+
short_display_name: SSIM
|
|
94
|
+
description: SSIM similarity (Structural Similarity Index) [(Wang et al., 2004)](https://www.cns.nyu.edu/pub/eero/wang03-reprint.pdf) between an image generated by the model and the target image.
|
|
95
|
+
lower_is_better: false
|
|
96
|
+
|
|
97
|
+
# Accuracy metrics:
|
|
98
|
+
- name: exact_match
|
|
99
|
+
display_name: Exact match
|
|
100
|
+
short_display_name: EM
|
|
101
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
102
|
+
lower_is_better: false
|
|
103
|
+
- name: quasi_exact_match
|
|
104
|
+
display_name: Quasi-exact match
|
|
105
|
+
short_display_name: EM
|
|
106
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
107
|
+
lower_is_better: false
|
|
108
|
+
- name: quasi_leave_articles_exact_match
|
|
109
|
+
display_name: Quasi-exact match
|
|
110
|
+
short_display_name: EM
|
|
111
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
112
|
+
lower_is_better: false
|
|
113
|
+
- name: prefix_exact_match
|
|
114
|
+
display_name: Prefix exact match
|
|
115
|
+
short_display_name: PEM
|
|
116
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
|
|
117
|
+
lower_is_better: false
|
|
118
|
+
- name: quasi_prefix_exact_match
|
|
119
|
+
# TODO: should call this prefix_quasi_exact_match
|
|
120
|
+
display_name: Prefix quasi-exact match
|
|
121
|
+
short_display_name: PEM
|
|
122
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
|
|
123
|
+
lower_is_better: false
|
|
124
|
+
|
|
125
|
+
- name: rouge_1
|
|
126
|
+
display_name: ROUGE-1
|
|
127
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
|
|
128
|
+
lower_is_better: false
|
|
129
|
+
- name: rouge_2
|
|
130
|
+
display_name: ROUGE-2
|
|
131
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
|
|
132
|
+
lower_is_better: false
|
|
133
|
+
- name: rouge_l
|
|
134
|
+
display_name: ROUGE-L
|
|
135
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
|
|
136
|
+
lower_is_better: false
|
|
137
|
+
- name: bleu_1
|
|
138
|
+
display_name: BLEU-1
|
|
139
|
+
description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
|
|
140
|
+
lower_is_better: false
|
|
141
|
+
- name: bleu_4
|
|
142
|
+
display_name: BLEU-4
|
|
143
|
+
description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
|
|
144
|
+
lower_is_better: false
|
|
145
|
+
- name: f1_score
|
|
146
|
+
display_name: F1
|
|
147
|
+
description: Average F1 score in terms of word overlap between the model output and correct reference.
|
|
148
|
+
lower_is_better: false
|
|
149
|
+
- name: cider
|
|
150
|
+
display_name: CIDEr
|
|
151
|
+
description: Evaluates the quality of generated caption by measuring the weighted similarity of n-grams between the captions and a set of human-written reference captions, emphasizing informativeness and consensus.
|
|
152
|
+
lower_is_better: false
|
|
153
|
+
- name: prometheus_vision
|
|
154
|
+
display_name: Prometheus Vision rating
|
|
155
|
+
description: Scores are from 1 to 5, where 5 is the best.
|
|
156
|
+
lower_is_better: false
|
|
157
|
+
|
|
158
|
+
# Toxicity metrics
|
|
159
|
+
- name: expected_max_toxicity
|
|
160
|
+
display_name: Average max toxicity
|
|
161
|
+
short_display_name: Avg. max toxicity
|
|
162
|
+
lower_is_better: true
|
|
163
|
+
description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
164
|
+
- name: max_toxicity_probability
|
|
165
|
+
display_name: max toxicity
|
|
166
|
+
lower_is_better: true
|
|
167
|
+
description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
168
|
+
- name: toxic_frac
|
|
169
|
+
display_name: Toxic fraction
|
|
170
|
+
lower_is_better: true
|
|
171
|
+
description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
|
|
172
|
+
|
|
173
|
+
############################################################
|
|
174
|
+
metric_groups:
|
|
175
|
+
- name: accuracy
|
|
176
|
+
display_name: Accuracy
|
|
177
|
+
metrics:
|
|
178
|
+
- name: ${main_name}
|
|
179
|
+
split: ${main_split}
|
|
180
|
+
|
|
181
|
+
- name: general_information
|
|
182
|
+
display_name: General information
|
|
183
|
+
metrics:
|
|
184
|
+
- name: num_instances
|
|
185
|
+
split: ${main_split}
|
|
186
|
+
- name: num_train_instances
|
|
187
|
+
split: ${main_split}
|
|
188
|
+
- name: prompt_truncated
|
|
189
|
+
split: ${main_split}
|
|
190
|
+
- name: num_prompt_tokens
|
|
191
|
+
split: ${main_split}
|
|
192
|
+
- name: num_output_tokens
|
|
193
|
+
split: ${main_split}
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
############################################################
|
|
197
|
+
run_groups:
|
|
198
|
+
- name: core_scenarios
|
|
199
|
+
display_name: All
|
|
200
|
+
description: All scenarios across capabilities
|
|
201
|
+
category: All scenarios
|
|
202
|
+
subgroups:
|
|
203
|
+
- msr_vtt
|
|
204
|
+
|
|
205
|
+
- name: msr_vtt
|
|
206
|
+
display_name: MST-VTT
|
|
207
|
+
description: A large-scale video benchmark for video understanding, especially the emerging task of translating video to text.
|
|
208
|
+
metric_groups:
|
|
209
|
+
- accuracy
|
|
210
|
+
- general_information
|
|
211
|
+
environment:
|
|
212
|
+
main_name: f1_score
|
|
213
|
+
main_split: test
|
|
214
|
+
taxonomy:
|
|
215
|
+
task: captioning
|
|
216
|
+
what: Real-world videos
|
|
217
|
+
who: Human experts
|
|
218
|
+
when: "2016"
|
|
219
|
+
language: English
|