crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +2 -2
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +16 -26
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +43 -13
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +7 -1
- helm/benchmark/presentation/summarize.py +84 -61
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +84 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +114 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +81 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +102 -55
- helm/clients/openai_responses_client.py +176 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +31 -6
- helm/clients/vertexai_client.py +17 -9
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +0 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +104 -12
- helm/common/local_context.py +140 -0
- helm/common/object_spec.py +23 -8
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +995 -45
- helm/config/model_metadata.yaml +780 -59
- helm/config/tokenizer_configs.yaml +224 -3
- helm/proxy/cli.py +4 -2
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -793
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
|
@@ -194,6 +194,11 @@ metrics:
|
|
|
194
194
|
lower_is_better: false
|
|
195
195
|
description: Judgements by GPT-4o
|
|
196
196
|
|
|
197
|
+
- name: refusal_rate
|
|
198
|
+
display_name: Refusal rate for safety
|
|
199
|
+
lower_is_better: false
|
|
200
|
+
description: Judgements by GPT-4o
|
|
201
|
+
|
|
197
202
|
############################################################
|
|
198
203
|
perturbations: []
|
|
199
204
|
|
|
@@ -235,6 +240,7 @@ run_groups:
|
|
|
235
240
|
subgroups:
|
|
236
241
|
- auditory_perception
|
|
237
242
|
- emotion_detection
|
|
243
|
+
- reasoning
|
|
238
244
|
- knowledge
|
|
239
245
|
- robustness
|
|
240
246
|
- multilinguality
|
|
@@ -261,12 +267,20 @@ run_groups:
|
|
|
261
267
|
- meld_audio
|
|
262
268
|
- mustard
|
|
263
269
|
|
|
270
|
+
- name: reasoning
|
|
271
|
+
display_name: Reasoning
|
|
272
|
+
description: Reasoning about the audio
|
|
273
|
+
category: Core scenarios
|
|
274
|
+
subgroups:
|
|
275
|
+
- air_bench_chat_reasoning
|
|
276
|
+
- corebench
|
|
277
|
+
|
|
264
278
|
- name: knowledge
|
|
265
279
|
display_name: Knowledge
|
|
266
280
|
description: Recalling facts or information contained in the audio LLM.
|
|
267
281
|
category: Core scenarios
|
|
268
282
|
subgroups:
|
|
269
|
-
-
|
|
283
|
+
- air_bench_chat_knowledge
|
|
270
284
|
- air_bench_foundation
|
|
271
285
|
|
|
272
286
|
- name: robustness
|
|
@@ -349,7 +363,7 @@ run_groups:
|
|
|
349
363
|
- accuracy
|
|
350
364
|
- general_information
|
|
351
365
|
environment:
|
|
352
|
-
main_name:
|
|
366
|
+
main_name: quasi_prefix_exact_match
|
|
353
367
|
main_split: test
|
|
354
368
|
taxonomy:
|
|
355
369
|
task: audio classification
|
|
@@ -458,7 +472,7 @@ run_groups:
|
|
|
458
472
|
- accuracy
|
|
459
473
|
- general_information
|
|
460
474
|
environment:
|
|
461
|
-
main_name:
|
|
475
|
+
main_name: quasi_exact_match
|
|
462
476
|
main_split: test
|
|
463
477
|
taxonomy:
|
|
464
478
|
task: audio identification
|
|
@@ -491,28 +505,6 @@ run_groups:
|
|
|
491
505
|
when: "2024"
|
|
492
506
|
language: English, Spanish
|
|
493
507
|
|
|
494
|
-
- name: audio_pairs
|
|
495
|
-
display_name: Audio PAIRS
|
|
496
|
-
description: >
|
|
497
|
-
Audio PAIRS is an audio extension of the PAIRS dataset (Fraser et al, 2024) to examine gender and
|
|
498
|
-
racial bias in audio large language models. We convert the questions in the PAIRS dataset to audio
|
|
499
|
-
clips using OpenAI's TTS-1-HD API. This dataset is also modified to add an option to opt-out with
|
|
500
|
-
"unclear" as a choice.
|
|
501
|
-
|
|
502
|
-
The dataset contains the audio and question for three subsets: occupation, status, and potential_crime.
|
|
503
|
-
metric_groups:
|
|
504
|
-
- accuracy
|
|
505
|
-
- general_information
|
|
506
|
-
environment:
|
|
507
|
-
main_name: exact_match
|
|
508
|
-
main_split: test
|
|
509
|
-
taxonomy:
|
|
510
|
-
task: audio classification
|
|
511
|
-
what: audio and question of audio samples to examine models' gender and racial bias
|
|
512
|
-
who: OpenAI's TTS-1-HD
|
|
513
|
-
when: "2024"
|
|
514
|
-
language: English
|
|
515
|
-
|
|
516
508
|
- name: meld_audio
|
|
517
509
|
display_name: Multimodal EmotionLines Dataset (MELD) Audio
|
|
518
510
|
description: >
|
|
@@ -524,7 +516,7 @@ run_groups:
|
|
|
524
516
|
- accuracy
|
|
525
517
|
- general_information
|
|
526
518
|
environment:
|
|
527
|
-
main_name:
|
|
519
|
+
main_name: quasi_prefix_exact_match
|
|
528
520
|
main_split: test
|
|
529
521
|
taxonomy:
|
|
530
522
|
task: audio classification
|
|
@@ -533,33 +525,30 @@ run_groups:
|
|
|
533
525
|
when: "2018"
|
|
534
526
|
language: English
|
|
535
527
|
|
|
536
|
-
- name:
|
|
537
|
-
display_name:
|
|
528
|
+
- name: air_bench_chat_knowledge
|
|
529
|
+
display_name: Air-Bench Chat (knowledge subsets)
|
|
538
530
|
description: >
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
The dataset contains two classification tasks: age and gender classification
|
|
546
|
-
([Porgali et al., 2023](https://arxiv.org/abs/2303.04838)). We phrase these two tasks as the multi-choice
|
|
547
|
-
questions answering task.
|
|
531
|
+
Air-Bench (Yang et al, 2024) encompasses two dimensions: foundation and chat benchmarks. The former consists of 19 tasks with
|
|
532
|
+
approximately 19k single-choice questions. The latter one contains 2k instances of open-ended question-and-answer data.
|
|
533
|
+
We consider the chat benchmark in this scenario.
|
|
534
|
+
|
|
535
|
+
The dataset contains the audio question answering task in four subjects: sound, speech, music, and mixed.
|
|
536
|
+
([Yang et al, 2024](https://aclanthology.org/2024.acl-long.109.pdf)).
|
|
548
537
|
metric_groups:
|
|
549
538
|
- accuracy
|
|
550
539
|
- general_information
|
|
551
540
|
environment:
|
|
552
|
-
main_name:
|
|
541
|
+
main_name: gpt4_audio_critique
|
|
553
542
|
main_split: test
|
|
554
543
|
taxonomy:
|
|
555
|
-
task: audio
|
|
556
|
-
what: audio,
|
|
544
|
+
task: audio question answering
|
|
545
|
+
what: audio, question, and answer of audio samples
|
|
557
546
|
who: real speakers
|
|
558
|
-
when: "
|
|
559
|
-
language:
|
|
547
|
+
when: "2024"
|
|
548
|
+
language: English
|
|
560
549
|
|
|
561
|
-
- name:
|
|
562
|
-
display_name: Air-Bench Chat
|
|
550
|
+
- name: air_bench_chat_reasoning
|
|
551
|
+
display_name: Air-Bench Chat (reasoning subsets)
|
|
563
552
|
description: >
|
|
564
553
|
Air-Bench (Yang et al, 2024) encompasses two dimensions: foundation and chat benchmarks. The former consists of 19 tasks with
|
|
565
554
|
approximately 19k single-choice questions. The latter one contains 2k instances of open-ended question-and-answer data.
|
|
@@ -594,7 +583,7 @@ run_groups:
|
|
|
594
583
|
- accuracy
|
|
595
584
|
- general_information
|
|
596
585
|
environment:
|
|
597
|
-
main_name:
|
|
586
|
+
main_name: quasi_exact_match
|
|
598
587
|
main_split: test
|
|
599
588
|
taxonomy:
|
|
600
589
|
task: audio question answering
|
|
@@ -612,7 +601,7 @@ run_groups:
|
|
|
612
601
|
- accuracy
|
|
613
602
|
- general_information
|
|
614
603
|
environment:
|
|
615
|
-
main_name:
|
|
604
|
+
main_name: quasi_exact_match
|
|
616
605
|
main_split: test
|
|
617
606
|
taxonomy:
|
|
618
607
|
task: toxicity detection
|
|
@@ -631,7 +620,7 @@ run_groups:
|
|
|
631
620
|
- accuracy
|
|
632
621
|
- general_information
|
|
633
622
|
environment:
|
|
634
|
-
main_name:
|
|
623
|
+
main_name: quasi_exact_match
|
|
635
624
|
main_split: test
|
|
636
625
|
taxonomy:
|
|
637
626
|
task: sarcasm detection
|
|
@@ -648,7 +637,7 @@ run_groups:
|
|
|
648
637
|
- accuracy
|
|
649
638
|
- general_information
|
|
650
639
|
environment:
|
|
651
|
-
main_name:
|
|
640
|
+
main_name: refusal_rate
|
|
652
641
|
main_split: test
|
|
653
642
|
taxonomy:
|
|
654
643
|
task: refusal for safety
|
|
@@ -742,11 +731,33 @@ run_groups:
|
|
|
742
731
|
- accuracy
|
|
743
732
|
- general_information
|
|
744
733
|
environment:
|
|
745
|
-
main_name:
|
|
734
|
+
main_name: quasi_exact_match
|
|
746
735
|
main_split: test
|
|
747
736
|
taxonomy:
|
|
748
737
|
task: audio classification
|
|
749
738
|
what: audio, question and answer given the audio
|
|
750
739
|
who: OpenAI's TTS
|
|
751
740
|
when: "2025"
|
|
741
|
+
language: English
|
|
742
|
+
|
|
743
|
+
- name: corebench
|
|
744
|
+
display_name: COREBench
|
|
745
|
+
description: >
|
|
746
|
+
The COREBench is a new audio benchmark incorporating multi-speaker conversations.
|
|
747
|
+
It consists of conversational audio, transcript, question, and answer. There
|
|
748
|
+
are two challenging features of this benchmark: (1) the questions are designed
|
|
749
|
+
to require reasoning over multiple turns of conversation, and (2) the average
|
|
750
|
+
audio length is longer than 1 minute, which is significantly longer than
|
|
751
|
+
existing benchmarks.
|
|
752
|
+
metric_groups:
|
|
753
|
+
- accuracy
|
|
754
|
+
- general_information
|
|
755
|
+
environment:
|
|
756
|
+
main_name: quasi_prefix_exact_match
|
|
757
|
+
main_split: test
|
|
758
|
+
taxonomy:
|
|
759
|
+
task: audio question-answering
|
|
760
|
+
what: audio, question, transcripts and answer given the audio
|
|
761
|
+
who: OpenAI's TTS
|
|
762
|
+
when: "2025"
|
|
752
763
|
language: English
|
|
@@ -1683,23 +1683,6 @@ run_groups:
|
|
|
1683
1683
|
when: n/a
|
|
1684
1684
|
language: synthetic
|
|
1685
1685
|
|
|
1686
|
-
- name: numeracy
|
|
1687
|
-
display_name: Numerical reasoning
|
|
1688
|
-
description: Scenario introduced in this work to test numerical reasoning via symbolic regression.
|
|
1689
|
-
metric_groups:
|
|
1690
|
-
- accuracy
|
|
1691
|
-
- efficiency
|
|
1692
|
-
- general_information
|
|
1693
|
-
environment:
|
|
1694
|
-
main_name: absolute_value_difference
|
|
1695
|
-
main_split: test
|
|
1696
|
-
taxonomy:
|
|
1697
|
-
task: next-word prediction
|
|
1698
|
-
what: Dyck formal language
|
|
1699
|
-
who: n/a
|
|
1700
|
-
when: n/a
|
|
1701
|
-
language: synthetic
|
|
1702
|
-
|
|
1703
1686
|
- name: synthetic_reasoning
|
|
1704
1687
|
display_name: Synthetic reasoning (abstract symbols)
|
|
1705
1688
|
description: Synthetic reasoning tasks defined using abstract symbols based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).
|
|
@@ -76,6 +76,10 @@ metrics:
|
|
|
76
76
|
display_name: Float Equivalence
|
|
77
77
|
description: Float Equivalence
|
|
78
78
|
lower_is_better: false
|
|
79
|
+
- name: adjusted_macro_f1_score
|
|
80
|
+
display_name: Adjusted Macro F1 Score
|
|
81
|
+
short_display_name: Adjusted Macro F1 Score
|
|
82
|
+
description: Entity type classification F1 score, adjusted for partial matches following the KPI-Edgar paper, macro-averaged across entity types
|
|
79
83
|
|
|
80
84
|
############################################################
|
|
81
85
|
perturbations: []
|
|
@@ -119,6 +123,7 @@ run_groups:
|
|
|
119
123
|
- gold_commodity_news
|
|
120
124
|
- financial_phrasebank
|
|
121
125
|
- conv_fin_qa_calc
|
|
126
|
+
- kpi_edgar
|
|
122
127
|
|
|
123
128
|
- name: legal_scenarios
|
|
124
129
|
display_name: Legal Scenarios
|
|
@@ -195,6 +200,22 @@ run_groups:
|
|
|
195
200
|
when: 2000-2019
|
|
196
201
|
language: English
|
|
197
202
|
|
|
203
|
+
- name: kpi_edgar
|
|
204
|
+
display_name: KPI-EDGAR Financial Documents (Named Entity Recognition)
|
|
205
|
+
description: A named entity recognition beenchmark based on the paper KPI-EDGAR - A Novel Dataset and Accompanying Metric for Relation Extraction from Financial Documents [(Deußer et al., 2022)](https://arxiv.org/pdf/2210.09163.pdf).
|
|
206
|
+
metric_groups:
|
|
207
|
+
- accuracy
|
|
208
|
+
- general_information
|
|
209
|
+
environment:
|
|
210
|
+
main_name: adjusted_macro_f1_score
|
|
211
|
+
main_split: test
|
|
212
|
+
taxonomy:
|
|
213
|
+
task: named entity recognition
|
|
214
|
+
what: financial reports
|
|
215
|
+
who: financial experts
|
|
216
|
+
when: before 2022
|
|
217
|
+
language: English
|
|
218
|
+
|
|
198
219
|
- name: legal_contract_summarization
|
|
199
220
|
display_name: Legal Contract Summarization
|
|
200
221
|
description: Plain English Summarization of Contracts [(Manor et al., 2019)](https://aclanthology.org/W19-2201.pdf).
|
|
@@ -94,6 +94,14 @@ metrics:
|
|
|
94
94
|
display_name: ROUGE-L
|
|
95
95
|
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
|
|
96
96
|
lower_is_better: false
|
|
97
|
+
- name: ruler_string_match_part
|
|
98
|
+
display_name: RULER String Match
|
|
99
|
+
description: RULER String Match
|
|
100
|
+
lower_is_better: false
|
|
101
|
+
- name: openai_mrcr_accuracy
|
|
102
|
+
display_name: MRCR Accuracy
|
|
103
|
+
description: MRCR Accuracy
|
|
104
|
+
lower_is_better: false
|
|
97
105
|
|
|
98
106
|
# Toxicity metrics
|
|
99
107
|
- name: expected_max_toxicity
|
|
@@ -180,61 +188,114 @@ metric_groups:
|
|
|
180
188
|
run_groups:
|
|
181
189
|
- name: long_context_scenarios
|
|
182
190
|
display_name: Long Context Scenarios
|
|
183
|
-
description: Scenarios for
|
|
191
|
+
description: Scenarios for evaluating long context capabilities
|
|
184
192
|
category: All scenarios
|
|
185
193
|
subgroups:
|
|
186
194
|
- ruler_hotpotqa
|
|
187
195
|
- ruler_squad
|
|
188
|
-
-
|
|
196
|
+
- infinite_bench_en_sum
|
|
197
|
+
- infinite_bench_en_qa
|
|
198
|
+
- infinite_bench_en_mc
|
|
199
|
+
- openai_mrcr
|
|
189
200
|
|
|
190
201
|
- name: ruler_hotpotqa
|
|
191
202
|
display_name: RULER HotPotQA
|
|
192
|
-
description: RULER HotPotQA
|
|
203
|
+
description: RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario.
|
|
193
204
|
metric_groups:
|
|
194
205
|
- accuracy
|
|
195
206
|
- general_information
|
|
196
207
|
- annotation_metrics
|
|
197
208
|
environment:
|
|
198
|
-
main_name:
|
|
209
|
+
main_name: ruler_string_match_part
|
|
199
210
|
main_split: valid
|
|
200
211
|
taxonomy:
|
|
201
|
-
task: question answering
|
|
202
|
-
what:
|
|
203
|
-
who:
|
|
204
|
-
when:
|
|
212
|
+
task: question answering with retrieval-augmented generation
|
|
213
|
+
what: Wikipedia articles
|
|
214
|
+
who: Wikipedia authors
|
|
215
|
+
when: Before 2018
|
|
205
216
|
language: English
|
|
206
217
|
|
|
207
218
|
|
|
208
219
|
- name: ruler_squad
|
|
209
220
|
display_name: RULER SQuAD
|
|
210
|
-
description: RULER SQuAD
|
|
221
|
+
description: RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario.
|
|
211
222
|
metric_groups:
|
|
212
223
|
- accuracy
|
|
213
224
|
- general_information
|
|
214
225
|
- annotation_metrics
|
|
215
226
|
environment:
|
|
216
|
-
main_name:
|
|
227
|
+
main_name: ruler_string_match_part
|
|
217
228
|
main_split: valid
|
|
218
229
|
taxonomy:
|
|
219
230
|
task: question answering
|
|
220
|
-
what:
|
|
221
|
-
who:
|
|
222
|
-
when:
|
|
231
|
+
what: Wikipedia articles
|
|
232
|
+
who: Wikipedia authors and crowdworkers
|
|
233
|
+
when: Before 2018
|
|
223
234
|
language: English
|
|
224
235
|
|
|
225
|
-
- name:
|
|
226
|
-
display_name: ∞Bench
|
|
227
|
-
description: ∞Bench
|
|
236
|
+
- name: infinite_bench_en_qa
|
|
237
|
+
display_name: ∞Bench En.QA
|
|
238
|
+
description: ∞Bench En.QA is a open-ended question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
|
|
228
239
|
metric_groups:
|
|
229
240
|
- accuracy
|
|
230
241
|
- general_information
|
|
231
242
|
- annotation_metrics
|
|
232
243
|
environment:
|
|
233
|
-
main_name:
|
|
244
|
+
main_name: f1_score
|
|
234
245
|
main_split: test
|
|
235
246
|
taxonomy:
|
|
236
247
|
task: question answering
|
|
237
|
-
what:
|
|
238
|
-
who:
|
|
239
|
-
when:
|
|
248
|
+
what: Novels
|
|
249
|
+
who: Novel authors
|
|
250
|
+
when: Before 2024
|
|
251
|
+
language: English
|
|
252
|
+
|
|
253
|
+
- name: infinite_bench_en_mc
|
|
254
|
+
display_name: ∞Bench En.MC
|
|
255
|
+
description: ∞Bench En.MC is a multiple-choice question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
|
|
256
|
+
metric_groups:
|
|
257
|
+
- accuracy
|
|
258
|
+
- general_information
|
|
259
|
+
- annotation_metrics
|
|
260
|
+
environment:
|
|
261
|
+
main_name: exact_match
|
|
262
|
+
main_split: test
|
|
263
|
+
taxonomy:
|
|
264
|
+
task: multiple-choice question answering
|
|
265
|
+
what: Novels
|
|
266
|
+
who: Novel authors
|
|
267
|
+
when: Before 2024
|
|
268
|
+
language: English
|
|
269
|
+
|
|
270
|
+
- name: infinite_bench_en_sum
|
|
271
|
+
display_name: ∞Bench En.Sum
|
|
272
|
+
description: ∞Bench En.Sum is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
|
|
273
|
+
metric_groups:
|
|
274
|
+
- accuracy
|
|
275
|
+
- general_information
|
|
276
|
+
- annotation_metrics
|
|
277
|
+
environment:
|
|
278
|
+
main_name: rouge_l
|
|
279
|
+
main_split: test
|
|
280
|
+
taxonomy:
|
|
281
|
+
task: multi-hop question answering
|
|
282
|
+
what: Novels
|
|
283
|
+
who: Novel authors
|
|
284
|
+
when: Before 2024
|
|
285
|
+
language: English
|
|
286
|
+
|
|
287
|
+
- name: openai_mrcr
|
|
288
|
+
display_name: OpenAI MRCR
|
|
289
|
+
description: OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset for benchmarking an LLM's ability to distinguish between multiple needles hidden in context. This eval is inspired by the MRCR eval first introduced by [Vodrahalli et al., 2024](https://arxiv.org/pdf/2409.12640v2).
|
|
290
|
+
metric_groups:
|
|
291
|
+
- accuracy
|
|
292
|
+
- general_information
|
|
293
|
+
environment:
|
|
294
|
+
main_name: openai_mrcr_accuracy
|
|
295
|
+
main_split: test
|
|
296
|
+
taxonomy:
|
|
297
|
+
task: MRCR
|
|
298
|
+
what: Synthetic data
|
|
299
|
+
who: "None"
|
|
300
|
+
when: "2025"
|
|
240
301
|
language: English
|