PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +2 -2
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +16 -26
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +43 -13
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +7 -1
helm/benchmark/presentation/summarize.py +84 -61
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +84 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/arabic_run_specs.py +73 -0
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +0 -53
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +114 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
helm/benchmark/scenarios/aratrust_scenario.py +76 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/bluex_scenario.py +66 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/cleva_scenario.py +1 -1
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/math_scenario.py +21 -20
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/seahelm_scenario.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +228 -0
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +81 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +102 -55
helm/clients/openai_responses_client.py +176 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/test_huggingface_client.py +3 -3
helm/clients/together_client.py +31 -6
helm/clients/vertexai_client.py +17 -9
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/critique_request.py +0 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +104 -12
helm/common/local_context.py +140 -0
helm/common/object_spec.py +23 -8
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +995 -45
helm/config/model_metadata.yaml +780 -59
helm/config/tokenizer_configs.yaml +224 -3
helm/proxy/cli.py +4 -2
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/retry.py +5 -0
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +55 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -793
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/benchmark/static/schema_audio.yaml CHANGED Viewed

@@ -194,6 +194,11 @@ metrics:
     lower_is_better: false
     description: Judgements by GPT-4o
+  - name: refusal_rate
+    display_name: Refusal rate for safety
+    lower_is_better: false
+    description: Judgements by GPT-4o
 ############################################################
 perturbations: []
@@ -235,6 +240,7 @@ run_groups:
     subgroups:
       - auditory_perception
       - emotion_detection
+      - reasoning
       - knowledge
       - robustness
       - multilinguality
@@ -261,12 +267,20 @@ run_groups:
       - meld_audio
       - mustard
+  - name: reasoning
+    display_name: Reasoning
+    description: Reasoning about the audio
+    category: Core scenarios
+    subgroups:
+      - air_bench_chat_reasoning
+      - corebench
   - name: knowledge
     display_name: Knowledge
     description: Recalling facts or information contained in the audio LLM.
     category: Core scenarios
     subgroups:
-      - air_bench_chat
+      - air_bench_chat_knowledge
       - air_bench_foundation
   - name: robustness
@@ -349,7 +363,7 @@ run_groups:
       - accuracy
       - general_information
     environment:
-      main_name: quasi_exact_match
+      main_name: quasi_prefix_exact_match
       main_split: test
     taxonomy:
       task: audio classification
@@ -458,7 +472,7 @@ run_groups:
       - accuracy
       - general_information
     environment:
-      main_name: exact_match
+      main_name: quasi_exact_match
       main_split: test
     taxonomy:
       task: audio identification
@@ -491,28 +505,6 @@ run_groups:
       when: "2024"
       language: English, Spanish
-  - name: audio_pairs
-    display_name: Audio PAIRS
-    description: >
-      Audio PAIRS is an audio extension of the PAIRS dataset (Fraser et al, 2024) to examine gender and
-      racial bias in audio large language models. We convert the questions in the PAIRS dataset to audio
-      clips using OpenAI's TTS-1-HD API. This dataset is also modified to add an option to opt-out with
-      "unclear" as a choice.
-      The dataset contains the audio and question for three subsets: occupation, status, and potential_crime.
-    metric_groups:
-      - accuracy
-      - general_information
-    environment:
-      main_name: exact_match
-      main_split: test
-    taxonomy:
-      task: audio classification
-      what: audio and question of audio samples to examine models' gender and racial bias
-      who: OpenAI's TTS-1-HD
-      when: "2024"
-      language: English
   - name: meld_audio
     display_name: Multimodal EmotionLines Dataset (MELD) Audio
     description: >
@@ -524,7 +516,7 @@ run_groups:
       - accuracy
       - general_information
     environment:
-      main_name: quasi_exact_match
+      main_name: quasi_prefix_exact_match
       main_split: test
     taxonomy:
       task: audio classification
@@ -533,33 +525,30 @@ run_groups:
       when: "2018"
       language: English
-  - name: casual_conversations2
-    display_name: Casual Conversations 2
+  - name: air_bench_chat_knowledge
+    display_name: Air-Bench Chat (knowledge subsets)
     description: >
-      Casual Conversation v2 (Porgali et al, 2023) is composed of over 5,567 participants (26,467 videos).
-      The videos feature paid individuals who agreed to participate in the project and explicitly provided
-      Age, Gender, Language/Dialect, Geo-location, Disability, Physical adornments, Physical attributes labels
-      themselves. The videos were recorded in Brazil, India, Indonesia, Mexico, Philippines, United States,
-      and Vietnam with a diverse set of adults in various categories.
-      The dataset contains two classification tasks: age and gender classification
-      ([Porgali et al., 2023](https://arxiv.org/abs/2303.04838)). We phrase these two tasks as the multi-choice
-      questions answering task.
+      Air-Bench (Yang et al, 2024) encompasses two dimensions: foundation and chat benchmarks. The former consists of 19 tasks with
+      approximately 19k single-choice questions. The latter one contains 2k instances of open-ended question-and-answer data.
+      We consider the chat benchmark in this scenario.
+      The dataset contains the audio question answering task in four subjects: sound, speech, music, and mixed.
+      ([Yang et al, 2024](https://aclanthology.org/2024.acl-long.109.pdf)).
     metric_groups:
       - accuracy
       - general_information
     environment:
-      main_name: exact_match
+      main_name: gpt4_audio_critique
       main_split: test
     taxonomy:
-      task: audio classification
-      what: audio, spoken language, speaker's gender, age information of audio samples
+      task: audio question answering
+      what: audio, question, and answer of audio samples
       who: real speakers
-      when: "2023"
-      language: 10 languages
+      when: "2024"
+      language: English
-  - name: air_bench_chat
-    display_name: Air-Bench Chat
+  - name: air_bench_chat_reasoning
+    display_name: Air-Bench Chat (reasoning subsets)
     description: >
       Air-Bench (Yang et al, 2024) encompasses two dimensions: foundation and chat benchmarks. The former consists of 19 tasks with
       approximately 19k single-choice questions. The latter one contains 2k instances of open-ended question-and-answer data.
@@ -594,7 +583,7 @@ run_groups:
       - accuracy
       - general_information
     environment:
-      main_name: exact_match
+      main_name: quasi_exact_match
       main_split: test
     taxonomy:
       task: audio question answering
@@ -612,7 +601,7 @@ run_groups:
       - accuracy
       - general_information
     environment:
-      main_name: exact_match
+      main_name: quasi_exact_match
       main_split: test
     taxonomy:
       task: toxicity detection
@@ -631,7 +620,7 @@ run_groups:
       - accuracy
       - general_information
     environment:
-      main_name: exact_match
+      main_name: quasi_exact_match
       main_split: test
     taxonomy:
       task: sarcasm detection
@@ -648,7 +637,7 @@ run_groups:
       - accuracy
       - general_information
     environment:
-      main_name: toxic_frac
+      main_name: refusal_rate
       main_split: test
     taxonomy:
       task: refusal for safety
@@ -742,11 +731,33 @@ run_groups:
       - accuracy
       - general_information
     environment:
-      main_name: exact_match
+      main_name: quasi_exact_match
       main_split: test
     taxonomy:
       task: audio classification
       what: audio, question and answer given the audio
       who: OpenAI's TTS
       when: "2025"
+      language: English
+  - name: corebench
+    display_name: COREBench
+    description: >
+      The COREBench is a new audio benchmark incorporating multi-speaker conversations.
+      It consists of conversational audio, transcript, question, and answer. There
+      are two challenging features of this benchmark: (1) the questions are designed
+      to require reasoning over multiple turns of conversation, and (2) the average
+      audio length is longer than 1 minute, which is significantly longer than
+      existing benchmarks.
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: quasi_prefix_exact_match
+      main_split: test
+    taxonomy:
+      task: audio question-answering
+      what: audio, question, transcripts and answer given the audio
+      who: OpenAI's TTS
+      when: "2025"
       language: English

helm/benchmark/static/schema_classic.yaml CHANGED Viewed

@@ -1683,23 +1683,6 @@ run_groups:
       when: n/a
       language: synthetic
-  - name: numeracy
-    display_name: Numerical reasoning
-    description: Scenario introduced in this work to test numerical reasoning via symbolic regression.
-    metric_groups:
-      - accuracy
-      - efficiency
-      - general_information
-    environment:
-      main_name: absolute_value_difference
-      main_split: test
-    taxonomy:
-      task: next-word prediction
-      what: Dyck formal language
-      who: n/a
-      when: n/a
-      language: synthetic
   - name: synthetic_reasoning
     display_name: Synthetic reasoning (abstract symbols)
     description: Synthetic reasoning tasks defined using abstract symbols based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).

helm/benchmark/static/schema_enterprise.yaml CHANGED Viewed

@@ -76,6 +76,10 @@ metrics:
     display_name: Float Equivalence
     description: Float Equivalence
     lower_is_better: false
+  - name: adjusted_macro_f1_score
+    display_name: Adjusted Macro F1 Score
+    short_display_name: Adjusted Macro F1 Score
+    description: Entity type classification F1 score, adjusted for partial matches following the KPI-Edgar paper, macro-averaged across entity types
 ############################################################
 perturbations: []
@@ -119,6 +123,7 @@ run_groups:
       - gold_commodity_news
       - financial_phrasebank
       - conv_fin_qa_calc
+      - kpi_edgar
   - name: legal_scenarios
     display_name: Legal Scenarios
@@ -195,6 +200,22 @@ run_groups:
       when: 2000-2019
       language: English
+  - name: kpi_edgar
+    display_name: KPI-EDGAR Financial Documents (Named Entity Recognition)
+    description: A named entity recognition beenchmark based on the paper KPI-EDGAR - A Novel Dataset and Accompanying Metric for Relation Extraction from Financial Documents [(Deußer et al., 2022)](https://arxiv.org/pdf/2210.09163.pdf).
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: adjusted_macro_f1_score
+      main_split: test
+    taxonomy:
+      task: named entity recognition
+      what: financial reports
+      who: financial experts
+      when: before 2022
+      language: English
   - name: legal_contract_summarization
     display_name: Legal Contract Summarization
     description: Plain English Summarization of Contracts [(Manor et al., 2019)](https://aclanthology.org/W19-2201.pdf).

helm/benchmark/static/schema_long_context.yaml CHANGED Viewed

@@ -94,6 +94,14 @@ metrics:
     display_name: ROUGE-L
     description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
     lower_is_better: false
+  - name: ruler_string_match_part
+    display_name: RULER String Match
+    description: RULER String Match
+    lower_is_better: false
+  - name: openai_mrcr_accuracy
+    display_name: MRCR Accuracy
+    description: MRCR Accuracy
+    lower_is_better: false
   # Toxicity metrics
   - name: expected_max_toxicity
@@ -180,61 +188,114 @@ metric_groups:
 run_groups:
   - name: long_context_scenarios
     display_name: Long Context Scenarios
-    description: Scenarios for the model safety
+    description: Scenarios for evaluating long context capabilities
     category: All scenarios
     subgroups:
       - ruler_hotpotqa
       - ruler_squad
-      - infinite_bench_sum
+      - infinite_bench_en_sum
+      - infinite_bench_en_qa
+      - infinite_bench_en_mc
+      - openai_mrcr
   - name: ruler_hotpotqa
     display_name: RULER HotPotQA
-    description: RULER HotPotQA
+    description: RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario.
     metric_groups:
       - accuracy
       - general_information
       - annotation_metrics
     environment:
-      main_name: f1_score
+      main_name: ruler_string_match_part
       main_split: valid
     taxonomy:
-      task: question answering
-      what: n/a
-      who: n/a
-      when: n/a
+      task: question answering with retrieval-augmented generation
+      what: Wikipedia articles
+      who: Wikipedia authors
+      when: Before 2018
       language: English
   - name: ruler_squad
     display_name: RULER SQuAD
-    description: RULER SQuAD
+    description: RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario.
     metric_groups:
       - accuracy
       - general_information
       - annotation_metrics
     environment:
-      main_name: f1_score
+      main_name: ruler_string_match_part
       main_split: valid
     taxonomy:
       task: question answering
-      what: n/a
-      who: n/a
-      when: n/a
+      what: Wikipedia articles
+      who: Wikipedia authors and crowdworkers
+      when: Before 2018
       language: English
-  - name: infinite_bench_sum
-    display_name: ∞Bench Sum
-    description: ∞Bench Sum
+  - name: infinite_bench_en_qa
+    display_name: ∞Bench En.QA
+    description: ∞Bench En.QA is a open-ended question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
     metric_groups:
       - accuracy
       - general_information
       - annotation_metrics
     environment:
-      main_name: rouge_l
+      main_name: f1_score
       main_split: test
     taxonomy:
       task: question answering
-      what: n/a
-      who: n/a
-      when: n/a
+      what: Novels
+      who: Novel authors
+      when: Before 2024
+      language: English
+  - name: infinite_bench_en_mc
+    display_name: ∞Bench En.MC
+    description: ∞Bench En.MC is a multiple-choice question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
+    metric_groups:
+      - accuracy
+      - general_information
+      - annotation_metrics
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: Novels
+      who: Novel authors
+      when: Before 2024
+      language: English
+  - name: infinite_bench_en_sum
+    display_name: ∞Bench En.Sum
+    description: ∞Bench En.Sum is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
+    metric_groups:
+      - accuracy
+      - general_information
+      - annotation_metrics
+    environment:
+      main_name: rouge_l
+      main_split: test
+    taxonomy:
+      task: multi-hop question answering
+      what: Novels
+      who: Novel authors
+      when: Before 2024
+      language: English
+  - name: openai_mrcr
+    display_name: OpenAI MRCR
+    description: OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset for benchmarking an LLM's ability to distinguish between multiple needles hidden in context. This eval is inspired by the MRCR eval first introduced by [Vodrahalli et al., 2024](https://arxiv.org/pdf/2409.12640v2).
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: openai_mrcr_accuracy
+      main_split: test
+    taxonomy:
+      task: MRCR
+      what: Synthetic data
+      who: "None"
+      when: "2025"
       language: English

crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl