crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +134 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +26 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +15 -0
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +20 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +26 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +14 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +15 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +17 -17
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
- helm/benchmark/static_build/assets/index-9352595e.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/openai_client.py +31 -19
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +48 -11
- helm/clients/vertexai_client.py +8 -2
- helm/config/model_deployments.yaml +75 -1
- helm/config/model_metadata.yaml +70 -2
- helm/config/tokenizer_configs.yaml +19 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
|
@@ -2,11 +2,13 @@ import datasets
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
7
8
|
Instance,
|
|
8
9
|
TEST_SPLIT,
|
|
9
10
|
Input,
|
|
11
|
+
ScenarioMetadata,
|
|
10
12
|
)
|
|
11
13
|
from helm.common.general import ensure_directory_exists
|
|
12
14
|
|
|
@@ -81,3 +83,19 @@ class WildBenchScenario(Scenario):
|
|
|
81
83
|
instances.append(instance)
|
|
82
84
|
|
|
83
85
|
return instances
|
|
86
|
+
|
|
87
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
88
|
+
return ScenarioMetadata(
|
|
89
|
+
name=self.name,
|
|
90
|
+
display_name="WildBench",
|
|
91
|
+
description=self.description,
|
|
92
|
+
main_metric="wildbench_score_rescaled",
|
|
93
|
+
main_split="test",
|
|
94
|
+
taxonomy=TaxonomyInfo(
|
|
95
|
+
task="instruction following",
|
|
96
|
+
what="GPT-judged instruction following with instructions collected from real-user conversations",
|
|
97
|
+
who="real-world users",
|
|
98
|
+
when="2024",
|
|
99
|
+
language="English",
|
|
100
|
+
),
|
|
101
|
+
)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import List, Any
|
|
2
2
|
from datasets import load_dataset
|
|
3
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
3
4
|
from helm.common.hierarchical_logger import htrack_block
|
|
4
5
|
from helm.benchmark.scenarios.scenario import (
|
|
5
6
|
Scenario,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
CORRECT_TAG,
|
|
12
13
|
Input,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
|
|
16
18
|
|
|
@@ -106,3 +108,13 @@ class WMT14Scenario(Scenario):
|
|
|
106
108
|
)
|
|
107
109
|
)
|
|
108
110
|
return instances
|
|
111
|
+
|
|
112
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
113
|
+
return ScenarioMetadata(
|
|
114
|
+
name="wmt_14",
|
|
115
|
+
display_name="WMT 2014",
|
|
116
|
+
description="WMT 2014 is a collection of machine translation datasets.",
|
|
117
|
+
taxonomy=TaxonomyInfo(task="machine translation", what="n/a", when="n/a", who="n/a", language="English"),
|
|
118
|
+
main_metric="bleu_4",
|
|
119
|
+
main_split="test",
|
|
120
|
+
)
|
|
@@ -92,6 +92,12 @@ metrics:
|
|
|
92
92
|
short_display_name: PEM
|
|
93
93
|
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
|
|
94
94
|
lower_is_better: false
|
|
95
|
+
- name: alrage_score
|
|
96
|
+
# TODO: should call this prefix_quasi_exact_match
|
|
97
|
+
display_name: ALRAGE Score
|
|
98
|
+
short_display_name: Score
|
|
99
|
+
description: Score of the output judged by GPT-4o.
|
|
100
|
+
lower_is_better: false
|
|
95
101
|
|
|
96
102
|
############################################################
|
|
97
103
|
perturbations: []
|
|
@@ -134,17 +140,20 @@ run_groups:
|
|
|
134
140
|
- name: arabic_scenarios
|
|
135
141
|
display_name: Arabic Scenarios
|
|
136
142
|
description: Arabic Scenarios
|
|
137
|
-
category:
|
|
143
|
+
category: Scenarios
|
|
138
144
|
subgroups:
|
|
139
|
-
- mmmlu
|
|
140
|
-
- arabic_mmlu
|
|
141
145
|
- alghafa
|
|
142
|
-
-
|
|
146
|
+
- arabic_mmlu
|
|
147
|
+
- arabic_exams
|
|
148
|
+
- madinah_qa
|
|
143
149
|
- aratrust
|
|
150
|
+
- alrage
|
|
151
|
+
- mbzuai_human_translated_arabic_mmlu
|
|
144
152
|
|
|
145
|
-
- name:
|
|
146
|
-
display_name:
|
|
147
|
-
|
|
153
|
+
- name: mbzuai_human_translated_arabic_mmlu
|
|
154
|
+
display_name: MBZUAI Human-Translated Arabic MMLU
|
|
155
|
+
short_display_name: Translated MMLU
|
|
156
|
+
description: A translation from MBZUAI by human translators of the Massive Multitask Language Understanding benchmark.
|
|
148
157
|
metric_groups:
|
|
149
158
|
- accuracy
|
|
150
159
|
- efficiency
|
|
@@ -160,8 +169,8 @@ run_groups:
|
|
|
160
169
|
language: Arabic
|
|
161
170
|
|
|
162
171
|
- name: arabic_mmlu
|
|
163
|
-
display_name:
|
|
164
|
-
description:
|
|
172
|
+
display_name: ArabicMMLU
|
|
173
|
+
description: ArabicMMLU
|
|
165
174
|
metric_groups:
|
|
166
175
|
- accuracy
|
|
167
176
|
- efficiency
|
|
@@ -193,9 +202,9 @@ run_groups:
|
|
|
193
202
|
when: "before 2023"
|
|
194
203
|
language: Arabic
|
|
195
204
|
|
|
196
|
-
- name:
|
|
197
|
-
display_name: EXAMS
|
|
198
|
-
description: EXAMS
|
|
205
|
+
- name: arabic_exams
|
|
206
|
+
display_name: Arabic EXAMS
|
|
207
|
+
description: Arabic EXAMS
|
|
199
208
|
metric_groups:
|
|
200
209
|
- accuracy
|
|
201
210
|
- efficiency
|
|
@@ -226,3 +235,37 @@ run_groups:
|
|
|
226
235
|
who: "academic exams writers and takers"
|
|
227
236
|
when: "before 2024"
|
|
228
237
|
language: Arabic
|
|
238
|
+
|
|
239
|
+
- name: alrage
|
|
240
|
+
display_name: ALRAGE
|
|
241
|
+
description: ALRAGE
|
|
242
|
+
metric_groups:
|
|
243
|
+
- accuracy
|
|
244
|
+
- efficiency
|
|
245
|
+
- general_information
|
|
246
|
+
environment:
|
|
247
|
+
main_name: alrage_score
|
|
248
|
+
main_split: test
|
|
249
|
+
taxonomy:
|
|
250
|
+
task: "openbook (RAG) open-ended question answering"
|
|
251
|
+
what: "?"
|
|
252
|
+
who: "?"
|
|
253
|
+
when: "?"
|
|
254
|
+
language: Arabic
|
|
255
|
+
|
|
256
|
+
- name: madinah_qa
|
|
257
|
+
display_name: MadinahQA
|
|
258
|
+
description: Arabic language competency benchmark
|
|
259
|
+
metric_groups:
|
|
260
|
+
- accuracy
|
|
261
|
+
- efficiency
|
|
262
|
+
- general_information
|
|
263
|
+
environment:
|
|
264
|
+
main_name: exact_match
|
|
265
|
+
main_split: test
|
|
266
|
+
taxonomy:
|
|
267
|
+
task: "question answering"
|
|
268
|
+
what: "academic questions about Arabic language"
|
|
269
|
+
who: "academic exams writers and takers"
|
|
270
|
+
when: "before 2024"
|
|
271
|
+
language: Arabic
|
|
@@ -194,7 +194,7 @@ run_groups:
|
|
|
194
194
|
- ruler_hotpotqa
|
|
195
195
|
- ruler_squad
|
|
196
196
|
- infinite_bench_en_sum
|
|
197
|
-
- infinite_bench_en_qa
|
|
197
|
+
# - infinite_bench_en_qa
|
|
198
198
|
- infinite_bench_en_mc
|
|
199
199
|
- openai_mrcr
|
|
200
200
|
|
|
@@ -233,22 +233,22 @@ run_groups:
|
|
|
233
233
|
when: Before 2018
|
|
234
234
|
language: English
|
|
235
235
|
|
|
236
|
-
- name: infinite_bench_en_qa
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
236
|
+
# - name: infinite_bench_en_qa
|
|
237
|
+
# display_name: ∞Bench En.QA
|
|
238
|
+
# description: ∞Bench En.QA is a open-ended question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
|
|
239
|
+
# metric_groups:
|
|
240
|
+
# - accuracy
|
|
241
|
+
# - general_information
|
|
242
|
+
# - annotation_metrics
|
|
243
|
+
# environment:
|
|
244
|
+
# main_name: f1_score
|
|
245
|
+
# main_split: test
|
|
246
|
+
# taxonomy:
|
|
247
|
+
# task: question answering
|
|
248
|
+
# what: Novels
|
|
249
|
+
# who: Novel authors
|
|
250
|
+
# when: Before 2024
|
|
251
|
+
# language: English
|
|
252
252
|
|
|
253
253
|
- name: infinite_bench_en_mc
|
|
254
254
|
display_name: ∞Bench En.MC
|
|
@@ -484,6 +484,8 @@ run_groups:
|
|
|
484
484
|
- ehrshot
|
|
485
485
|
- head_qa
|
|
486
486
|
- medbullets
|
|
487
|
+
- med_qa
|
|
488
|
+
- med_mcqa
|
|
487
489
|
- medalign
|
|
488
490
|
- shc_ptbm_med
|
|
489
491
|
- shc_sei_med
|
|
@@ -657,6 +659,40 @@ run_groups:
|
|
|
657
659
|
when: Any
|
|
658
660
|
language: English
|
|
659
661
|
|
|
662
|
+
- name: med_qa
|
|
663
|
+
display_name: MedQA
|
|
664
|
+
description: MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).
|
|
665
|
+
metric_groups:
|
|
666
|
+
- accuracy
|
|
667
|
+
- efficiency
|
|
668
|
+
- general_information
|
|
669
|
+
environment:
|
|
670
|
+
main_name: exact_match
|
|
671
|
+
main_split: test
|
|
672
|
+
taxonomy:
|
|
673
|
+
task: question answering
|
|
674
|
+
what: n/a
|
|
675
|
+
who: n/a
|
|
676
|
+
when: n/a
|
|
677
|
+
language: English
|
|
678
|
+
|
|
679
|
+
- name: med_mcqa
|
|
680
|
+
display_name: MedMCQA
|
|
681
|
+
description: MedMCQA is a "multiple-choice question answering (MCQA) dataset designed to address real-world medical entrance exam questions ([Flores et al. 2020](https://arxiv.org/abs/2203.14371)).
|
|
682
|
+
metric_groups:
|
|
683
|
+
- accuracy
|
|
684
|
+
- efficiency
|
|
685
|
+
- general_information
|
|
686
|
+
environment:
|
|
687
|
+
main_name: exact_match
|
|
688
|
+
main_split: valid
|
|
689
|
+
taxonomy:
|
|
690
|
+
task: question answering
|
|
691
|
+
what: n/a
|
|
692
|
+
who: n/a
|
|
693
|
+
when: n/a
|
|
694
|
+
language: English
|
|
695
|
+
|
|
660
696
|
- name: medalign
|
|
661
697
|
display_name: MedAlign
|
|
662
698
|
short_display_name: MedAlign
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
############################################################
|
|
2
|
+
metrics:
|
|
3
|
+
# Infrastructure metrics:
|
|
4
|
+
- name: num_perplexity_tokens
|
|
5
|
+
display_name: '# tokens'
|
|
6
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
7
|
+
- name: num_bytes
|
|
8
|
+
display_name: '# bytes'
|
|
9
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
10
|
+
|
|
11
|
+
- name: num_references
|
|
12
|
+
display_name: '# ref'
|
|
13
|
+
description: Number of references.
|
|
14
|
+
- name: num_train_trials
|
|
15
|
+
display_name: '# trials'
|
|
16
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
17
|
+
- name: estimated_num_tokens_cost
|
|
18
|
+
display_name: 'cost'
|
|
19
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
20
|
+
- name: num_prompt_tokens
|
|
21
|
+
display_name: '# prompt tokens'
|
|
22
|
+
description: Number of tokens in the prompt.
|
|
23
|
+
- name: num_prompt_characters
|
|
24
|
+
display_name: '# prompt chars'
|
|
25
|
+
description: Number of characters in the prompt.
|
|
26
|
+
- name: num_completion_tokens
|
|
27
|
+
display_name: '# completion tokens'
|
|
28
|
+
description: Actual number of completion tokens (over all completions).
|
|
29
|
+
- name: num_output_tokens
|
|
30
|
+
display_name: '# output tokens'
|
|
31
|
+
description: Actual number of output tokens.
|
|
32
|
+
- name: max_num_output_tokens
|
|
33
|
+
display_name: 'Max output tokens'
|
|
34
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
35
|
+
- name: num_requests
|
|
36
|
+
display_name: '# requests'
|
|
37
|
+
description: Number of distinct API requests.
|
|
38
|
+
- name: num_instances
|
|
39
|
+
display_name: '# eval'
|
|
40
|
+
description: Number of evaluation instances.
|
|
41
|
+
- name: num_train_instances
|
|
42
|
+
display_name: '# train'
|
|
43
|
+
description: Number of training instances (e.g., in-context examples).
|
|
44
|
+
- name: prompt_truncated
|
|
45
|
+
display_name: truncated
|
|
46
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
47
|
+
- name: finish_reason_length
|
|
48
|
+
display_name: finish b/c length
|
|
49
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
50
|
+
- name: finish_reason_stop
|
|
51
|
+
display_name: finish b/c stop
|
|
52
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
53
|
+
- name: finish_reason_endoftext
|
|
54
|
+
display_name: finish b/c endoftext
|
|
55
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
56
|
+
- name: finish_reason_unknown
|
|
57
|
+
display_name: finish b/c unknown
|
|
58
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
59
|
+
- name: num_completions
|
|
60
|
+
display_name: '# completions'
|
|
61
|
+
description: Number of completions.
|
|
62
|
+
- name: predicted_index
|
|
63
|
+
display_name: Predicted index
|
|
64
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
65
|
+
|
|
66
|
+
# Accuracy metrics:
|
|
67
|
+
- name: exact_match
|
|
68
|
+
display_name: Exact match
|
|
69
|
+
short_display_name: EM
|
|
70
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
71
|
+
lower_is_better: false
|
|
72
|
+
- name: classification_macro_f1
|
|
73
|
+
display_name: Macro-F1
|
|
74
|
+
description: Population-level macro-averaged F1 score.
|
|
75
|
+
lower_is_better: false
|
|
76
|
+
- name: classification_micro_f1
|
|
77
|
+
display_name: Micro-F1
|
|
78
|
+
description: Population-level micro-averaged F1 score.
|
|
79
|
+
lower_is_better: false
|
|
80
|
+
- name: wer_score
|
|
81
|
+
display_name: Word Error Rate
|
|
82
|
+
description: Transcription error rate.
|
|
83
|
+
lower_is_better: true
|
|
84
|
+
- name: mer_score
|
|
85
|
+
display_name: Character Error Rate
|
|
86
|
+
description: Character error rate.
|
|
87
|
+
lower_is_better: true
|
|
88
|
+
|
|
89
|
+
############################################################
|
|
90
|
+
perturbations: []
|
|
91
|
+
|
|
92
|
+
############################################################
|
|
93
|
+
metric_groups:
|
|
94
|
+
- name: accuracy
|
|
95
|
+
display_name: Accuracy
|
|
96
|
+
hide_win_rates: true
|
|
97
|
+
metrics:
|
|
98
|
+
- name: exact_match
|
|
99
|
+
split: ${main_split}
|
|
100
|
+
- name: classification_macro_f1
|
|
101
|
+
split: ${main_split}
|
|
102
|
+
- name: classification_micro_f1
|
|
103
|
+
split: ${main_split}
|
|
104
|
+
|
|
105
|
+
- name: transcription_accuracy
|
|
106
|
+
display_name: Transcription Accuracy
|
|
107
|
+
hide_win_rates: true
|
|
108
|
+
metrics:
|
|
109
|
+
- name: wer_score
|
|
110
|
+
split: ${main_split}
|
|
111
|
+
- name: mer_score
|
|
112
|
+
split: ${main_split}
|
|
113
|
+
|
|
114
|
+
- name: efficiency
|
|
115
|
+
display_name: Efficiency
|
|
116
|
+
metrics:
|
|
117
|
+
- name: inference_runtime
|
|
118
|
+
split: ${main_split}
|
|
119
|
+
|
|
120
|
+
- name: general_information
|
|
121
|
+
display_name: General information
|
|
122
|
+
hide_win_rates: true
|
|
123
|
+
metrics:
|
|
124
|
+
- name: num_instances
|
|
125
|
+
split: ${main_split}
|
|
126
|
+
- name: num_train_instances
|
|
127
|
+
split: ${main_split}
|
|
128
|
+
- name: prompt_truncated
|
|
129
|
+
split: ${main_split}
|
|
130
|
+
- name: num_prompt_tokens
|
|
131
|
+
split: ${main_split}
|
|
132
|
+
- name: num_output_tokens
|
|
133
|
+
split: ${main_split}
|
|
134
|
+
|
|
135
|
+
############################################################
|
|
136
|
+
|
|
137
|
+
run_groups:
|
|
138
|
+
- name: slp
|
|
139
|
+
display_name: SLP Scenarios
|
|
140
|
+
description: SLP-language scenarios
|
|
141
|
+
category: All scenarios
|
|
142
|
+
subgroups:
|
|
143
|
+
- disorder_diagnosis
|
|
144
|
+
- transcription
|
|
145
|
+
- symptom_diagnosis
|
|
146
|
+
- disorder_type_diagnosis
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
- name: disorder_diagnosis
|
|
150
|
+
display_name: Disorder Diagnosis Accuracy
|
|
151
|
+
description: >
|
|
152
|
+
Macro-averaged accuracy on disorder diagnosis for pediatric speech disorder.
|
|
153
|
+
metric_groups:
|
|
154
|
+
- accuracy
|
|
155
|
+
- efficiency
|
|
156
|
+
- general_information
|
|
157
|
+
environment:
|
|
158
|
+
main_name: classification_micro_f1
|
|
159
|
+
main_split: test
|
|
160
|
+
taxonomy:
|
|
161
|
+
task: classification
|
|
162
|
+
what: n/a
|
|
163
|
+
who: n/a
|
|
164
|
+
when: "?"
|
|
165
|
+
language: English
|
|
166
|
+
|
|
167
|
+
- name: transcription
|
|
168
|
+
display_name: Transcription Accuracy
|
|
169
|
+
description: >
|
|
170
|
+
Model transcription accuracy on understanding disordered pediatric speech
|
|
171
|
+
metric_groups:
|
|
172
|
+
- transcription_accuracy
|
|
173
|
+
- efficiency
|
|
174
|
+
- general_information
|
|
175
|
+
environment:
|
|
176
|
+
main_name: wer_score
|
|
177
|
+
main_split: test
|
|
178
|
+
taxonomy:
|
|
179
|
+
task: transcription
|
|
180
|
+
what: disordered pediatric speech
|
|
181
|
+
who: n/a
|
|
182
|
+
when: "?"
|
|
183
|
+
language: English
|
|
184
|
+
|
|
185
|
+
- name: symptom_diagnosis
|
|
186
|
+
display_name: Symptom Diagnosis Accuracy
|
|
187
|
+
description: >
|
|
188
|
+
Macro-averaged accuracy on symptom diagnosis for pediatric speech disorder.
|
|
189
|
+
metric_groups:
|
|
190
|
+
- accuracy
|
|
191
|
+
- efficiency
|
|
192
|
+
- general_information
|
|
193
|
+
environment:
|
|
194
|
+
main_name: classification_micro_f1
|
|
195
|
+
main_split: test
|
|
196
|
+
taxonomy:
|
|
197
|
+
task: classification
|
|
198
|
+
what: n/a
|
|
199
|
+
who: n/a
|
|
200
|
+
when: "?"
|
|
201
|
+
language: English
|
|
202
|
+
|
|
203
|
+
- name: disorder_type_diagnosis
|
|
204
|
+
display_name: Disorder Type Diagnosis Accuracy
|
|
205
|
+
description: >
|
|
206
|
+
Macro-averaged accuracy on disorder type diagnosis for pediatric speech disorder.
|
|
207
|
+
metric_groups:
|
|
208
|
+
- accuracy
|
|
209
|
+
- efficiency
|
|
210
|
+
- general_information
|
|
211
|
+
environment:
|
|
212
|
+
main_name: classification_micro_f1
|
|
213
|
+
main_split: test
|
|
214
|
+
taxonomy:
|
|
215
|
+
task: classification
|
|
216
|
+
what: n/a
|
|
217
|
+
who: n/a
|
|
218
|
+
when: "?"
|
|
219
|
+
language: English
|