crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (311) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/classification_metrics.py +19 -1
  27. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  28. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  29. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  30. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  31. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  32. helm/benchmark/metrics/comet_metric.py +1 -1
  33. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  34. helm/benchmark/metrics/copyright_metrics.py +1 -1
  35. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  36. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  37. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  38. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  39. helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
  40. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  41. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  42. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  43. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  44. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  45. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  46. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  47. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  48. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  49. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  50. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  51. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  52. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  53. helm/benchmark/metrics/medec_metrics.py +25 -2
  54. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  55. helm/benchmark/metrics/metric.py +25 -0
  56. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  57. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  58. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  59. helm/benchmark/metrics/summac/model_summac.py +3 -3
  60. helm/benchmark/metrics/summarization_metrics.py +129 -1
  61. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  62. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  63. helm/benchmark/model_deployment_registry.py +11 -19
  64. helm/benchmark/presentation/create_plots.py +11 -2
  65. helm/benchmark/presentation/schema.py +10 -22
  66. helm/benchmark/presentation/summarize.py +189 -14
  67. helm/benchmark/presentation/taxonomy_info.py +20 -0
  68. helm/benchmark/presentation/test_create_plots.py +4 -1
  69. helm/benchmark/run.py +7 -1
  70. helm/benchmark/run_expander.py +4 -0
  71. helm/benchmark/run_specs/arabic_run_specs.py +191 -0
  72. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  73. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  74. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  75. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  76. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  77. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  78. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  79. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  80. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  81. helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
  82. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  83. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
  84. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  85. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  86. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  87. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  88. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  89. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  90. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  91. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  92. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  93. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  94. helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
  95. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  96. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
  97. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
  98. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
  99. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  100. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  101. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  102. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  103. helm/benchmark/scenarios/bold_scenario.py +15 -0
  104. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  105. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  106. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  107. helm/benchmark/scenarios/clear_scenario.py +23 -0
  108. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  109. helm/benchmark/scenarios/code_scenario.py +28 -0
  110. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  111. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  112. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  113. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  114. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  115. helm/benchmark/scenarios/commonsense_scenario.py +26 -0
  116. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  117. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  118. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  119. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  120. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  121. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  122. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  123. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  124. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  125. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  126. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  127. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  128. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  129. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  130. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  131. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  132. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  133. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  134. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  135. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  136. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  137. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  138. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  139. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  140. helm/benchmark/scenarios/gsm_scenario.py +15 -0
  141. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  142. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  143. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  144. helm/benchmark/scenarios/ice_scenario.py +21 -1
  145. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  146. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  147. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  148. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  149. helm/benchmark/scenarios/koala_scenario.py +21 -1
  150. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  151. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  152. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  153. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  154. helm/benchmark/scenarios/legalbench_scenario.py +20 -0
  155. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  156. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  157. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  158. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  159. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  160. helm/benchmark/scenarios/math_scenario.py +47 -20
  161. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  162. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  163. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  164. helm/benchmark/scenarios/med_qa_scenario.py +14 -0
  165. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  166. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  167. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  168. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  169. helm/benchmark/scenarios/medec_scenario.py +23 -0
  170. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  171. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  172. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  173. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  174. helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
  175. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  176. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  177. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  178. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  179. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  180. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  181. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  182. helm/benchmark/scenarios/mmlu_scenario.py +15 -0
  183. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  184. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  185. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  186. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  187. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  188. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
  189. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  190. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  191. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  192. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  193. helm/benchmark/scenarios/quac_scenario.py +14 -0
  194. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  195. helm/benchmark/scenarios/raft_scenario.py +15 -0
  196. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  197. helm/benchmark/scenarios/scenario.py +31 -0
  198. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  199. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  200. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  201. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  202. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  203. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  204. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  205. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  206. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  207. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  208. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  209. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  210. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  211. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  212. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  213. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  214. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  215. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  216. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  217. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  218. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  219. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  220. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  221. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  222. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  223. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  224. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  225. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  226. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  227. helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
  228. helm/benchmark/slurm_jobs.py +1 -2
  229. helm/benchmark/slurm_runner.py +8 -1
  230. helm/benchmark/static/schema_arabic.yaml +271 -0
  231. helm/benchmark/static/schema_classic.yaml +0 -17
  232. helm/benchmark/static/schema_long_context.yaml +24 -6
  233. helm/benchmark/static/schema_medhelm.yaml +36 -0
  234. helm/benchmark/static/schema_slp.yaml +219 -0
  235. helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
  236. helm/benchmark/static_build/assets/index-9352595e.css +1 -0
  237. helm/benchmark/static_build/index.html +2 -2
  238. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  239. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  240. helm/clients/audio_language/llama_omni/constants.py +9 -0
  241. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  242. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  243. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  244. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  245. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  246. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  247. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  248. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  249. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  250. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  251. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  252. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  253. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  254. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  255. helm/clients/audio_language/llama_omni/utils.py +202 -0
  256. helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
  257. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  258. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  259. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  260. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  261. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  262. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  263. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  264. helm/clients/huggingface_client.py +2 -2
  265. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  266. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  267. helm/clients/openai_client.py +33 -20
  268. helm/clients/openai_responses_client.py +34 -8
  269. helm/clients/openrouter_client.py +31 -0
  270. helm/clients/test_huggingface_client.py +3 -3
  271. helm/clients/test_openrouter_client.py +69 -0
  272. helm/clients/together_client.py +48 -13
  273. helm/clients/vertexai_client.py +19 -11
  274. helm/clients/vllm_client.py +43 -7
  275. helm/clients/vllm_granite_thinking_client.py +56 -0
  276. helm/common/critique_request.py +0 -1
  277. helm/common/hierarchical_logger.py +83 -34
  278. helm/common/object_spec.py +23 -8
  279. helm/common/test_logging.py +94 -0
  280. helm/config/model_deployments.yaml +525 -172
  281. helm/config/model_metadata.yaml +185 -10
  282. helm/config/tokenizer_configs.yaml +100 -2
  283. helm/proxy/cli.py +1 -1
  284. helm/proxy/example_queries.py +8 -8
  285. helm/proxy/retry.py +5 -0
  286. helm/proxy/server.py +2 -1
  287. helm/proxy/static/index.css +4 -0
  288. helm/proxy/static/index.js +7 -1
  289. helm/tokenizers/grok_tokenizer.py +2 -0
  290. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  291. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  292. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  293. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  294. helm/benchmark/metrics/medalign_metrics.py +0 -14
  295. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  296. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  297. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  298. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  299. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  300. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  301. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  302. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  303. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  304. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  305. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  306. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  307. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  308. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
  309. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
  310. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
  311. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,57 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.healthqa_br_scenario import HEALTHQA_BR_Scenario
5
+ from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_healthqa_br_instance():
10
+ scenario = HEALTHQA_BR_Scenario()
11
+ with TemporaryDirectory() as tmpdir:
12
+ instances = scenario.get_instances(tmpdir)
13
+
14
+ instance = instances[35]
15
+
16
+ assert instance.split == TEST_SPLIT
17
+
18
+ assert instance.input.text.startswith("Homem de 22 anos de idade procura a Unidade Básica")
19
+
20
+ assert instance.references == [
21
+ Reference(
22
+ output=Output(
23
+ text="administração de relaxante muscular, colocando o paciente em posição de Trendelenburg, com "
24
+ "tentativa de redução do volume."
25
+ ),
26
+ tags=[],
27
+ ),
28
+ Reference(
29
+ output=Output(
30
+ text="encaminhamento do paciente ao Serviço de Urgência do Hospital com o pedido de avaliação "
31
+ "imediata do cirurgião."
32
+ ),
33
+ tags=[CORRECT_TAG],
34
+ ),
35
+ Reference(
36
+ output=Output(
37
+ text="tentativa de redução manual do aumento de volume da região inguinescrotal para a cavidade "
38
+ "abdominal."
39
+ ),
40
+ tags=[],
41
+ ),
42
+ Reference(
43
+ output=Output(
44
+ text="transiluminação do escroto para tentar diferenciar hérnia inguinal de hidrocele comunicante."
45
+ ),
46
+ tags=[],
47
+ ),
48
+ Reference(
49
+ output=Output(text="prescrição de antiemético e solicitação de ecografia da região inguinescrotal."),
50
+ tags=[],
51
+ ),
52
+ ]
53
+
54
+ correct_refs = [ref for ref in instance.references if CORRECT_TAG in ref.tags]
55
+ assert len(correct_refs) == 1
56
+
57
+ assert instance.references[1].is_correct
@@ -5,9 +5,10 @@ import sys
5
5
  import requests
6
6
  from typing import Dict, List
7
7
 
8
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
8
9
  from helm.common.general import ensure_file_downloaded
9
10
  from helm.common.hierarchical_logger import hlog, htrack, htrack_block
10
- from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
11
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
11
12
 
12
13
 
13
14
  class ThePileScenario(Scenario):
@@ -146,3 +147,14 @@ class ThePileScenario(Scenario):
146
147
  instances = [instances[i] for i in indices]
147
148
 
148
149
  return instances
150
+
151
+ def get_metadata(self) -> ScenarioMetadata:
152
+ return ScenarioMetadata(
153
+ name="the_pile",
154
+ display_name="The Pile",
155
+ description="The Pile corpus for measuring lanugage model performance across various "
156
+ "domains [(Gao et al., 2020)](https://arxiv.org/pdf/2101.00027.pdf).",
157
+ taxonomy=TaxonomyInfo(task="language modeling", what="?", when="?", who="?", language="English, code"),
158
+ main_metric="bits_per_byte",
159
+ main_split="test",
160
+ )
@@ -2,6 +2,7 @@ import csv
2
2
  import os
3
3
  from typing import List, Dict, Any
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
12
13
  CORRECT_TAG,
13
14
  Input,
14
15
  Output,
16
+ ScenarioMetadata,
15
17
  )
16
18
 
17
19
 
@@ -154,3 +156,15 @@ class TruthfulQAScenario(Scenario):
154
156
  valid_instances: List[Instance] = get_split_instances(VALID_SPLIT, data[split_k:])
155
157
 
156
158
  return train_instances + valid_instances
159
+
160
+ def get_metadata(self) -> ScenarioMetadata:
161
+ return ScenarioMetadata(
162
+ name="truthful_qa",
163
+ display_name="TruthfulQA",
164
+ description="The TruthfulQA benchmarking for measuring model truthfulness and commonsense "
165
+ "knowledge in question answering [(Lin et al., "
166
+ "2022)](https://aclanthology.org/2022.acl-long.229/).",
167
+ taxonomy=TaxonomyInfo(task="question answering", what="?", when="?", who="?", language="English"),
168
+ main_metric="exact_match",
169
+ main_split="valid",
170
+ )
@@ -2,9 +2,10 @@ import csv
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.common.hierarchical_logger import hlog
7
- from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
8
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
8
9
 
9
10
  CODALAB_URI_TEMPLATE: str = (
10
11
  "https://worksheets.codalab.org/rest/bundles/0x31485f8c37ad481fb9f4e9bf7ccff6e5/contents/blob/"
@@ -56,3 +57,21 @@ class TwitterAAEScenario(Scenario):
56
57
  instances.append(instance)
57
58
 
58
59
  return instances
60
+
61
+ def get_metadata(self) -> ScenarioMetadata:
62
+ return ScenarioMetadata(
63
+ name="twitter_aae",
64
+ display_name="TwitterAAE",
65
+ description="The TwitterAAE corpus of [Blodgett et al. "
66
+ "(2016)](https://aclanthology.org/D16-1120/) for measuring language model "
67
+ "performance in tweets as a function of speaker dialect.",
68
+ taxonomy=TaxonomyInfo(
69
+ task="language modeling",
70
+ what="?",
71
+ when="?",
72
+ who="?",
73
+ language="English (AAE-aligned and White-aligned)",
74
+ ),
75
+ main_metric="bits_per_byte",
76
+ main_split="test",
77
+ )
@@ -2,8 +2,9 @@ import json
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
7
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
7
8
 
8
9
 
9
10
  class VicunaScenario(Scenario):
@@ -47,3 +48,22 @@ class VicunaScenario(Scenario):
47
48
  )
48
49
  instances.append(instance)
49
50
  return instances
51
+
52
+ def get_metadata(self) -> ScenarioMetadata:
53
+ return ScenarioMetadata(
54
+ name="vicuna",
55
+ display_name="Vicuna",
56
+ short_display_name="Vicuna",
57
+ description="The set of prompts used by the "
58
+ "[Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate "
59
+ "instruction-following models.",
60
+ taxonomy=TaxonomyInfo(
61
+ task="open-ended instruction following",
62
+ what="Instructions for LLMs",
63
+ when="Before 2023",
64
+ who="Unknown",
65
+ language="English",
66
+ ),
67
+ main_metric="Helpfulness",
68
+ main_split="test",
69
+ )
@@ -2,6 +2,7 @@ import os
2
2
  from typing import List, Dict
3
3
  import json
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_directory_exists, ensure_file_downloaded, flatten_list
6
7
  from helm.common.hierarchical_logger import hlog
7
8
  from helm.benchmark.scenarios.scenario import (
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
14
15
  CORRECT_TAG,
15
16
  Input,
16
17
  Output,
18
+ ScenarioMetadata,
17
19
  )
18
20
 
19
21
  PID_TO_NAME = {
@@ -183,3 +185,21 @@ class WIKIFactScenario(Scenario):
183
185
  instances.append(instance)
184
186
 
185
187
  return instances
188
+
189
+ def get_metadata(self) -> ScenarioMetadata:
190
+ return ScenarioMetadata(
191
+ name="wikifact",
192
+ display_name="WikiFact",
193
+ description="Scenario introduced in this work, inspired by [Petroni et al. "
194
+ "(2019)](https://aclanthology.org/D19-1250/), to more extensively test factual "
195
+ "knowledge.",
196
+ taxonomy=TaxonomyInfo(
197
+ task="knowledge base completion",
198
+ what="entity-relation-entity triples in natural language form",
199
+ when="?",
200
+ who="automatically generated from templates",
201
+ language="structured English",
202
+ ),
203
+ main_metric="quasi_exact_match",
204
+ main_split="test",
205
+ )
@@ -2,11 +2,13 @@ import datasets
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Scenario,
7
8
  Instance,
8
9
  TEST_SPLIT,
9
10
  Input,
11
+ ScenarioMetadata,
10
12
  )
11
13
  from helm.common.general import ensure_directory_exists
12
14
 
@@ -81,3 +83,19 @@ class WildBenchScenario(Scenario):
81
83
  instances.append(instance)
82
84
 
83
85
  return instances
86
+
87
+ def get_metadata(self) -> ScenarioMetadata:
88
+ return ScenarioMetadata(
89
+ name=self.name,
90
+ display_name="WildBench",
91
+ description=self.description,
92
+ main_metric="wildbench_score_rescaled",
93
+ main_split="test",
94
+ taxonomy=TaxonomyInfo(
95
+ task="instruction following",
96
+ what="GPT-judged instruction following with instructions collected from real-user conversations",
97
+ who="real-world users",
98
+ when="2024",
99
+ language="English",
100
+ ),
101
+ )
@@ -1,5 +1,6 @@
1
1
  from typing import List, Any
2
2
  from datasets import load_dataset
3
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
3
4
  from helm.common.hierarchical_logger import htrack_block
4
5
  from helm.benchmark.scenarios.scenario import (
5
6
  Scenario,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  CORRECT_TAG,
12
13
  Input,
13
14
  Output,
15
+ ScenarioMetadata,
14
16
  )
15
17
 
16
18
 
@@ -106,3 +108,13 @@ class WMT14Scenario(Scenario):
106
108
  )
107
109
  )
108
110
  return instances
111
+
112
+ def get_metadata(self) -> ScenarioMetadata:
113
+ return ScenarioMetadata(
114
+ name="wmt_14",
115
+ display_name="WMT 2014",
116
+ description="WMT 2014 is a collection of machine translation datasets.",
117
+ taxonomy=TaxonomyInfo(task="machine translation", what="n/a", when="n/a", who="n/a", language="English"),
118
+ main_metric="bleu_4",
119
+ main_split="test",
120
+ )
@@ -13,7 +13,6 @@ except ModuleNotFoundError as e:
13
13
 
14
14
 
15
15
  class SlurmJobState:
16
- # TODO: Convert to StrEnum after upgrading to Python 3.11
17
16
  # Non-exhaustive list of Slurm job states.
18
17
  # See: https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES
19
18
 
@@ -81,7 +80,7 @@ def get_slurm_job_state(job_id: int) -> str:
81
80
  except subprocess.CalledProcessError as e:
82
81
  # Default CalledProcessError message doesn't have output, so re-raise here to include the output.
83
82
  raise Exception(f"{str(e)} output: {e.output}")
84
- search_result = re.search("JobState=(\w+)", scontrol_output.decode())
83
+ search_result = re.search(r"JobState=(\w+)", scontrol_output.decode())
85
84
  if not search_result:
86
85
  raise Exception(f"Could not extract JobState from scontrol: {scontrol_output.decode()}")
87
86
  return search_result.group(1)
@@ -26,7 +26,7 @@ from helm.benchmark.slurm_jobs import (
26
26
  FAILURE_SLURM_JOB_STATES,
27
27
  )
28
28
  from helm.common.general import ensure_directory_exists
29
- from helm.common.hierarchical_logger import hlog, htrack_block
29
+ from helm.common.hierarchical_logger import hlog, htrack_block, setup_default_logging
30
30
 
31
31
  from helm.benchmark.runner_config_registry import RUNNER_CONFIG
32
32
 
@@ -343,7 +343,14 @@ def main():
343
343
  help="Path to the RunSpec JSON file",
344
344
  required=True,
345
345
  )
346
+ parser.add_argument(
347
+ "--log-config",
348
+ type=str,
349
+ default=None,
350
+ help="PATH to a YAML file to customize logging",
351
+ )
346
352
  args = parser.parse_args()
353
+ setup_default_logging(args.log_config)
347
354
 
348
355
  # Deserialize SlurmRunner and RunSpec from the given files, then run the RunSpec with the SlurmRunner.
349
356
  with open(args.slurm_runner_spec_path, "r") as f:
@@ -0,0 +1,271 @@
1
+ ---
2
+ # Schema for Arabic scenarios
3
+ ############################################################
4
+ metrics:
5
+ # Infrastructure metrics:
6
+ - name: num_perplexity_tokens
7
+ display_name: '# tokens'
8
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
9
+ - name: num_bytes
10
+ display_name: '# bytes'
11
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
12
+
13
+ - name: num_references
14
+ display_name: '# ref'
15
+ description: Number of references.
16
+ - name: num_train_trials
17
+ display_name: '# trials'
18
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
19
+ - name: estimated_num_tokens_cost
20
+ display_name: 'cost'
21
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
22
+ - name: num_prompt_tokens
23
+ display_name: '# prompt tokens'
24
+ description: Number of tokens in the prompt.
25
+ - name: num_prompt_characters
26
+ display_name: '# prompt chars'
27
+ description: Number of characters in the prompt.
28
+ - name: num_completion_tokens
29
+ display_name: '# completion tokens'
30
+ description: Actual number of completion tokens (over all completions).
31
+ - name: num_output_tokens
32
+ display_name: '# output tokens'
33
+ description: Actual number of output tokens.
34
+ - name: max_num_output_tokens
35
+ display_name: 'Max output tokens'
36
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
37
+ - name: num_requests
38
+ display_name: '# requests'
39
+ description: Number of distinct API requests.
40
+ - name: num_instances
41
+ display_name: '# eval'
42
+ description: Number of evaluation instances.
43
+ - name: num_train_instances
44
+ display_name: '# train'
45
+ description: Number of training instances (e.g., in-context examples).
46
+ - name: prompt_truncated
47
+ display_name: truncated
48
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
49
+ - name: finish_reason_length
50
+ display_name: finish b/c length
51
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
52
+ - name: finish_reason_stop
53
+ display_name: finish b/c stop
54
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
55
+ - name: finish_reason_endoftext
56
+ display_name: finish b/c endoftext
57
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
58
+ - name: finish_reason_unknown
59
+ display_name: finish b/c unknown
60
+ description: Fraction of instances where the the output was terminated for unknown reasons.
61
+ - name: num_completions
62
+ display_name: '# completions'
63
+ description: Number of completions.
64
+ - name: predicted_index
65
+ display_name: Predicted index
66
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
67
+ - name: inference_runtime
68
+ display_name: Observed inference runtime (s)
69
+ short_display_name: Observed inference time (s)
70
+ lower_is_better: true
71
+ description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
72
+
73
+ # Accuracy metrics:
74
+ - name: exact_match
75
+ display_name: Exact match
76
+ short_display_name: EM
77
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
78
+ lower_is_better: false
79
+ - name: quasi_exact_match
80
+ display_name: Quasi-exact match
81
+ short_display_name: EM
82
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
83
+ lower_is_better: false
84
+ - name: prefix_exact_match
85
+ display_name: Prefix exact match
86
+ short_display_name: PEM
87
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
88
+ lower_is_better: false
89
+ - name: quasi_prefix_exact_match
90
+ # TODO: should call this prefix_quasi_exact_match
91
+ display_name: Prefix quasi-exact match
92
+ short_display_name: PEM
93
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
94
+ lower_is_better: false
95
+ - name: alrage_score
96
+ # TODO: should call this prefix_quasi_exact_match
97
+ display_name: ALRAGE Score
98
+ short_display_name: Score
99
+ description: Score of the output judged by GPT-4o.
100
+ lower_is_better: false
101
+
102
+ ############################################################
103
+ perturbations: []
104
+
105
+ ############################################################
106
+ metric_groups:
107
+ - name: accuracy
108
+ display_name: Accuracy
109
+ aggregation_strategies:
110
+ - mean
111
+ metrics:
112
+ - name: ${main_name}
113
+ split: ${main_split}
114
+
115
+ - name: efficiency
116
+ display_name: Efficiency
117
+ aggregation_strategies:
118
+ - mean
119
+ metrics:
120
+ - name: inference_runtime
121
+ split: ${main_split}
122
+
123
+ - name: general_information
124
+ display_name: General information
125
+ hide_win_rates: true
126
+ metrics:
127
+ - name: num_instances
128
+ split: ${main_split}
129
+ - name: num_train_instances
130
+ split: ${main_split}
131
+ - name: prompt_truncated
132
+ split: ${main_split}
133
+ - name: num_prompt_tokens
134
+ split: ${main_split}
135
+ - name: num_output_tokens
136
+ split: ${main_split}
137
+
138
+ ############################################################
139
+ run_groups:
140
+ - name: arabic_scenarios
141
+ display_name: Arabic Scenarios
142
+ description: Arabic Scenarios
143
+ category: Scenarios
144
+ subgroups:
145
+ - alghafa
146
+ - arabic_mmlu
147
+ - arabic_exams
148
+ - madinah_qa
149
+ - aratrust
150
+ - alrage
151
+ - mbzuai_human_translated_arabic_mmlu
152
+
153
+ - name: mbzuai_human_translated_arabic_mmlu
154
+ display_name: MBZUAI Human-Translated Arabic MMLU
155
+ short_display_name: Translated MMLU
156
+ description: A translation from MBZUAI by human translators of the Massive Multitask Language Understanding benchmark.
157
+ metric_groups:
158
+ - accuracy
159
+ - efficiency
160
+ - general_information
161
+ environment:
162
+ main_name: exact_match
163
+ main_split: test
164
+ taxonomy:
165
+ task: multiple-choice question answering
166
+ what: math, science, history, etc.
167
+ who: various online sources
168
+ when: before 2021
169
+ language: Arabic
170
+
171
+ - name: arabic_mmlu
172
+ display_name: ArabicMMLU
173
+ description: ArabicMMLU
174
+ metric_groups:
175
+ - accuracy
176
+ - efficiency
177
+ - general_information
178
+ environment:
179
+ main_name: exact_match
180
+ main_split: test
181
+ taxonomy:
182
+ task: "question answering"
183
+ what: "academic questions across various disciplines"
184
+ who: "academic exams writers and takers"
185
+ when: "before 2024"
186
+ language: Arabic
187
+
188
+ - name: alghafa
189
+ display_name: AlGhafa
190
+ description: AlGhafa
191
+ metric_groups:
192
+ - accuracy
193
+ - efficiency
194
+ - general_information
195
+ environment:
196
+ main_name: exact_match
197
+ main_split: test
198
+ taxonomy:
199
+ task: "multiple choice question answering"
200
+ what: Various
201
+ who: Various
202
+ when: "before 2023"
203
+ language: Arabic
204
+
205
+ - name: arabic_exams
206
+ display_name: Arabic EXAMS
207
+ description: Arabic EXAMS
208
+ metric_groups:
209
+ - accuracy
210
+ - efficiency
211
+ - general_information
212
+ environment:
213
+ main_name: exact_match
214
+ main_split: test
215
+ taxonomy:
216
+ task: "multiple choice question answering"
217
+ what: High school examinations
218
+ who: High school examinations writers and test-takers
219
+ when: before 2020
220
+ language: Arabic
221
+
222
+ - name: aratrust
223
+ display_name: AraTrust
224
+ description: AraTrust
225
+ metric_groups:
226
+ - accuracy
227
+ - efficiency
228
+ - general_information
229
+ environment:
230
+ main_name: exact_match
231
+ main_split: test
232
+ taxonomy:
233
+ task: "question answering"
234
+ what: "academic questions across various disciplines"
235
+ who: "academic exams writers and takers"
236
+ when: "before 2024"
237
+ language: Arabic
238
+
239
+ - name: alrage
240
+ display_name: ALRAGE
241
+ description: ALRAGE
242
+ metric_groups:
243
+ - accuracy
244
+ - efficiency
245
+ - general_information
246
+ environment:
247
+ main_name: alrage_score
248
+ main_split: test
249
+ taxonomy:
250
+ task: "openbook (RAG) open-ended question answering"
251
+ what: "?"
252
+ who: "?"
253
+ when: "?"
254
+ language: Arabic
255
+
256
+ - name: madinah_qa
257
+ display_name: MadinahQA
258
+ description: Arabic language competency benchmark
259
+ metric_groups:
260
+ - accuracy
261
+ - efficiency
262
+ - general_information
263
+ environment:
264
+ main_name: exact_match
265
+ main_split: test
266
+ taxonomy:
267
+ task: "question answering"
268
+ what: "academic questions about Arabic language"
269
+ who: "academic exams writers and takers"
270
+ when: "before 2024"
271
+ language: Arabic
@@ -1683,23 +1683,6 @@ run_groups:
1683
1683
  when: n/a
1684
1684
  language: synthetic
1685
1685
 
1686
- - name: numeracy
1687
- display_name: Numerical reasoning
1688
- description: Scenario introduced in this work to test numerical reasoning via symbolic regression.
1689
- metric_groups:
1690
- - accuracy
1691
- - efficiency
1692
- - general_information
1693
- environment:
1694
- main_name: absolute_value_difference
1695
- main_split: test
1696
- taxonomy:
1697
- task: next-word prediction
1698
- what: Dyck formal language
1699
- who: n/a
1700
- when: n/a
1701
- language: synthetic
1702
-
1703
1686
  - name: synthetic_reasoning
1704
1687
  display_name: Synthetic reasoning (abstract symbols)
1705
1688
  description: Synthetic reasoning tasks defined using abstract symbols based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).
@@ -194,7 +194,8 @@ run_groups:
194
194
  - ruler_hotpotqa
195
195
  - ruler_squad
196
196
  - infinite_bench_en_sum
197
- - infinite_bench_en_qa
197
+ # - infinite_bench_en_qa
198
+ - infinite_bench_en_mc
198
199
  - openai_mrcr
199
200
 
200
201
  - name: ruler_hotpotqa
@@ -232,18 +233,35 @@ run_groups:
232
233
  when: Before 2018
233
234
  language: English
234
235
 
235
- - name: infinite_bench_en_qa
236
- display_name: ∞Bench En.QA
237
- description: ∞Bench En.QA is a question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
236
+ # - name: infinite_bench_en_qa
237
+ # display_name: ∞Bench En.QA
238
+ # description: ∞Bench En.QA is a open-ended question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
239
+ # metric_groups:
240
+ # - accuracy
241
+ # - general_information
242
+ # - annotation_metrics
243
+ # environment:
244
+ # main_name: f1_score
245
+ # main_split: test
246
+ # taxonomy:
247
+ # task: question answering
248
+ # what: Novels
249
+ # who: Novel authors
250
+ # when: Before 2024
251
+ # language: English
252
+
253
+ - name: infinite_bench_en_mc
254
+ display_name: ∞Bench En.MC
255
+ description: ∞Bench En.MC is a multiple-choice question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
238
256
  metric_groups:
239
257
  - accuracy
240
258
  - general_information
241
259
  - annotation_metrics
242
260
  environment:
243
- main_name: f1_score
261
+ main_name: exact_match
244
262
  main_split: test
245
263
  taxonomy:
246
- task: question answering
264
+ task: multiple-choice question answering
247
265
  what: Novels
248
266
  who: Novel authors
249
267
  when: Before 2024