crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/bbq_metrics.py +12 -0
  27. helm/benchmark/metrics/classification_metrics.py +19 -1
  28. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  29. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  30. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  31. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  32. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  33. helm/benchmark/metrics/comet_metric.py +1 -1
  34. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  35. helm/benchmark/metrics/copyright_metrics.py +1 -1
  36. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  37. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  38. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  39. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  40. helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
  41. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  42. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  43. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  44. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  45. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  46. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  47. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  48. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  49. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  50. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  51. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  52. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  53. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  54. helm/benchmark/metrics/medec_metrics.py +25 -2
  55. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  56. helm/benchmark/metrics/metric.py +25 -0
  57. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  58. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  59. helm/benchmark/metrics/safety_metrics.py +13 -1
  60. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  61. helm/benchmark/metrics/summac/model_summac.py +3 -3
  62. helm/benchmark/metrics/summarization_metrics.py +129 -1
  63. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  64. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  65. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  66. helm/benchmark/model_deployment_registry.py +11 -19
  67. helm/benchmark/presentation/create_plots.py +11 -2
  68. helm/benchmark/presentation/run_display.py +13 -3
  69. helm/benchmark/presentation/run_entry.py +2 -2
  70. helm/benchmark/presentation/schema.py +10 -22
  71. helm/benchmark/presentation/summarize.py +189 -14
  72. helm/benchmark/presentation/taxonomy_info.py +20 -0
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/run.py +15 -4
  75. helm/benchmark/run_expander.py +4 -0
  76. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  77. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  78. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  79. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  80. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  81. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  82. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  83. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  84. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  85. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  86. helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
  87. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  88. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
  89. helm/benchmark/runner.py +7 -0
  90. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  91. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  92. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  93. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  94. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  95. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  96. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  97. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  98. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  99. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  100. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  101. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  102. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  103. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
  104. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
  105. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
  106. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  107. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  108. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  109. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  110. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  111. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  112. helm/benchmark/scenarios/bold_scenario.py +15 -0
  113. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  115. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  116. helm/benchmark/scenarios/clear_scenario.py +23 -0
  117. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  118. helm/benchmark/scenarios/code_scenario.py +28 -0
  119. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  120. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  121. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  122. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  123. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  124. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  125. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  126. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  127. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  128. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  129. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  130. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  131. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  132. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  133. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  134. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  135. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  136. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  137. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  138. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  139. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  140. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  141. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  142. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  143. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  144. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  145. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  146. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  147. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  148. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  149. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  150. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  151. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  152. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  153. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  154. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  155. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  156. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  157. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  158. helm/benchmark/scenarios/ice_scenario.py +21 -1
  159. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  160. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  161. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  162. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  163. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  164. helm/benchmark/scenarios/koala_scenario.py +21 -1
  165. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  166. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  167. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  168. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  169. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  170. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  171. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  172. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  173. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  175. helm/benchmark/scenarios/math_scenario.py +54 -20
  176. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  177. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  178. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  179. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  180. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  181. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  182. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  183. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  184. helm/benchmark/scenarios/medec_scenario.py +23 -0
  185. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  186. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  187. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  188. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  189. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  190. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  191. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  192. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  193. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  194. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  195. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  196. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  197. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  198. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  199. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  200. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  201. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  202. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  203. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  204. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  205. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  206. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  207. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  208. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  209. helm/benchmark/scenarios/quac_scenario.py +14 -0
  210. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  211. helm/benchmark/scenarios/raft_scenario.py +15 -0
  212. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  213. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  214. helm/benchmark/scenarios/scenario.py +31 -0
  215. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  216. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  217. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  218. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  219. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  220. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  221. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  222. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  223. helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
  224. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  225. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  226. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  227. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  228. helm/benchmark/scenarios/spider_scenario.py +18 -0
  229. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  230. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  231. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  232. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  233. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  234. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  235. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  236. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  237. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  238. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  239. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  240. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  241. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  242. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  243. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  244. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  245. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  246. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  247. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  248. helm/benchmark/slurm_jobs.py +1 -2
  249. helm/benchmark/slurm_runner.py +8 -1
  250. helm/benchmark/static/schema_arabic.yaml +271 -0
  251. helm/benchmark/static/schema_classic.yaml +0 -17
  252. helm/benchmark/static/schema_long_context.yaml +17 -18
  253. helm/benchmark/static/schema_medhelm.yaml +36 -0
  254. helm/benchmark/static/schema_slp.yaml +219 -0
  255. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  256. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  257. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  258. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  259. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  260. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  261. helm/benchmark/static_build/index.html +5 -6
  262. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  263. helm/clients/ai21_client.py +2 -0
  264. helm/clients/aleph_alpha_client.py +2 -0
  265. helm/clients/anthropic_client.py +7 -1
  266. helm/clients/audio_language/diva_llama_client.py +2 -0
  267. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  268. helm/clients/audio_language/llama_omni/constants.py +9 -0
  269. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  270. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  271. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  272. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  273. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  274. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  275. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  276. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  277. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  278. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  279. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  280. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  281. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  282. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  283. helm/clients/audio_language/llama_omni/utils.py +202 -0
  284. helm/clients/audio_language/llama_omni_client.py +2 -1
  285. helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
  286. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  287. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  288. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  289. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  290. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  291. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  292. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  293. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  294. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  295. helm/clients/bedrock_client.py +63 -6
  296. helm/clients/cohere_client.py +3 -0
  297. helm/clients/dspy_client.py +135 -0
  298. helm/clients/google_client.py +2 -0
  299. helm/clients/http_model_client.py +2 -0
  300. helm/clients/huggingface_client.py +4 -3
  301. helm/clients/ibm_client.py +3 -1
  302. helm/clients/image_generation/adobe_vision_client.py +2 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  304. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  305. helm/clients/image_generation/cogview2_client.py +2 -1
  306. helm/clients/image_generation/dalle2_client.py +2 -0
  307. helm/clients/image_generation/dalle_mini_client.py +2 -1
  308. helm/clients/image_generation/deep_floyd_client.py +2 -0
  309. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  310. helm/clients/image_generation/lexica_client.py +2 -0
  311. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  312. helm/clients/image_generation/mindalle_client.py +2 -1
  313. helm/clients/image_generation/together_image_generation_client.py +2 -0
  314. helm/clients/megatron_client.py +2 -0
  315. helm/clients/mistral_client.py +2 -0
  316. helm/clients/moderation_api_client.py +2 -0
  317. helm/clients/openai_client.py +38 -21
  318. helm/clients/openai_responses_client.py +34 -8
  319. helm/clients/openrouter_client.py +31 -0
  320. helm/clients/palmyra_client.py +2 -1
  321. helm/clients/reka_client.py +2 -1
  322. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  323. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  324. helm/clients/test_huggingface_client.py +3 -3
  325. helm/clients/test_openrouter_client.py +69 -0
  326. helm/clients/together_client.py +52 -13
  327. helm/clients/vertexai_client.py +23 -11
  328. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  329. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  330. helm/clients/vision_language/idefics_client.py +2 -1
  331. helm/clients/vision_language/open_flamingo_client.py +2 -1
  332. helm/clients/vision_language/paligemma_client.py +2 -1
  333. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  334. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  335. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  336. helm/clients/vllm_client.py +43 -7
  337. helm/clients/vllm_granite_thinking_client.py +56 -0
  338. helm/clients/writer_client.py +5 -2
  339. helm/common/critique_request.py +0 -1
  340. helm/common/hierarchical_logger.py +103 -34
  341. helm/common/object_spec.py +23 -8
  342. helm/common/optional_dependencies.py +1 -1
  343. helm/common/test_general.py +4 -0
  344. helm/common/test_logging.py +94 -0
  345. helm/config/model_deployments.yaml +1001 -187
  346. helm/config/model_metadata.yaml +602 -18
  347. helm/config/tokenizer_configs.yaml +202 -5
  348. helm/proxy/cli.py +1 -1
  349. helm/proxy/example_queries.py +8 -8
  350. helm/proxy/retry.py +5 -0
  351. helm/proxy/server.py +2 -1
  352. helm/proxy/static/index.css +4 -0
  353. helm/proxy/static/index.js +7 -1
  354. helm/tokenizers/auto_tokenizer.py +2 -2
  355. helm/tokenizers/grok_tokenizer.py +2 -0
  356. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  357. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  358. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  359. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  360. helm/benchmark/metrics/medalign_metrics.py +0 -14
  361. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  362. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  363. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  364. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  365. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  366. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  367. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  368. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  369. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  370. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  371. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
  372. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  373. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  374. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  375. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  376. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  377. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  378. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
  379. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  380. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
  381. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  382. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  383. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  384. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  385. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  386. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  387. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  388. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  389. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  390. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  391. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  392. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  393. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  394. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -0,0 +1,271 @@
1
+ ---
2
+ # Schema for Arabic scenarios
3
+ ############################################################
4
+ metrics:
5
+ # Infrastructure metrics:
6
+ - name: num_perplexity_tokens
7
+ display_name: '# tokens'
8
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
9
+ - name: num_bytes
10
+ display_name: '# bytes'
11
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
12
+
13
+ - name: num_references
14
+ display_name: '# ref'
15
+ description: Number of references.
16
+ - name: num_train_trials
17
+ display_name: '# trials'
18
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
19
+ - name: estimated_num_tokens_cost
20
+ display_name: 'cost'
21
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
22
+ - name: num_prompt_tokens
23
+ display_name: '# prompt tokens'
24
+ description: Number of tokens in the prompt.
25
+ - name: num_prompt_characters
26
+ display_name: '# prompt chars'
27
+ description: Number of characters in the prompt.
28
+ - name: num_completion_tokens
29
+ display_name: '# completion tokens'
30
+ description: Actual number of completion tokens (over all completions).
31
+ - name: num_output_tokens
32
+ display_name: '# output tokens'
33
+ description: Actual number of output tokens.
34
+ - name: max_num_output_tokens
35
+ display_name: 'Max output tokens'
36
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
37
+ - name: num_requests
38
+ display_name: '# requests'
39
+ description: Number of distinct API requests.
40
+ - name: num_instances
41
+ display_name: '# eval'
42
+ description: Number of evaluation instances.
43
+ - name: num_train_instances
44
+ display_name: '# train'
45
+ description: Number of training instances (e.g., in-context examples).
46
+ - name: prompt_truncated
47
+ display_name: truncated
48
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
49
+ - name: finish_reason_length
50
+ display_name: finish b/c length
51
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
52
+ - name: finish_reason_stop
53
+ display_name: finish b/c stop
54
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
55
+ - name: finish_reason_endoftext
56
+ display_name: finish b/c endoftext
57
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
58
+ - name: finish_reason_unknown
59
+ display_name: finish b/c unknown
60
+ description: Fraction of instances where the the output was terminated for unknown reasons.
61
+ - name: num_completions
62
+ display_name: '# completions'
63
+ description: Number of completions.
64
+ - name: predicted_index
65
+ display_name: Predicted index
66
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
67
+ - name: inference_runtime
68
+ display_name: Observed inference runtime (s)
69
+ short_display_name: Observed inference time (s)
70
+ lower_is_better: true
71
+ description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
72
+
73
+ # Accuracy metrics:
74
+ - name: exact_match
75
+ display_name: Exact match
76
+ short_display_name: EM
77
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
78
+ lower_is_better: false
79
+ - name: quasi_exact_match
80
+ display_name: Quasi-exact match
81
+ short_display_name: EM
82
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
83
+ lower_is_better: false
84
+ - name: prefix_exact_match
85
+ display_name: Prefix exact match
86
+ short_display_name: PEM
87
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
88
+ lower_is_better: false
89
+ - name: quasi_prefix_exact_match
90
+ # TODO: should call this prefix_quasi_exact_match
91
+ display_name: Prefix quasi-exact match
92
+ short_display_name: PEM
93
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
94
+ lower_is_better: false
95
+ - name: alrage_score
96
+ # TODO: should call this prefix_quasi_exact_match
97
+ display_name: ALRAGE Score
98
+ short_display_name: Score
99
+ description: Score of the output judged by GPT-4o.
100
+ lower_is_better: false
101
+
102
+ ############################################################
103
+ perturbations: []
104
+
105
+ ############################################################
106
+ metric_groups:
107
+ - name: accuracy
108
+ display_name: Accuracy
109
+ aggregation_strategies:
110
+ - mean
111
+ metrics:
112
+ - name: ${main_name}
113
+ split: ${main_split}
114
+
115
+ - name: efficiency
116
+ display_name: Efficiency
117
+ aggregation_strategies:
118
+ - mean
119
+ metrics:
120
+ - name: inference_runtime
121
+ split: ${main_split}
122
+
123
+ - name: general_information
124
+ display_name: General information
125
+ hide_win_rates: true
126
+ metrics:
127
+ - name: num_instances
128
+ split: ${main_split}
129
+ - name: num_train_instances
130
+ split: ${main_split}
131
+ - name: prompt_truncated
132
+ split: ${main_split}
133
+ - name: num_prompt_tokens
134
+ split: ${main_split}
135
+ - name: num_output_tokens
136
+ split: ${main_split}
137
+
138
+ ############################################################
139
+ run_groups:
140
+ - name: arabic_scenarios
141
+ display_name: Arabic Scenarios
142
+ description: Arabic Scenarios
143
+ category: Scenarios
144
+ subgroups:
145
+ - alghafa
146
+ - arabic_mmlu
147
+ - arabic_exams
148
+ - madinah_qa
149
+ - aratrust
150
+ - alrage
151
+ - mbzuai_human_translated_arabic_mmlu
152
+
153
+ - name: mbzuai_human_translated_arabic_mmlu
154
+ display_name: MBZUAI Human-Translated Arabic MMLU
155
+ short_display_name: Translated MMLU
156
+ description: A translation from MBZUAI by human translators of the Massive Multitask Language Understanding benchmark.
157
+ metric_groups:
158
+ - accuracy
159
+ - efficiency
160
+ - general_information
161
+ environment:
162
+ main_name: exact_match
163
+ main_split: test
164
+ taxonomy:
165
+ task: multiple-choice question answering
166
+ what: math, science, history, etc.
167
+ who: various online sources
168
+ when: before 2021
169
+ language: Arabic
170
+
171
+ - name: arabic_mmlu
172
+ display_name: ArabicMMLU
173
+ description: ArabicMMLU
174
+ metric_groups:
175
+ - accuracy
176
+ - efficiency
177
+ - general_information
178
+ environment:
179
+ main_name: exact_match
180
+ main_split: test
181
+ taxonomy:
182
+ task: "question answering"
183
+ what: "academic questions across various disciplines"
184
+ who: "academic exams writers and takers"
185
+ when: "before 2024"
186
+ language: Arabic
187
+
188
+ - name: alghafa
189
+ display_name: AlGhafa
190
+ description: AlGhafa
191
+ metric_groups:
192
+ - accuracy
193
+ - efficiency
194
+ - general_information
195
+ environment:
196
+ main_name: exact_match
197
+ main_split: test
198
+ taxonomy:
199
+ task: "multiple choice question answering"
200
+ what: Various
201
+ who: Various
202
+ when: "before 2023"
203
+ language: Arabic
204
+
205
+ - name: arabic_exams
206
+ display_name: Arabic EXAMS
207
+ description: Arabic EXAMS
208
+ metric_groups:
209
+ - accuracy
210
+ - efficiency
211
+ - general_information
212
+ environment:
213
+ main_name: exact_match
214
+ main_split: test
215
+ taxonomy:
216
+ task: "multiple choice question answering"
217
+ what: High school examinations
218
+ who: High school examinations writers and test-takers
219
+ when: before 2020
220
+ language: Arabic
221
+
222
+ - name: aratrust
223
+ display_name: AraTrust
224
+ description: AraTrust
225
+ metric_groups:
226
+ - accuracy
227
+ - efficiency
228
+ - general_information
229
+ environment:
230
+ main_name: exact_match
231
+ main_split: test
232
+ taxonomy:
233
+ task: "question answering"
234
+ what: "academic questions across various disciplines"
235
+ who: "academic exams writers and takers"
236
+ when: "before 2024"
237
+ language: Arabic
238
+
239
+ - name: alrage
240
+ display_name: ALRAGE
241
+ description: ALRAGE
242
+ metric_groups:
243
+ - accuracy
244
+ - efficiency
245
+ - general_information
246
+ environment:
247
+ main_name: alrage_score
248
+ main_split: test
249
+ taxonomy:
250
+ task: "openbook (RAG) open-ended question answering"
251
+ what: "?"
252
+ who: "?"
253
+ when: "?"
254
+ language: Arabic
255
+
256
+ - name: madinah_qa
257
+ display_name: MadinahQA
258
+ description: Arabic language competency benchmark
259
+ metric_groups:
260
+ - accuracy
261
+ - efficiency
262
+ - general_information
263
+ environment:
264
+ main_name: exact_match
265
+ main_split: test
266
+ taxonomy:
267
+ task: "question answering"
268
+ what: "academic questions about Arabic language"
269
+ who: "academic exams writers and takers"
270
+ when: "before 2024"
271
+ language: Arabic
@@ -1683,23 +1683,6 @@ run_groups:
1683
1683
  when: n/a
1684
1684
  language: synthetic
1685
1685
 
1686
- - name: numeracy
1687
- display_name: Numerical reasoning
1688
- description: Scenario introduced in this work to test numerical reasoning via symbolic regression.
1689
- metric_groups:
1690
- - accuracy
1691
- - efficiency
1692
- - general_information
1693
- environment:
1694
- main_name: absolute_value_difference
1695
- main_split: test
1696
- taxonomy:
1697
- task: next-word prediction
1698
- what: Dyck formal language
1699
- who: n/a
1700
- when: n/a
1701
- language: synthetic
1702
-
1703
1686
  - name: synthetic_reasoning
1704
1687
  display_name: Synthetic reasoning (abstract symbols)
1705
1688
  description: Synthetic reasoning tasks defined using abstract symbols based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).
@@ -191,15 +191,15 @@ run_groups:
191
191
  description: Scenarios for evaluating long context capabilities
192
192
  category: All scenarios
193
193
  subgroups:
194
- - ruler_hotpotqa
195
194
  - ruler_squad
195
+ - ruler_hotpotqa
196
+ - infinite_bench_en_mc
196
197
  - infinite_bench_en_sum
197
- - infinite_bench_en_qa
198
198
  - openai_mrcr
199
199
 
200
- - name: ruler_hotpotqa
201
- display_name: RULER HotPotQA
202
- description: RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario.
200
+ - name: ruler_squad
201
+ display_name: RULER SQuAD
202
+ description: RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario.
203
203
  metric_groups:
204
204
  - accuracy
205
205
  - general_information
@@ -208,16 +208,15 @@ run_groups:
208
208
  main_name: ruler_string_match_part
209
209
  main_split: valid
210
210
  taxonomy:
211
- task: question answering with retrieval-augmented generation
211
+ task: question answering
212
212
  what: Wikipedia articles
213
- who: Wikipedia authors
213
+ who: Wikipedia authors and crowdworkers
214
214
  when: Before 2018
215
215
  language: English
216
216
 
217
-
218
- - name: ruler_squad
219
- display_name: RULER SQuAD
220
- description: RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario.
217
+ - name: ruler_hotpotqa
218
+ display_name: RULER HotPotQA
219
+ description: RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario.
221
220
  metric_groups:
222
221
  - accuracy
223
222
  - general_information
@@ -226,24 +225,24 @@ run_groups:
226
225
  main_name: ruler_string_match_part
227
226
  main_split: valid
228
227
  taxonomy:
229
- task: question answering
228
+ task: question answering with retrieval-augmented generation
230
229
  what: Wikipedia articles
231
- who: Wikipedia authors and crowdworkers
230
+ who: Wikipedia authors
232
231
  when: Before 2018
233
232
  language: English
234
233
 
235
- - name: infinite_bench_en_qa
236
- display_name: ∞Bench En.QA
237
- description: ∞Bench En.QA is a question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
234
+ - name: infinite_bench_en_mc
235
+ display_name: ∞Bench En.MC
236
+ description: ∞Bench En.MC is a multiple-choice question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
238
237
  metric_groups:
239
238
  - accuracy
240
239
  - general_information
241
240
  - annotation_metrics
242
241
  environment:
243
- main_name: f1_score
242
+ main_name: exact_match
244
243
  main_split: test
245
244
  taxonomy:
246
- task: question answering
245
+ task: multiple-choice question answering
247
246
  what: Novels
248
247
  who: Novel authors
249
248
  when: Before 2024
@@ -484,6 +484,8 @@ run_groups:
484
484
  - ehrshot
485
485
  - head_qa
486
486
  - medbullets
487
+ - med_qa
488
+ - med_mcqa
487
489
  - medalign
488
490
  - shc_ptbm_med
489
491
  - shc_sei_med
@@ -657,6 +659,40 @@ run_groups:
657
659
  when: Any
658
660
  language: English
659
661
 
662
+ - name: med_qa
663
+ display_name: MedQA
664
+ description: MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).
665
+ metric_groups:
666
+ - accuracy
667
+ - efficiency
668
+ - general_information
669
+ environment:
670
+ main_name: exact_match
671
+ main_split: test
672
+ taxonomy:
673
+ task: question answering
674
+ what: n/a
675
+ who: n/a
676
+ when: n/a
677
+ language: English
678
+
679
+ - name: med_mcqa
680
+ display_name: MedMCQA
681
+ description: MedMCQA is a "multiple-choice question answering (MCQA) dataset designed to address real-world medical entrance exam questions ([Flores et al. 2020](https://arxiv.org/abs/2203.14371)).
682
+ metric_groups:
683
+ - accuracy
684
+ - efficiency
685
+ - general_information
686
+ environment:
687
+ main_name: exact_match
688
+ main_split: valid
689
+ taxonomy:
690
+ task: question answering
691
+ what: n/a
692
+ who: n/a
693
+ when: n/a
694
+ language: English
695
+
660
696
  - name: medalign
661
697
  display_name: MedAlign
662
698
  short_display_name: MedAlign
@@ -0,0 +1,219 @@
1
+ ############################################################
2
+ metrics:
3
+ # Infrastructure metrics:
4
+ - name: num_perplexity_tokens
5
+ display_name: '# tokens'
6
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
7
+ - name: num_bytes
8
+ display_name: '# bytes'
9
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
10
+
11
+ - name: num_references
12
+ display_name: '# ref'
13
+ description: Number of references.
14
+ - name: num_train_trials
15
+ display_name: '# trials'
16
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
17
+ - name: estimated_num_tokens_cost
18
+ display_name: 'cost'
19
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
20
+ - name: num_prompt_tokens
21
+ display_name: '# prompt tokens'
22
+ description: Number of tokens in the prompt.
23
+ - name: num_prompt_characters
24
+ display_name: '# prompt chars'
25
+ description: Number of characters in the prompt.
26
+ - name: num_completion_tokens
27
+ display_name: '# completion tokens'
28
+ description: Actual number of completion tokens (over all completions).
29
+ - name: num_output_tokens
30
+ display_name: '# output tokens'
31
+ description: Actual number of output tokens.
32
+ - name: max_num_output_tokens
33
+ display_name: 'Max output tokens'
34
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
35
+ - name: num_requests
36
+ display_name: '# requests'
37
+ description: Number of distinct API requests.
38
+ - name: num_instances
39
+ display_name: '# eval'
40
+ description: Number of evaluation instances.
41
+ - name: num_train_instances
42
+ display_name: '# train'
43
+ description: Number of training instances (e.g., in-context examples).
44
+ - name: prompt_truncated
45
+ display_name: truncated
46
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
47
+ - name: finish_reason_length
48
+ display_name: finish b/c length
49
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
50
+ - name: finish_reason_stop
51
+ display_name: finish b/c stop
52
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
53
+ - name: finish_reason_endoftext
54
+ display_name: finish b/c endoftext
55
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
56
+ - name: finish_reason_unknown
57
+ display_name: finish b/c unknown
58
+ description: Fraction of instances where the the output was terminated for unknown reasons.
59
+ - name: num_completions
60
+ display_name: '# completions'
61
+ description: Number of completions.
62
+ - name: predicted_index
63
+ display_name: Predicted index
64
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
65
+
66
+ # Accuracy metrics:
67
+ - name: exact_match
68
+ display_name: Exact match
69
+ short_display_name: EM
70
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
71
+ lower_is_better: false
72
+ - name: classification_macro_f1
73
+ display_name: Macro-F1
74
+ description: Population-level macro-averaged F1 score.
75
+ lower_is_better: false
76
+ - name: classification_micro_f1
77
+ display_name: Micro-F1
78
+ description: Population-level micro-averaged F1 score.
79
+ lower_is_better: false
80
+ - name: wer_score
81
+ display_name: Word Error Rate
82
+ description: Transcription error rate.
83
+ lower_is_better: true
84
+ - name: mer_score
85
+ display_name: Character Error Rate
86
+ description: Character error rate.
87
+ lower_is_better: true
88
+
89
+ ############################################################
90
+ perturbations: []
91
+
92
+ ############################################################
93
+ metric_groups:
94
+ - name: accuracy
95
+ display_name: Accuracy
96
+ hide_win_rates: true
97
+ metrics:
98
+ - name: exact_match
99
+ split: ${main_split}
100
+ - name: classification_macro_f1
101
+ split: ${main_split}
102
+ - name: classification_micro_f1
103
+ split: ${main_split}
104
+
105
+ - name: transcription_accuracy
106
+ display_name: Transcription Accuracy
107
+ hide_win_rates: true
108
+ metrics:
109
+ - name: wer_score
110
+ split: ${main_split}
111
+ - name: mer_score
112
+ split: ${main_split}
113
+
114
+ - name: efficiency
115
+ display_name: Efficiency
116
+ metrics:
117
+ - name: inference_runtime
118
+ split: ${main_split}
119
+
120
+ - name: general_information
121
+ display_name: General information
122
+ hide_win_rates: true
123
+ metrics:
124
+ - name: num_instances
125
+ split: ${main_split}
126
+ - name: num_train_instances
127
+ split: ${main_split}
128
+ - name: prompt_truncated
129
+ split: ${main_split}
130
+ - name: num_prompt_tokens
131
+ split: ${main_split}
132
+ - name: num_output_tokens
133
+ split: ${main_split}
134
+
135
+ ############################################################
136
+
137
+ run_groups:
138
+ - name: slp
139
+ display_name: SLP Scenarios
140
+ description: SLP-language scenarios
141
+ category: All scenarios
142
+ subgroups:
143
+ - disorder_diagnosis
144
+ - transcription
145
+ - symptom_diagnosis
146
+ - disorder_type_diagnosis
147
+
148
+
149
+ - name: disorder_diagnosis
150
+ display_name: Disorder Diagnosis Accuracy
151
+ description: >
152
+ Macro-averaged accuracy on disorder diagnosis for pediatric speech disorder.
153
+ metric_groups:
154
+ - accuracy
155
+ - efficiency
156
+ - general_information
157
+ environment:
158
+ main_name: classification_micro_f1
159
+ main_split: test
160
+ taxonomy:
161
+ task: classification
162
+ what: n/a
163
+ who: n/a
164
+ when: "?"
165
+ language: English
166
+
167
+ - name: transcription
168
+ display_name: Transcription Accuracy
169
+ description: >
170
+ Model transcription accuracy on understanding disordered pediatric speech
171
+ metric_groups:
172
+ - transcription_accuracy
173
+ - efficiency
174
+ - general_information
175
+ environment:
176
+ main_name: wer_score
177
+ main_split: test
178
+ taxonomy:
179
+ task: transcription
180
+ what: disordered pediatric speech
181
+ who: n/a
182
+ when: "?"
183
+ language: English
184
+
185
+ - name: symptom_diagnosis
186
+ display_name: Symptom Diagnosis Accuracy
187
+ description: >
188
+ Macro-averaged accuracy on symptom diagnosis for pediatric speech disorder.
189
+ metric_groups:
190
+ - accuracy
191
+ - efficiency
192
+ - general_information
193
+ environment:
194
+ main_name: classification_micro_f1
195
+ main_split: test
196
+ taxonomy:
197
+ task: classification
198
+ what: n/a
199
+ who: n/a
200
+ when: "?"
201
+ language: English
202
+
203
+ - name: disorder_type_diagnosis
204
+ display_name: Disorder Type Diagnosis Accuracy
205
+ description: >
206
+ Macro-averaged accuracy on disorder type diagnosis for pediatric speech disorder.
207
+ metric_groups:
208
+ - accuracy
209
+ - efficiency
210
+ - general_information
211
+ environment:
212
+ main_name: classification_micro_f1
213
+ main_split: test
214
+ taxonomy:
215
+ task: classification
216
+ what: n/a
217
+ who: n/a
218
+ when: "?"
219
+ language: English