crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/bbq_metrics.py +12 -0
  27. helm/benchmark/metrics/classification_metrics.py +19 -1
  28. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  29. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  30. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  31. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  32. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  33. helm/benchmark/metrics/comet_metric.py +1 -1
  34. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  35. helm/benchmark/metrics/copyright_metrics.py +1 -1
  36. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  37. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  38. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  39. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  40. helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
  41. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  42. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  43. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  44. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  45. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  46. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  47. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  48. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  49. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  50. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  51. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  52. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  53. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  54. helm/benchmark/metrics/medec_metrics.py +25 -2
  55. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  56. helm/benchmark/metrics/metric.py +25 -0
  57. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  58. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  59. helm/benchmark/metrics/safety_metrics.py +13 -1
  60. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  61. helm/benchmark/metrics/summac/model_summac.py +3 -3
  62. helm/benchmark/metrics/summarization_metrics.py +129 -1
  63. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  64. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  65. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  66. helm/benchmark/model_deployment_registry.py +11 -19
  67. helm/benchmark/presentation/create_plots.py +11 -2
  68. helm/benchmark/presentation/run_display.py +13 -3
  69. helm/benchmark/presentation/run_entry.py +2 -2
  70. helm/benchmark/presentation/schema.py +10 -22
  71. helm/benchmark/presentation/summarize.py +189 -14
  72. helm/benchmark/presentation/taxonomy_info.py +20 -0
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/run.py +15 -4
  75. helm/benchmark/run_expander.py +4 -0
  76. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  77. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  78. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  79. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  80. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  81. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  82. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  83. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  84. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  85. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  86. helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
  87. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  88. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
  89. helm/benchmark/runner.py +7 -0
  90. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  91. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  92. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  93. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  94. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  95. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  96. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  97. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  98. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  99. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  100. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  101. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  102. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  103. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
  104. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
  105. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
  106. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  107. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  108. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  109. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  110. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  111. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  112. helm/benchmark/scenarios/bold_scenario.py +15 -0
  113. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  115. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  116. helm/benchmark/scenarios/clear_scenario.py +23 -0
  117. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  118. helm/benchmark/scenarios/code_scenario.py +28 -0
  119. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  120. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  121. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  122. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  123. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  124. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  125. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  126. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  127. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  128. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  129. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  130. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  131. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  132. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  133. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  134. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  135. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  136. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  137. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  138. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  139. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  140. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  141. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  142. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  143. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  144. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  145. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  146. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  147. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  148. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  149. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  150. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  151. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  152. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  153. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  154. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  155. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  156. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  157. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  158. helm/benchmark/scenarios/ice_scenario.py +21 -1
  159. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  160. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  161. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  162. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  163. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  164. helm/benchmark/scenarios/koala_scenario.py +21 -1
  165. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  166. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  167. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  168. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  169. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  170. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  171. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  172. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  173. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  175. helm/benchmark/scenarios/math_scenario.py +54 -20
  176. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  177. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  178. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  179. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  180. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  181. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  182. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  183. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  184. helm/benchmark/scenarios/medec_scenario.py +23 -0
  185. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  186. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  187. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  188. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  189. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  190. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  191. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  192. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  193. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  194. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  195. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  196. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  197. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  198. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  199. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  200. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  201. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  202. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  203. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  204. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  205. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  206. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  207. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  208. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  209. helm/benchmark/scenarios/quac_scenario.py +14 -0
  210. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  211. helm/benchmark/scenarios/raft_scenario.py +15 -0
  212. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  213. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  214. helm/benchmark/scenarios/scenario.py +31 -0
  215. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  216. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  217. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  218. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  219. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  220. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  221. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  222. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  223. helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
  224. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  225. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  226. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  227. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  228. helm/benchmark/scenarios/spider_scenario.py +18 -0
  229. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  230. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  231. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  232. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  233. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  234. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  235. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  236. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  237. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  238. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  239. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  240. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  241. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  242. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  243. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  244. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  245. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  246. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  247. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  248. helm/benchmark/slurm_jobs.py +1 -2
  249. helm/benchmark/slurm_runner.py +8 -1
  250. helm/benchmark/static/schema_arabic.yaml +271 -0
  251. helm/benchmark/static/schema_classic.yaml +0 -17
  252. helm/benchmark/static/schema_long_context.yaml +17 -18
  253. helm/benchmark/static/schema_medhelm.yaml +36 -0
  254. helm/benchmark/static/schema_slp.yaml +219 -0
  255. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  256. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  257. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  258. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  259. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  260. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  261. helm/benchmark/static_build/index.html +5 -6
  262. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  263. helm/clients/ai21_client.py +2 -0
  264. helm/clients/aleph_alpha_client.py +2 -0
  265. helm/clients/anthropic_client.py +7 -1
  266. helm/clients/audio_language/diva_llama_client.py +2 -0
  267. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  268. helm/clients/audio_language/llama_omni/constants.py +9 -0
  269. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  270. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  271. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  272. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  273. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  274. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  275. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  276. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  277. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  278. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  279. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  280. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  281. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  282. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  283. helm/clients/audio_language/llama_omni/utils.py +202 -0
  284. helm/clients/audio_language/llama_omni_client.py +2 -1
  285. helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
  286. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  287. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  288. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  289. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  290. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  291. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  292. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  293. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  294. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  295. helm/clients/bedrock_client.py +63 -6
  296. helm/clients/cohere_client.py +3 -0
  297. helm/clients/dspy_client.py +135 -0
  298. helm/clients/google_client.py +2 -0
  299. helm/clients/http_model_client.py +2 -0
  300. helm/clients/huggingface_client.py +4 -3
  301. helm/clients/ibm_client.py +3 -1
  302. helm/clients/image_generation/adobe_vision_client.py +2 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  304. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  305. helm/clients/image_generation/cogview2_client.py +2 -1
  306. helm/clients/image_generation/dalle2_client.py +2 -0
  307. helm/clients/image_generation/dalle_mini_client.py +2 -1
  308. helm/clients/image_generation/deep_floyd_client.py +2 -0
  309. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  310. helm/clients/image_generation/lexica_client.py +2 -0
  311. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  312. helm/clients/image_generation/mindalle_client.py +2 -1
  313. helm/clients/image_generation/together_image_generation_client.py +2 -0
  314. helm/clients/megatron_client.py +2 -0
  315. helm/clients/mistral_client.py +2 -0
  316. helm/clients/moderation_api_client.py +2 -0
  317. helm/clients/openai_client.py +38 -21
  318. helm/clients/openai_responses_client.py +34 -8
  319. helm/clients/openrouter_client.py +31 -0
  320. helm/clients/palmyra_client.py +2 -1
  321. helm/clients/reka_client.py +2 -1
  322. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  323. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  324. helm/clients/test_huggingface_client.py +3 -3
  325. helm/clients/test_openrouter_client.py +69 -0
  326. helm/clients/together_client.py +52 -13
  327. helm/clients/vertexai_client.py +23 -11
  328. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  329. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  330. helm/clients/vision_language/idefics_client.py +2 -1
  331. helm/clients/vision_language/open_flamingo_client.py +2 -1
  332. helm/clients/vision_language/paligemma_client.py +2 -1
  333. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  334. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  335. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  336. helm/clients/vllm_client.py +43 -7
  337. helm/clients/vllm_granite_thinking_client.py +56 -0
  338. helm/clients/writer_client.py +5 -2
  339. helm/common/critique_request.py +0 -1
  340. helm/common/hierarchical_logger.py +103 -34
  341. helm/common/object_spec.py +23 -8
  342. helm/common/optional_dependencies.py +1 -1
  343. helm/common/test_general.py +4 -0
  344. helm/common/test_logging.py +94 -0
  345. helm/config/model_deployments.yaml +1001 -187
  346. helm/config/model_metadata.yaml +602 -18
  347. helm/config/tokenizer_configs.yaml +202 -5
  348. helm/proxy/cli.py +1 -1
  349. helm/proxy/example_queries.py +8 -8
  350. helm/proxy/retry.py +5 -0
  351. helm/proxy/server.py +2 -1
  352. helm/proxy/static/index.css +4 -0
  353. helm/proxy/static/index.js +7 -1
  354. helm/tokenizers/auto_tokenizer.py +2 -2
  355. helm/tokenizers/grok_tokenizer.py +2 -0
  356. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  357. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  358. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  359. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  360. helm/benchmark/metrics/medalign_metrics.py +0 -14
  361. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  362. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  363. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  364. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  365. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  366. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  367. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  368. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  369. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  370. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  371. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
  372. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  373. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  374. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  375. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  376. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  377. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  378. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
  379. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  380. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
  381. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  382. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  383. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  384. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  385. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  386. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  387. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  388. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  389. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  390. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  391. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  392. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  393. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  394. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -0,0 +1,144 @@
1
+ """Run spec functions for Vietnam WVS cultural alignment evaluation."""
2
+
3
+ from helm.benchmark.adaptation.common_adapter_specs import (
4
+ get_generation_adapter_spec,
5
+ )
6
+ from helm.benchmark.metrics.common_metric_specs import (
7
+ get_exact_match_metric_specs,
8
+ get_f1_metric_specs,
9
+ get_open_ended_generation_metric_specs,
10
+ )
11
+ from helm.benchmark.metrics.lmkt_metric_specs import get_semantic_similarity_metric_specs
12
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
13
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
14
+
15
+ INSTRUCTIONS = {
16
+ "cultural_value_understanding_wvs": {
17
+ "en": {
18
+ "instructions": "Please respond as the {country} persona described below.",
19
+ "input_noun": "Question",
20
+ "output_noun": "Answer",
21
+ },
22
+ "vi": {
23
+ "instructions": "Vui lòng trả lời như một người {country} được mô tả bên dưới.",
24
+ "input_noun": "Câu hỏi",
25
+ "output_noun": "Trả lời",
26
+ },
27
+ },
28
+ "social_norm_application_normad": {
29
+ "en": {
30
+ "instructions": "Please respond as the {country} persona described below.",
31
+ "input_noun": "Situation",
32
+ "output_noun": "Response",
33
+ },
34
+ "vi": {
35
+ "instructions": "Vui lòng trả lời như một người {country} được mô tả bên dưới.",
36
+ "input_noun": "Tình huống",
37
+ "output_noun": "Phản hồi",
38
+ },
39
+ },
40
+ "social_norm_explanation_normad": {
41
+ "en": {
42
+ "instructions": "Please respond as the {country} persona described below.",
43
+ "input_noun": "Situation",
44
+ "output_noun": "Explanation",
45
+ },
46
+ "vi": {
47
+ "instructions": "Vui lòng trả lời như một người {country} được mô tả bên dưới.",
48
+ "input_noun": "Tình huống",
49
+ "output_noun": "Giải thích",
50
+ },
51
+ },
52
+ }
53
+
54
+ COUNTRIES = {
55
+ "US": "United States",
56
+ "VN": "Vietnam",
57
+ }
58
+
59
+
60
+ @run_spec_function("cultural_value_understanding_wvs")
61
+ def get_cultural_value_understanding_wvs_spec(language: str, country: str) -> RunSpec:
62
+ scenario_spec = ScenarioSpec(
63
+ class_name="helm.benchmark.scenarios.lmkt_scenarios.CulturalValueUnderstandingWVSScenario",
64
+ args={
65
+ "language": language,
66
+ "num_personas": 300,
67
+ "num_question_variants": 4,
68
+ "include_few_shot_examples": True,
69
+ },
70
+ )
71
+
72
+ adapter_spec = get_generation_adapter_spec(
73
+ instructions=INSTRUCTIONS["cultural_value_understanding_wvs"][language]["instructions"].format(
74
+ country=COUNTRIES[country]
75
+ ),
76
+ input_noun=INSTRUCTIONS["cultural_value_understanding_wvs"][language]["input_noun"],
77
+ output_noun=INSTRUCTIONS["cultural_value_understanding_wvs"][language]["output_noun"],
78
+ max_tokens=3,
79
+ stop_sequences=[],
80
+ )
81
+
82
+ return RunSpec(
83
+ name="cultural_value_understanding_wvs",
84
+ scenario_spec=scenario_spec,
85
+ adapter_spec=adapter_spec,
86
+ metric_specs=get_exact_match_metric_specs() + get_f1_metric_specs(),
87
+ groups=["lmkt", "cultural_value_understanding_wvs"],
88
+ )
89
+
90
+
91
+ @run_spec_function("social_norm_application_normad")
92
+ def get_social_norm_application_normad_spec(language: str, country: str) -> RunSpec:
93
+ scenario_spec = ScenarioSpec(
94
+ class_name="helm.benchmark.scenarios.lmkt_scenarios.SocialNormApplicationNormADScenario",
95
+ args={
96
+ "language": language,
97
+ },
98
+ )
99
+
100
+ adapter_spec = get_generation_adapter_spec(
101
+ instructions=INSTRUCTIONS["social_norm_application_normad"][language]["instructions"].format(
102
+ country=COUNTRIES[country]
103
+ ),
104
+ input_noun=INSTRUCTIONS["social_norm_application_normad"][language]["input_noun"],
105
+ output_noun=INSTRUCTIONS["social_norm_application_normad"][language]["output_noun"],
106
+ max_tokens=5,
107
+ stop_sequences=[],
108
+ )
109
+
110
+ return RunSpec(
111
+ name="social_norm_application_normad",
112
+ scenario_spec=scenario_spec,
113
+ adapter_spec=adapter_spec,
114
+ metric_specs=get_exact_match_metric_specs() + get_f1_metric_specs(),
115
+ groups=["lmkt", "social_norm_application_normad"],
116
+ )
117
+
118
+
119
+ @run_spec_function("social_norm_explanation_normad")
120
+ def get_social_norm_explanation_normad_spec(language: str, country: str) -> RunSpec:
121
+ scenario_spec = ScenarioSpec(
122
+ class_name="helm.benchmark.scenarios.lmkt_scenarios.SocialNormExplanationNormADScenario",
123
+ args={
124
+ "language": language,
125
+ },
126
+ )
127
+
128
+ adapter_spec = get_generation_adapter_spec(
129
+ instructions=INSTRUCTIONS["social_norm_explanation_normad"][language]["instructions"].format(
130
+ country=COUNTRIES[country]
131
+ ),
132
+ input_noun=INSTRUCTIONS["social_norm_explanation_normad"][language]["input_noun"],
133
+ output_noun=INSTRUCTIONS["social_norm_explanation_normad"][language]["output_noun"],
134
+ max_tokens=128,
135
+ stop_sequences=[],
136
+ )
137
+
138
+ return RunSpec(
139
+ name="social_norm_explanation_normad",
140
+ scenario_spec=scenario_spec,
141
+ adapter_spec=adapter_spec,
142
+ metric_specs=get_open_ended_generation_metric_specs() + get_semantic_similarity_metric_specs(),
143
+ groups=["lmkt", "social_norm_explanation_normad"],
144
+ )
@@ -1,4 +1,9 @@
1
- from helm.benchmark.adaptation.adapter_spec import ADAPT_CHAT, ADAPT_GENERATION, AdapterSpec
1
+ from helm.benchmark.adaptation.adapter_spec import (
2
+ ADAPT_CHAT,
3
+ ADAPT_GENERATION,
4
+ ADAPT_MULTIPLE_CHOICE_JOINT,
5
+ AdapterSpec,
6
+ )
2
7
  from helm.benchmark.metrics.common_metric_specs import (
3
8
  get_exact_match_metric_specs,
4
9
  get_open_ended_generation_metric_specs,
@@ -29,6 +34,27 @@ def _get_long_context_generation_adapter_spec(max_tokens: int) -> AdapterSpec:
29
34
  )
30
35
 
31
36
 
37
+ def _get_long_context_multiple_choice_adapter_spec(max_tokens: int) -> AdapterSpec:
38
+ return AdapterSpec(
39
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
40
+ global_prefix="",
41
+ global_suffix="",
42
+ instructions="Read the passage and answer the following question. Respond with only a single letter corresponding to your choice. Do not include a period in your answer.\n\n", # noqa: E501
43
+ input_prefix="",
44
+ input_suffix="\n",
45
+ reference_prefix="A. ",
46
+ reference_suffix="\n",
47
+ output_prefix="\nAnswer the question above based on the passage. Respond with only a single letter corresponding to your choice. Do not include a period in your answer.\n", # noqa: E501
48
+ output_suffix="",
49
+ instance_prefix="",
50
+ max_train_instances=0,
51
+ num_outputs=1,
52
+ temperature=0.0,
53
+ max_tokens=max_tokens,
54
+ stop_sequences=[],
55
+ )
56
+
57
+
32
58
  @run_spec_function("ruler_hotpotqa")
33
59
  def get_ruler_hotpotqa_spec(max_num_words: int = 131072) -> RunSpec:
34
60
  scenario_spec = ScenarioSpec(
@@ -96,6 +122,27 @@ def get_infinite_bench_en_qa_spec(max_num_words: int = 131072) -> RunSpec:
96
122
  )
97
123
 
98
124
 
125
+ @run_spec_function("infinite_bench_en_mc")
126
+ def get_infinite_bench_en_mc_spec(max_num_words: int = 131072) -> RunSpec:
127
+ scenario_spec = ScenarioSpec(
128
+ class_name="helm.benchmark.scenarios.infinite_bench_en_mc_scenario.InfiniteBenchEnMCScenario",
129
+ args={
130
+ "max_num_words": max_num_words,
131
+ },
132
+ )
133
+
134
+ adapter_spec = _get_long_context_multiple_choice_adapter_spec(max_tokens=40)
135
+ metric_specs = get_exact_match_metric_specs()
136
+
137
+ return RunSpec(
138
+ name=f"infinite_bench_en_mc:max_num_words={max_num_words}",
139
+ scenario_spec=scenario_spec,
140
+ adapter_spec=adapter_spec,
141
+ metric_specs=metric_specs,
142
+ groups=["infinite_bench_en_mc"],
143
+ )
144
+
145
+
99
146
  @run_spec_function("infinite_bench_en_sum")
100
147
  def get_infinite_bench_en_sum_spec(max_num_words: int = 131072) -> RunSpec:
101
148
 
File without changes
@@ -0,0 +1,219 @@
1
+ import yaml
2
+ import json
3
+ import re
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any, Dict, List, Union
7
+ from abc import ABC
8
+
9
+ from helm.benchmark.annotation.annotator import AnnotatorSpec
10
+ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
11
+ from helm.benchmark.metrics.metric import MetricSpec
12
+ from helm.benchmark.metrics.common_metric_specs import (
13
+ get_basic_metric_specs,
14
+ get_exact_match_metric_specs,
15
+ get_summarization_metric_specs,
16
+ )
17
+ from helm.common.gpu_utils import get_torch_device_name
18
+
19
+
20
+ SUMMARIZATION_METRICS = {
21
+ "rouge_1",
22
+ "rouge_2",
23
+ "rouge_l",
24
+ "BERTScore-P",
25
+ "BERTScore-R",
26
+ "BERTScore-F",
27
+ }
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class MetricConfig(ABC):
32
+ """Base class for all metric configurations"""
33
+
34
+ name: str
35
+
36
+
37
+ @dataclass(frozen=True)
38
+ class SimpleMetricConfig(MetricConfig):
39
+ """Configuration for simple string-based metrics like 'exact_match'"""
40
+
41
+ pass
42
+
43
+
44
+ @dataclass(frozen=True)
45
+ class JuryMetricConfig(MetricConfig):
46
+ """Configuration for jury-based metrics with multiple judges"""
47
+
48
+ prompt_file: str
49
+ judges: List[AnnotatorModelInfo]
50
+
51
+
52
+ @dataclass(frozen=True)
53
+ class BenchmarkConfig:
54
+ """
55
+ A benchmark configuration is an immutable data structure that holds
56
+ the configuration for a specific benchmark, including prompt, dataset and metric
57
+ """
58
+
59
+ name: str
60
+ """Name of the benchmark"""
61
+
62
+ description: str
63
+ """Description of the benchmark"""
64
+
65
+ prompt_file: str
66
+ """Path to the prompt file. This prompt will be used for all instances of the benchmark."""
67
+
68
+ dataset_file: str
69
+ """Path to the dataset file. This dataset will be used to populate the context in the prompt."""
70
+
71
+ main_metric: Union[SimpleMetricConfig, JuryMetricConfig]
72
+ """The main metric for the benchmark"""
73
+
74
+ metrics: List[Union[SimpleMetricConfig, JuryMetricConfig]]
75
+ """List of structured metric configurations for the benchmark"""
76
+
77
+ max_tokens: int = 1024
78
+ """Maximum number of tokens to generate in the response"""
79
+
80
+ def get_metric_specs(self) -> List[MetricSpec]:
81
+ """Get the metric specifications for the benchmark"""
82
+ metric_specs: List[MetricSpec] = []
83
+ summarization = False
84
+ for metric in self.metrics:
85
+ if metric.name == "exact_match":
86
+ metric_specs.extend(get_exact_match_metric_specs())
87
+
88
+ elif metric.name == "jury_score":
89
+ if not isinstance(metric, JuryMetricConfig):
90
+ raise AssertionError("Metric 'jury_score' must be a JuryMetricConfig")
91
+ annotator_models = {judge.model_deployment: judge for judge in metric.judges}
92
+ metric_specs.append(
93
+ MetricSpec(
94
+ class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
95
+ args={
96
+ "metric_name": "jury_score",
97
+ "scenario_name": self.name,
98
+ "annotator_models": annotator_models,
99
+ },
100
+ )
101
+ )
102
+ if len(self.metrics) == 1:
103
+ metric_specs.extend(get_basic_metric_specs([]))
104
+
105
+ elif metric.name in SUMMARIZATION_METRICS:
106
+ if not summarization:
107
+ summarization = True
108
+ metric_args = {
109
+ "task": self.name,
110
+ "device": get_torch_device_name(),
111
+ "bertscore_model": "distilbert-base-uncased",
112
+ "rescale_with_baseline": False,
113
+ }
114
+ metric_specs.extend(get_summarization_metric_specs(metric_args))
115
+ else:
116
+ raise ValueError(f"Unknown metric name: {metric.name}")
117
+ return metric_specs
118
+
119
+ def _get_annotation_criteria(self, prompt_template: str) -> Dict[str, List[str]]:
120
+ criteria_tag = re.compile(r"<rubric_criteria>\s*(\{.*?\})\s*</rubric_criteria>", re.DOTALL)
121
+ m = criteria_tag.search(prompt_template)
122
+ if not m:
123
+ raise ValueError("No <rubric_criteria>{...}</rubric_criteria> block found in prompt_template.")
124
+ raw = json.loads(m.group(1))
125
+ # normalize to Dict[str, Set[str]]
126
+ return {k: list(v) for k, v in raw.items()}
127
+
128
+ def get_annotator_specs(self) -> List[AnnotatorSpec]:
129
+ """Convert jury metrics to AnnotatorSpec objects"""
130
+ annotator_specs = []
131
+ # return annotator_specs
132
+ for metric in self.metrics:
133
+ if isinstance(metric, JuryMetricConfig):
134
+ with open(metric.prompt_file, "r") as f:
135
+ prompt_template = f.read()
136
+ annotator_models = {judge.model_deployment: judge for judge in metric.judges}
137
+ annotator_criteria = self._get_annotation_criteria(prompt_template)
138
+ # Create a generic annotator spec - you may need to customize the class_name
139
+ # based on your specific use case
140
+ annotator_specs.append(
141
+ AnnotatorSpec(
142
+ class_name="helm.benchmark.annotation.model_as_judge.LLMAsJuryAnnotator",
143
+ args={
144
+ "name": self.name,
145
+ "prompt_template": prompt_template,
146
+ "annotation_criteria": annotator_criteria,
147
+ "annotator_models": annotator_models,
148
+ },
149
+ )
150
+ )
151
+
152
+ return annotator_specs
153
+
154
+
155
+ def _convert_metrics(raw_metrics: List[Dict[str, Any]]) -> List[MetricConfig]:
156
+ """
157
+ Convert raw metrics from YAML into structured MetricConfig objects.
158
+ """
159
+ converted_metrics: List[MetricConfig] = []
160
+
161
+ for metric in raw_metrics:
162
+ if not isinstance(metric, dict) or "name" not in metric:
163
+ raise ValueError(
164
+ f"Invalid metric format: {metric}. Each metric must be a dict with at least a 'name' field."
165
+ )
166
+
167
+ metric_name = metric["name"]
168
+
169
+ if metric_name == "jury_score":
170
+ if "prompt_file" not in metric or "judges" not in metric:
171
+ raise ValueError(f"jury_score metric requires 'prompt_file' and 'judges': {metric}")
172
+
173
+ judges = [
174
+ AnnotatorModelInfo(
175
+ model_name=j["model_name"],
176
+ model_deployment=j["name"],
177
+ )
178
+ for j in metric["judges"]
179
+ ]
180
+
181
+ converted_metrics.append(
182
+ JuryMetricConfig(name=metric_name, prompt_file=metric["prompt_file"], judges=judges)
183
+ )
184
+ else:
185
+ converted_metrics.append(SimpleMetricConfig(name=metric_name))
186
+
187
+ return converted_metrics
188
+
189
+
190
+ def _structure_benchmark_config(data: Dict[str, Any], cls) -> BenchmarkConfig:
191
+ """
192
+ Custom structure function for BenchmarkConfig that handles metrics conversion
193
+ """
194
+ if "metrics" in data:
195
+ data = data.copy() # Don't modify the original
196
+ raw_metrics = data["metrics"]
197
+ data["metrics"] = _convert_metrics(raw_metrics)
198
+ data["main_metric"] = data["metrics"][0]
199
+ else:
200
+ raise ValueError("No metrics specified.")
201
+
202
+ return BenchmarkConfig(
203
+ name=data["name"],
204
+ description=data["description"],
205
+ prompt_file=data["prompt_file"],
206
+ dataset_file=data["dataset_file"],
207
+ main_metric=data["main_metric"],
208
+ metrics=data["metrics"],
209
+ max_tokens=data.get("max_tokens", 1024),
210
+ )
211
+
212
+
213
+ def get_benchmark_config_from_path(path: str) -> BenchmarkConfig:
214
+ """Load and parse benchmark configuration from YAML file"""
215
+ with open(path) as f:
216
+ config = yaml.safe_load(f)
217
+
218
+ benchmark_config = _structure_benchmark_config(config, BenchmarkConfig)
219
+ return benchmark_config