crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/bbq_metrics.py +12 -0
  27. helm/benchmark/metrics/classification_metrics.py +19 -1
  28. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  29. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  30. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  31. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  32. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  33. helm/benchmark/metrics/comet_metric.py +1 -1
  34. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  35. helm/benchmark/metrics/copyright_metrics.py +1 -1
  36. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  37. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  38. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  39. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  40. helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
  41. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  42. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  43. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  44. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  45. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  46. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  47. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  48. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  49. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  50. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  51. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  52. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  53. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  54. helm/benchmark/metrics/medec_metrics.py +25 -2
  55. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  56. helm/benchmark/metrics/metric.py +25 -0
  57. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  58. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  59. helm/benchmark/metrics/safety_metrics.py +13 -1
  60. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  61. helm/benchmark/metrics/summac/model_summac.py +3 -3
  62. helm/benchmark/metrics/summarization_metrics.py +129 -1
  63. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  64. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  65. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  66. helm/benchmark/model_deployment_registry.py +11 -19
  67. helm/benchmark/presentation/create_plots.py +11 -2
  68. helm/benchmark/presentation/run_display.py +13 -3
  69. helm/benchmark/presentation/run_entry.py +2 -2
  70. helm/benchmark/presentation/schema.py +10 -22
  71. helm/benchmark/presentation/summarize.py +189 -14
  72. helm/benchmark/presentation/taxonomy_info.py +20 -0
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/run.py +15 -4
  75. helm/benchmark/run_expander.py +4 -0
  76. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  77. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  78. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  79. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  80. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  81. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  82. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  83. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  84. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  85. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  86. helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
  87. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  88. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
  89. helm/benchmark/runner.py +7 -0
  90. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  91. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  92. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  93. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  94. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  95. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  96. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  97. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  98. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  99. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  100. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  101. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  102. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  103. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
  104. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
  105. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
  106. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  107. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  108. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  109. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  110. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  111. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  112. helm/benchmark/scenarios/bold_scenario.py +15 -0
  113. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  115. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  116. helm/benchmark/scenarios/clear_scenario.py +23 -0
  117. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  118. helm/benchmark/scenarios/code_scenario.py +28 -0
  119. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  120. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  121. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  122. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  123. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  124. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  125. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  126. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  127. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  128. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  129. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  130. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  131. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  132. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  133. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  134. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  135. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  136. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  137. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  138. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  139. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  140. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  141. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  142. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  143. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  144. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  145. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  146. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  147. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  148. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  149. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  150. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  151. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  152. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  153. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  154. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  155. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  156. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  157. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  158. helm/benchmark/scenarios/ice_scenario.py +21 -1
  159. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  160. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  161. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  162. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  163. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  164. helm/benchmark/scenarios/koala_scenario.py +21 -1
  165. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  166. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  167. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  168. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  169. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  170. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  171. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  172. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  173. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  175. helm/benchmark/scenarios/math_scenario.py +54 -20
  176. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  177. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  178. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  179. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  180. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  181. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  182. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  183. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  184. helm/benchmark/scenarios/medec_scenario.py +23 -0
  185. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  186. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  187. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  188. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  189. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  190. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  191. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  192. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  193. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  194. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  195. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  196. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  197. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  198. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  199. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  200. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  201. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  202. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  203. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  204. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  205. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  206. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  207. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  208. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  209. helm/benchmark/scenarios/quac_scenario.py +14 -0
  210. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  211. helm/benchmark/scenarios/raft_scenario.py +15 -0
  212. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  213. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  214. helm/benchmark/scenarios/scenario.py +31 -0
  215. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  216. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  217. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  218. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  219. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  220. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  221. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  222. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  223. helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
  224. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  225. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  226. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  227. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  228. helm/benchmark/scenarios/spider_scenario.py +18 -0
  229. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  230. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  231. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  232. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  233. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  234. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  235. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  236. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  237. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  238. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  239. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  240. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  241. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  242. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  243. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  244. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  245. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  246. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  247. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  248. helm/benchmark/slurm_jobs.py +1 -2
  249. helm/benchmark/slurm_runner.py +8 -1
  250. helm/benchmark/static/schema_arabic.yaml +271 -0
  251. helm/benchmark/static/schema_classic.yaml +0 -17
  252. helm/benchmark/static/schema_long_context.yaml +17 -18
  253. helm/benchmark/static/schema_medhelm.yaml +36 -0
  254. helm/benchmark/static/schema_slp.yaml +219 -0
  255. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  256. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  257. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  258. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  259. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  260. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  261. helm/benchmark/static_build/index.html +5 -6
  262. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  263. helm/clients/ai21_client.py +2 -0
  264. helm/clients/aleph_alpha_client.py +2 -0
  265. helm/clients/anthropic_client.py +7 -1
  266. helm/clients/audio_language/diva_llama_client.py +2 -0
  267. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  268. helm/clients/audio_language/llama_omni/constants.py +9 -0
  269. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  270. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  271. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  272. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  273. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  274. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  275. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  276. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  277. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  278. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  279. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  280. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  281. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  282. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  283. helm/clients/audio_language/llama_omni/utils.py +202 -0
  284. helm/clients/audio_language/llama_omni_client.py +2 -1
  285. helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
  286. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  287. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  288. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  289. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  290. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  291. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  292. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  293. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  294. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  295. helm/clients/bedrock_client.py +63 -6
  296. helm/clients/cohere_client.py +3 -0
  297. helm/clients/dspy_client.py +135 -0
  298. helm/clients/google_client.py +2 -0
  299. helm/clients/http_model_client.py +2 -0
  300. helm/clients/huggingface_client.py +4 -3
  301. helm/clients/ibm_client.py +3 -1
  302. helm/clients/image_generation/adobe_vision_client.py +2 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  304. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  305. helm/clients/image_generation/cogview2_client.py +2 -1
  306. helm/clients/image_generation/dalle2_client.py +2 -0
  307. helm/clients/image_generation/dalle_mini_client.py +2 -1
  308. helm/clients/image_generation/deep_floyd_client.py +2 -0
  309. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  310. helm/clients/image_generation/lexica_client.py +2 -0
  311. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  312. helm/clients/image_generation/mindalle_client.py +2 -1
  313. helm/clients/image_generation/together_image_generation_client.py +2 -0
  314. helm/clients/megatron_client.py +2 -0
  315. helm/clients/mistral_client.py +2 -0
  316. helm/clients/moderation_api_client.py +2 -0
  317. helm/clients/openai_client.py +38 -21
  318. helm/clients/openai_responses_client.py +34 -8
  319. helm/clients/openrouter_client.py +31 -0
  320. helm/clients/palmyra_client.py +2 -1
  321. helm/clients/reka_client.py +2 -1
  322. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  323. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  324. helm/clients/test_huggingface_client.py +3 -3
  325. helm/clients/test_openrouter_client.py +69 -0
  326. helm/clients/together_client.py +52 -13
  327. helm/clients/vertexai_client.py +23 -11
  328. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  329. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  330. helm/clients/vision_language/idefics_client.py +2 -1
  331. helm/clients/vision_language/open_flamingo_client.py +2 -1
  332. helm/clients/vision_language/paligemma_client.py +2 -1
  333. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  334. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  335. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  336. helm/clients/vllm_client.py +43 -7
  337. helm/clients/vllm_granite_thinking_client.py +56 -0
  338. helm/clients/writer_client.py +5 -2
  339. helm/common/critique_request.py +0 -1
  340. helm/common/hierarchical_logger.py +103 -34
  341. helm/common/object_spec.py +23 -8
  342. helm/common/optional_dependencies.py +1 -1
  343. helm/common/test_general.py +4 -0
  344. helm/common/test_logging.py +94 -0
  345. helm/config/model_deployments.yaml +1001 -187
  346. helm/config/model_metadata.yaml +602 -18
  347. helm/config/tokenizer_configs.yaml +202 -5
  348. helm/proxy/cli.py +1 -1
  349. helm/proxy/example_queries.py +8 -8
  350. helm/proxy/retry.py +5 -0
  351. helm/proxy/server.py +2 -1
  352. helm/proxy/static/index.css +4 -0
  353. helm/proxy/static/index.js +7 -1
  354. helm/tokenizers/auto_tokenizer.py +2 -2
  355. helm/tokenizers/grok_tokenizer.py +2 -0
  356. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  357. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  358. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  359. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  360. helm/benchmark/metrics/medalign_metrics.py +0 -14
  361. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  362. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  363. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  364. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  365. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  366. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  367. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  368. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  369. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  370. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  371. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
  372. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  373. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  374. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  375. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  376. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  377. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  378. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
  379. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  380. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
  381. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  382. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  383. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  384. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  385. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  386. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  387. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  388. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  389. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  390. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  391. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  392. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  393. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  394. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -0,0 +1,10 @@
1
+ import{r as o,a as Fe,L as b,O as Be,d as ze,u as re,f as be,h as Ue,H as We,i as qe,j as P,R as Ve}from"./react-BteFIppM.js";import{g as B,b as q,m as te,s as ve,a as Je,d as he,y as Ge,c as ue,e as ie,l as le}from"./tremor-DR4fE7ko.js";import"./recharts-DxuQtTOs.js";(function(){const t=document.createElement("link").relList;if(t&&t.supports&&t.supports("modulepreload"))return;for(const r of document.querySelectorAll('link[rel="modulepreload"]'))n(r);new MutationObserver(r=>{for(const l of r)if(l.type==="childList")for(const i of l.addedNodes)i.tagName==="LINK"&&i.rel==="modulepreload"&&n(i)}).observe(document,{childList:!0,subtree:!0});function a(r){const l={};return r.integrity&&(l.integrity=r.integrity),r.referrerPolicy&&(l.referrerPolicy=r.referrerPolicy),r.crossOrigin==="use-credentials"?l.credentials="include":r.crossOrigin==="anonymous"?l.credentials="omit":l.credentials="same-origin",l}function n(r){if(r.ep)return;r.ep=!0;const l=a(r);fetch(r.href,l)}})();var we={exports:{}},X={};/**
2
+ * @license React
3
+ * react-jsx-runtime.production.min.js
4
+ *
5
+ * Copyright (c) Facebook, Inc. and its affiliates.
6
+ *
7
+ * This source code is licensed under the MIT license found in the
8
+ * LICENSE file in the root directory of this source tree.
9
+ */var Qe=o,Ke=Symbol.for("react.element"),Ze=Symbol.for("react.fragment"),Xe=Object.prototype.hasOwnProperty,Ye=Qe.__SECRET_INTERNALS_DO_NOT_USE_OR_YOU_WILL_BE_FIRED.ReactCurrentOwner,es={key:!0,ref:!0,__self:!0,__source:!0};function ye(s,t,a){var n,r={},l=null,i=null;a!==void 0&&(l=""+a),t.key!==void 0&&(l=""+t.key),t.ref!==void 0&&(i=t.ref);for(n in t)Xe.call(t,n)&&!es.hasOwnProperty(n)&&(r[n]=t[n]);if(s&&s.defaultProps)for(n in t=s.defaultProps,t)r[n]===void 0&&(r[n]=t[n]);return{$$typeof:Ke,type:s,key:l,ref:i,props:r,_owner:Ye.current}}X.Fragment=Ze;X.jsx=ye;X.jsxs=ye;we.exports=X;var e=we.exports,ae={},xe=Fe;ae.createRoot=xe.createRoot,ae.hydrateRoot=xe.hydrateRoot;function ss({title:s,titleId:t,...a},n){return o.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?o.createElement("title",{id:t},s):null,o.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M3.75 6.75h16.5M3.75 12h16.5m-16.5 5.25h16.5"}))}const Ne=o.forwardRef(ss);function ts({title:s,titleId:t,...a},n){return o.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?o.createElement("title",{id:t},s):null,o.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9 12.75L11.25 15 15 9.75M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const as=o.forwardRef(ts);function ns({title:s,titleId:t,...a},n){return o.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?o.createElement("title",{id:t},s):null,o.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9.75 9.75l4.5 4.5m0-4.5l-4.5 4.5M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const rs=o.forwardRef(ns),Le=""+new URL("crfm-logo-Du4T1uWZ.png",import.meta.url).href,Me=""+new URL("helm-logo-simple-DzOhNN41.png",import.meta.url).href;function is({title:s,titleId:t,...a},n){return o.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?o.createElement("title",{id:t},s):null,o.createElement("path",{fillRule:"evenodd",d:"M12 2.25a.75.75 0 01.75.75v11.69l3.22-3.22a.75.75 0 111.06 1.06l-4.5 4.5a.75.75 0 01-1.06 0l-4.5-4.5a.75.75 0 111.06-1.06l3.22 3.22V3a.75.75 0 01.75-.75zm-9 13.5a.75.75 0 01.75.75v2.25a1.5 1.5 0 001.5 1.5h13.5a1.5 1.5 0 001.5-1.5V16.5a.75.75 0 011.5 0v2.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V16.5a.75.75 0 01.75-.75z",clipRule:"evenodd"}))}const ls=o.forwardRef(is);function os({title:s,titleId:t,...a},n){return o.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?o.createElement("title",{id:t},s):null,o.createElement("path",{fillRule:"evenodd",d:"M15.75 2.25H21a.75.75 0 01.75.75v5.25a.75.75 0 01-1.5 0V4.81L8.03 17.03a.75.75 0 01-1.06-1.06L19.19 3.75h-3.44a.75.75 0 010-1.5zm-10.5 4.5a1.5 1.5 0 00-1.5 1.5v10.5a1.5 1.5 0 001.5 1.5h10.5a1.5 1.5 0 001.5-1.5V10.5a.75.75 0 011.5 0v8.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V8.25a3 3 0 013-3h8.25a.75.75 0 010 1.5H5.25z",clipRule:"evenodd"}))}const cs=o.forwardRef(os);function ds({title:s,titleId:t,...a},n){return o.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?o.createElement("title",{id:t},s):null,o.createElement("path",{fillRule:"evenodd",d:"M15 3.75a.75.75 0 01.75-.75h4.5a.75.75 0 01.75.75v4.5a.75.75 0 01-1.5 0V5.56l-3.97 3.97a.75.75 0 11-1.06-1.06l3.97-3.97h-2.69a.75.75 0 01-.75-.75zm-12 0A.75.75 0 013.75 3h4.5a.75.75 0 010 1.5H5.56l3.97 3.97a.75.75 0 01-1.06 1.06L4.5 5.56v2.69a.75.75 0 01-1.5 0v-4.5zm11.47 11.78a.75.75 0 111.06-1.06l3.97 3.97v-2.69a.75.75 0 011.5 0v4.5a.75.75 0 01-.75.75h-4.5a.75.75 0 010-1.5h2.69l-3.97-3.97zm-4.94-1.06a.75.75 0 010 1.06L5.56 19.5h2.69a.75.75 0 010 1.5h-4.5a.75.75 0 01-.75-.75v-4.5a.75.75 0 011.5 0v2.69l3.97-3.97a.75.75 0 011.06 0z",clipRule:"evenodd"}))}const ms=o.forwardRef(ds);function hs({title:s,titleId:t,...a},n){return o.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?o.createElement("title",{id:t},s):null,o.createElement("path",{fillRule:"evenodd",d:"M12.53 16.28a.75.75 0 01-1.06 0l-7.5-7.5a.75.75 0 011.06-1.06L12 14.69l6.97-6.97a.75.75 0 111.06 1.06l-7.5 7.5z",clipRule:"evenodd"}))}const Ee=o.forwardRef(hs);function us({title:s,titleId:t,...a},n){return o.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?o.createElement("title",{id:t},s):null,o.createElement("path",{fillRule:"evenodd",d:"M11.47 4.72a.75.75 0 011.06 0l3.75 3.75a.75.75 0 01-1.06 1.06L12 6.31 8.78 9.53a.75.75 0 01-1.06-1.06l3.75-3.75zm-3.75 9.75a.75.75 0 011.06 0L12 17.69l3.22-3.22a.75.75 0 111.06 1.06l-3.75 3.75a.75.75 0 01-1.06 0l-3.75-3.75a.75.75 0 010-1.06z",clipRule:"evenodd"}))}const Se=o.forwardRef(us);function xs({title:s,titleId:t,...a},n){return o.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?o.createElement("title",{id:t},s):null,o.createElement("path",{fillRule:"evenodd",d:"M10.5 3.75a6.75 6.75 0 100 13.5 6.75 6.75 0 000-13.5zM2.25 10.5a8.25 8.25 0 1114.59 5.28l4.69 4.69a.75.75 0 11-1.06 1.06l-4.69-4.69A8.25 8.25 0 012.25 10.5z",clipRule:"evenodd"}))}const fs=o.forwardRef(xs);function oe(s,t){return t?t==="home"?"https://crfm.stanford.edu/helm/":s?`https://crfm.stanford.edu/helm/${t}/${s}/`:`https://crfm.stanford.edu/helm/${t}/latest/`:"#"}function ke(){const[s,t]=o.useState([]),[a,n]=o.useState();return o.useEffect(()=>{if(a&&a.title&&a.title!=="All Leaderboards"){const r=a.title==="Lite"||a.title==="Classic"?"HELM "+a.title:a.title;document.title=r+" - Holistic Evaluation of Language Models (HELM)"}},[a]),o.useEffect(()=>{fetch("https://crfm.stanford.edu/helm/project_metadata.json").then(r=>r.json()).then(r=>{if(t(r),window.PROJECT_ID){const l=r.find(i=>i.id===window.PROJECT_ID);n(l)}else{const l=r.find(i=>i.id==="lite");n(l)}}).catch(r=>{console.error("Error fetching JSON:",r)})},[]),a===void 0||a.title===void 0?null:e.jsxs("div",{className:"dropdown z-50",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"btn normal-case bg-white font-bold p-2 border-0 text-lg block whitespace-nowrap z-40","aria-haspopup":"true","aria-controls":"menu",children:[a.title," ",e.jsx(Ee,{fill:"black",color:"black",className:"text w-4 h-4 inline"})]}),e.jsx("ul",{tabIndex:0,className:"-translate-x-36 dropdown-content z-[1] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:s.map((r,l)=>e.jsx("li",{className:"z-40",children:e.jsxs("a",{href:oe(void 0,r.id),className:"block",role:"menuitem",children:[e.jsx("strong",{className:a.title===r.title?"underline":"",children:r.title}),": ",r.description]})},l))})]})}function R(s){return s.startsWith("http://")||s.startsWith("https://")?s:`${window.BENCHMARK_OUTPUT_BASE_URL.replace(/\/$/,"")}/${s.replace(/^\//,"").split("/").map(t=>encodeURIComponent(t)).join("/")}`}function V(){return window.RELEASE?`/releases/${window.RELEASE}`:`/runs/${window.SUITE}`}async function ps(s){try{return await(await fetch(R(`${V()}/summary.json`),{signal:s})).json()}catch(t){return console.log(t),{release:void 0,suites:void 0,suite:void 0,date:""}}}function gs(){const[s,t]=o.useState({release:void 0,suites:void 0,suite:void 0,date:""}),[a,n]=o.useState();o.useEffect(()=>{fetch("https://crfm.stanford.edu/helm/project_metadata.json").then(h=>h.json()).then(h=>{if(window.PROJECT_ID){const x=h.find(p=>p.id===window.PROJECT_ID);n(x)}else{const x=h.find(p=>p.id==="lite");n(x)}}).catch(h=>{console.error("Error fetching JSON:",h)})},[]),o.useEffect(()=>{const h=new AbortController;async function x(){const p=await ps(h.signal);t(p)}return x(),()=>h.abort()},[]);const r=a!==void 0&&a.releases!==void 0?a.releases:["v1.0.0"],l=s.release||s.suite||null;if(!l)return null;const i=`Release ${l} (${s.date})`;if(r.length<=1)return e.jsx("div",{children:i});const u=r.indexOf(l),d=u<0?e.jsx(B,{color:"blue",children:"preview"}):u===0?e.jsx(B,{color:"blue",children:"latest"}):e.jsx(B,{color:"yellow",children:"stale"});return e.jsxs("div",{className:"dropdown",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"normal-case bg-white border-0 block whitespace-nowrap","aria-haspopup":"true","aria-controls":"menu",children:[i," ",d," ",e.jsx(Ee,{fill:"black",color:"black",className:"inline text w-4 h-4"})]}),e.jsx("ul",{tabIndex:0,className:"dropdown-content z-[50] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:["latest"].concat(r).map(h=>e.jsx("li",{children:e.jsx("a",{href:oe(h,a?a.id:"lite"),className:"block",role:"menuitem",children:h})},h))})]})}function js(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsxs("div",{className:"dropdown md:hidden mr-4",children:[e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(Ne,{className:"w-16 h-16"})}),e.jsxs("ul",{tabIndex:0,className:"menu menu-lg dropdown-content mt-3 z-50 p-2 bg-base-100 shadow",children:[e.jsx("li",{children:e.jsx(b,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(b,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(b,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(b,{to:"runs",className:"whitespace-nowrap",children:"Predictions"})}),e.jsx("li",{children:e.jsx(b,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})})]})]})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx("a",{href:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:Le,className:"object-contain"})}),e.jsx(b,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:Me,className:"object-contain"})}),e.jsx(ke,{})]}),e.jsx("div",{className:"flex-none hidden md:block",children:e.jsxs("ul",{className:"flex flex-row gap-6 px-1",children:[e.jsx("li",{children:e.jsx(b,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(b,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(b,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(b,{to:"runs",children:"Predictions"})}),e.jsx("li",{children:e.jsx(b,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})}),e.jsx("li",{className:"hidden lg:flex",children:e.jsx(gs,{})})]})})]})}function bs(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsx("div",{className:"dropdown md:hidden mr-4",children:e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(Ne,{className:"w-16 h-16"})})})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx(b,{to:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:Le,className:"object-contain"})}),e.jsx(b,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:Me,className:"object-contain"})}),e.jsx(ke,{})]})]})}function vs(){return e.jsxs(e.Fragment,{children:[window.PROJECT_ID==="home"?e.jsx(bs,{}):e.jsx(js,{}),e.jsx("main",{className:"p-8 pt-0",children:e.jsx("div",{className:"mx-auto max-w-[1500]px",children:e.jsx(Be,{})})})]})}async function F(s){try{return await(await fetch(R(`${V()}/schema.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),{adapter:[],metric_groups:[],metrics:[],models:[],perturbations:[],run_groups:[]}}}function ws({href:s,children:t}){return e.jsx("a",{href:s,className:"link link-primary link-hover",target:"_blank",rel:"noreferrer",children:t})}function z({value:s}){return e.jsx("span",{children:e.jsx(ze,{components:{a:ws},children:s})})}function O({title:s,subtitle:t,markdown:a=!1}){return e.jsxs("header",{className:"m-4 ml-0",children:[e.jsx("h1",{className:"text-4xl",children:s}),a&&t!==void 0?e.jsx("h2",{className:"mt-2 text-neutral",children:e.jsx(z,{value:t})}):t!==void 0&&e.jsx("h2",{className:"mt-2 text-neutral",children:t})]})}const ys={open:"green",limited:"yellow",closed:"red"},Ns={open:"Open",limited:"Limited",closed:"Closed"};function Ls({level:s}){return e.jsx(B,{color:ys[s],children:Ns[s]})}function $(){return e.jsx("div",{className:"w-full",children:e.jsx("div",{className:"block mx-auto my-24 loading loading-spinner loading-lg"})})}function Ms(){const[s,t]=o.useState([]);o.useEffect(()=>{const i=new AbortController;async function u(){const d=await F(i.signal);t(d.models)}return u(),()=>i.abort()},[]);const[a,n,r]=s.reduce((i,u)=>{switch(u.access){case"open":i[0]+=1;break;case"limited":i[1]+=1;break;case"closed":i[2]+=1;break}return i},[0,0,0]),l=Object.values(s.reduce((i,u)=>{const d=u.creator_organization;return i[d]===void 0?(i[d]={name:d,models:1},i):(i[d].models+=1,i)},{}));return s.length===0?e.jsx($,{}):e.jsxs(e.Fragment,{children:[e.jsx(O,{title:"Models"}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Creator"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Description"}),e.jsx("th",{children:"Access"})]})}),e.jsx("tbody",{children:s.map(i=>e.jsxs("tr",{children:[e.jsx("td",{className:"text-lg",children:i.creator_organization}),e.jsxs("td",{children:[e.jsx("span",{className:"text-xl",children:i.display_name}),e.jsx("br",{}),e.jsx("span",{children:i.name})]}),e.jsx("td",{children:e.jsx(z,{value:i.description})}),e.jsx("td",{children:e.jsx(Ls,{level:i.access})})]}))})]}),e.jsx(O,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-3 grid-cols-1 gap-8",children:[e.jsxs(q,{className:"flex flex-col justify-between",children:[e.jsx(te,{children:"Models"}),e.jsx(ve,{className:"text-6xl md:!text-[96px]",children:s.length}),e.jsx(Je,{values:[a,n,r],colors:["green","yellow","red"]}),e.jsx(he,{categories:["Open","Limited","Closed"],colors:["green","yellow","red"]})]}),e.jsxs(q,{className:"md:col-span-2",children:[e.jsx(te,{children:"Creator Organizations"}),e.jsxs("div",{className:"flex justify-between mt-4",children:[e.jsx(Ge,{data:l,category:"models",index:"name",variant:"pie",className:"basis-5/12"}),e.jsx(he,{categories:l.map(i=>i.name),className:"basis-7/12"})]})]})]})]})]})}function Q({to:s,children:t,inTable:a=!1,title:n=""}){return a?e.jsx(b,{className:"link link-hover",to:s,title:n,children:t}):e.jsx(b,{className:"link link-primary link-hover",to:s,children:t})}function Es(){const[s,t]=o.useState([]),[a,n]=o.useState({});o.useEffect(()=>{const i=new AbortController;async function u(){const d=await F(i.signal);t(d.run_groups.filter(h=>!h.todo&&h.taxonomy&&!h.display_name.includes("CLEVA"))),n(Object.fromEntries(d.metrics.map(h=>[h.name,h])))}return u(),()=>i.abort()},[]);const r=i=>{var x;const u=(x=i.environment)==null?void 0:x.main_name,d=u?a[u]:void 0;if(d===void 0)return"";const h=d.display_name||d.short_display_name||d.name;return d.description?`${h} – ${d.description}`:h},l=Object.values(s.reduce((i,u)=>{var h;const d=((h=u.taxonomy)==null?void 0:h.task)||"Unknown";return i[d]===void 0?(i[d]={name:d,value:1},i):(i[d].value+=1,i)},{}));return s.length===0?e.jsx($,{}):(console.log(s),e.jsxs(e.Fragment,{children:[e.jsx(O,{title:"Scenarios",subtitle:"A scenario represents a use case and consists of a dataset of instances."}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Scenario"}),e.jsx("th",{children:"Task"}),e.jsx("th",{children:"What"}),e.jsx("th",{children:"Who"}),e.jsx("th",{children:"When"}),e.jsx("th",{children:"Language"}),e.jsx("th",{children:"Description"}),e.jsx("th",{children:"Metric"})]})}),e.jsx("tbody",{children:s.map(i=>{var u,d,h,x,p;return e.jsxs("tr",{children:[e.jsxs("td",{children:[e.jsx(Q,{to:`/leaderboard/${i.name}`,children:e.jsx("span",{className:"text-lg",children:i.display_name})}),e.jsx("span",{className:"block",children:i.name})]}),e.jsx("td",{children:((u=i.taxonomy)==null?void 0:u.task)||""}),e.jsx("td",{children:((d=i.taxonomy)==null?void 0:d.what)||""}),e.jsx("td",{children:((h=i.taxonomy)==null?void 0:h.who)||""}),e.jsx("td",{children:((x=i.taxonomy)==null?void 0:x.when)||""}),e.jsx("td",{children:((p=i.taxonomy)==null?void 0:p.language)||""}),e.jsx("td",{children:e.jsx(z,{value:i.short_description||i.description})}),e.jsx("td",{children:e.jsx(z,{value:r(i)})})]})})})]}),e.jsx(O,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-4 gap-8",children:[e.jsxs(q,{className:"flex flex-col",children:[e.jsx(te,{children:"Total scenarios"}),e.jsx(ve,{className:"mx-auto my-6 md:mt-16 !text-[72px] md:!text-[96px]",children:s.length})]}),e.jsx(q,{className:"col-span-3",children:e.jsxs("div",{className:"grid md:grid-cols-2 gap-x-12",children:[e.jsx(ue,{data:l.slice(0,Math.floor(l.length/2))}),e.jsx(ue,{data:l.slice(Math.ceil(l.length/2))})]})})]})]})]}))}function Ss(){return R(`${V()}/groups.json`)}async function Re(s){try{return await(await fetch(Ss(),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),[]}}function ce({children:s}){return e.jsx("div",{role:"navigation",className:"tabs flex-nowrap border-b-2 border-gray-2 overflow-x-auto overflow-y-hidden",children:s})}function K({active:s=!1,onClick:t=()=>{},size:a="md",children:n}){return e.jsx("div",{onClick:t,className:`whitespace-nowrap text-${a} mb-[-2px] text-md tab tab-bordered${s?" border-2 border-grey-500 rounded":" border-none"}`,children:n})}function ks({title:s,titleId:t,...a},n){return o.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 20 20",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?o.createElement("title",{id:t},s):null,o.createElement("path",{fillRule:"evenodd",d:"M4.25 5.5a.75.75 0 00-.75.75v8.5c0 .414.336.75.75.75h8.5a.75.75 0 00.75-.75v-4a.75.75 0 011.5 0v4A2.25 2.25 0 0112.75 17h-8.5A2.25 2.25 0 012 14.75v-8.5A2.25 2.25 0 014.25 4h5a.75.75 0 010 1.5h-5z",clipRule:"evenodd"}),o.createElement("path",{fillRule:"evenodd",d:"M6.194 12.753a.75.75 0 001.06.053L16.5 4.44v2.81a.75.75 0 001.5 0v-4.5a.75.75 0 00-.75-.75h-4.5a.75.75 0 000 1.5h2.553l-9.056 8.194a.75.75 0 00-.053 1.06z",clipRule:"evenodd"}))}const fe=o.forwardRef(ks);function U(s){return Number.isNaN(Number(s))?String(s):String(Math.round(Number(s)*1e3)/1e3)}function Rs(s){return s.replace(/[.*+?^${}()|[\]\\]/g,"\\$&")}function Z({value:s,title:t,hideIcon:a}){if(typeof s.value=="string"&&s.value.includes("⚠")&&(s.value=s.value.replace("⚠","")),s.value===void 0)return"-";if(s.run_spec_names){const n=(()=>{if(s.run_spec_names.length==1)return"/runs/"+s.run_spec_names[0];if(s.run_spec_names.length>1){const r="/runs/?q="+s.run_spec_names.map(i=>`^${Rs(i)}$`).join("|");return encodeURI(r)}})();return n?e.jsx(Q,{to:n,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center ",children:[U(s.value),!a&&e.jsx(fe,{className:"w-3 h-3 ml-1",stroke:"#cbcbcb",fill:"#cbcbcb"})]})}):t?e.jsx("a",{title:t,children:U(s.value)}):e.jsx(e.Fragment,{children:U(s.value)})}return s.href?e.jsx(Q,{to:s.href,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center",children:[U(s.value),!a&&e.jsx(fe,{className:"w-3 h-3 ml-1",stroke:"#cbcbcb",fill:"#cbcbcb"})]})}):s.markdown?e.jsx(z,{value:String(s.value)}):t?e.jsx("a",{title:t,children:U(s.value)}):e.jsx(e.Fragment,{children:U(s.value)})}function de({schema:s,groupTable:t,numRowsToDisplay:a,sortColumnIndex:n=1,sortable:r=!0,displayColumnIndexes:l=void 0,miniStyle:i=!1}){const[u,d]=o.useState(1),[h,x]=o.useState(Math.min(t.header.length-1,n));function p(c){return c.length>30?c.substring(0,27)+"...":c}const N=c=>{const f=["AIRBench 2024 -","-book"];if(c.value==="Model/adapter")return"Model";if(f.some(m=>c.value.includes(m))){let m=c.value;return f.forEach(v=>{m=m.replace(v,"")}),p(m)}else return p(c.value)},w=c=>{if(s){const f=s.models.find(m=>m.display_name===c);if(f){let m=f.description;return m.includes("/")&&(m=m.replace("/","_")),m}}return""},S=c=>{d(c===h?u*-1:c===0?-1:1),x(c)},j=c=>{if(s){const f=s.models.find(m=>m.display_name===c);if(f){let m=f.name;return m.includes("/")&&(m=m.replace("/","_")),m}}return""},A=()=>{const c=t.header[h].lower_is_better,f=u*(c?1:-1),m=t.rows.slice();return m.sort((v,k)=>{var M,_;const y=(M=v[h])==null?void 0:M.value,L=(_=k[h])==null?void 0:_.value;return y!==void 0&&L===void 0?-1:L!==void 0&&y===void 0?1:typeof y=="number"&&typeof L=="number"?(y-L)*f:typeof y=="string"&&typeof L=="string"?f===1?y.localeCompare(L):L.localeCompare(y):0}),a>0?m.slice(0,a):m};function g(c){const f=c.lastIndexOf(" - ");return f===-1?c:c.substring(0,f)+"*"+c.substring(f+1)}const C=c=>{const m=g(c).split("*")[0].trim();if(s){const v=s.run_groups.find(k=>k.display_name===m||k.short_display_name===m);if(v)return v.name}return""};return e.jsxs("table",{className:i?"table w-full":"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsx("tr",{children:t.header.filter((c,f)=>l===void 0||l.includes(f)).map((c,f)=>e.jsx("th",{className:`${f===h?"bg-gray-100":"bg-white"} ${f===0?"left-0 z-40":""} ${c.description?"underline decoration-dashed decoration-gray-300 ":""} whitespace-nowrap px-4 sticky top-0`,title:c.description?c.description:"",children:e.jsxs("div",{className:i?"flex gap-2 items-center":"z-20 flex justify-between items-center min-w-48 w-48 max-w-48 text-wrap",children:[e.jsx("span",{className:"inline-block w-full break-words",children:N(c)}),r?e.jsx("button",{className:"link",onClick:()=>S(f),children:e.jsx(Se,{className:"w-6 h-6"})}):null]})},`$${f}`))})}),e.jsx("tbody",{children:A().map((c,f)=>e.jsx("tr",{children:c.filter((m,v)=>l===void 0||l.includes(v)).map((m,v)=>e.jsx("td",{className:`${v===0?"z-20 text-lg sticky left-0":"z-0"} ${f%2===0?"bg-gray-50":"bg-white"}`,children:v==1?e.jsx("div",{className:`${m&&m.style&&m.style["font-weight"]&&m.style["font-weight"]==="bold"?"font-bold":""}`,children:e.jsx(Z,{value:{...m,href:"/runs/?q="+j(String(c[0].value))},title:`Click value to see all predictions for: ${j(String(c[0].value))}`})}):e.jsx("div",{className:`${m&&m.style&&m.style["font-weight"]&&m.style["font-weight"]==="bold"?"font-bold":""} ${v===0?"underline decoration-dashed decoration-gray-300 z-10":"z-0"}`,children:e.jsx(Z,{value:{...m},title:String(c[0].value)===m.value?w(String(c[0].value)):`Click value to see predictions for ${String(c[0].value)} for ${C(N(t.header[v]))}: ${j(String(c[0].value))}`})})},`${v}`))},`$${c[0].value}`))})]})}function As(){const[s,t]=o.useState(0),[a,n]=o.useState(),[r,l]=o.useState();return o.useEffect(()=>{const i=new AbortController;async function u(){const d=F(i.signal),h=Re(i.signal),x=await d;l(x);const p=await h;n(p)}return u(),()=>i.abort()},[]),a===void 0||r===void 0?e.jsx($,{}):a.length===0?e.jsxs("div",{children:[e.jsx(O,{title:"Results",subtitle:"Groupings of the processes, methods, and metrics involved in evaluating models, particularly in the context of natural language understanding and question answering.",className:"mb-16"}),e.jsx("div",{children:"No groups found."})]}):e.jsxs("div",{children:[e.jsx(O,{title:"Results",subtitle:"Groupings of the processes, methods, and metrics involved in evaluating models, particularly in the context of natural language understanding and question answering.",className:"mb-16"}),e.jsxs("div",{children:[a.length>1?e.jsx(ce,{children:a.map((i,u)=>e.jsx(K,{active:u===s,onClick:()=>t(u),children:i.title},u))}):null,e.jsx(de,{schema:r,groupTable:a[s],numRowsToDisplay:-1,sortColumnIndex:1,sortable:!0},`${s}`)]})]})}async function Cs(s){try{return await(await fetch(R(`${V()}/run_specs.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),[]}}function me({currentPage:s,totalPages:t,onNextPage:a,onPrevPage:n,className:r}){let l="join";return r!==void 0&&(l=`join ${r}`),e.jsxs("div",{className:l,children:[e.jsx("button",{onClick:n,className:"join-item btn",children:"«"}),e.jsxs("button",{className:"join-item btn",children:["Page ",s," of ",t]}),e.jsx("button",{onClick:a,className:"join-item btn",children:"»"})]})}const ee=100;function _s(){const[s,t]=re(),[a,n]=o.useState(),[r,l]=o.useState(Number(s.get("page")||1)),[i,u]=o.useState(!0),[d,h]=o.useState(s.get("q")||"");o.useEffect(()=>{const j=new AbortController;async function A(){const g=await Cs(j.signal);n(g)}return A(),()=>j.abort()},[]);const x=j=>{j.preventDefault();const g=j.target.q.value;h(g),t({q:g,page:"1"})};if(a===void 0)return e.jsx($,{});let p=null;if(i)try{p=new RegExp(d)}catch{p=null}const N=a.filter(j=>p?p.test(j.name):j.name.includes(d)),w=N.slice((r-1)*ee,r*ee),S=Math.ceil(N.length/ee);return e.jsxs(e.Fragment,{children:[e.jsx(O,{title:"Predictions",subtitle:"All benchmark predictions"}),e.jsxs("form",{className:"flex mb-8",onSubmit:x,children:[e.jsxs("div",{className:"form-control",children:[e.jsx("input",{type:"text",name:"q",placeholder:"Search",className:"input input-bordered",value:d,onChange:j=>h(j.target.value)}),e.jsxs("label",{className:"label",children:[e.jsxs("span",{className:"label-text-alt flex item-center",children:[e.jsx("input",{type:"checkbox",className:"toggle toggle-xs",checked:i,onChange:()=>u(!i)}),e.jsx("span",{className:"ml-2",children:"Regex"})]}),e.jsx("span",{className:"label-text-alt",children:`${N.length} results`})]})]}),e.jsx("div",{className:"form-control ml-4",children:e.jsx("button",{className:"btn",children:e.jsx(fs,{className:"w-6 h-6"})})})]}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Run"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Groups"}),e.jsx("th",{children:"Adapter method"}),e.jsx("th",{children:"Subject / Task"})]})}),e.jsx("tbody",{children:w.map((j,A)=>e.jsxs("tr",{children:[e.jsx("td",{children:e.jsx(Q,{to:`/runs/${j.name}`,children:j.name})}),e.jsx("td",{children:j.adapter_spec.model}),e.jsx("td",{children:j.groups.join(", ")}),e.jsx("td",{children:j.adapter_spec.method}),e.jsx("td",{children:j.scenario_spec.args.subject||j.scenario_spec.args.task||"-"})]},`${j.name}-${A}`))})]})}),S>0?e.jsx(me,{className:"flex justify-center my-8",onNextPage:()=>{const j=Math.min(r+1,S);l(j),s.set("page",String(j)),t(s)},onPrevPage:()=>{const j=Math.max(r-1,1);l(j),s.set("page",String(j)),t(s)},currentPage:r,totalPages:S}):e.jsx("div",{className:"my-8 text-center",children:"No results"})]})}function H(){return window.SUITE!==void 0?window.SUITE:void 0}async function Ts(s,t,a){try{return await(await fetch(R(`/runs/${a||H()}/${s}/scenario.json`),{signal:t})).json()}catch(n){n instanceof Error&&n.name!=="AbortError"&&console.log(n);return}}function Ae(s,t){return R(`/runs/${t||H()}/${s}/run_spec.json`)}async function Ps(s,t,a){try{return await(await fetch(Ae(s,a),{signal:t})).json()}catch(n){n instanceof Error&&n.name!=="AbortError"&&console.log(n);return}}function Is(s,t){return R(`/runs/${t||H()}/${s}/scenario_state.json`)}function Ce(){return window.RELEASE!==void 0?window.RELEASE:void 0}async function Hs(s){try{return await(await fetch(R(`/releases/${Ce()}/runs_to_run_suites.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),{}}}function $s(s,t){return Ce()?s[t]:window.SUITE}function Ds(s){const a={quasi_exact_match:!1,toxic_frac:!0,safety_score:!1,exact_match:!1},n=Object.keys(s);for(const r of n)if(s[r]!==void 0&&a[r]!==void 0)return a[r]?s[r]<.5?[r,!0]:[r,!1]:s[r]>=.5?[r,!0]:[r,!1];return["",!1]}function Os(s){const[t,a]=Ds(s.stats);return t===""?null:a?e.jsx(Fs,{value:`${t.replace(/_/g," ")}: ${s.stats[t]}`}):e.jsx(Bs,{value:`${t.replace(/_/g," ")}: ${s.stats[t]}`})}function Fs({value:s}){return e.jsx(B,{icon:as,color:"green",children:s})}function Bs({value:s}){return e.jsx(B,{icon:rs,color:"red",children:s})}function I({value:s}){const[t,a]=o.useState(!1),[n,r]=o.useState(!1);return e.jsxs(e.Fragment,{children:[e.jsxs("div",{onMouseOver:()=>a(!0),onMouseOut:()=>a(!1),className:"relative",children:[e.jsx("div",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-[36rem] mb-2 whitespace-pre-wrap localize-text-direction",children:s}),t?e.jsx("button",{className:"bg-white absolute p-2 leading-none height-fit min-h-none right-1 bottom-1 shadow",onClick:()=>r(!0),children:e.jsx(ms,{fill:"black",color:"black",className:"text w-4 h-4"})}):null]}),e.jsx("dialog",{open:n,className:"modal p-16 bg-opacity-80 bg-white",onClick:()=>r(!1),children:e.jsx("div",{className:"modal-box max-w-none p-4 whitespace-pre-wrap bg-base-200 localize-text-direction",children:s})})]})}function _e({messages:s}){return e.jsx("div",{children:s.map((t,a)=>e.jsxs("div",{children:[e.jsx("div",{children:t.role}),e.jsx(I,{value:t.content})]},a))})}function Te({mediaObject:s}){if(s.content_type.includes("image")){if(s.location===void 0)return null;const t=R(s.location.replace("benchmark_output/","").replace("prod_env/","../"));return e.jsxs("div",{children:[e.jsx("img",{src:t}),e.jsx("br",{})]})}else if(s.content_type.includes("audio")){if(s.location===void 0)return null;const t=R(s.location.replace(/^.*benchmark_output\//,"").replace("prod_env/","../"));return e.jsx("div",{children:e.jsx("audio",{controls:!0,src:t})})}else if(s.content_type.includes("video")){if(s.location===void 0)return null;const t=R(s.location.replace(/^.*benchmark_output\//,"").replace("prod_env/","../"));return e.jsxs("div",{children:[e.jsx("video",{controls:!0,style:{width:"720px"},children:e.jsx("source",{src:t,type:s.content_type})}),e.jsx("br",{})]})}else return s.text&&s.content_type&&s.content_type==="text/plain"&&s.text.length>1?e.jsxs("div",{children:[s.text,e.jsx("br",{}),e.jsx("br",{})]}):e.jsx("div",{})}function Pe({multimediaObject:s}){return e.jsx("div",{children:s.media_objects.map(t=>e.jsx(Te,{mediaObject:t}))})}function zs(s){return Array.isArray(s)?s.length==0?"[]":`[${s.map(t=>String(t).replace(/\n/,"\\n")).join(", ")}]`:String(s)}function Us({request:s}){return e.jsxs("div",{children:[s.request.prompt.length>0?e.jsxs("div",{children:[e.jsxs("h3",{className:"block text text-gray-400",children:["Prompt (",s.request.prompt.length," Chars)"]}),e.jsx(I,{value:s.request.prompt})]}):s.request.multimodal_prompt?e.jsxs("div",{children:[e.jsx("h3",{className:"block text text-gray-400",children:"Prompt"}),e.jsx(Pe,{multimediaObject:s.request.multimodal_prompt})]}):s.request.messages&&s.request.messages.length?e.jsxs("div",{children:[e.jsx("h3",{className:"block text text-gray-400",children:"Prompt"}),e.jsx(_e,{messages:s.request.messages})]}):e.jsx("h3",{className:"block text text-gray-400",children:"Empty Prompt"}),e.jsx(ie,{children:Object.keys(s.request).filter(t=>t!=="prompt").map((t,a)=>e.jsxs(le,{children:[e.jsxs("span",{children:[t,":"]}),s.request&&s.request[t]?e.jsx("span",{children:zs(s.request[t])}):"null"]},a+1))})]})}function Ws(s){return e.jsx("div",{children:s.map((t,a)=>e.jsxs("div",{children:[t.error&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Error"}),e.jsx(I,{value:t.error})," "]}),t.text&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Text"}),e.jsx(I,{value:t.text})," "]}),t.media_object&&e.jsx(Te,{mediaObject:t.media_object})]},a))})}function ne(s,t){return Array.isArray(s)?s.flatMap((a,n)=>ne(a,`${t||""}[${n}]`)):s instanceof Object&&s.constructor===Object?Object.entries(s).flatMap(([a,n])=>ne(n,t?`${t}.${a}`:a)):[[t||"",typeof s=="string"?s:JSON.stringify(s)]]}function qs(s){return e.jsx("div",{children:ne(s).map(([t,a])=>e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:t}),e.jsx(I,{value:a})]},t))})}function Vs({predictionAnnotations:s}){return e.jsx("div",{children:s&&s!==void 0?Object.entries(s).map(([t,a])=>e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white my-2",children:[e.jsx("summary",{className:"collapse-title",children:e.jsx(e.Fragment,{children:"View "+t+" annotations"})}),e.jsx("div",{className:"collapse-content",children:Array.isArray(a)?Ws(a):qs(a)})]},t)):null})}function Js({predictions:s,requests:t,metricFieldMap:a}){return s.length<1?null:e.jsx("div",{children:e.jsx("div",{className:"flex flex-wrap justify-start items-start",children:s.map((n,r)=>e.jsxs("div",{className:"w-full",children:[s.length>1?e.jsxs("h2",{children:["Trial ",n.train_trial_index]}):null,e.jsxs("div",{className:"mt-2 w-full",children:[n.thinking_text?e.jsxs(e.Fragment,{children:[e.jsx("h3",{children:e.jsx("span",{className:"mr-4",children:"Thinking"})}),e.jsx(I,{value:n.thinking_text})]}):null,n.base64_images&&n.base64_images.length>0?e.jsxs(e.Fragment,{children:[e.jsx("h3",{className:"mr-4",children:"Prediction image"}),n.base64_images.map(l=>e.jsx("img",{src:"data:image;base64,"+l,alt:"Base64 Image"}))]}):e.jsxs(e.Fragment,{children:[e.jsxs("h3",{children:[e.jsx("span",{className:"mr-4",children:"Prediction raw text"}),e.jsx(Os,{stats:n.stats})]}),e.jsx(I,{value:n.predicted_text}),n.mapped_output?e.jsxs(e.Fragment,{children:[e.jsx("h3",{children:"Prediction mapped output"}),e.jsx(I,{value:String(n.mapped_output)})]}):null]})]}),e.jsx(Vs,{predictionAnnotations:n.annotations}),e.jsxs("div",{className:"mx-1",children:[e.jsx("h3",{children:"Metrics"}),e.jsx(ie,{children:Object.keys(n.stats).map((l,i)=>e.jsxs(le,{children:[a[l]?e.jsx("span",{title:a[l].description,children:a[l].display_name}):e.jsx("span",{children:l}),e.jsx("span",{children:String(n.stats[l])})]},i))})]}),e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white",children:[e.jsx("summary",{className:"collapse-title",children:"Request details"}),e.jsx("div",{className:"collapse-content",children:e.jsx(Us,{request:t[r]})})]})]},r))})})}const Gs="correct";function Qs({references:s}){return e.jsxs("span",{children:[e.jsx("h3",{children:"References"}),e.jsx("ul",{children:s.map((t,a)=>e.jsxs("li",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-72 mb-2 whitespace-pre-wrap localize-text-direction",children:[t.output.text,t.tags.map(n=>e.jsx(B,{className:"mx-2",color:n===Gs?"green":void 0,children:n}))]},a))})]})}function Ie(s){return e.jsx(I,{value:s===null?"null":typeof s=="object"?JSON.stringify(s):s.toString()})}function Ks(s){return e.jsx("div",{children:s.map(t=>Ie(t))})}function Zs({extraData:s}){return e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white my-2",children:[e.jsx("summary",{className:"collapse-title",children:"View instance extra data"}),e.jsx("div",{className:"collapse-content",children:Object.entries(s).map(([t,a])=>e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:t}),Array.isArray(a)?Ks(a):Ie(a)]}))})]})}function Xs({instance:s,requests:t,predictions:a,metricFieldMap:n}){return e.jsxs("div",{children:[e.jsx("h3",{children:"Input"}),s.input.multimedia_content!==void 0?e.jsx(Pe,{multimediaObject:s.input.multimedia_content}):s.input.text.includes('<br><img src="data:image;base64')?e.jsx("div",{dangerouslySetInnerHTML:{__html:s.input.text}}):s.input.messages!==void 0&&s.input.messages.length?e.jsx(_e,{messages:s.input.messages}):e.jsx(I,{value:s.input.text}),e.jsx("div",{children:s.references&&s.references.length>0?e.jsx(Qs,{references:s.references}):null}),s.extra_data&&s.extra_data.length?e.jsx(Zs,{extraData:s.extra_data}):null,e.jsx("div",{children:a&&t?e.jsx(Js,{predictions:a,requests:t,metricFieldMap:n}):null})]})}function Y(s){return s.includes("gpqa")||s.includes("ewok")}async function pe(s,t,a,n){const r=h=>Uint8Array.from(atob(h),x=>x.charCodeAt(0)),l=await window.crypto.subtle.importKey("raw",r(t),"AES-GCM",!0,["decrypt"]),i=new Uint8Array([...r(s),...r(n)]),u=r(a),d=await window.crypto.subtle.decrypt({name:"AES-GCM",iv:u},l,i);return new TextDecoder().decode(d)}async function Ys(s,t,a,n){try{const l=await(await fetch(R(`/runs/${a||H()}/${s}/instances.json`),{signal:t})).json();if(Y(s)&&n){const u=await(await fetch(R(`/runs/${a||H()}/${s}/encryption_data.json`),{signal:t})).json();for(const d of l){const h=u[d.input.text];h&&(d.input.text="encrypted",d.input.text=await pe(h.ciphertext,h.key,h.iv,h.tag));for(const x of d.references){const p=u[x.output.text];p&&(x.output.text=await pe(p.ciphertext,p.key,p.iv,p.tag))}}}return l}catch(r){return r instanceof Error&&r.name!=="AbortError"&&console.log(r),[]}}async function et(s,t,a,n){const r=h=>Uint8Array.from(atob(h),x=>x.charCodeAt(0)),l=await window.crypto.subtle.importKey("raw",r(t),"AES-GCM",!0,["decrypt"]),i=new Uint8Array([...r(s),...r(n)]),u=r(a),d=await window.crypto.subtle.decrypt({name:"AES-GCM",iv:u},l,i);return new TextDecoder().decode(d)}async function st(s,t,a,n){try{const l=await(await fetch(R(`/runs/${a||H()}/${s}/display_predictions.json`),{signal:t})).json();if(Y(s)&&n){const u=await(await fetch(R(`/runs/${a||H()}/${s}/encryption_data.json`),{signal:t})).json();for(const d of l){const h=d.predicted_text,x=u[h];if(x)try{d.predicted_text=await et(x.ciphertext,x.key,x.iv,x.tag)}catch(p){console.error(`Failed to decrypt predicted_text for instance_id: ${d.instance_id}`,p)}}}return l}catch(r){return r instanceof Error&&r.name==="AbortError"&&console.log(r),[]}}async function tt(s,t,a,n){const r=h=>Uint8Array.from(atob(h),x=>x.charCodeAt(0)),l=await window.crypto.subtle.importKey("raw",r(t),"AES-GCM",!0,["decrypt"]),i=new Uint8Array([...r(s),...r(n)]),u=r(a),d=await window.crypto.subtle.decrypt({name:"AES-GCM",iv:u},l,i);return new TextDecoder().decode(d)}async function at(s,t,a,n){try{const l=await(await fetch(R(`/runs/${a||H()}/${s}/display_requests.json`),{signal:t})).json();if(Y(s)&&n){const u=await(await fetch(R(`/runs/${a||H()}/${s}/encryption_data.json`),{signal:t})).json();for(const d of l){const h=d.request.prompt,x=u[h];if(x)try{d.request.prompt=await tt(x.ciphertext,x.key,x.iv,x.tag)}catch(p){console.error(`Failed to decrypt prompt for instance_id: ${d.instance_id}`,p)}}}return l}catch(r){return r instanceof Error&&r.name!=="AbortError"&&console.log(r),[]}}const J=10;function nt({runName:s,suite:t,metricFieldMap:a,userAgreed:n}){const[r,l]=re(),[i,u]=o.useState([]),[d,h]=o.useState(),[x,p]=o.useState(),[N,w]=o.useState(1);o.useEffect(()=>{const g=new AbortController;async function C(){const c=g.signal,[f,m,v]=await Promise.all([Ys(s,c,t,n),st(s,c,t,n),at(s,c,t,n)]);u(f);const k={};v.forEach(L=>{var T;const M=L.instance_id,_=((T=L.perturbation)==null?void 0:T.name)||"";k[M]===void 0&&(k[M]={}),k[M][_]===void 0&&(k[M][_]=[]),k[M][_].push(L)}),p(k);const y={};m.forEach(L=>{var T;const M=L.instance_id,_=((T=L.perturbation)==null?void 0:T.name)||"";y[M]===void 0&&(y[M]={}),y[M][_]===void 0&&(y[M][_]=[]),y[M][_].push(L)}),h(y)}return C(),()=>g.abort()},[s,t,n]);const S=i.slice((N-1)*J,(N-1)*J+J),j=Math.ceil(i.length/J);o.useEffect(()=>{const g=r.get("instance");if(g&&!window.helmHasScrolledToInstance&&S.length>0){if(S.findIndex(c=>c.id===g)===-1)return;requestAnimationFrame(()=>{const c=document.getElementById(`instance-${g}`);c&&c.scrollIntoView({behavior:"smooth"})}),window.helmHasScrolledToInstance=!0}},[r,N,l,S]);const A=g=>g.perturbation===void 0?`Instance id: ${g.id} [split: ${g.split}]`:`Instance id: ${g.id} [split: ${g.split}][perturbation: ${g.perturbation.name}]`;return d===void 0||x===void 0?e.jsx($,{}):e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"grid gap-8",children:S.map((g,C)=>{var c,f;return e.jsxs("div",{id:"instance-"+g.id,className:"border p-4",children:[e.jsxs("div",{className:"flex items-center justify-between",children:[e.jsx("h3",{className:"text-xl mb-4",children:A(g)}),e.jsx("button",{className:"btn btn-sm normal-case px-2 py-1",onClick:()=>{const m=window.location.href+(window.location.href.includes("?")?"&instance=":"?instance=")+g.id;navigator.clipboard.writeText(m)},children:"Copy Link"})]}),e.jsx(Xs,{instance:g,requests:x[g.id][((c=g.perturbation)==null?void 0:c.name)||""],predictions:d[g.id][((f=g.perturbation)==null?void 0:f.name)||""],metricFieldMap:a},`${g.id}-${C}`)]})})}),e.jsx(me,{className:"flex justify-center my-8",onNextPage:()=>{const g=Math.min(N+1,j);w(g),r.set("instancesPage",String(g)),l(r)},onPrevPage:()=>{const g=Math.max(N-1,1);w(g),r.set("instancesPage",String(g)),l(r)},currentPage:N,totalPages:j})]})}async function rt(s,t,a){try{return await(await fetch(R(`/runs/${a||H()}/${s}/stats.json`),{signal:t})).json()}catch(n){return n instanceof Error&&n.name!=="AbortError"&&console.log(n),[]}}function it({stat:s,metricFieldMap:t}){const a=`${s.name.split!==void 0?` on ${s.name.split}`:""}${s.name.sub_split!==void 0?`/${s.name.sub_split}`:""}${s.name.perturbation!==void 0?` with ${s.name.perturbation.name}`:" original"}`;return t[s.name.name]?e.jsxs("span",{title:t[s.name.name].description,children:[e.jsx("strong",{children:t[s.name.name].display_name||s.name.name}),a]}):e.jsxs("span",{children:[e.jsx("strong",{children:s.name.name}),a]})}const G=50,ge=["name","mean","min","max","sum","sum_squared","variance","stddev"];function lt({runName:s,suite:t,metricFieldMap:a}){const[n,r]=re(),[l,i]=o.useState(),[u,d]=o.useState(1),[h,x]=o.useState("");if(o.useEffect(()=>{const w=new AbortController;async function S(){const j=w.signal,A=await rt(s,j,t);i(A)}return S(),()=>w.abort()},[s,t]),l===void 0||l.length===0)return e.jsx($,{});const p=Math.ceil(l.length/G),N=l.slice((u-1)*G,(u-1)*G+G);return e.jsxs("div",{children:[e.jsx("div",{className:"flex justify-start my-4",children:e.jsx("input",{type:"text",className:"input input-bordered w-full max-w-xs",placeholder:"Search for a metric",onChange:w=>x(w.target.value)})}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsx("tr",{children:ge.map(w=>e.jsx("th",{children:w},w))})}),e.jsx("tbody",{children:N.filter(w=>!h||w.name.name.toLowerCase().includes(h.toLowerCase())).map(w=>e.jsx("tr",{children:ge.map(S=>{const j=w[S];return typeof j=="number"?e.jsx("td",{children:j}):e.jsx("td",{children:e.jsx(it,{stat:w,metricFieldMap:a})},S)})}))})]})}),e.jsx(me,{className:"flex justify-center my-8",onNextPage:()=>{const w=Math.min(u+1,p);d(w),n.set("metricsPage",String(w)),r(n)},onPrevPage:()=>{const w=Math.max(u-1,1);d(w),n.set("metricsPage",String(w)),r(n)},currentPage:u,totalPages:p})]})}function ot({runName:s,onAgree:t}){const a=o.useRef(null),n=()=>{a.current!==null&&a.current.value.trim()==="Yes, I agree"?t():alert("Please type 'Yes, I agree' exactly.")},r=s.includes("gpqa")?e.jsx(ct,{}):s.includes("ewok")?e.jsx(dt,{}):null;return e.jsxs("div",{className:"mb-8",children:[r,e.jsxs("p",{className:"mb-4",children:["If you agree to this condition, please type"," ",e.jsx("strong",{children:'"Yes, I agree"'})," in the box below and then click"," ",e.jsx("strong",{children:"Decrypt"}),"."]}),e.jsxs("div",{className:"flex gap-2 mt-2",children:[e.jsx("input",{type:"text",ref:a,className:"input input-bordered",placeholder:'Type "Yes, I agree"'}),e.jsx("button",{onClick:n,className:"btn btn-primary",children:"Decrypt"})]}),e.jsx("hr",{className:"my-4"})]})}function ct(){return e.jsxs("div",{children:[e.jsx("p",{className:"mb-4",children:"The GPQA dataset instances are encrypted by default to comply with the following request:"}),e.jsx("blockquote",{className:"italic border-l-4 border-gray-300 pl-4 text-gray-700 mb-4",children:"“We ask that you do not reveal examples from this dataset in plain text or images online, to minimize the risk of these instances being included in foundation model training corpora.”"})]})}function dt(){return e.jsxs("div",{children:[e.jsx("p",{className:"mb-4",children:"The EWoK dataset instances are encrypted by default to comply with the following request:"}),e.jsx("blockquote",{className:"italic border-l-4 border-gray-300 pl-4 text-gray-700 mb-4",children:"“PLEASE DO NOT distribute any of the EWoK materials or derivatives publicly in plain-text! Any materials should appear in password-protected ZIP files or behind gated authentication mechanisms such as Huggingface datasets.”"})]})}function mt(){var m;const{runName:s}=be(),[t,a]=o.useState(0),[n,r]=o.useState(),[l,i]=o.useState(),[u,d]=o.useState(),[h,x]=o.useState(),[p,N]=o.useState(),[w,S]=o.useState({}),[j,A]=o.useState({}),[g,C]=o.useState(!1);if(o.useEffect(()=>{const v=new AbortController;async function k(){const y=v.signal;if(s===void 0)return()=>v.abort();const L=window.SUITE?window.SUITE:$s(await Hs(y),s);i(L);const[M,_,T]=await Promise.all([Ps(s,y,L),Ts(s,y,L),F(y)]);r(M),x(_),A(T.metrics.reduce((D,W)=>(D[W.name]=W,D),{})),S(T.adapter.reduce((D,W)=>(D[W.name]=W,D),{})),N(T.run_groups.find(D=>M==null?void 0:M.groups.includes(D.name))),d(T.models.find(D=>D.name===(M==null?void 0:M.adapter_spec.model)))}return k(),()=>v.abort()},[s]),n===void 0||h===void 0||s===void 0||l===void 0||j===void 0)return e.jsx($,{});const c=(m=p==null?void 0:p.taxonomy)==null?void 0:m.language,f=c?"scenario-language-"+c.toLowerCase():"";return e.jsxs("div",{className:f,children:[e.jsx("div",{className:"flex justify-between gap-8 mb-12",children:e.jsxs("div",{children:[e.jsxs("h1",{className:"text-3xl flex items-center",children:[(p==null?void 0:p.display_name)||h.name,p?e.jsx("a",{href:"/#/groups/"+p.name,children:e.jsx(cs,{className:"w-6 h-6 ml-2"})}):null]}),e.jsx("h3",{className:"text-xl",children:e.jsx(z,{value:(p==null?void 0:p.description)||h.description})}),e.jsx("h1",{className:"text-3xl mt-2",children:(u==null?void 0:u.display_name)||n.adapter_spec.model}),e.jsx("h3",{className:"text-xl",children:e.jsx(z,{value:(u==null?void 0:u.description)||""})}),e.jsx("div",{className:"mt-2 flex gap-2",children:h.tags.map(v=>e.jsx(B,{size:"xs",color:"gray",children:e.jsx("span",{className:"text text-md",children:v})}))})]})}),e.jsxs(q,{children:[e.jsxs("div",{className:"flex justify-between",children:[e.jsx("h3",{className:"text-lg mb-1",children:"Adapter Specification"}),e.jsxs("div",{className:"flex gap-2",children:[e.jsx(ls,{className:"w-6 h-6 mr-1 text text-primary"}),e.jsx("a",{className:"link link-primary link-hover",href:Ae(n.name,l),download:"true",target:"_blank",children:"Spec JSON"}),e.jsx("a",{className:"link link-primary link-hover",href:Is(n.name,l),download:"true",target:"_blank",children:"Full JSON"})]})]}),e.jsx("div",{children:e.jsx(ie,{className:"grid md:grid-cols-2 lg:grid-cols-3 gap-x-8",children:Object.entries(n.adapter_spec).map(([v,k],y)=>e.jsxs(le,{className:y<3?"!border-0":"",children:[e.jsx("strong",{className:"mr-1",title:w[v]?w[v].description:void 0,children:`${v}: `}),e.jsx("span",{className:"overflow-x-auto",children:k})]}))})})]}),e.jsx("div",{className:"mt-16 mb-8",children:e.jsxs(ce,{children:[e.jsx(K,{size:"lg",active:t===0,onClick:()=>a(0),children:"Instances + Predictions"}),e.jsx(K,{size:"lg",active:t===1,onClick:()=>a(1),children:"All metrics"})]})}),t===0&&Y(s)&&!g&&e.jsx(ot,{runName:s,onAgree:()=>C(!0)}),t===0?e.jsx(nt,{runName:s,suite:l,metricFieldMap:j,userAgreed:g},g?"instances-agreed":"instances-not-agreed"):e.jsx(lt,{runName:s,suite:l,metricFieldMap:j})]})}function ht({schema:s,groupTable:t,numRowsToDisplay:a,sortColumnIndex:n=1,sortable:r=!0,miniStyle:l=!1}){const[i,u]=o.useState(1),[d,h]=o.useState(Math.min(t.header.length-1,n)),x=c=>c.value,p=c=>{if(s){const f=s.models.find(m=>m.display_name===c);if(f){let m=f.description;return m.includes("/")&&(m=m.replace("/","_")),m}}return""},N=c=>{u(c===d?i*-1:c===0?-1:1),h(c)},w=c=>{if(s){const f=s.models.find(m=>m.display_name===c);if(f){let m=f.name;return m.includes("/")&&(m=m.replace("/","_")),m}}return""},S=()=>{const c=t.header[d].lower_is_better,f=i*(c?1:-1),m=t.rows.slice();return m.sort((v,k)=>{var M,_;const y=(M=v[d])==null?void 0:M.value,L=(_=k[d])==null?void 0:_.value;return y!==void 0&&L===void 0?-1:L!==void 0&&y===void 0?1:typeof y=="number"&&typeof L=="number"?(y-L)*f:typeof y=="string"&&typeof L=="string"?f===1?y.localeCompare(L):L.localeCompare(y):0}),a>0?m.slice(0,a):m};function j(c){const f=c.lastIndexOf(" - ");return f===-1?c:c.substring(0,f)+"*"+c.substring(f+1)}const A=c=>{const m=j(c).split("*")[0].trim();if(s){const v=s.run_groups.find(k=>k.display_name===m||k.short_display_name===m);if(v)return v.name}return""},g=[];g.push(t.header.map((c,f)=>e.jsx("td",{className:`${f===d?"bg-gray-100":f%2===0?"bg-gray-50":"bg-white"} ${f===0?"top-0 z-20":"z-10"} ${c.description?"underline decoration-dashed decoration-gray-300":""} sticky left-0 justify-between items-center w-48`,title:c.description?c.description:"",children:e.jsxs("div",{className:l?"flex gap-2":"z-10 flex w-full",children:[e.jsx("span",{className:"inline-block w-full break-words",children:x(c)}),r?e.jsx("button",{className:"link",onClick:()=>N(f),children:e.jsx(Se,{className:"w-6 h-6"})}):null]})},`scenario-${f}`))),S().forEach((c,f)=>g.push(c.map((m,v)=>e.jsx("td",{className:`${v%2===0?"bg-gray-50":"bg-white"} ${v===0?"px-4 sticky top-0":""} w-32`,children:v==1?e.jsx("div",{className:`${m&&m.style&&m.style["font-weight"]&&m.style["font-weight"]==="bold"?"font-bold":""}`,children:e.jsx(Z,{value:{...m,href:"/runs/?q="+w(String(c[0].value))},title:`Click value to see all predictions for: ${w(String(c[0].value))}`})}):e.jsx("div",{className:`${m&&m.style&&m.style["font-weight"]&&m.style["font-weight"]==="bold"?"font-bold":""} ${v===0?"underline decoration-dashed decoration-gray-300 z-20":"z-0"}`,children:e.jsx(Z,{value:{...m},title:String(c[0].value)===m.value?p(String(c[0].value)):`Click value to see predictions for ${String(c[0].value)} for ${A(x(t.header[v]))}: ${w(String(c[0].value))}`})})},`${f}-${v}`))));const C=g[0].map((c,f)=>g.map(m=>m[f]));return e.jsxs("table",{className:l?"table table-fixed w-full":"rounded-lg shadow-md table table-fixed",children:[e.jsx("thead",{children:e.jsx("tr",{children:C[0]})}),e.jsx("tbody",{children:C.slice(1).map((c,f)=>e.jsx("tr",{children:c},f))})]})}async function He(s,t){try{return await(await fetch(R(`${V()}/groups/${s}.json`),{signal:t})).json()}catch(a){return a instanceof Error&&a.name!=="AbortError"&&console.log(a),[]}}function ut({schema:s,runGroupName:t,numRowsToDisplay:a=-1}){const[n,r]=o.useState(),[l,i]=o.useState(0);if(o.useEffect(()=>{const d=new AbortController;async function h(){const x=await He(t,d.signal);r(x)}return h(),()=>d.abort()},[s,t]),n===void 0||n.length===0)return e.jsx($,{});if(n.length===0)return e.jsx("div",{children:"Group currently has no tables."});const u=s.run_groups[0].name=="medhelm_scenarios";return e.jsxs("div",{children:[n.length>1?e.jsx(ce,{children:n.map((d,h)=>e.jsx(K,{active:h===l,onClick:()=>i(h),children:d.title},h))}):null,u?e.jsx(ht,{schema:s,groupTable:n[l],numRowsToDisplay:a,sortColumnIndex:1},`${t}-${l}`):e.jsx(de,{schema:s,groupTable:n[l],numRowsToDisplay:a,sortColumnIndex:1},`${t}-${l}`)]})}function se(){const{groupName:s}=be(),t=Ue(),[a,n]=o.useState(void 0),[r,l]=o.useState(),[i,u]=o.useState();o.useEffect(()=>{const x=new AbortController;async function p(){const N=F(x.signal),w=Re(x.signal),S=await N;n(S);let j;const A=await w,g={};A.forEach(C=>{g[C.title]=[],C.rows.forEach(c=>{const f=c[0].href.replace("?group=","");j===void 0&&(j=f),g[C.title].push({title:String(c[0].value),name:f})})}),l(g),u(j)}return p(),()=>x.abort()},[]);const d=s||i;if(a===void 0||r===void 0||d===void 0)return e.jsx($,{});let h;for(const x of a.run_groups)x.name===d&&(h=x);return e.jsxs(e.Fragment,{children:[e.jsxs("div",{className:"flex flex-row",children:[e.jsx("div",{className:"w-3/4",children:h?e.jsx(O,{title:"Leaderboard: "+h.display_name,subtitle:h.description,markdown:!0}):e.jsx(O,{title:"Leaderboard",subtitle:"The HELM leaderboard shows how the various models perform across different scenarios and metrics.",markdown:!0})}),e.jsx("div",{className:"w-1/4 pt-8",children:e.jsxs("label",{htmlFor:"group",className:"block text-sm font-medium text-gray-700",children:["Select a group:",e.jsx("select",{id:"group",name:"group",onChange:x=>t("/leaderboard/"+x.target.value),className:"mt-1 block w-full pl-3 pr-10 py-2 text-base border-gray-300 focus:outline-none focus:ring focus:border-blue-300 rounded-md",value:d,children:Object.entries(r).map(([x,p])=>e.jsx("optgroup",{label:x,children:p.map(N=>e.jsx("option",{value:N.name,children:N.title},N.name))},x))})]})})]}),e.jsx(ut,{schema:a,runGroupName:d},d)]})}const xt=""+new URL("instruct-flowchart-CQIN4bVV.svg",import.meta.url).href,ft=""+new URL("instruct-graph-B4NER252.svg",import.meta.url).href;function pt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 font-bold text-center",children:"HELM Instruct: A Multidimensional Instruction Following Evaluation Framework with Absolute Ratings"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://crfm.stanford.edu/2024/02/18/helm-instruct.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{children:["We introduce ",e.jsx("em",{children:"HELM Instruct"}),", a multidimensional evaluation framework for instruction-following LLMs with absolute ratings. The framework takes an instruction, a model, an evaluator, and a criterion to generate a score. In our study, we use HELM Instruct to compare 4 instruction-following models on 7 scenarios based on 4 Human/LM evaluators and 5 criteria. Check out the blog post for more details."]}),e.jsxs("div",{className:"grid my-16 grid-cols-1 md:mx-32 md:grid-cols-2 md:gap-2",children:[e.jsx("img",{src:xt,alt:"Evaluation flowchart",className:"mx-auto block",sizes:"100vw"}),e.jsx("img",{src:ft,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block",sizes:"100vw"})]}),e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Average"}),e.jsx("th",{children:"Helpfulness"}),e.jsx("th",{children:"Understandability"}),e.jsx("th",{children:"Completeness"}),e.jsx("th",{children:"Conciseness"}),e.jsx("th",{children:"Harmlessness"})]})}),e.jsxs("tbody",{children:[e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-4-0314"}),e.jsx("td",{children:"4.63"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.85"}),e.jsx("td",{children:"4.50"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.95"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-3.5-turbo-0613"}),e.jsx("td",{children:"4.60"}),e.jsx("td",{children:"4.34"}),e.jsx("td",{children:"4.86"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.41"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"anthropic_claude-v1.3"}),e.jsx("td",{children:"4.56"}),e.jsx("td",{children:"4.25"}),e.jsx("td",{children:"4.87"}),e.jsx("td",{children:"4.32"}),e.jsx("td",{children:"4.40"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"cohere_command-xlarge-beta"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"3.90"}),e.jsx("td",{children:"4.73"}),e.jsx("td",{children:"3.88"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"4.72"})]})]})]})]})}function $e({models:s}){return e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[s.length," models"]}),e.jsx("ul",{children:s.map((t,a)=>t.todo?e.jsxs("li",{className:"text-slate-300 mt-1",children:[t.creator_organization," / ",t.display_name]},a):e.jsx(b,{to:"models",children:e.jsxs("li",{className:"text-black mt-1",children:[t.creator_organization," / ",t.display_name]},a)}))})]})}function De({runGroups:s}){const t=new Map(s.filter(r=>r.metric_groups!==void 0&&(r.subgroups===void 0||r.subgroups.length===0)).map(r=>[r.name,r])),a=new Set,n=[];return s.forEach(r=>{const l=r.subgroups?r.subgroups:[],i=[];l.forEach(u=>{const d=t.get(u);d&&(i.push(d),a.add(d.name))}),i.length>0&&n.push([r,i])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[a.size," scenarios"]}),e.jsx("ul",{children:n.map(([r,l])=>e.jsxs("li",{className:"my-3",children:[e.jsx(b,{className:"text-black",to:"groups/"+r.name,children:e.jsx("h2",{children:r.display_name})}),e.jsx("ul",{className:"list-disc list-inside",children:l.map(i=>i.todo?e.jsx("li",{className:`${i.todo?"ml-4 text-slate-300":"ml-4"}`,children:i.display_name},i.name):e.jsx(b,{className:"text-black",to:"groups/"+i.name,children:e.jsx("li",{className:`${i.todo?"ml-4 text-slate-300":"ml-4"}`,children:i.display_name},i.name)}))})]},r.name))})]})}const Oe=""+new URL("helmhero-D9TvmJsp.png",import.meta.url).href;function E({runGroupName:s=void 0,tableIndexToDisplay:t=0,numRowsToDisplay:a=10,sortColumnIndex:n=1}){const[r,l]=o.useState(void 0),[i,u]=o.useState(void 0);return o.useEffect(()=>{const d=new AbortController;async function h(){const x=await F(d.signal);l(x);const p=x.run_groups;if(p.length===0)return;const N=s||p[0].name,w=await He(N,d.signal);u(w[t])}return h(),()=>d.abort()},[s,t]),r===void 0||i===void 0?e.jsx($,{}):e.jsx("div",{className:"rounded-2xl overflow-hidden border-2 bg-white p-1 mx-2 my-0 overflow-x-auto",style:{overflow:"auto",justifyContent:"space-between"},children:e.jsx(de,{schema:r,groupTable:i,numRowsToDisplay:a,sortColumnIndex:n,displayColumnIndexes:[0,1],sortable:!1,miniStyle:!0})})}function gt(){return e.jsxs("div",{className:"flex flex-col px-4 sm:px-6 py-100 sm:py-10 sm:mb-96 md:mb-96 lg:mb-0 xl:mb-0 2xl:mb-0",children:[e.jsx("div",{className:"flex flex-col text-center mb-10 justify-start",children:e.jsx("h1",{className:"text-3xl sm:text-4xl mb-3 sm:mb-4 mx-2 mt-2",children:e.jsx("strong",{children:"A holistic framework for evaluating foundation models."})})}),e.jsxs("div",{className:"flex flex-col md:flex-col lg:flex-row lg:justify-center",children:[e.jsx("div",{className:"w-full lg:w-1/2 flex justify-center mb-4 lg:mb-0 h-full py-10",children:e.jsx("img",{src:Oe,alt:"HELM Hero",className:"object-contain w-96"})}),e.jsxs("div",{className:"py-2 rounded-xl bg-gray-100 h-full",children:[e.jsx(E,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-2 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function je(){const[s,t]=o.useState(void 0);return o.useEffect(()=>{const a=new AbortController;async function n(){const r=await F(a.signal);t(r)}return n(),()=>a.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(gt,{}),e.jsx("div",{className:"mx-auto text-lg px-16",children:e.jsx("div",{className:"container mx-auto",children:e.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-8",children:[e.jsx($e,{models:s.models}),e.jsx(De,{runGroups:s.run_groups})]})})})]}):null}function jt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Massive Multitask Language Understanding (MMLU) on HELM"}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Massive Multitask Language Understanding (MMLU)"})," ",e.jsx("a",{href:"https://arxiv.org/pdf/2009.03300.pdf",className:"link",children:"(Hendrycks et al, 2020)"})," ","is a multiple-choice question answering test that covers 57 tasks including elementary mathematics, US history, computer science, law, and more. We publish evaluation results from evaluating various models on MMLU using HELM. Our evaluation results include the following:"]}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"Simple, standardized prompts"}),e.jsx("li",{children:"Accuracy breakdown for each of the 57 subjects"}),e.jsx("li",{children:"Full transparency of all raw prompts and predictions"})]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://crfm.stanford.edu/2024/05/01/helm-mmlu.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsx("div",{className:"flex-1",children:e.jsx(E,{})})]})]})}const bt=""+new URL("air-overview-DpBbyagA.png",import.meta.url).href;function vt(){const s={fontVariant:"small-caps",fontWeight:"bold"},t=e.jsx("span",{style:s,children:"AIR-Bench 2024"});return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:t}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("img",{src:bt,alt:"AIR 2024 Categories",className:"mx-auto my-4 block w-3/4",sizes:"100vw"}),e.jsxs("p",{children:["We introduce ",t,", the first AI safety benchmark aligned with emerging government regulations and company policies, following the regulation-based safety categories grounded in our AI Risks study, AIR 2024. AIR 2024 decomposes 8 government regulations and 16 company policies into a four-tiered safety taxonomy with 314 granular risk categories in the lowest tier. ",t," contains 5,694 diverse prompts spanning these categories, with manual curation and human auditing to ensure quality. We evaluate leading language models on ",t,", uncovering insights into their alignment with specified safety concerns. By bridging the gap between public benchmarks and practical AI risks, ",t," ","provides a foundation for assessing model safety across jurisdictions, fostering the development of safer and more responsible AI systems."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2407.17436",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(E,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function wt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"ThaiExam"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://www.scbx.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"SCBX"})," ","and"," ",e.jsx("a",{href:"https://www.scb10x.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"SCB 10X"}),", we introduce the ThaiExam HELM leaderboard. ThaiExam is a Thai language benchmark based on examinations for high school students and investment professionals in Thailand. The ThaiExam leaderboard is the first public leaderboard for large language models on Thai language scenarios, and features evaluations of leading language models. Like all other HELM leaderboards, the ThaiExam leaderboard provides full prompt-level transparency, and the results can be fully reproduced using the HELM framework. We hope that this leaderboard will encourage further work in multilingual language model evaluation."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(b,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(E,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function yt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Finance"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("p",{className:"my-4",children:["LLMs show great potential for applications in the financial domain, yet there is a lack of financial-domain evaluations for LLMs. To help address this, we introduce the"," ",e.jsx("strong",{className:"font-bold",children:"HELM Finance"})," leaderboard. The HELM Finance leaderboard evaluates leading LLMs on three financial benchmarks (i.e.,"," ",e.jsx("a",{className:"underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://arxiv.org/abs/2109.00122/",children:"FinQA"}),","," ",e.jsx("a",{className:"underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://arxiv.org/abs/2311.11944/",children:"FinanceBench"}),","," ",e.jsx("a",{className:"underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://aclanthology.org/2020.nlp4convai-1.5/",children:"BANKING77"}),") that utilize real financial documents. Like all other HELM leaderboards, the HELM Finance leaderboard provides full prompt-level transparency, and the results can be fully reproduced using the open-source HELM framework. We hope that this leaderboard provides valuable insights for financial practitioners."]}),e.jsxs("p",{className:"my-4",children:["This leaderboard was produced through a collaboration with"," ",e.jsx("a",{className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://www.wellsfargo.com/",children:"Wells Fargo"}),", and was funded by the"," ",e.jsx("a",{className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://hai.stanford.edu/corporate-affiliate-program",children:"HAI Corporate Affiliate Program"}),"."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(b,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(E,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Nt=""+new URL("heim-logo-BJtQlEbV.png",import.meta.url).href;function Lt({metricFieldMap:s,metricGroups:t}){const a=new Set,n=[];return t.forEach(r=>{const l=[];r.metrics.forEach(i=>{const u=s[i.name];u&&(l.push(u),a.add(u.name))}),l.length>0&&n.push([r,l])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[a.size," metrics"]}),e.jsx("ul",{children:n.map(([r,l])=>e.jsxs("li",{className:"my-3",children:[e.jsx("h4",{children:r.display_name}),e.jsx("ul",{className:"list-disc list-inside",children:l.map(i=>e.jsx("li",{className:"ml-4",children:i.display_name},i.name))})]},r.name))})]})}function Mt(){const[s,t]=o.useState(void 0);o.useEffect(()=>{const n=new AbortController;async function r(){const l=await F(n.signal);t(l)}return r(),()=>n.abort()},[]);const a=s?s.metrics.reduce((n,r)=>(n[r.name]=r,n),{}):void 0;return e.jsxs("div",{className:"container mx-auto px-16 text-base",children:[e.jsx("div",{className:"container max-w-screen-lg mx-auto",children:e.jsx("img",{className:"mx-auto w-96",src:Nt,alt:"HEIM Logo"})}),e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Holistic Evaluation of Text-To-Image Models"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2311.04287",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{className:"my-2",children:["Significant effort has recently been made in developing text-to-image generation models, which take textual prompts as input and generate images. As these models are widely used in real-world applications, there is an urgent need to comprehensively understand their capabilities and risks. However, existing evaluations primarily focus on image-text alignment and image quality. To address this limitation, we introduce a new benchmark,"," ",e.jsx("strong",{children:"Holistic Evaluation of Text-To-Image Models (HEIM)"}),"."]}),e.jsx("p",{className:"my-2",children:"We identify 12 different aspects that are important in real-world model deployment, including:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside unreset",children:[e.jsx("li",{children:"image-text alignment"}),e.jsx("li",{children:"image quality"}),e.jsx("li",{children:"aesthetics"}),e.jsx("li",{children:"originality"}),e.jsx("li",{children:"reasoning"}),e.jsx("li",{children:"knowledge"}),e.jsx("li",{children:"bias"}),e.jsx("li",{children:"toxicity"}),e.jsx("li",{children:"fairness"}),e.jsx("li",{children:"robustness"}),e.jsx("li",{children:"multilinguality"}),e.jsx("li",{children:"efficiency"})]}),e.jsx("p",{className:"my-2",children:"By curating scenarios encompassing these aspects, we evaluate state-of-the-art text-to-image models using this benchmark. Unlike previous evaluations that focused on alignment and quality, HEIM significantly improves coverage by evaluating all models across all aspects. Our results reveal that no single model excels in all aspects, with different models demonstrating strengths in different aspects."}),e.jsx("p",{className:"my-2",children:"For full transparency, this website contains all the prompts, generated images and the results for the automated and human evaluation metrics."}),e.jsx("p",{className:"my-2",children:"Inspired by HELM, we decompose the model evaluation into four key components: aspect, scenario, adaptation, and metric:"}),e.jsx("div",{className:"container max-w-screen-lg mx-auto my-8",children:e.jsx("img",{src:"https://crfm.stanford.edu/heim/latest/images/heim-main.png",alt:"HEIM scenarios, prompts, images and metrics"})}),s&&a?e.jsxs("div",{className:"grid grid-cols-1 md:grid-cols-3 gap-8",children:[e.jsx($e,{models:s.models}),e.jsx(De,{runGroups:s.run_groups}),e.jsx(Lt,{metricFieldMap:a,metricGroups:s.metric_groups})]}):null]})}const Et=""+new URL("vhelm-framework-NxJE4fdA.png",import.meta.url).href,St=""+new URL("vhelm-model-ypCL5Yvq.png",import.meta.url).href,kt=""+new URL("vhelm-aspects-NiDQofvP.png",import.meta.url).href;function Rt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Holistic Evaluation of Vision-Language Models"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2410.07112",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"#/leaderboard",children:"Leaderboard"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{className:"my-4",children:["Current benchmarks for assessing vision-language models (VLMs) often focus on their perception or problem-solving capabilities and neglect other critical aspects such as fairness, multilinguality, or toxicity. Furthermore, they differ in their evaluation procedures and the scope of the evaluation, making it difficult to compare models. To address these issues, we extend the HELM framework to VLMs to present the Holistic Evaluation of Vision Language Models (VHELM). To address these issues, we introduce VHELM, built on HELM for language models. VHELM aggregates various datasets to cover one or more of the 9 aspects:"," ",e.jsx("b",{children:"visual perception"}),", ",e.jsx("b",{children:"bias"}),", ",e.jsx("b",{children:"fairness"}),", ",e.jsx("b",{children:"knowledge"}),", ",e.jsx("b",{children:"multilinguality"}),", ",e.jsx("b",{children:"reasoning"}),", ",e.jsx("b",{children:"robustness"}),","," ",e.jsx("b",{children:"safety"}),", and ",e.jsx("b",{children:"toxicity"}),". In doing so, we produce a comprehensive, multi-dimensional view of the capabilities of the VLMs across these important factors. In addition, we standardize the standard inference parameters, methods of prompting, and evaluation metrics to enable fair comparisons across models. Our framework is designed to be lightweight and automatic so that evaluation runs are cheap and fast. For transparency, we release the raw model generations and complete results on this website."]}),e.jsx("p",{className:"my-4 font-bold",children:"VHELM is intended to be a living benchmark. We hope to continue adding new datasets, models and metrics over time, so please stay tuned!"}),e.jsxs("div",{className:"my-16 flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("img",{src:St,alt:"A vision-lanuage model (VLM) takes in an image and a text prompt and generates text.",className:""}),e.jsx("img",{src:Et,alt:"An example of an evaluation for an Aspect (Knowledge) - a Scenario (MMMU) undergoes Adaptation (multimodal multiple choice) for a Model (GPT-4 Omni), then Metrics (Exact match) are computed",className:""})]}),e.jsxs("div",{className:"flex-1",children:[e.jsx(E,{}),e.jsx(b,{to:"leaderboard",className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})]})]}),e.jsx("div",{className:"container max-w-screen-lg mx-auto my-8",children:e.jsx("img",{src:kt,alt:"An example of each aspect in VHELM: Visual Perception, Bias, Fairness, Knowledge, Multilinguality, Reasoning, Robustness, Toxicity Mitigation and Safety. ",className:""})})]})}function At(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Call Center Leaderboard"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("p",{className:"my-4",children:["LLMs show great potential for applications for the call center, yet there is a lack of domain-specific and ecologically-valid evaluations in this domain. To address this, we introduce the"," ",e.jsx("strong",{className:"font-bold",children:"HELM Call Center leaderboard"}),". The HELM Call Center leaderboard evaluates leading LLMs on a summarization task over a dataset of real helpdesk call transcripts provided by Accenture. The quality of the summaries is evaluated using LLM-as-judge with an ensemble of 3 models. We hope that this leaderboard provides some initial insights into the potential of LLMs in this domain."]}),e.jsxs("p",{className:"my-4",children:["This leaderboard was produced through research collaboration with"," ",e.jsx("a",{className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://www.accenture.com/",children:"Accenture"}),", and was funded by the"," ",e.jsx("a",{className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://hai.stanford.edu/corporate-affiliate-program",children:"HAI Corporate Affiliate Program"}),"."]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(E,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function Ct(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Call Transcript Summarization Leaderboard"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-[3] text-l",children:[e.jsx("img",{src:"https://storage.googleapis.com/crfm-helm-public/assets/images/call-center-banner-narrow.jpg",className:"shadow-xl",width:"1200",height:"400"}),e.jsxs("p",{className:"my-4",children:["Large language models (LLMs) show great potential for call center applications, yet there is a lack of domain-specific and ecologically valid evaluations in this domain. To address this, we introduce the"," ",e.jsx("strong",{className:"font-bold",children:"HELM Call Transcript Summarization"})," ","leaderboard, which evaluates leading LLMs on a summarization task over a dataset of real call transcripts provided by Accenture."]}),e.jsx("p",{className:"my-4",children:"This dataset consists of 162 transcribed calls to an internal corporate IT helpdesk. The calls were transcribed using an automatic speech recognition (ASR) model. Transcription errors were deliberately left uncorrected to reflect the nature of real-life transcripts. The transcripts were anonymized using a semi-automated process with human verification."}),e.jsx("p",{className:"my-4",children:"To evaluate the LLMs, summaries of the transcripts were generated using 17 LLMs. The quality of the generated summaries were then evaluated using LLM-as-judge with an ensemble of 3 models."}),e.jsx("p",{className:"my-4",children:"As with all HELM leaderboards, this leaderboard provides full transparency into all LLM requests and responses, and the results are reproducible using the HELM open source framework. We hope that this leaderboard offers initial insights into the potential of LLMs for this task."}),e.jsxs("p",{className:"my-4",children:["This leaderboard was produced through research collaboration with"," ",e.jsx("a",{className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://www.accenture.com/",children:"Accenture"}),", and was funded by the"," ",e.jsx("a",{className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://hai.stanford.edu/corporate-affiliate-program",children:"HAI Corporate Affiliate Program"}),"."]})]}),e.jsxs("div",{className:"flex-[2] py-2 rounded-3xl bg-gray-100 h-full",children:[e.jsx(E,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function _t(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Chinese Language Models EVAluation Platform (CLEVA)"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("p",{children:["In collaboration with the"," ",e.jsx("a",{href:"https://lwwangcse.github.io/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"LaVi Lab"})," ","team from"," ",e.jsx("a",{href:"https://www.cuhk.edu.hk/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"The Chinese University of Hong Kong (CUHK)"}),", we introduce the"," ",e.jsx("strong",{className:"font-bold",children:"Chinese Language Models EVAluation Platform (CLEVA)"})," ","leaderboard on HELM. CLEVA is a comprehensive Chinese-language benchmark for holistic evaluation of Chinese-language LLMs, and employs a standardized workflow to assess LLMs' performance across various dimensions."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2308.04813",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(E,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function Tt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"The Mighty ToRR: A Benchmark for Table Reasoning and Robustness"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("p",{className:"mb-4 italic",children:["This leaderboard is a collaboration with"," ",e.jsx("a",{href:"https://research.ibm.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"IBM Research"}),"."]}),e.jsxs("p",{className:"my-4",children:["Despite its real-world significance, model performance on tabular data remains underexplored, leaving uncertainty about which model to rely on and which prompt configuration to adopt. To address this gap, we create ",e.jsx("strong",{className:"font-bold",children:"ToRR"}),", a benchmark for Table Reasoning and Robustness, a varied benchmark that measures model performance and robustness on table-related tasks. The benchmark includes 10 datasets that cover different types of table reasoning capabilities across varied domains. ToRR goes beyond model performance rankings, and is designed to reflect whether models can handle tabular data consistently and robustly, across a variety of common table representation formats. We present a leaderboard as well as comprehensive analyses of the results of leading models over ToRR. Our results reveal a striking pattern of brittle model behavior, where even strong models are unable to perform robustly on tabular data tasks. Although no specific table format leads to consistently better performance, we show that testing over multiple formats is crucial for reliably estimating model capabilities. Moreover, we show that the reliability boost from testing multiple prompts can be equivalent to adding more test examples. Overall, our findings show that reasoning over table tasks remains a significant challenge."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4 hidden",href:"#",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(E,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Pt=({id:s,title:t,text:a})=>((t==="Classic"||t==="Lite"||t==="Instruct")&&(t="HELM "+t),e.jsx("div",{className:"max-w-sm rounded overflow-hidden bg-gray-100 hover:scale-105 transition-transform duration-300",children:e.jsx("a",{href:oe(void 0,s),children:e.jsxs("div",{className:"px-6 py-4",children:[e.jsxs("div",{className:"font-bold text-xl mb-2",children:[e.jsx("div",{className:"py-3",children:e.jsx("svg",{fill:"#000000",width:"20px",height:"20px",viewBox:"0 0 24 24",xmlns:"http://www.w3.org/2000/svg",children:e.jsx("path",{d:"M22,7H16.333V4a1,1,0,0,0-1-1H8.667a1,1,0,0,0-1,1v7H2a1,1,0,0,0-1,1v8a1,1,0,0,0,1,1H22a1,1,0,0,0,1-1V8A1,1,0,0,0,22,7ZM7.667,19H3V13H7.667Zm6.666,0H9.667V5h4.666ZM21,19H16.333V9H21Z"})})}),t+" →"]}),e.jsx("p",{className:"text-gray-700 text-base",children:a})]})})}));function It(){const[s,t]=o.useState();return o.useEffect(()=>{fetch("https://crfm.stanford.edu/helm/project_metadata.json").then(a=>a.json()).then(a=>{t(a)}).catch(a=>{console.error("Error fetching JSON:",a)})},[]),e.jsx("div",{className:"p-10 mb-20",children:e.jsx("div",{className:"grid grid-cols-2 lg:grid-cols-3 gap-4",children:s&&s.map((a,n)=>a.id==="home"?null:e.jsx(Pt,{id:a.id,title:a.title,text:a.description},n))})})}function Ht(){return e.jsxs("div",{className:"flex flex-col md:flex-row px-6 py-32",children:[e.jsxs("div",{className:"flex-1 p-4 flex flex-col justify-center",children:[e.jsx("div",{className:"flex justify-start",children:e.jsxs("div",{children:[e.jsx("h1",{className:"text-4xl mb-4 mx-4 mt-2",children:e.jsx("strong",{children:"A reproducible and transparent framework for evaluating foundation models."})}),e.jsx("h3",{className:`text-xl
10
+ mb-4 mx-4 mt-2`,children:"Find leaderboards with many scenarios, metrics, and models with support for multimodality and model-graded evaluation."})]})}),e.jsxs("div",{className:"flex flex-col md:flex-row justify-start mt-6 ml-4",children:[e.jsx("button",{className:"px-6 btn btn-grey rounded-md mb-4 md:mb-0",onClick:()=>window.scrollTo({top:760,behavior:"smooth"}),children:e.jsx("div",{children:"Leaderboards ↓"})}),e.jsx("button",{className:"px-6 btn btn-grey rounded-md md:ml-4",children:e.jsx("a",{href:"https://github.com/stanford-crfm/helm",children:"Github"})})]})]}),e.jsx("div",{className:"mx-4 mt-6 md:mt-0 md:w-1/3",children:e.jsx("img",{src:Oe,alt:"HELM Hero",className:"object-cover w-full h-full"})})]})}function $t(){const[s,t]=o.useState(void 0);return o.useEffect(()=>{const a=new AbortController;async function n(){const r=await F(a.signal);t(r)}return n(),()=>a.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(Ht,{}),e.jsx("div",{className:"container py-5 mx-auto text-lg",children:e.jsx("div",{className:"flex flex-col sm:flex-row justify-center mb-10 flex sm:gap-8 md:gap-32",children:e.jsx("h1",{className:"text-4xl mx-4 ",children:e.jsx("strong",{children:"HELM Leaderboards"})})})}),e.jsx(It,{})]}):null}const Dt=""+new URL("overview-BwypNWnk.png",import.meta.url).href,Ot=""+new URL("process-flow-DWDJC733.png",import.meta.url).href;function Ft(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Image2Struct: A Benchmark for Evaluating Vision-Language Models in Extracting Structured Information from Images"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-2 md:gap-8 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2410.22456",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-latex",children:"Latex dataset"}),e.jsx("a",{className:"px-5 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-webpage",children:"Webpage dataset"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-musicsheet",children:"Music sheet dataset"})]}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Image2Struct"})," is a benchmark for evaluating vision-language models in practical tasks of extracting structured information from images."]}),e.jsx("br",{}),e.jsx("p",{children:"In our tasks, VLMs are prompted to generate the underlying structured information (i.e., code) from an input image. The code can be compiled, and the output image is evaluated against the input image to produce a score. This round-trip evaluation allows us to quantitatively evaluate VLMs on complex tasks with multiple correct answers. We create a pipeline that downloads fresh, user-submitted data from active online communities upon execution, evaluates the VLMs shortly, and produces a leaderboard."}),e.jsx("br",{}),e.jsx("img",{src:Dt,alt:"Evaluation flowchart",className:"mx-auto block w-full",sizes:"100vw"}),e.jsx("br",{}),e.jsx("p",{children:"We introduce 3 tasks:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"LaTex: equations, tables, plots and algorithms from ArXiV papers."}),e.jsx("li",{children:"Webpages: pages from GitHub written in HTML, CSS and Javascript. ..."}),e.jsx("li",{children:"Music sheets: crops of measures from IMSLP music sheets."})]}),e.jsx("div",{className:"flex flex-row justify-center mt-8",children:e.jsx("a",{className:"px-10 btn rounded-md",href:"#/leaderboard",children:"Full Leaderboard"})})]}),e.jsx("div",{className:"flex-1",children:e.jsx(E,{numRowsToDisplay:12})})]}),e.jsx("br",{}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("p",{children:"We provide an automated process for collecting new fresh data from online communities, evaluating the models and producing a leaderboard. The pipeline is designed to be executed on a regular basis to keep the leaderboard up-to-date."}),e.jsx("br",{}),e.jsxs("p",{children:["In addition to the automated data collection, we also provide a"," ",e.jsx("i",{children:"wild"})," subset for the LaTeX and webpage tasks that are collected from Wikipedia and various popular websites. These instances do not have a corresponding code, and the evaluation is done by our proposed metric: block EMD (Earth Mover Distance)."]})]}),e.jsx("div",{className:"flex-1",children:e.jsx("img",{src:Ot,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block w-full",sizes:"200vw"})})]})]})}function Bt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Elements of World Knowledge (EWoK)"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("p",{children:["We present the"," ",e.jsx("a",{className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://arxiv.org/abs/2405.09605",children:"Elements of World Knowledge (EWoK)"})," ","leaderboard in collaboration with the EWoK team. EWoK is a benchmark for evaluating world modeling in language models by testing their ability to use knowledge of a concept to match a target text with a plausible/implausible context. EWoK targets specific concepts from multiple knowledge domains known to be vital for world modeling in humans, including social interactions and spatial relations."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2405.09605",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(E,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const zt=""+new URL("medhelm-overview-CND0EIsy.png",import.meta.url).href;function Ut(){const s=e.jsx("strong",{className:"font-bold",children:"MedHELM"});return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-4 font-bold text-center",children:"MedHELM"}),e.jsx("p",{className:"text-xl my-4 italic text-center",children:"Holistic Evaluation of Large Language Models for Medical Tasks"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-[7] text-l",children:[e.jsxs("p",{className:"my-",children:["We introduce ",s,", an extensible evaluation framework for assessing LLM performance for medical tasks. Building on the HELM framework, ",s," comprises a structured taxonomy with 5 categories, 22 subcategories, and 121 distinct clinical tasks as well as 35 distinct benchmarks (14 private, 7 gated-access, and 14 public). The benchmarks represent a spectrum of healthcare scenarios, from diagnostic decision-making to patient communication, providing a more nuanced and medically relevant assessment of AI capabilities in healthcare settings."]}),e.jsx("img",{src:zt,alt:"MedHELM Task Categories",className:"mx-auto my-4 block w-7/8",sizes:"100vw"}),e.jsxs("p",{className:"my-4",children:[s," establishes a foundation for testing and evaluation of the real-world applicability of language models in healthcare. It is made possible by a unique collaboration between the Center for Research on Foundation Models, Technology and Digital Solutions at Stanford Healthcare, and Microsoft Healthcare and Life Sciences in partnership with faculty in the Departments of Medicine, Computer Science, Anesthesiology, Dermatology, Pediatrics and Biomedical Data Science as well as trainees from the MCiM program at the Clinical Excellence Research Center. The effort is coordinated by the Center for Biomedical Informatics Research."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2505.23802",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://crfm-helm.readthedocs.io/en/latest/medhelm/",children:"Documentation"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://hai.stanford.edu/news/holistic-evaluation-of-large-language-models-for-medical-applications",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Leaderboard"})]})]}),e.jsxs("div",{className:"flex-[3] py-2 rounded-3xl bg-gray-100 h-full",children:[e.jsx(E,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Wt=""+new URL("medhelm-v1-overview-Cu2tphBB.png",import.meta.url).href;function qt(){const s=e.jsx("strong",{className:"font-bold",children:"MedHELM"});return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-4 font-bold text-center",children:"MedHELM"}),e.jsx("p",{className:"text-xl my-4 italic text-center",children:"Medical and AI experts build a benchmark for evaluation of LLMs grounded in real-world healthcare needs."}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("p",{className:"my-4",children:["We introduce ",s,", a comprehensive healthcare benchmark to evaluate language models on real-world clinical tasks using real electronic health records. Building on the HELM framework,"," ",s," comprises a structured taxonomy with 5 categories, 22 subcategories, and 121 distinct clinical tasks as well as 31 diverse datasets (12 private, 6 gated-access, and 13 public). The datasets represent a spectrum of healthcare scenarios, from diagnostic decision-making to patient communication, providing a more nuanced and clinically relevant assessment of AI capabilities in healthcare settings."]}),e.jsx("img",{src:Wt,alt:"MedHELM Task Categories",className:"mx-auto my-4 block w-7/8",sizes:"100vw"}),e.jsxs("p",{className:"my-4",children:[s," establishes a foundation for testing and evaluation of the real-world applicability of language models in healthcare. It is made possible by a unique collaboration between the Center for Research on Foundation Models, Technology and Digital Solutions at Stanford Healthcare, and Microsoft Healthcare and Life Sciences in partnership with faculty in the Departments of Medicine, Computer Science, Anesthesiology, Dermatology, Pediatrics and Biomedical Data Science as well as trainees from the MCiM program at the Clinical Excellence Research Center. The effort is coordinated by the Center for Biomedical Informatics Research."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2505.23802",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://crfm-helm.readthedocs.io/en/latest/medhelm/",children:"Documentation"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://hai.stanford.edu/news/holistic-evaluation-of-large-language-models-for-medical-applications",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(E,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Vt=""+new URL("helm-safety-COfndXuS.png",import.meta.url).href;function Jt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"HELM Safety"}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("img",{src:Vt,alt:"Logo",className:"mx-auto p-0 block",style:{width:"300px"}}),e.jsx("p",{children:"Language models demonstrate powerful capabilities and pose significant risks. Given their widespread deployment, standardized public benchmarking of such models is vital. While language models are routinely evaluated on standard capability benchmarks, comparable standardization for benchmarking safety risks lags behind. To address this gap, we introduce HELM-Safety as a collection of 5 safety benchmarks that span 6 risk categories (e.g. violence, fraud, discrimination, sexual, harassment, deception). We present evaluation results for recent leading open weights and closed models."}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://crfm.stanford.edu/2024/11/08/helm-safety.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsx("div",{className:"flex-1",children:e.jsx(E,{})})]})]})}function Gt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Capabilities"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("p",{children:["Building on the HELM framework, we introduce"," ",e.jsx("strong",{className:"font-bold",children:"HELM Capabilities"})," to capture our latest thinking on the evaluation of general capabilities. HELM Capabilities is a new benchmark and leaderboard that consists of a curated set of scenarios for measuring various capabilities of language models. Like all other HELM leaderboards, the HELM Capabilities leaderboard provides full prompt-level transparency, and the results can be fully reproduced using the HELM framework."]}),e.jsxs("div",{className:"flex flex-row justify-center my-4",children:[e.jsx("a",{href:"https://crfm.stanford.edu/2025/03/20/helm-capabilities.html",className:"px-10 btn rounded-md mx-4",children:"Blog Post"}),e.jsx(b,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(E,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function Qt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"MMLU-Winogrande-Afr: Clinical MMLU and Winogrande in 11 low-resource African languages"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("p",{className:"mb-4 italic",children:["This leaderboard is a collaboration with"," ",e.jsx("a",{href:"https://ghamut.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Ghamut Corporation"})," ","and the"," ",e.jsx("a",{href:"https://www.gatesfoundation.org/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Gates Foundation"}),"."]}),e.jsx("p",{className:"my-4",children:"Large Language Models (LLMs) have shown remarkable performance across various tasks, yet significant disparities remain for non-English languages, and especially native African languages. This paper addresses these disparities by creating approximately 1 million human-translated words of new benchmark data in 8 low-resource African languages, covering a population of over 160 million speakers of: Amharic, Bambara, Igbo, Sepedi (Northern Sotho), Shona, Sesotho (Southern Sotho), Setswana, and Tsonga. Our benchmarks are translations of Winogrande and three sections of MMLU: college medicine, clinical knowledge, and virology. Using the translated benchmarks, we report previously unknown performance gaps between state-of-the-art (SOTA) LLMs in English and African languages. The publicly available benchmarks, translations, and code from this study support further research and development aimed at creating more inclusive and effective language technologies."}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2412.12417",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(E,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function Kt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"SEA-HELM: Southeast Asian Holistic Evaluation of Language Models"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("p",{className:"mb-4 italic",children:["This leaderboard is a collaboration with"," ",e.jsx("a",{href:"https://aisingapore.org/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"AI Singapore"}),"."]}),e.jsxs("p",{className:"my-4",children:["With the rapid emergence of novel capabilities in Large Language Models (LLMs), the need for rigorous multilingual and multicultural benchmarks that are integrated has become more pronounced. Though existing LLM benchmarks are capable of evaluating specific capabilities of LLMs in English as well as in various mid- to low-resource languages, including those in the Southeast Asian (SEA) region, a comprehensive and authentic evaluation suite for the SEA languages has not been developed thus far. Here, we present"," ",e.jsx("strong",{className:"font-bold",children:"SEA-HELM"}),", a holistic linguistic and cultural LLM evaluation suite that emphasizes SEA languages, comprising five core pillars: (1) NLP Classics, (2) LLM-specifics, (3) SEA Linguistics, (4) SEA Culture, (5) Safety. SEA-HELM currently supports Filipino, Indonesian, Tamil, Thai, and Vietnamese. We also introduce the SEA-HELM leaderboard, which allows users to understand models' multilingual and multicultural performance in a systematic and user-friendly manner."]}),e.jsxs("p",{className:"mb-4 italic",children:["Additional evaluation results are available on the external"," ",e.jsx("a",{href:"https://leaderboard.sea-lion.ai/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"AI Singapore (AISG) SEA-HELM leaderboard"}),"."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2502.14301",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"HELM Leaderboard"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://leaderboard.sea-lion.ai/",children:"AISG Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(E,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function Zt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Speech"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("p",{className:"my-4",children:"We present a HELM leaderboard for speech tasks."}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(E,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function Xt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-4 font-bold text-center",children:"HELM Long Context"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-[1] text-l",children:[e.jsx("p",{className:"my-4",children:"Recent Large Language Models (LLMs) support processing long inputs with hundreds of thousands or millions of tokens. Long context capabilities are important for many real-world applications, such as processing long text documents, conducting long conversations or following complex instructions. However, support for long inputs does not equate to strong long context capabilities. As such, there is a need for rigorous and comprehensive evaluations of long context capabilities."}),e.jsxs("p",{className:"my-4",children:["To address this, we introduce the"," ",e.jsx("strong",{className:"font-bold",children:"HELM Long Context"})," ","leaderboard, which provides transparent, comparable and reproducible evaluations of long context capabilities of recent models. The benchmark consists of 5 tasks:"]}),e.jsxs("ul",{className:"list-disc pl-6",children:[e.jsxs("li",{children:[e.jsx("a",{className:"underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"#/leaderboard/ruler_squad",children:e.jsx("strong",{className:"font-bold",children:"RULER SQuAD"})})," ","— open ended single-hop question answering on passages"]}),e.jsxs("li",{children:[e.jsx("a",{className:"underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"#/leaderboard/ruler_hotpotqa",children:e.jsx("strong",{className:"font-bold",children:"RULER HotPotQA"})})," ","— open ended multi-hop question answering on passages"]}),e.jsxs("li",{children:[e.jsx("a",{className:"underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"#/leaderboard/infinite_bench_en_qa",children:e.jsx("strong",{className:"font-bold",children:"∞Bench En.MC"})})," ","— multiple choice question answering based on the plot of a novel"]}),e.jsxs("li",{children:[e.jsx("a",{className:"underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"#/leaderboard/infinite_bench_en_sum",children:e.jsx("strong",{className:"font-bold",children:"∞Bench En.Sum"})})," ","— summarization of the plot of a novel"]}),e.jsxs("li",{children:[e.jsx("a",{className:"underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"#/leaderboard/openai_mrcr",children:e.jsx("strong",{className:"font-bold",children:"OpenAI MRCR"})})," ","— multi-round co-reference resolution on a long, multi-turn, synthetic conversation"]})]}),e.jsx("p",{className:"my-4",children:"The results demonstrate that even though significant progress has been made on long context capabilities, there is still considerable room for improvement."}),e.jsx("p",{className:"my-4",children:"As with all HELM leaderboards, this leaderboard provides full transparency into all LLM requests and responses, and the results are reproducible using the HELM open source framework."}),e.jsxs("p",{className:"my-4",children:["This leaderboard was produced through research collaboration with"," ",e.jsx("a",{className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://www.lvmh.com/",children:"LVMH"}),", and was funded by the"," ",e.jsx("a",{className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://hai.stanford.edu/corporate-affiliate-program",children:"HAI Corporate Affiliate Program"}),"."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://crfm.stanford.edu/2025/09/29/helm-long-context.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"flex-[1] py-2 rounded-3xl bg-gray-100 h-full",children:[e.jsx(E,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function Yt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM SQL"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("p",{className:"mb-4",children:["Text-to-SQL is the task of converting natural language instructions into SQL code. There has been increasing interest in text-to-SQL for applications by data scientists in various domains. Thus, we introduce the ",e.jsx("strong",{className:"font-bold",children:"HELM SQL"})," ","leaderboard for text-to-SQL evaluations. The HELM SQL leaderboard evaluates leading LLMs on two existing text-to-SQL benchmarks (Spider, BIRD-SQL) that cover a range of professional domains. In addition, we introduce a new benchmark,"," ",e.jsx("strong",{className:"font-bold",children:"CzechBankQA"}),", a text-to-SQL benchmark based on a real public bank customer relational database, to address the lack of coverage of text-to-SQL in the financial domain. CzechBankQA consists of text-to-SQL queries and gold labels provided by professionals at Wells Fargo. We hope that this leaderboard provides useful insights for data science practitioners."]}),e.jsxs("p",{className:"my-4",children:["This leaderboard was produced through research collaboration with"," ",e.jsx("a",{className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://www.wellsfargo.com/",children:"Wells Fargo"}),", and was funded by the"," ",e.jsx("a",{className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://hai.stanford.edu/corporate-affiliate-program",children:"HAI Corporate Affiliate Program"}),"."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4 hidden",href:"#",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(E,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function ea(){const s=e.jsx("strong",{className:"font-bold",children:"ViLLM"});return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"ViLLM: Crossing Linguistic Horizon"}),e.jsxs("p",{className:"text-xl my-4 italic text-center",children:[s," is a comprehensive benchmark suite for evaluating the performance of language models in ",e.jsx("strong",{children:"Vietnamese"}),"."]}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("p",{className:"my-4",children:["As multilingual large language models (LLMs) continue to advance natural language processing, bridging communication across diverse cultures and languages, their effectiveness in lower-resourced languages like Vietnamese remains limited. Despite being trained on large multilingual corpora, most open-source LLMs struggle with Vietnamese understanding and generation.",e.jsx("strong",{children:" ViLLM"})," addresses this gap by providing a robust evaluation framework tailored specifically for Vietnamese. It includes ",e.jsx("strong",{children:"11 essential scenarios"}),", each targeting a core capability of Vietnamese LLMs:"]}),e.jsxs("p",{className:"my-4",children:[e.jsx("strong",{children:"ViLLM"})," includes 11 carefully designed evaluation scenarios, each addressing a core language modeling capability:",e.jsxs("ul",{className:"list-disc list-inside mt-2",children:[e.jsxs("li",{children:[e.jsx("strong",{children:"Question Answering:"})," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/juletxara/xquad_xtreme",children:"XQuAD"}),","," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/facebook/mlqa",children:"MLQA"})]}),e.jsxs("li",{children:[e.jsx("strong",{children:"Summarization:"})," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/Yuhthe/vietnews",children:"VietNews"}),","," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/GEM/wiki_lingua",children:"WikiLingua"})]}),e.jsxs("li",{children:[e.jsx("strong",{children:"Sentiment Analysis:"})," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/ura-hcmut/vlsp2016",children:"VLSP2016"}),","," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/ura-hcmut/UIT-VSFC",children:"UiT-VSFC"})]}),e.jsxs("li",{children:[e.jsx("strong",{children:"Text Classification:"})," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/ura-hcmut/PhoATIS",children:"PhoATIS"}),","," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/ura-hcmut/UIT-VSMEC",children:"UiT-VSMEC"})]}),e.jsxs("li",{children:[e.jsx("strong",{children:"Knowledge:"})," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/ura-hcmut/zalo_e2eqa",children:"ZaloE2E"}),","," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/ura-hcmut/ViMMRC",children:"UiT-ViMMRC"})]}),e.jsxs("li",{children:[e.jsx("strong",{children:"Toxic Detection:"})," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/ura-hcmut/UIT-ViHSD",children:"UiT-VIHSD"}),","," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/tarudesu/ViCTSD",children:"UiT-ViCTSD"})]}),e.jsxs("li",{children:[e.jsx("strong",{children:"Information Retrieval:"})," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/unicamp-dl/mmarco",children:"mMARCO"}),","," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/unicamp-dl/mrobust",children:"mRobust04"})]}),e.jsxs("li",{children:[e.jsx("strong",{children:"Language Modeling:"})," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/ura-hcmut/MLQA",children:"MLQA"}),","," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/ura-hcmut/VSEC",children:"VSEC"})]}),e.jsxs("li",{children:[e.jsx("strong",{children:"Reasoning:"})," ",e.jsx("a",{className:"link-primary",href:"",children:"Synthetic reasoning"}),","," ",e.jsx("a",{className:"link-primary",href:"",children:"Natural synthetic reasoning"})]}),e.jsxs("li",{children:[e.jsx("strong",{children:"Mathematic:"})," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/ura-hcmut/Vietnamese-MATH",children:"MATH"})]}),e.jsxs("li",{children:[e.jsx("strong",{children:"Translation:"})," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/vietgpt/opus100_envi",children:"OPUS100"}),","," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/ura-hcmut/PhoMT",children:"PhoMT"})]})]})]}),e.jsxs("p",{className:"my-4",children:[e.jsx("strong",{children:"ViLLM"})," also includes tools to promote the ethical and responsible use of language models:",e.jsxs("ul",{className:"list-disc list-inside mt-2",children:[e.jsxs("li",{children:[e.jsx("strong",{children:"Bias Assessment:"})," Detects and mitigates biased patterns in model outputs."]}),e.jsxs("li",{children:[e.jsx("strong",{children:"Toxicity Assessment:"})," Monitors and controls the generation of harmful or offensive content."]}),e.jsxs("li",{children:[e.jsx("strong",{children:"Fairness Evaluation:"})," Ensures equitable performance across demographic groups and languages."]}),e.jsxs("li",{children:[e.jsx("strong",{children:"Robustness Analysis:"})," Evaluates model stability against noisy or adversarial inputs in real-world scenarios."]})]})]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://aclanthology.org/2024.findings-naacl.182",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(E,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function sa(){const s=e.jsx("strong",{className:"font-bold",children:"SLPHelm"});return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"SLPHelm: Advancing Speech Language Processing"}),e.jsxs("p",{className:"text-xl my-4 italic text-center",children:[s," is a comprehensive benchmark suite for evaluating the performance of speech language models across multiple languages and tasks."]}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("p",{className:"my-4",children:["As speech language models continue to evolve and transform how we interact with technology, there is a growing need for standardized evaluation frameworks that can assess their capabilities across diverse languages and tasks. Despite significant advances in speech processing, many models struggle with multilingual understanding, accent recognition, and domain-specific speech tasks."," ",e.jsx("strong",{children:"SLPHelm"})," addresses these challenges by providing a robust evaluation framework that includes",e.jsx("strong",{children:" 5 key scenarios"})," across ",e.jsx("strong",{children:"15 models"}),", each targeting essential speech processing capabilities:"]}),e.jsxs("p",{className:"my-4",children:[e.jsx("strong",{children:"SLPHelm"})," includes 5 carefully designed evaluation scenarios, each addressing a core speech processing capability:",e.jsxs("ul",{className:"list-disc list-inside mt-2",children:[e.jsx("li",{children:e.jsx("strong",{children:"Disorder Diagnosis"})}),e.jsx("li",{children:e.jsx("strong",{children:"Transcription Accuracy"})}),e.jsx("li",{children:e.jsx("strong",{children:"Disorder Type Diagnosis"})}),e.jsx("li",{children:e.jsx("strong",{children:"Disorder Symptom Diagnosis"})}),e.jsx("li",{children:e.jsx("strong",{children:"Disorder Diagnosis via Transcription"})})]})]}),e.jsxs("p",{className:"my-4",children:[e.jsx("strong",{children:"SLPHelm"})," evaluates models using comprehensive datasets:",e.jsxs("ul",{className:"list-disc list-inside mt-2",children:[e.jsxs("li",{children:[e.jsx("strong",{children:"UltraSuite:"})," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/SAA-Lab/SLPHelmUltraSuite",children:"UltraSuite"})]}),e.jsxs("li",{children:[e.jsx("strong",{children:"ENNI:"})," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/SAA-Lab/SLPHelmDataset/tree/main/ENNI",children:"ENNI"})]}),e.jsxs("li",{children:[e.jsx("strong",{children:"LeNormand:"})," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/SAA-Lab/SLPHelmDataset/tree/main/LeNormand",children:"LeNormand"})]}),e.jsxs("li",{children:[e.jsx("strong",{children:"PERCEPT-GFTA:"})," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/SAA-Lab/SLPHelmDataset/tree/main/PERCEPT-GFTA",children:"PERCEPT-GFTA"})]}),e.jsxs("li",{children:[e.jsx("strong",{children:"UltraSuite w/ Manual Labels:"})," ",e.jsx("a",{className:"link-primary",href:"https://huggingface.co/datasets/SAA-Lab/SLPHelmManualLabels",children:"SLPHelmManualLabels"})]})]})]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(E,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const ta=""+new URL("audio-table-Dn5NMMeJ.png",import.meta.url).href;function aa(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Holistic Evaluation of Audio-Language Models"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2508.21376",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"#/leaderboard",children:"Leaderboard"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{className:"my-4",children:["Evaluations of"," ",e.jsx("strong",{className:"font-bold",children:"audio-language models (ALMs)"})," ","— multimodal models that take interleaved audio and text as input and output text — are hindered by the lack of standardized benchmarks; most benchmarks measure only one or two capabilities and omit evaluative aspects such as fairness or safety. Furthermore, comparison across models is difficult as separate evaluations test a limited number of models and use different prompting methods and inference parameters."]}),e.jsxs("p",{className:"my-4",children:["To address these shortfalls, we introduce"," ",e.jsx("strong",{className:"font-bold",children:"AHELM"}),", a benchmark that aggregates various datasets — including"," ",e.jsx("strong",{className:"font-bold",children:"2 new synthetic audio-text datasets"})," ","called ",e.jsx("strong",{className:"font-bold",children:"PARADE"}),", which evaluates the ALMs on avoiding stereotypes, and"," ",e.jsx("strong",{className:"font-bold",children:"CoRe-Bench"}),", which measures reasoning over conversational audio through inferential multi-turn question answering — to holistically measure the performance of ALMs across 10 aspects we have identified as important to the development and usage of ALMs:"," ",e.jsx("em",{className:"italic",children:"audio perception"}),","," ",e.jsx("em",{className:"italic",children:"knowledge"}),","," ",e.jsx("em",{className:"italic",children:"reasoning"}),","," ",e.jsx("em",{className:"italic",children:"emotion detection"}),","," ",e.jsx("em",{className:"italic",children:"bias"}),", ",e.jsx("em",{className:"italic",children:"fairness"}),","," ",e.jsx("em",{className:"italic",children:"multilinguality"}),","," ",e.jsx("em",{className:"italic",children:"robustness"}),","," ",e.jsx("em",{className:"italic",children:"toxicity"}),", and"," ",e.jsx("em",{className:"italic",children:"safety"}),". We standardize the prompts, inference parameters, and evaluation metrics to ensure equitable comparisons across models."]}),e.jsxs("div",{className:"my-16 flex flex-col lg:flex-row gap-8",children:[e.jsx("div",{className:"flex-1 text-xl",children:e.jsx("img",{src:ta,alt:"An example of each aspect in AHELM: Auditory Perception, Knowledge, Reasoning, Emotion Detection, Bias, Fairness, Multilinguality, Robustness, Toxicity and Safety. ",className:""})}),e.jsxs("div",{className:"flex-1",children:[e.jsx(E,{}),e.jsx(b,{to:"leaderboard",className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})]})]})]})}function na(){return window.PROJECT_ID==="lite"?e.jsx(je,{}):window.PROJECT_ID==="instruct"?e.jsx(pt,{}):window.PROJECT_ID==="image2struct"?e.jsx(Ft,{}):window.PROJECT_ID==="heim"?e.jsx(Mt,{}):window.PROJECT_ID==="mmlu"?e.jsx(jt,{}):window.PROJECT_ID==="vhelm"?e.jsx(Rt,{}):window.PROJECT_ID==="air-bench"?e.jsx(vt,{}):window.PROJECT_ID==="thaiexam"?e.jsx(wt,{}):window.PROJECT_ID==="finance"?e.jsx(yt,{}):window.PROJECT_ID==="call-center"?e.jsx(At,{}):window.PROJECT_ID==="call-transcript-summarization"?e.jsx(Ct,{}):window.PROJECT_ID==="cleva"?e.jsx(_t,{}):window.PROJECT_ID==="torr"?e.jsx(Tt,{}):window.PROJECT_ID==="ewok"?e.jsx(Bt,{}):window.PROJECT_ID==="medhelm"&&window.RELEASE=="v1.0.0"?e.jsx(qt,{}):window.PROJECT_ID==="medhelm"?e.jsx(Ut,{}):window.PROJECT_ID==="safety"?e.jsx(Jt,{}):window.PROJECT_ID==="capabilities"?e.jsx(Gt,{}):window.PROJECT_ID==="mmlu-winogrande-afr"?e.jsx(Qt,{}):window.PROJECT_ID==="seahelm"?e.jsx(Kt,{}):window.PROJECT_ID==="speech"?e.jsx(Zt,{}):window.PROJECT_ID==="sql"?e.jsx(Yt,{}):window.PROJECT_ID==="long-context"?e.jsx(Xt,{}):window.PROJECT_ID==="villm"?e.jsx(ea,{}):window.PROJECT_ID==="slphelm"?e.jsx(sa,{}):window.PROJECT_ID==="audio"?e.jsx(aa,{}):window.PROJECT_ID==="home"?e.jsx($t,{}):e.jsx(je,{})}function ra(){return e.jsx(We,{children:e.jsx(qe,{children:e.jsxs(P,{path:"/",element:e.jsx(vs,{}),children:[e.jsx(P,{index:!0,element:e.jsx(na,{})}),e.jsx(P,{path:"leaderboard",element:e.jsx(se,{})}),e.jsx(P,{path:"leaderboard/:groupName",element:e.jsx(se,{})}),e.jsx(P,{path:"models",element:e.jsx(Ms,{})}),e.jsx(P,{path:"scenarios",element:e.jsx(Es,{})}),e.jsx(P,{path:"groups",element:e.jsx(As,{})}),e.jsx(P,{path:"groups/:groupName",element:e.jsx(se,{})}),e.jsx(P,{path:"runs",element:e.jsx(_s,{})}),e.jsx(P,{path:"runs/:runName",element:e.jsx(mt,{})})]})})})}ae.createRoot(document.getElementById("root")).render(e.jsx(Ve.StrictMode,{children:e.jsx(ra,{})}));