crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/bbq_metrics.py +12 -0
  27. helm/benchmark/metrics/classification_metrics.py +19 -1
  28. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  29. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  30. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  31. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  32. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  33. helm/benchmark/metrics/comet_metric.py +1 -1
  34. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  35. helm/benchmark/metrics/copyright_metrics.py +1 -1
  36. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  37. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  38. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  39. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  40. helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
  41. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  42. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  43. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  44. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  45. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  46. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  47. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  48. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  49. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  50. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  51. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  52. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  53. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  54. helm/benchmark/metrics/medec_metrics.py +25 -2
  55. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  56. helm/benchmark/metrics/metric.py +25 -0
  57. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  58. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  59. helm/benchmark/metrics/safety_metrics.py +13 -1
  60. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  61. helm/benchmark/metrics/summac/model_summac.py +3 -3
  62. helm/benchmark/metrics/summarization_metrics.py +129 -1
  63. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  64. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  65. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  66. helm/benchmark/model_deployment_registry.py +11 -19
  67. helm/benchmark/presentation/create_plots.py +11 -2
  68. helm/benchmark/presentation/run_display.py +13 -3
  69. helm/benchmark/presentation/run_entry.py +2 -2
  70. helm/benchmark/presentation/schema.py +10 -22
  71. helm/benchmark/presentation/summarize.py +189 -14
  72. helm/benchmark/presentation/taxonomy_info.py +20 -0
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/run.py +15 -4
  75. helm/benchmark/run_expander.py +4 -0
  76. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  77. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  78. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  79. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  80. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  81. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  82. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  83. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  84. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  85. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  86. helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
  87. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  88. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
  89. helm/benchmark/runner.py +7 -0
  90. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  91. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  92. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  93. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  94. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  95. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  96. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  97. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  98. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  99. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  100. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  101. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  102. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  103. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
  104. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
  105. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
  106. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  107. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  108. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  109. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  110. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  111. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  112. helm/benchmark/scenarios/bold_scenario.py +15 -0
  113. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  115. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  116. helm/benchmark/scenarios/clear_scenario.py +23 -0
  117. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  118. helm/benchmark/scenarios/code_scenario.py +28 -0
  119. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  120. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  121. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  122. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  123. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  124. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  125. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  126. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  127. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  128. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  129. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  130. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  131. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  132. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  133. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  134. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  135. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  136. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  137. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  138. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  139. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  140. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  141. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  142. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  143. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  144. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  145. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  146. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  147. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  148. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  149. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  150. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  151. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  152. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  153. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  154. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  155. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  156. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  157. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  158. helm/benchmark/scenarios/ice_scenario.py +21 -1
  159. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  160. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  161. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  162. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  163. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  164. helm/benchmark/scenarios/koala_scenario.py +21 -1
  165. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  166. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  167. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  168. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  169. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  170. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  171. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  172. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  173. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  175. helm/benchmark/scenarios/math_scenario.py +54 -20
  176. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  177. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  178. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  179. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  180. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  181. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  182. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  183. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  184. helm/benchmark/scenarios/medec_scenario.py +23 -0
  185. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  186. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  187. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  188. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  189. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  190. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  191. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  192. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  193. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  194. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  195. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  196. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  197. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  198. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  199. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  200. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  201. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  202. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  203. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  204. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  205. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  206. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  207. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  208. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  209. helm/benchmark/scenarios/quac_scenario.py +14 -0
  210. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  211. helm/benchmark/scenarios/raft_scenario.py +15 -0
  212. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  213. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  214. helm/benchmark/scenarios/scenario.py +31 -0
  215. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  216. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  217. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  218. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  219. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  220. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  221. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  222. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  223. helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
  224. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  225. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  226. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  227. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  228. helm/benchmark/scenarios/spider_scenario.py +18 -0
  229. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  230. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  231. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  232. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  233. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  234. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  235. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  236. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  237. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  238. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  239. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  240. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  241. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  242. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  243. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  244. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  245. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  246. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  247. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  248. helm/benchmark/slurm_jobs.py +1 -2
  249. helm/benchmark/slurm_runner.py +8 -1
  250. helm/benchmark/static/schema_arabic.yaml +271 -0
  251. helm/benchmark/static/schema_classic.yaml +0 -17
  252. helm/benchmark/static/schema_long_context.yaml +17 -18
  253. helm/benchmark/static/schema_medhelm.yaml +36 -0
  254. helm/benchmark/static/schema_slp.yaml +219 -0
  255. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  256. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  257. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  258. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  259. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  260. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  261. helm/benchmark/static_build/index.html +5 -6
  262. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  263. helm/clients/ai21_client.py +2 -0
  264. helm/clients/aleph_alpha_client.py +2 -0
  265. helm/clients/anthropic_client.py +7 -1
  266. helm/clients/audio_language/diva_llama_client.py +2 -0
  267. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  268. helm/clients/audio_language/llama_omni/constants.py +9 -0
  269. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  270. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  271. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  272. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  273. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  274. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  275. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  276. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  277. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  278. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  279. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  280. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  281. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  282. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  283. helm/clients/audio_language/llama_omni/utils.py +202 -0
  284. helm/clients/audio_language/llama_omni_client.py +2 -1
  285. helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
  286. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  287. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  288. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  289. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  290. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  291. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  292. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  293. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  294. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  295. helm/clients/bedrock_client.py +63 -6
  296. helm/clients/cohere_client.py +3 -0
  297. helm/clients/dspy_client.py +135 -0
  298. helm/clients/google_client.py +2 -0
  299. helm/clients/http_model_client.py +2 -0
  300. helm/clients/huggingface_client.py +4 -3
  301. helm/clients/ibm_client.py +3 -1
  302. helm/clients/image_generation/adobe_vision_client.py +2 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  304. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  305. helm/clients/image_generation/cogview2_client.py +2 -1
  306. helm/clients/image_generation/dalle2_client.py +2 -0
  307. helm/clients/image_generation/dalle_mini_client.py +2 -1
  308. helm/clients/image_generation/deep_floyd_client.py +2 -0
  309. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  310. helm/clients/image_generation/lexica_client.py +2 -0
  311. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  312. helm/clients/image_generation/mindalle_client.py +2 -1
  313. helm/clients/image_generation/together_image_generation_client.py +2 -0
  314. helm/clients/megatron_client.py +2 -0
  315. helm/clients/mistral_client.py +2 -0
  316. helm/clients/moderation_api_client.py +2 -0
  317. helm/clients/openai_client.py +38 -21
  318. helm/clients/openai_responses_client.py +34 -8
  319. helm/clients/openrouter_client.py +31 -0
  320. helm/clients/palmyra_client.py +2 -1
  321. helm/clients/reka_client.py +2 -1
  322. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  323. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  324. helm/clients/test_huggingface_client.py +3 -3
  325. helm/clients/test_openrouter_client.py +69 -0
  326. helm/clients/together_client.py +52 -13
  327. helm/clients/vertexai_client.py +23 -11
  328. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  329. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  330. helm/clients/vision_language/idefics_client.py +2 -1
  331. helm/clients/vision_language/open_flamingo_client.py +2 -1
  332. helm/clients/vision_language/paligemma_client.py +2 -1
  333. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  334. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  335. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  336. helm/clients/vllm_client.py +43 -7
  337. helm/clients/vllm_granite_thinking_client.py +56 -0
  338. helm/clients/writer_client.py +5 -2
  339. helm/common/critique_request.py +0 -1
  340. helm/common/hierarchical_logger.py +103 -34
  341. helm/common/object_spec.py +23 -8
  342. helm/common/optional_dependencies.py +1 -1
  343. helm/common/test_general.py +4 -0
  344. helm/common/test_logging.py +94 -0
  345. helm/config/model_deployments.yaml +1001 -187
  346. helm/config/model_metadata.yaml +602 -18
  347. helm/config/tokenizer_configs.yaml +202 -5
  348. helm/proxy/cli.py +1 -1
  349. helm/proxy/example_queries.py +8 -8
  350. helm/proxy/retry.py +5 -0
  351. helm/proxy/server.py +2 -1
  352. helm/proxy/static/index.css +4 -0
  353. helm/proxy/static/index.js +7 -1
  354. helm/tokenizers/auto_tokenizer.py +2 -2
  355. helm/tokenizers/grok_tokenizer.py +2 -0
  356. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  357. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  358. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  359. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  360. helm/benchmark/metrics/medalign_metrics.py +0 -14
  361. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  362. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  363. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  364. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  365. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  366. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  367. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  368. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  369. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  370. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  371. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
  372. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  373. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  374. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  375. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  376. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  377. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  378. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
  379. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  380. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
  381. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  382. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  383. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  384. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  385. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  386. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  387. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  388. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  389. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  390. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  391. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  392. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  393. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  394. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -251,6 +251,13 @@ tokenizer_configs:
251
251
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
252
252
  end_of_text_token: "<eos>"
253
253
  prefix_token: "<bos>"
254
+ - name: google/medgemma-4b-it
255
+ tokenizer_spec:
256
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
257
+ args:
258
+ trust_remote_code: true
259
+ end_of_text_token: "<eos>"
260
+ prefix_token: "<bos>"
254
261
 
255
262
  # Grok
256
263
  - name: xai/grok-3-beta
@@ -265,6 +272,12 @@ tokenizer_configs:
265
272
  end_of_text_token: ""
266
273
  prefix_token: ""
267
274
 
275
+ - name: xai/grok-4-0709
276
+ tokenizer_spec:
277
+ class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
278
+ end_of_text_token: ""
279
+ prefix_token: ""
280
+
268
281
  # Hf-internal-testing
269
282
 
270
283
  # Tokenizer name hf-internal-testing/llama-tokenizer is taken from:
@@ -454,7 +467,7 @@ tokenizer_configs:
454
467
 
455
468
  # Allen Institute for AI
456
469
  # The allenai/olmo-7b requires Python 3.9 or newer.
457
- # To use the allenai/olmo-7b tokenizer, run `pip install crfm-helm[allenai]` first.
470
+ # To use the allenai/olmo-7b tokenizer, run `pip install "crfm-helm[allenai]"` first.
458
471
  - name: allenai/olmo-7b
459
472
  tokenizer_spec:
460
473
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -582,6 +595,17 @@ tokenizer_configs:
582
595
  end_of_text_token: "</s>"
583
596
  prefix_token: "<s>"
584
597
 
598
+ # Moonshot AI
599
+ - name: moonshotai/kimi-k2-instruct
600
+ tokenizer_spec:
601
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
602
+ args:
603
+ pretrained_model_name_or_path: moonshotai/Kimi-K2-Instruct
604
+ trust_remote_code: true
605
+ revision: 4f239503ad9d1a042f0a4bacac457931ab972cfc
606
+ end_of_text_token: "[EOS]"
607
+ prefix_token: "[BOS]"
608
+
585
609
  # Nectec
586
610
  - name: nectec/OpenThaiLLM-Prebuilt-7B
587
611
  tokenizer_spec:
@@ -633,6 +657,12 @@ tokenizer_configs:
633
657
  end_of_text_token: "<|endoftext|>"
634
658
  prefix_token: "<|endoftext|>"
635
659
 
660
+ - name: openai/o200k_harmony
661
+ tokenizer_spec:
662
+ class_name: "helm.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"
663
+ end_of_text_token: "<|endoftext|>"
664
+ prefix_token: "<|startoftext|>"
665
+
636
666
  - name: openai/clip-vit-large-patch14
637
667
  tokenizer_spec:
638
668
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -688,6 +718,18 @@ tokenizer_configs:
688
718
  end_of_text_token: "<|im_end|>"
689
719
  prefix_token: "<|im_start|>"
690
720
 
721
+ - name: qwen/qwen3-235b-a22b-instruct-2507-fp8
722
+ tokenizer_spec:
723
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
724
+ end_of_text_token: "<|im_end|>"
725
+ prefix_token: ""
726
+
727
+ - name: qwen/qwen3-next-80b-a3b-thinking
728
+ tokenizer_spec:
729
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
730
+ end_of_text_token: "<|im_end|>"
731
+ prefix_token: ""
732
+
691
733
  - name: qwen/qwq-32b-preview
692
734
  tokenizer_spec:
693
735
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -768,6 +810,12 @@ tokenizer_configs:
768
810
  end_of_text_token: "<|endoftext|>"
769
811
  prefix_token: ""
770
812
 
813
+ - name: tiiuae/falcon3-1b-instruct
814
+ tokenizer_spec:
815
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
816
+ end_of_text_token: "<|endoftext|>"
817
+ prefix_token: ""
818
+
771
819
  # TsinghuaKEG
772
820
  - name: TsinghuaKEG/ice
773
821
  tokenizer_spec:
@@ -892,6 +940,23 @@ tokenizer_configs:
892
940
  end_of_text_token: ""
893
941
  prefix_token: ""
894
942
 
943
+ - name: ibm/granite-4.0-micro
944
+ tokenizer_spec:
945
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
946
+ args:
947
+ pretrained_model_name_or_path: ibm-granite/granite-4.0-micro
948
+ end_of_text_token: "<|end_of_text|>"
949
+ prefix_token: "<|end_of_text|>"
950
+
951
+ - name: ibm/granite-4.0-h-small
952
+ tokenizer_spec:
953
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
954
+ args:
955
+ pretrained_model_name_or_path: ibm-granite/granite-4.0-h-small
956
+ end_of_text_token: "<|end_of_text|>"
957
+ prefix_token: "<|end_of_text|>"
958
+
959
+ # Maritaca AI
895
960
  - name: maritaca-ai/sabia-7b
896
961
  tokenizer_spec:
897
962
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -900,6 +965,14 @@ tokenizer_configs:
900
965
  end_of_text_token: "</s>"
901
966
  prefix_token: "<s>"
902
967
 
968
+ - name: maritaca-ai/sabia-2-tokenizer-medium
969
+ tokenizer_spec:
970
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
971
+ args:
972
+ pretrained_model_name_or_path: maritaca-ai/sabia-2-tokenizer-medium
973
+ end_of_text_token: "</s>"
974
+ prefix_token: "<s>"
975
+
903
976
  # Granite-3.1-8b-base
904
977
  - name: ibm-granite/granite-3.1-8b-base
905
978
  tokenizer_spec:
@@ -1022,7 +1095,6 @@ tokenizer_configs:
1022
1095
  end_of_text_token: ""
1023
1096
 
1024
1097
  # IBM Granite 3.3
1025
-
1026
1098
  - name: ibm/granite-3.3-8b-instruct
1027
1099
  tokenizer_spec:
1028
1100
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -1031,7 +1103,12 @@ tokenizer_configs:
1031
1103
  end_of_text_token: "<|end_of_text|>"
1032
1104
  prefix_token: "<|end_of_text|>"
1033
1105
 
1034
-
1106
+ # Z.ai GLM-4.5-AIR-FP8
1107
+ - name: zai-org/glm-4.5-air-fp8
1108
+ tokenizer_spec:
1109
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1110
+ end_of_text_token: "<|endoftext|>"
1111
+ prefix_token: ""
1035
1112
 
1036
1113
  # DeepSeek-R1-Distill-Llama-3.1-8b
1037
1114
  - name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
@@ -1042,6 +1119,20 @@ tokenizer_configs:
1042
1119
  end_of_text_token: "<|end▁of▁sentence|>"
1043
1120
  prefix_token: "<|begin▁of▁sentence|>"
1044
1121
 
1122
+ # DeepSeek-R1-Distill-Llama-3.1-8b
1123
+ - name: deepseek-ai/deepseek-r1-distill-llama-70b
1124
+ tokenizer_spec:
1125
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1126
+ end_of_text_token: "<|end▁of▁sentence|>"
1127
+ prefix_token: "<|begin▁of▁sentence|>"
1128
+
1129
+ # DeepSeek-R1-Distill-Qwen-14B
1130
+ - name: deepseek-ai/deepseek-r1-distill-qwen-14b
1131
+ tokenizer_spec:
1132
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1133
+ end_of_text_token: "<|end▁of▁sentence|>"
1134
+ prefix_token: "<|begin▁of▁sentence|>"
1135
+
1045
1136
  # deepseek-ai/deepseek-coder-6.7b-instruct
1046
1137
  - name: deepseek-ai/deepseek-coder-6.7b-instruct
1047
1138
  tokenizer_spec:
@@ -1051,7 +1142,6 @@ tokenizer_configs:
1051
1142
  end_of_text_token: "<|end▁of▁sentence|>"
1052
1143
  prefix_token: "<|begin▁of▁sentence|>"
1053
1144
 
1054
-
1055
1145
  # vilm/vinallama-2.7b-chat
1056
1146
  - name: vilm/vinallama-2.7b-chat
1057
1147
  tokenizer_spec:
@@ -1104,4 +1194,111 @@ tokenizer_configs:
1104
1194
  args:
1105
1195
  pretrained_model_name_or_path: vinai/PhoGPT-4B-Chat
1106
1196
  end_of_text_token: "</s>"
1107
- prefix_token: "<s>"
1197
+ prefix_token: "<s>"
1198
+
1199
+ # Gemma-3-Gaia-PT-BR-4b-it
1200
+ - name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
1201
+ tokenizer_spec:
1202
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1203
+ args:
1204
+ pretrained_model_name_or_path: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
1205
+ end_of_text_token: "<eos>"
1206
+ prefix_token: "<bos>"
1207
+
1208
+ # Bode 13B Alpaca PT-BR
1209
+ - name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
1210
+ tokenizer_spec:
1211
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1212
+ args:
1213
+ pretrained_model_name_or_path: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
1214
+ end_of_text_token: "</s>"
1215
+ prefix_token: "<s>"
1216
+
1217
+ # Cabrita 7B PT-BR tokenizer
1218
+ - name: 22h/cabrita_7b_pt_850000
1219
+ tokenizer_spec:
1220
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1221
+ args:
1222
+ pretrained_model_name_or_path: 22h/cabrita_7b_pt_850000
1223
+ end_of_text_token: "</s>"
1224
+ prefix_token: "<s>"
1225
+
1226
+ # Gervásio 7B PT‑BR/PT‑PT tokenizer
1227
+ - name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
1228
+ tokenizer_spec:
1229
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1230
+ args:
1231
+ pretrained_model_name_or_path: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
1232
+ end_of_text_token: "</s>"
1233
+ prefix_token: "<s>"
1234
+
1235
+ # Tucano 2b4 PT-BR tokenizer
1236
+ - name: TucanoBR/Tucano-2b4
1237
+ tokenizer_spec:
1238
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1239
+ args:
1240
+ pretrained_model_name_or_path: TucanoBR/Tucano-2b4
1241
+ end_of_text_token: "</s>"
1242
+ prefix_token: "<s>"
1243
+
1244
+ # TeenyTinyLlama 460M PT-BR tokenizer
1245
+ - name: nicholasKluge/TeenyTinyLlama-460m
1246
+ tokenizer_spec:
1247
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1248
+ args:
1249
+ pretrained_model_name_or_path: nicholasKluge/TeenyTinyLlama-460m
1250
+ end_of_text_token: "</s>"
1251
+ prefix_token: "<s>"
1252
+
1253
+ # AceGPT-v2
1254
+ - name: freedomintelligence/acegpt-v2-8b-chat
1255
+ tokenizer_spec:
1256
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1257
+ end_of_text_token: "<|end_of_text|>"
1258
+ prefix_token: "<|begin_of_text|>"
1259
+
1260
+ - name: freedomintelligence/acegpt-v2-32b-chat
1261
+ tokenizer_spec:
1262
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1263
+ end_of_text_token: "<|endoftext|>"
1264
+ prefix_token: ""
1265
+
1266
+ - name: freedomintelligence/acegpt-v2-70b-chat
1267
+ tokenizer_spec:
1268
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1269
+ end_of_text_token: "<|end_of_text|>"
1270
+ prefix_token: "<|begin_of_text|>"
1271
+
1272
+ # ALLaM
1273
+ - name: allam-ai/allam-7b-instruct-preview
1274
+ tokenizer_spec:
1275
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1276
+ end_of_text_token: "</s>"
1277
+ prefix_token: "<s>"
1278
+
1279
+ # SILMA
1280
+ - name: silma-ai/silma-9b-instruct-v1.0
1281
+ tokenizer_spec:
1282
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1283
+ end_of_text_token: "<eos>"
1284
+ prefix_token: "<bos>"
1285
+
1286
+ # Jais Family
1287
+ - name: inceptionai/jais-family-590m-chat
1288
+ tokenizer_spec:
1289
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1290
+ end_of_text_token: "<|endoftext|>"
1291
+ prefix_token: "<|endoftext|>"
1292
+
1293
+ # Jais Adapted
1294
+ - name: inceptionai/jais-adapted-7b-chat
1295
+ tokenizer_spec:
1296
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1297
+ end_of_text_token: "</s>"
1298
+ prefix_token: "<s>"
1299
+
1300
+ - name: inceptionai/jais-adapted-13b-chat
1301
+ tokenizer_spec:
1302
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1303
+ end_of_text_token: "</s>"
1304
+ prefix_token: "<s>"
helm/proxy/cli.py CHANGED
@@ -123,7 +123,7 @@ def do_create_update_command(service: RemoteService, auth: Authentication, args)
123
123
 
124
124
  # Update quotas
125
125
  for quota_str in args.quotas:
126
- m = re.match(f"(\w+)\.(\w+)=(\d+|{UNLIMITED_QUOTA})", quota_str)
126
+ m = re.match(rf"(\w+)\.(\w+)=(\d+|{UNLIMITED_QUOTA})", quota_str)
127
127
  if not m:
128
128
  raise Exception(
129
129
  f"Invalid format: {quota_str}, expect <model_group>.<granularity>=<quota> "
@@ -21,7 +21,7 @@ example_queries = [
21
21
  """
22
22
  temperature: 0.5 # Medium amount of randomness
23
23
  stop_sequences: [.] # Stop when you hit a period
24
- model: openai/gpt-3.5-turbo-0613
24
+ model: openai/gpt-4.1-nano-2025-04-14
25
25
  """
26
26
  ),
27
27
  environments="",
@@ -33,7 +33,7 @@ example_queries = [
33
33
  temperature: 0.5 # Medium amount of randomness
34
34
  stop_sequences: [\\n] # Stop when you hit a newline
35
35
  num_completions: 5 # Generate many samples
36
- model: openai/gpt-3.5-turbo-0613
36
+ model: openai/gpt-4.1-nano-2025-04-14
37
37
  """
38
38
  ),
39
39
  environments="",
@@ -58,7 +58,7 @@ example_queries = [
58
58
  """
59
59
  temperature: 0 # Deterministic
60
60
  max_tokens: 50
61
- model: openai/gpt-3.5-turbo-0613
61
+ model: openai/gpt-4.1-nano-2025-04-14
62
62
  """
63
63
  ),
64
64
  environments="",
@@ -76,7 +76,7 @@ example_queries = [
76
76
  environments=dedent(
77
77
  """
78
78
  occupation: [mathematician, lawyer, doctor]
79
- model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
79
+ model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
80
80
  """
81
81
  ),
82
82
  ),
@@ -101,7 +101,7 @@ example_queries = [
101
101
  ),
102
102
  environments=dedent(
103
103
  """
104
- model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
104
+ model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
105
105
  """
106
106
  ),
107
107
  ),
@@ -136,7 +136,7 @@ example_queries = [
136
136
  ),
137
137
  environments=dedent(
138
138
  """
139
- model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
139
+ model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
140
140
  """
141
141
  ),
142
142
  ),
@@ -144,7 +144,7 @@ example_queries = [
144
144
  prompt="Write a Python function that takes two vectors a and b and returns their Euclidean distance.",
145
145
  settings=dedent(
146
146
  """
147
- model: openai/gpt-3.5-turbo-0613
147
+ model: openai/gpt-4.1-nano-2025-04-14
148
148
  """
149
149
  ),
150
150
  environments="",
@@ -161,7 +161,7 @@ example_queries = [
161
161
  ),
162
162
  environments=dedent(
163
163
  """
164
- model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
164
+ model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
165
165
  """
166
166
  ),
167
167
  ),
helm/proxy/retry.py CHANGED
@@ -5,6 +5,7 @@ from retrying import Retrying
5
5
  from helm.common.request import RequestResult
6
6
  from helm.common.tokenization_request import TokenizationRequestResult
7
7
  from helm.common.hierarchical_logger import hlog
8
+ import os
8
9
  import traceback
9
10
  import threading
10
11
 
@@ -19,6 +20,10 @@ Example usage:
19
20
  ...
20
21
  """
21
22
 
23
+ # TODO: make these configurable at a config / cli level
24
+ HELM_RETRIES = int(os.environ.get("HELM_RETRIES", "5"))
25
+ HELM_TOKENIZER_RETRIES = int(os.environ.get("HELM_TOKENIZER_RETRIES", HELM_RETRIES))
26
+
22
27
  # The lock is used to prevent multiple threads from printing at the same time.
23
28
  # This can cause issues when printing the stack trace.
24
29
  # (The stack traces can get mixed up and become unreadable.)
helm/proxy/server.py CHANGED
@@ -23,7 +23,7 @@ from helm.benchmark.model_deployment_registry import get_default_model_deploymen
23
23
  from helm.common.authentication import Authentication
24
24
  from helm.common.cache_backend_config import CacheBackendConfig, MongoCacheBackendConfig, SqliteCacheBackendConfig
25
25
  from helm.common.general import ensure_directory_exists
26
- from helm.common.hierarchical_logger import hlog
26
+ from helm.common.hierarchical_logger import hlog, setup_default_logging
27
27
  from helm.common.optional_dependencies import handle_module_not_found_error
28
28
  from helm.common.request import Request
29
29
  from helm.common.perspective_api_request import PerspectiveAPIRequest
@@ -273,6 +273,7 @@ def main():
273
273
  default="",
274
274
  )
275
275
  args = parser.parse_args()
276
+ setup_default_logging()
276
277
 
277
278
  register_builtin_configs_from_helm_package()
278
279
  register_configs_from_directory(args.base_path)
@@ -35,6 +35,10 @@
35
35
  font-style: italic;
36
36
  }
37
37
 
38
+ .thinking {
39
+ font-style: italic;
40
+ }
41
+
38
42
  .token:hover {
39
43
  background-color: lightgreen;
40
44
  }
@@ -282,7 +282,13 @@ $(function () {
282
282
  requestResult.completions.forEach((completion) => {
283
283
  const $contents = $("<span>", {
284
284
  title: `logprob: ${completion.logprob}`,
285
- }).append(renderTokens(completion.tokens));
285
+ });
286
+ if (completion.thinking) {
287
+ const $thinking = $("<span>", { class: "thinking" }).append(completion.thinking.text);
288
+ $contents.append($thinking);
289
+ }
290
+ const $resultText = completion.tokens.length > 0 ?renderTokens(completion.tokens) : $("<div>").append(completion.text);
291
+ $contents.append($resultText);
286
292
  const $metadata = $("<span>", { class: "metadata" });
287
293
  $metadata.append(
288
294
  $("<span>", { title: "Log probability" }).append(
@@ -8,7 +8,7 @@ from helm.common.credentials_utils import provide_api_key
8
8
  from helm.common.cache_backend_config import CacheBackendConfig, CacheConfig
9
9
  from helm.common.hierarchical_logger import hlog
10
10
  from helm.common.object_spec import create_object, inject_object_spec_args
11
- from helm.proxy.retry import retry_tokenizer_request
11
+ from helm.proxy.retry import NonRetriableException, retry_tokenizer_request
12
12
  from helm.common.tokenization_request import (
13
13
  DecodeRequest,
14
14
  DecodeRequestResult,
@@ -50,7 +50,7 @@ class AutoTokenizer(Tokenizer):
50
50
  )
51
51
  tokenizer = create_object(tokenizer_spec)
52
52
  else:
53
- hlog(f"No tokenizer config for {tokenizer_name}")
53
+ raise NonRetriableException(f"Could not find tokenizer config for {tokenizer_name}")
54
54
 
55
55
  # Cache the tokenizer
56
56
  assert isinstance(tokenizer, Tokenizer) # To make mypy happy
@@ -34,6 +34,8 @@ class GrokAPITokenizer(CachingTokenizer):
34
34
  "Set grokApiKey in credentials.conf or set the GROK_API_KEY environment variable"
35
35
  )
36
36
  text = request["text"]
37
+ if not text:
38
+ return {"token_ids": []}
37
39
  model = request["tokenizer"].split("/")[-1]
38
40
  response = requests.post(
39
41
  url="https://api.x.ai/v1/tokenize-text",
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.aci_bench_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class ACIBenchMetric(LLMJuryMetric):
6
- """Score metrics for ACIBench."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="aci_bench_accuracy",
11
- scenario_name="aci_bench",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.chw_care_plan_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class CHWCarePlanMetric(LLMJuryMetric):
6
- """Score metrics for CHWCarePlan."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="chw_care_plan_accuracy",
11
- scenario_name="chw_care_plan",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.dischargeme_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class DischargeMeMetric(LLMJuryMetric):
6
- """Score metrics for DischargeMe."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="dischargeme_accuracy",
11
- scenario_name="dischargeme",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.med_dialog_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class MedDialogMetric(LLMJuryMetric):
6
- """Score metrics for MedDialog."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="med_dialog_accuracy",
11
- scenario_name="med_dialog",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.medalign_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class MedalignMetric(LLMJuryMetric):
6
- """Score metrics for Medalign."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="medalign_accuracy",
11
- scenario_name="medalign",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.medi_qa_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class MediQAMetric(LLMJuryMetric):
6
- """Score metrics for MediQA."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="medi_qa_accuracy",
11
- scenario_name="medi_qa",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.medication_qa_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class MedicationQAMetric(LLMJuryMetric):
6
- """Score metrics for MedicationQA."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="medication_qa_accuracy",
11
- scenario_name="medication_qa",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.mental_health_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class MentalHealthMetric(LLMJuryMetric):
6
- """Score metrics for MentalHealth."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="mental_health_accuracy",
11
- scenario_name="mental_health",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.mimic_bhc_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class MIMICBHCMetric(LLMJuryMetric):
6
- """Score metrics for MIMICBHC."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="mimic_bhc_accuracy",
11
- scenario_name="mimic_bhc",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.mimic_rrs_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class MIMICRRSMetric(LLMJuryMetric):
6
- """Score metrics for MIMICRRS."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="mimic_rrs_accuracy",
11
- scenario_name="mimic_rrs",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.mtsamples_procedures_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class MTSamplesProceduresMetric(LLMJuryMetric):
6
- """Score metrics for MTSamplesProcedures."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="mtsamples_procedures_accuracy",
11
- scenario_name="mtsamples_procedures",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.mtsamples_replicate_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class MTSamplesReplicateMetric(LLMJuryMetric):
6
- """Score metrics for MTSamplesReplicate."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="mtsamples_replicate_accuracy",
11
- scenario_name="mtsamples_replicate",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )