crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (333) hide show
  1. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
  2. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/alrage_annotator.py +90 -0
  8. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  9. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  10. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  11. helm/benchmark/annotation/medalign_annotator.py +11 -22
  12. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  13. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  14. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  15. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  16. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  17. helm/benchmark/annotation/model_as_judge.py +23 -18
  18. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  19. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  20. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  21. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  22. helm/benchmark/metrics/alrage_metric.py +35 -0
  23. helm/benchmark/metrics/basic_metrics.py +267 -2
  24. helm/benchmark/metrics/bbq_metrics.py +12 -0
  25. helm/benchmark/metrics/classification_metrics.py +19 -1
  26. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  27. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  28. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  29. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  30. helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
  31. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  32. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  33. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  35. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  36. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  37. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  38. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  39. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  40. helm/benchmark/metrics/medec_metrics.py +25 -2
  41. helm/benchmark/metrics/metric.py +25 -0
  42. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  43. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  44. helm/benchmark/metrics/safety_metrics.py +13 -1
  45. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  46. helm/benchmark/metrics/summac/model_summac.py +2 -2
  47. helm/benchmark/metrics/summarization_metrics.py +129 -1
  48. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  49. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  50. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  51. helm/benchmark/presentation/run_display.py +13 -3
  52. helm/benchmark/presentation/run_entry.py +2 -2
  53. helm/benchmark/presentation/schema.py +5 -22
  54. helm/benchmark/presentation/summarize.py +180 -11
  55. helm/benchmark/presentation/taxonomy_info.py +20 -0
  56. helm/benchmark/run.py +1 -1
  57. helm/benchmark/run_expander.py +4 -0
  58. helm/benchmark/run_specs/arabic_run_specs.py +140 -16
  59. helm/benchmark/run_specs/bluex_run_specs.py +1 -1
  60. helm/benchmark/run_specs/classic_run_specs.py +2 -2
  61. helm/benchmark/run_specs/long_context_run_specs.py +2 -2
  62. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  63. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  64. helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
  65. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
  66. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  67. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  68. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  69. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  70. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  71. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  72. helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
  73. helm/benchmark/scenarios/aratrust_scenario.py +19 -0
  74. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
  75. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
  76. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
  77. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
  78. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
  79. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  80. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  81. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  82. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  83. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  84. helm/benchmark/scenarios/bluex_scenario.py +6 -2
  85. helm/benchmark/scenarios/bold_scenario.py +15 -0
  86. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  87. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  88. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  89. helm/benchmark/scenarios/clear_scenario.py +23 -0
  90. helm/benchmark/scenarios/cleva_scenario.py +479 -0
  91. helm/benchmark/scenarios/code_scenario.py +28 -0
  92. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  93. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  94. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  95. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  96. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  97. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  98. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  99. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  100. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  101. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  102. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  103. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  104. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  105. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  106. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  107. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  108. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  109. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  110. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  111. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  112. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  113. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  114. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  115. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  116. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  117. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  118. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  119. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  120. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  121. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  122. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  123. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  124. helm/benchmark/scenarios/ice_scenario.py +21 -1
  125. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  126. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  127. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
  128. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  129. helm/benchmark/scenarios/koala_scenario.py +21 -1
  130. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  131. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  132. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  133. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  134. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  135. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  136. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  137. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  138. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  139. helm/benchmark/scenarios/math_scenario.py +33 -0
  140. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  141. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  142. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  143. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  144. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  145. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  146. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  147. helm/benchmark/scenarios/medec_scenario.py +23 -0
  148. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  149. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  150. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  151. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  152. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  153. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  154. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  155. helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
  156. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  157. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  158. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  159. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  160. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  161. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  162. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  163. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  164. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  165. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  166. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  167. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  168. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  169. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  170. helm/benchmark/scenarios/quac_scenario.py +14 -0
  171. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  172. helm/benchmark/scenarios/raft_scenario.py +15 -0
  173. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  174. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  175. helm/benchmark/scenarios/scenario.py +31 -0
  176. helm/benchmark/scenarios/seahelm_scenario.py +348 -0
  177. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  178. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  179. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  180. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  181. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  182. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  183. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  184. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  185. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  186. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  187. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  188. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  189. helm/benchmark/scenarios/spider_scenario.py +18 -0
  190. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  191. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  192. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  193. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  194. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  195. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  196. helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
  197. helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
  198. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  199. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  200. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  201. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  202. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  203. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  204. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  205. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  206. helm/benchmark/static/schema_arabic.yaml +55 -12
  207. helm/benchmark/static/schema_long_context.yaml +11 -30
  208. helm/benchmark/static/schema_medhelm.yaml +36 -0
  209. helm/benchmark/static/schema_slp.yaml +219 -0
  210. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  211. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  212. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  213. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  214. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  215. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  216. helm/benchmark/static_build/index.html +5 -6
  217. helm/clients/ai21_client.py +2 -0
  218. helm/clients/aleph_alpha_client.py +2 -0
  219. helm/clients/anthropic_client.py +7 -1
  220. helm/clients/audio_language/diva_llama_client.py +2 -0
  221. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  222. helm/clients/audio_language/llama_omni/constants.py +9 -0
  223. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  224. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  225. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  226. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  227. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  228. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  229. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  230. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  231. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  232. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  233. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  234. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  235. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  236. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  237. helm/clients/audio_language/llama_omni/utils.py +202 -0
  238. helm/clients/audio_language/llama_omni_client.py +2 -1
  239. helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
  240. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  241. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  242. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  243. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  244. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  245. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  246. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  247. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  248. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  249. helm/clients/bedrock_client.py +2 -0
  250. helm/clients/cohere_client.py +3 -0
  251. helm/clients/google_client.py +2 -0
  252. helm/clients/http_model_client.py +2 -0
  253. helm/clients/huggingface_client.py +2 -1
  254. helm/clients/ibm_client.py +3 -1
  255. helm/clients/image_generation/adobe_vision_client.py +2 -0
  256. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  257. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  258. helm/clients/image_generation/cogview2_client.py +2 -1
  259. helm/clients/image_generation/dalle2_client.py +2 -0
  260. helm/clients/image_generation/dalle_mini_client.py +2 -1
  261. helm/clients/image_generation/deep_floyd_client.py +2 -0
  262. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  263. helm/clients/image_generation/lexica_client.py +2 -0
  264. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  265. helm/clients/image_generation/mindalle_client.py +2 -1
  266. helm/clients/image_generation/together_image_generation_client.py +2 -0
  267. helm/clients/megatron_client.py +2 -0
  268. helm/clients/mistral_client.py +2 -0
  269. helm/clients/moderation_api_client.py +2 -0
  270. helm/clients/openai_client.py +36 -20
  271. helm/clients/openai_responses_client.py +27 -3
  272. helm/clients/openrouter_client.py +31 -0
  273. helm/clients/palmyra_client.py +2 -1
  274. helm/clients/reka_client.py +2 -1
  275. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  276. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  277. helm/clients/test_openrouter_client.py +69 -0
  278. helm/clients/together_client.py +52 -11
  279. helm/clients/vertexai_client.py +12 -2
  280. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  281. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  282. helm/clients/vision_language/idefics_client.py +2 -1
  283. helm/clients/vision_language/open_flamingo_client.py +2 -1
  284. helm/clients/vision_language/paligemma_client.py +2 -1
  285. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  286. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  287. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  288. helm/clients/writer_client.py +2 -0
  289. helm/common/hierarchical_logger.py +20 -0
  290. helm/common/optional_dependencies.py +1 -1
  291. helm/common/test_general.py +4 -0
  292. helm/config/model_deployments.yaml +300 -1
  293. helm/config/model_metadata.yaml +302 -9
  294. helm/config/tokenizer_configs.yaml +92 -4
  295. helm/proxy/example_queries.py +8 -8
  296. helm/proxy/server.py +2 -1
  297. helm/proxy/static/index.css +4 -0
  298. helm/proxy/static/index.js +7 -1
  299. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  300. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  301. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  302. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  303. helm/benchmark/metrics/medalign_metrics.py +0 -14
  304. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  305. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  306. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  307. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  308. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  309. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  310. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  311. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  312. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  313. helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
  314. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  315. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  316. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  317. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
  318. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
  319. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
  320. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
  321. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  322. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  323. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  324. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  325. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  326. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  327. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  328. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  329. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  330. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  331. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  332. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  333. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -4,6 +4,7 @@ import requests
4
4
 
5
5
  from helm.common.cache import CacheConfig, Cache
6
6
  from helm.common.file_caches.file_cache import FileCache
7
+ from helm.common.hierarchical_logger import hexception
7
8
  from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
8
9
  from helm.common.tokenization_request import (
9
10
  TokenizationRequest,
@@ -84,6 +85,7 @@ class TogetherImageGenerationClient(Client):
84
85
 
85
86
  response, cached = self._cache.get(cache_key, wrap_request_time(do_it))
86
87
  except RuntimeError as e:
88
+ hexception(e)
87
89
  error: str = f"TogetherVisionClient error: {e}"
88
90
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
89
91
 
@@ -4,6 +4,7 @@ from typing import Any, Dict, List
4
4
  import traceback
5
5
  from helm.common.cache import CacheConfig
6
6
 
7
+ from helm.common.hierarchical_logger import hexception
7
8
  from helm.common.request import (
8
9
  wrap_request_time,
9
10
  EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
@@ -103,6 +104,7 @@ class MegatronClient(CachingClient):
103
104
  try:
104
105
  return self._make_request(request)
105
106
  except Exception as e:
107
+ hexception(e)
106
108
  return RequestResult(
107
109
  success=False,
108
110
  cached=False,
@@ -1,6 +1,7 @@
1
1
  import requests
2
2
  from typing import Any, Dict, List, Optional, TypedDict, Union
3
3
 
4
+ from helm.common.hierarchical_logger import hexception
4
5
  from helm.proxy.retry import NonRetriableException
5
6
  from helm.common.cache import CacheConfig
6
7
  from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
@@ -156,6 +157,7 @@ class MistralAIClient(CachingClient):
156
157
 
157
158
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
158
159
  except (requests.exceptions.RequestException, AssertionError) as e:
160
+ hexception(e)
159
161
  error: str = f"MistralClient error: {e}"
160
162
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
161
163
 
@@ -1,5 +1,6 @@
1
1
  from typing import Any, Dict
2
2
 
3
+ from helm.common.hierarchical_logger import hexception
3
4
  from helm.common.request import wrap_request_time
4
5
  from helm.common.cache import Cache, CacheConfig
5
6
  from helm.common.moderations_api_request import (
@@ -64,6 +65,7 @@ class ModerationAPIClient:
64
65
 
65
66
  response, cached = self.cache.get(raw_request, wrap_request_time(do_it))
66
67
  except openai.OpenAIError as e:
68
+ hexception(e)
67
69
  error: str = f"Moderation API error: {e}"
68
70
  return ModerationAPIRequestResult(
69
71
  success=False, cached=False, error=error, flagged=None, flagged_results=None, scores=None
@@ -10,7 +10,7 @@ from helm.common import multimodal_request_utils
10
10
  from helm.common.cache import CacheConfig
11
11
  from helm.common.media_object import TEXT_TYPE, MultimediaObject, MediaObject
12
12
  from helm.common.request import ErrorFlags, Thinking, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
13
- from helm.common.hierarchical_logger import hlog, hwarn
13
+ from helm.common.hierarchical_logger import hlog, hwarn, hexception
14
14
  from helm.common.object_spec import get_class_by_name
15
15
  from helm.common.optional_dependencies import handle_module_not_found_error
16
16
  from helm.common.tokenization_request import (
@@ -33,9 +33,12 @@ class OpenAIClientUtils:
33
33
  @classmethod
34
34
  def is_reasoning_model(cls, model_engine: str) -> bool:
35
35
  # All OpenAI reasoning models start "o[somenumber]", so we regexp for that to future proof things
36
- return bool(re.match(r"^o\d+", model_engine))
36
+ return bool(re.match(r"^o\d+", model_engine)) or bool(re.match(r"^gpt-5", model_engine))
37
37
 
38
38
  # Error OpenAI throws when the image in the prompt violates their content policy
39
+ HARMFUL_INFORMATION_ERROR: str = (
40
+ "Invalid prompt: we've limited access to this content for safety reasons. This type of information may be used to benefit or to harm people." # noqa: E501
41
+ )
39
42
  INAPPROPRIATE_IMAGE_ERROR: str = "Your input image may contain content that is not allowed by our safety system"
40
43
  INAPPROPRIATE_PROMPT_ERROR: str = "Invalid prompt: your prompt was flagged"
41
44
  INAPPROPRIATE_PROMPT_AZURE_ERROR: str = (
@@ -44,12 +47,10 @@ class OpenAIClientUtils:
44
47
  INAPPROPRIATE_PROMPT_MICROSOFT_ERROR: str = (
45
48
  "The response was filtered due to the prompt triggering Microsoft's content management policy."
46
49
  )
47
-
48
- # OpenAI server error
49
- OPENAI_SERVER_ERROR: str = (
50
- "The server had an error processing your request. Sorry about that! You can retry your request, "
51
- "or contact us through our help center at help.openai.com if you keep seeing this error."
52
- )
50
+ # Grok content safety guidelines error message
51
+ # TODO: Refactor so that this is owned by the Grok client instead.
52
+ SAFETY_GUIDELINES_GROK_ERROR: str = "Content violates safety guidelines."
53
+ USAGE_GUIDELINES_GROK_ERROR: str = "Content violates usage guidelines."
53
54
 
54
55
  # Set the finish reason to this if the prompt violates OpenAI's content policy
55
56
  CONTENT_POLICY_VIOLATED_FINISH_REASON: str = (
@@ -74,21 +75,14 @@ class OpenAIClientUtils:
74
75
  completions=[empty_completion] * request.num_completions,
75
76
  embedding=[],
76
77
  )
77
- elif cls.OPENAI_SERVER_ERROR in str(e):
78
- # Handle these errors by returning an empty completion to unblock
79
- hwarn(f"OpenAI server error for request: {str(request)}")
80
- empty_completion = GeneratedOutput(
81
- text="",
82
- logprob=0,
83
- tokens=[],
84
- finish_reason={"reason": cls.OPENAI_SERVER_ERROR},
85
- )
78
+ elif cls.HARMFUL_INFORMATION_ERROR in str(e):
86
79
  return RequestResult(
87
- success=True,
80
+ success=False,
88
81
  cached=False,
89
- request_time=0,
90
- completions=[empty_completion] * request.num_completions,
82
+ error="Prompt blocked by OpenAI's safety filter",
83
+ completions=[],
91
84
  embedding=[],
85
+ error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
92
86
  )
93
87
  elif cls.INAPPROPRIATE_PROMPT_AZURE_ERROR in str(e) or cls.INAPPROPRIATE_PROMPT_MICROSOFT_ERROR in str(e):
94
88
  return RequestResult(
@@ -99,7 +93,26 @@ class OpenAIClientUtils:
99
93
  embedding=[],
100
94
  error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
101
95
  )
96
+ elif cls.SAFETY_GUIDELINES_GROK_ERROR in str(e):
97
+ return RequestResult(
98
+ success=False,
99
+ cached=False,
100
+ error="Grok API error: Content violates safety guidelines",
101
+ completions=[],
102
+ embedding=[],
103
+ error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
104
+ )
105
+ elif cls.USAGE_GUIDELINES_GROK_ERROR in str(e):
106
+ return RequestResult(
107
+ success=False,
108
+ cached=False,
109
+ error="Grok API error: Content violates usage guidelines",
110
+ completions=[],
111
+ embedding=[],
112
+ error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
113
+ )
102
114
 
115
+ hexception(e)
103
116
  error: str = f"OpenAI error: {e}"
104
117
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
105
118
 
@@ -158,6 +171,7 @@ class OpenAIClient(CachingClient):
158
171
  cache_key = self._get_cache_key(raw_request, request)
159
172
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
160
173
  except openai.OpenAIError as e:
174
+ hexception(e)
161
175
  error: str = f"OpenAI error: {e}"
162
176
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
163
177
 
@@ -424,6 +438,7 @@ class OpenAIClient(CachingClient):
424
438
  cache_key = self._get_cache_key(raw_request, request)
425
439
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
426
440
  except openai.OpenAIError as e:
441
+ hexception(e)
427
442
  error: str = f"OpenAI error: {e}"
428
443
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
429
444
 
@@ -479,6 +494,7 @@ class OpenAIClient(CachingClient):
479
494
  cache_key = self._get_cache_key({"audio": audio_path, "model": model}, request)
480
495
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
481
496
  except openai.OpenAIError as e:
497
+ hexception(e)
482
498
  error: str = f"OpenAI error: {e}"
483
499
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
484
500
 
@@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional, Union
5
5
 
6
6
  from helm.clients.openai_client import OpenAIClientUtils
7
7
  from helm.common.cache import CacheConfig
8
+ from helm.common.hierarchical_logger import hwarn
8
9
  from helm.common.media_object import TEXT_TYPE
9
10
  from helm.common.request import (
10
11
  Thinking,
@@ -60,7 +61,28 @@ class OpenAIResponseClient(CachingClient):
60
61
 
61
62
  def _make_raw_request(self, request: Request) -> dict[str, Any]:
62
63
  input: Union[str, List[Dict[str, Any]]]
63
- if request.multimodal_prompt is not None:
64
+
65
+ if (
66
+ (request.prompt and request.messages)
67
+ or (request.prompt and request.multimodal_prompt)
68
+ or (request.messages and request.multimodal_prompt)
69
+ ):
70
+ raise ValueError(
71
+ f"More than one of `prompt`, `messages` and `multimodal_prompt` was set in request: {request}"
72
+ )
73
+
74
+ if request.messages is not None:
75
+ # Checks that all messages have a role and some content
76
+ for message in request.messages:
77
+ if not message.get("role") or not message.get("content"):
78
+ raise ValueError("All messages must have a role and content")
79
+ # Checks that the last role is "user"
80
+ if request.messages[-1]["role"] != "user":
81
+ raise ValueError("Last message must have role 'user'")
82
+ if request.prompt != "":
83
+ hwarn("Since message is set, prompt will be ignored")
84
+ input = request.messages
85
+ elif request.multimodal_prompt is not None:
64
86
  content = []
65
87
  request.validate()
66
88
  for media_object in request.multimodal_prompt.media_objects:
@@ -101,6 +123,8 @@ class OpenAIResponseClient(CachingClient):
101
123
  # Plus other changes
102
124
  model_engine: str = request.model_engine
103
125
  if OpenAIClientUtils.is_reasoning_model(model_engine):
126
+ if "reasoning" not in raw_request:
127
+ raw_request["reasoning"] = {}
104
128
  raw_request["reasoning"]["summary"] = "detailed"
105
129
  # Avoid error:
106
130
  # "Error code: 400 - {'error': {'message': "Unsupported parameter: 'temperature' is
@@ -150,9 +174,9 @@ class OpenAIResponseClient(CachingClient):
150
174
  ] # one of "message" or "reasoning" from API observation, but can also include tool calls
151
175
 
152
176
  if output_type == "reasoning":
153
- reasoning_output += "\n".join([raw_output["text"] for raw_output in output["summary"]])
177
+ reasoning_output += "\n\n".join([raw_output["text"] for raw_output in output["summary"]])
154
178
  elif output_type == "message":
155
- text_output += "\n".join([raw_output["text"] for raw_output in output["content"]])
179
+ text_output += "\n\n".join([raw_output["text"] for raw_output in output["content"]])
156
180
  # (Other output types are ignored)
157
181
 
158
182
  completion = truncate_and_tokenize_response_text(
@@ -0,0 +1,31 @@
1
+ import os
2
+ from typing import Optional
3
+ from helm.clients.openai_client import OpenAIClient
4
+ from helm.common.cache import CacheConfig
5
+ from helm.tokenizers.tokenizer import Tokenizer
6
+
7
+
8
+ class OpenRouterClient(OpenAIClient):
9
+ def __init__(
10
+ self,
11
+ tokenizer_name: str,
12
+ tokenizer: Tokenizer,
13
+ cache_config: CacheConfig,
14
+ api_key: Optional[str] = None,
15
+ model_name: Optional[str] = None,
16
+ output_processor: Optional[str] = None,
17
+ ):
18
+ self.api_key = api_key or os.getenv("OPENROUTER_API_KEY")
19
+ self.base_url = "https://openrouter.ai/api/v1/"
20
+ super().__init__(
21
+ tokenizer,
22
+ tokenizer_name,
23
+ cache_config=cache_config,
24
+ output_processor=output_processor,
25
+ base_url=self.base_url,
26
+ api_key=self.api_key,
27
+ )
28
+ self.model_name = model_name
29
+
30
+ def _get_model_for_request(self, request):
31
+ return self.model_name or request.model
@@ -5,7 +5,7 @@ from typing import Any, Dict, List
5
5
 
6
6
  from helm.clients.openai_client import OpenAIClient
7
7
  from helm.common.cache import CacheConfig
8
- from helm.common.hierarchical_logger import hwarn
8
+ from helm.common.hierarchical_logger import hexception, hwarn
9
9
  from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token, ErrorFlags
10
10
  from helm.common.tokenization_request import (
11
11
  TokenizationRequest,
@@ -99,6 +99,7 @@ class PalmyraClient(CachingClient):
99
99
 
100
100
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
101
101
  except (requests.exceptions.RequestException, AssertionError) as e:
102
+ hexception(e)
102
103
  error: str = f"PalmyraClient error: {e}"
103
104
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
104
105
 
@@ -6,7 +6,7 @@ from helm.proxy.retry import NonRetriableException
6
6
  from helm.common.cache import CacheConfig
7
7
  from helm.common.media_object import TEXT_TYPE
8
8
  from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput
9
- from helm.common.hierarchical_logger import hwarn
9
+ from helm.common.hierarchical_logger import hexception, hwarn
10
10
  from helm.common.optional_dependencies import handle_module_not_found_error
11
11
  from helm.tokenizers.tokenizer import Tokenizer
12
12
  from helm.clients.client import CachingClient, truncate_and_tokenize_response_text
@@ -167,6 +167,7 @@ class RekaClient(CachingClient):
167
167
 
168
168
  response, cached = self.cache.get(raw_request, wrap_request_time(do_it))
169
169
  except (requests.exceptions.RequestException, AssertionError) as e:
170
+ hexception(e)
170
171
  error: str = f"RekaClient error: {e}"
171
172
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
172
173
 
@@ -39,7 +39,7 @@ class StanfordHealthCareAzureOpenAIClient(AzureOpenAIClient):
39
39
  tokenizer=tokenizer,
40
40
  tokenizer_name=tokenizer_name,
41
41
  cache_config=cache_config,
42
- api_key="unused",
42
+ api_key=api_key,
43
43
  base_url=base_url,
44
44
  azure_openai_deployment_name=openai_model_name,
45
45
  api_version=api_version,
@@ -50,7 +50,7 @@ class StanfordHealthCareAzureOpenAIClient(AzureOpenAIClient):
50
50
  tokenizer=tokenizer,
51
51
  tokenizer_name=tokenizer_name,
52
52
  cache_config=cache_config,
53
- api_key="unused",
53
+ api_key=api_key,
54
54
  endpoint=endpoint,
55
55
  azure_openai_deployment_name=openai_model_name,
56
56
  api_version=api_version,
@@ -5,6 +5,7 @@ from dataclasses import asdict
5
5
  from typing import Any, Dict, List, Optional
6
6
 
7
7
  from helm.common.cache import CacheConfig
8
+ from helm.common.hierarchical_logger import hexception
8
9
  from helm.common.request import (
9
10
  wrap_request_time,
10
11
  Request,
@@ -82,6 +83,7 @@ class StanfordHealthCareHTTPModelClient(CachingClient, ABC):
82
83
  request_time=response["request_time"],
83
84
  )
84
85
  except requests.exceptions.RequestException as e:
86
+ hexception(e)
85
87
  return RequestResult(success=False, cached=False, error=f"Request error: {e}", completions=[], embedding=[])
86
88
 
87
89
  @abstractmethod
@@ -0,0 +1,69 @@
1
+ import os
2
+ import pytest
3
+ import tempfile
4
+
5
+ from helm.common.cache import BlackHoleCacheConfig, SqliteCacheConfig
6
+ from helm.common.request import Request
7
+ from helm.clients.openrouter_client import OpenRouterClient
8
+
9
+ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
10
+
11
+
12
+ class TestOpenRouterClient:
13
+ def setup_method(self, method):
14
+ cache_file = tempfile.NamedTemporaryFile(delete=False)
15
+ self.cache_path: str = cache_file.name
16
+ self.tokenizer_name = "mistralai/Mistral-7B-v0.1"
17
+ self.tokenizer = HuggingFaceTokenizer(
18
+ cache_config=BlackHoleCacheConfig(),
19
+ tokenizer_name=self.tokenizer_name,
20
+ )
21
+
22
+ def teardown_method(self, method):
23
+ os.remove(self.cache_path)
24
+
25
+ @pytest.mark.parametrize(
26
+ "model_name,test_input,expected_model",
27
+ [
28
+ (
29
+ "mistralai/mistral-medium-3.1",
30
+ Request(
31
+ model="mistralai/mistral-medium-3.1",
32
+ model_deployment="openrouter/mistral-medium-3.1",
33
+ ),
34
+ "mistralai/mistral-medium-3.1",
35
+ ),
36
+ (
37
+ None,
38
+ Request(model="openai/gpt-oss-20b:free", model_deployment="openrouter/gpt-oss-20b:free"),
39
+ "openai/gpt-oss-20b:free",
40
+ ),
41
+ ],
42
+ )
43
+ def test_get_model_for_request(self, model_name, test_input, expected_model):
44
+ client = OpenRouterClient(
45
+ tokenizer_name=self.tokenizer_name,
46
+ tokenizer=self.tokenizer,
47
+ cache_config=SqliteCacheConfig(self.cache_path),
48
+ model_name=model_name,
49
+ api_key="test_key",
50
+ )
51
+ assert client._get_model_for_request(test_input) == expected_model
52
+
53
+ def test_api_key_env_var(self, monkeypatch):
54
+ monkeypatch.setenv("OPENROUTER_API_KEY", "test_key")
55
+ client = OpenRouterClient(
56
+ tokenizer_name=self.tokenizer_name,
57
+ tokenizer=self.tokenizer,
58
+ cache_config=SqliteCacheConfig(self.cache_path),
59
+ )
60
+ assert client.api_key == "test_key"
61
+
62
+ def test_api_key_argument(self):
63
+ client = OpenRouterClient(
64
+ tokenizer_name=self.tokenizer_name,
65
+ tokenizer=self.tokenizer,
66
+ cache_config=BlackHoleCacheConfig(),
67
+ api_key="explicit_key",
68
+ )
69
+ assert client.api_key == "explicit_key"
@@ -9,6 +9,7 @@ import requests
9
9
  from retrying import retry
10
10
 
11
11
  from helm.common.cache import CacheConfig
12
+ from helm.common.hierarchical_logger import hexception
12
13
  from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
13
14
  from helm.common.object_spec import get_class_by_name
14
15
  from helm.common.optional_dependencies import handle_module_not_found_error
@@ -99,7 +100,7 @@ class JobNotFinishedError(TogetherClientError):
99
100
  pass
100
101
 
101
102
 
102
- def _parse_thinking(input: str) -> Tuple[str, str]:
103
+ def _parse_thinking_deepseek_r1(input: str) -> Tuple[str, str]:
103
104
  """Return a tuple of thinking text and output text."""
104
105
  match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
105
106
  if match:
@@ -112,6 +113,44 @@ def _parse_thinking(input: str) -> Tuple[str, str]:
112
113
  return (input, "")
113
114
 
114
115
 
116
+ def _parse_thinking_qwen3(input: str) -> Tuple[str, str]:
117
+ """Return a tuple of thinking text and output text."""
118
+ match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
119
+ if match:
120
+ return (match.group(1), match.group(2))
121
+
122
+ match = re.match(r"<think>\n?(.*)", input, re.DOTALL)
123
+ if match:
124
+ return (match.group(1), "")
125
+
126
+ return (input, "")
127
+
128
+
129
+ def _parse_thinking_glm_4_5(input: str) -> Tuple[str, str]:
130
+ """Return a tuple of thinking text and output text."""
131
+ match = re.match(r"\n<think>(.*)</think>(.*)", input, re.DOTALL)
132
+ if match:
133
+ return (match.group(1), match.group(2))
134
+
135
+ match = re.match(r"\n<think>(.*)", input, re.DOTALL)
136
+ if match:
137
+ return (match.group(1), "")
138
+
139
+ return (input, "")
140
+
141
+
142
+ def _parse_thinking(input: str, model_name: str) -> Tuple[str, str]:
143
+ # TODO: Come up with a more sustainable extensible way of doing this.
144
+ if "deepseek-r1" in model_name:
145
+ return _parse_thinking_deepseek_r1(input)
146
+ elif "qwen3" in model_name:
147
+ return _parse_thinking_qwen3(input)
148
+ elif "glm-4.5" in model_name:
149
+ return _parse_thinking_glm_4_5(input)
150
+ else:
151
+ raise Exception(f"No thinking parser available for model {model_name}")
152
+
153
+
115
154
  class TogetherClient(CachingClient):
116
155
  """
117
156
  Client for the models where we evaluate offline. Since the queries are handled offline, the `TogetherClient` just
@@ -235,6 +274,7 @@ class TogetherClient(CachingClient):
235
274
  try:
236
275
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it_sync))
237
276
  except Exception as error:
277
+ hexception(error)
238
278
  return RequestResult(
239
279
  success=False,
240
280
  cached=False,
@@ -346,9 +386,8 @@ class TogetherChatClient(CachingClient):
346
386
  self._client = Together(api_key=api_key)
347
387
  self._together_model = together_model
348
388
  self._disable_logprobs = bool(disable_logprobs)
349
- # self.output_processor is actually a function, not a class
350
389
  self._parse_thinking = bool(parse_thinking)
351
-
390
+ # self.output_processor is actually a function, not a class
352
391
  self.output_processor: Optional[Callable[[str], str]] = (
353
392
  get_class_by_name(output_processor) if output_processor else None
354
393
  )
@@ -418,6 +457,7 @@ class TogetherChatClient(CachingClient):
418
457
  raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
419
458
  response = ChatCompletionResponse.model_validate(raw_response)
420
459
  except Exception as error:
460
+ hexception(error)
421
461
  return RequestResult(
422
462
  success=False,
423
463
  cached=False,
@@ -444,15 +484,15 @@ class TogetherChatClient(CachingClient):
444
484
  if self.output_processor:
445
485
  output_text = self.output_processor(output_text)
446
486
 
487
+ thinking: Optional[Thinking] = None
447
488
  if self._parse_thinking:
448
- thinking_text, output_text = _parse_thinking(output_text)
449
- generated_outputs.append(
450
- GeneratedOutput(
451
- text=output_text, logprob=logprob, tokens=tokens, thinking=Thinking(text=thinking_text)
452
- )
453
- )
454
- else:
455
- generated_outputs.append(GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens))
489
+ thinking_text, output_text = _parse_thinking(output_text, request.model)
490
+ thinking = Thinking(text=thinking_text)
491
+ elif hasattr(choice.message, "reasoning_content"):
492
+ thinking = Thinking(text=choice.message.reasoning_content)
493
+ generated_outputs.append(
494
+ GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens, thinking=thinking)
495
+ )
456
496
  return RequestResult(
457
497
  success=True,
458
498
  cached=cached,
@@ -525,6 +565,7 @@ class TogetherCompletionClient(CachingClient):
525
565
  raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
526
566
  response = CompletionResponse.model_validate(raw_response)
527
567
  except Exception as error:
568
+ hexception(error)
528
569
  return RequestResult(
529
570
  success=False,
530
571
  cached=False,
@@ -4,6 +4,7 @@ from threading import Lock
4
4
  from typing import Any, Dict, Mapping, Optional, List, Union, cast
5
5
 
6
6
  from helm.common.cache import CacheConfig
7
+ from helm.common.hierarchical_logger import hexception
7
8
  from helm.common.multimodal_request_utils import get_contents_as_bytes
8
9
  from helm.common.media_object import TEXT_TYPE
9
10
  from helm.common.optional_dependencies import handle_module_not_found_error
@@ -152,6 +153,7 @@ class VertexAITextClient(VertexAIClient):
152
153
 
153
154
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
154
155
  except (requests.exceptions.RequestException, AssertionError) as e:
156
+ hexception(e)
155
157
  error: str = f"VertexAITextClient error: {e}"
156
158
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
157
159
 
@@ -276,8 +278,14 @@ class VertexAIChatClient(VertexAIClient):
276
278
  if not candidate.content:
277
279
  raise VertexAIContentBlockedError(f"No content in candidate: {candidate}")
278
280
  if not candidate.content.parts:
279
- raise VertexAIContentBlockedError(f"No content parts in candidate: {candidate}")
280
- predictions.append({"text": candidate.content.text})
281
+ if candidate.finish_reason == 2: # MAX_TOKENS
282
+ # This means that there is no text output because the maximum number of tokens were
283
+ # reached during thinking.
284
+ predictions.append({"text": ""})
285
+ else:
286
+ raise VertexAIContentBlockedError(f"No content parts in candidate: {candidate}")
287
+ else:
288
+ predictions.append({"text": candidate.content.text})
281
289
  # TODO: Extract more information from the response
282
290
  return {"predictions": predictions}
283
291
 
@@ -304,6 +312,7 @@ class VertexAIChatClient(VertexAIClient):
304
312
  error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
305
313
  )
306
314
  except (requests.exceptions.RequestException, AssertionError) as e:
315
+ hexception(e)
307
316
  error: str = f"VertexAITextClient error: {e}"
308
317
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
309
318
 
@@ -434,6 +443,7 @@ class VertexAIChatClient(VertexAIClient):
434
443
  cache_key = self.make_cache_key_with_safety_settings_preset(raw_cache_key, request)
435
444
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
436
445
  except requests.exceptions.RequestException as e:
446
+ hexception(e)
437
447
  error: str = f"Gemini Vision error: {e}"
438
448
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
439
449
  except VertexAIContentBlockedError as e:
@@ -8,7 +8,7 @@ import torch
8
8
 
9
9
  from helm.common.cache import CacheConfig
10
10
  from helm.common.gpu_utils import get_torch_device_name, is_cuda_available
11
- from helm.common.hierarchical_logger import hlog, htrack_block
11
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
12
12
  from helm.common.media_object import TEXT_TYPE
13
13
  from helm.common.request import Request, RequestResult, GeneratedOutput, Token
14
14
  from helm.common.request import wrap_request_time
@@ -125,6 +125,7 @@ class HuggingFaceVision2SeqClient(CachingClient):
125
125
  )
126
126
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
127
127
  except RuntimeError as model_error:
128
+ hexception(model_error)
128
129
  return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
129
130
 
130
131
  for text in result["output"]:
@@ -5,6 +5,7 @@ from transformers import pipeline
5
5
  from transformers.pipelines import ImageToTextPipeline
6
6
 
7
7
  from helm.common.cache import CacheConfig
8
+ from helm.common.hierarchical_logger import hexception
8
9
  from helm.common.images_utils import open_image
9
10
  from helm.common.media_object import TEXT_TYPE
10
11
  from helm.common.optional_dependencies import handle_module_not_found_error
@@ -93,6 +94,7 @@ class HuggingFaceVLMClient(CachingClient):
93
94
  )
94
95
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
95
96
  except RuntimeError as e:
97
+ hexception(e)
96
98
  return RequestResult(success=False, cached=False, error=str(e), completions=[], embedding=[])
97
99
 
98
100
  output: str = result["generated_text"]
@@ -8,7 +8,7 @@ from transformers import IdeficsForVisionText2Text, AutoProcessor, IdeficsProces
8
8
  from helm.common.cache import CacheConfig
9
9
  from helm.common.images_utils import open_image
10
10
  from helm.common.gpu_utils import get_torch_device_name
11
- from helm.common.hierarchical_logger import hlog, htrack_block
11
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
12
12
  from helm.common.media_object import TEXT_TYPE
13
13
  from helm.common.optional_dependencies import handle_module_not_found_error
14
14
  from helm.common.request import Request, RequestResult, GeneratedOutput, Token
@@ -137,6 +137,7 @@ class IDEFICSClient(CachingClient):
137
137
  )
138
138
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
139
139
  except RuntimeError as model_error:
140
+ hexception(model_error)
140
141
  return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
141
142
 
142
143
  for text in result["output"]:
@@ -5,7 +5,7 @@ import torch
5
5
  from huggingface_hub import hf_hub_download
6
6
 
7
7
  from helm.common.cache import CacheConfig
8
- from helm.common.hierarchical_logger import hlog, htrack_block
8
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
9
9
  from helm.common.images_utils import open_image
10
10
  from helm.common.gpu_utils import get_torch_device_name
11
11
  from helm.common.media_object import TEXT_TYPE
@@ -131,6 +131,7 @@ class OpenFlamingoClient(CachingClient):
131
131
  )
132
132
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
133
133
  except RuntimeError as ex:
134
+ hexception(ex)
134
135
  return RequestResult(success=False, cached=False, error=str(ex), completions=[], embedding=[])
135
136
 
136
137
  completions: List[GeneratedOutput] = []