crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/bbq_metrics.py +12 -0
  27. helm/benchmark/metrics/classification_metrics.py +19 -1
  28. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  29. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  30. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  31. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  32. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  33. helm/benchmark/metrics/comet_metric.py +1 -1
  34. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  35. helm/benchmark/metrics/copyright_metrics.py +1 -1
  36. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  37. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  38. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  39. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  40. helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
  41. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  42. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  43. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  44. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  45. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  46. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  47. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  48. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  49. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  50. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  51. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  52. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  53. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  54. helm/benchmark/metrics/medec_metrics.py +25 -2
  55. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  56. helm/benchmark/metrics/metric.py +25 -0
  57. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  58. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  59. helm/benchmark/metrics/safety_metrics.py +13 -1
  60. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  61. helm/benchmark/metrics/summac/model_summac.py +3 -3
  62. helm/benchmark/metrics/summarization_metrics.py +129 -1
  63. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  64. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  65. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  66. helm/benchmark/model_deployment_registry.py +11 -19
  67. helm/benchmark/presentation/create_plots.py +11 -2
  68. helm/benchmark/presentation/run_display.py +13 -3
  69. helm/benchmark/presentation/run_entry.py +2 -2
  70. helm/benchmark/presentation/schema.py +10 -22
  71. helm/benchmark/presentation/summarize.py +189 -14
  72. helm/benchmark/presentation/taxonomy_info.py +20 -0
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/run.py +15 -4
  75. helm/benchmark/run_expander.py +4 -0
  76. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  77. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  78. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  79. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  80. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  81. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  82. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  83. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  84. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  85. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  86. helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
  87. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  88. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
  89. helm/benchmark/runner.py +7 -0
  90. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  91. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  92. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  93. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  94. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  95. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  96. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  97. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  98. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  99. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  100. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  101. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  102. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  103. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
  104. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
  105. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
  106. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  107. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  108. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  109. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  110. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  111. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  112. helm/benchmark/scenarios/bold_scenario.py +15 -0
  113. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  115. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  116. helm/benchmark/scenarios/clear_scenario.py +23 -0
  117. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  118. helm/benchmark/scenarios/code_scenario.py +28 -0
  119. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  120. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  121. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  122. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  123. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  124. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  125. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  126. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  127. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  128. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  129. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  130. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  131. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  132. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  133. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  134. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  135. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  136. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  137. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  138. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  139. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  140. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  141. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  142. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  143. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  144. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  145. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  146. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  147. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  148. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  149. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  150. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  151. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  152. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  153. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  154. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  155. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  156. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  157. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  158. helm/benchmark/scenarios/ice_scenario.py +21 -1
  159. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  160. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  161. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  162. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  163. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  164. helm/benchmark/scenarios/koala_scenario.py +21 -1
  165. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  166. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  167. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  168. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  169. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  170. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  171. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  172. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  173. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  175. helm/benchmark/scenarios/math_scenario.py +54 -20
  176. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  177. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  178. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  179. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  180. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  181. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  182. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  183. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  184. helm/benchmark/scenarios/medec_scenario.py +23 -0
  185. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  186. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  187. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  188. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  189. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  190. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  191. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  192. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  193. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  194. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  195. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  196. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  197. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  198. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  199. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  200. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  201. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  202. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  203. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  204. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  205. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  206. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  207. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  208. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  209. helm/benchmark/scenarios/quac_scenario.py +14 -0
  210. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  211. helm/benchmark/scenarios/raft_scenario.py +15 -0
  212. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  213. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  214. helm/benchmark/scenarios/scenario.py +31 -0
  215. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  216. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  217. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  218. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  219. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  220. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  221. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  222. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  223. helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
  224. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  225. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  226. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  227. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  228. helm/benchmark/scenarios/spider_scenario.py +18 -0
  229. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  230. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  231. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  232. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  233. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  234. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  235. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  236. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  237. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  238. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  239. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  240. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  241. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  242. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  243. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  244. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  245. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  246. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  247. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  248. helm/benchmark/slurm_jobs.py +1 -2
  249. helm/benchmark/slurm_runner.py +8 -1
  250. helm/benchmark/static/schema_arabic.yaml +271 -0
  251. helm/benchmark/static/schema_classic.yaml +0 -17
  252. helm/benchmark/static/schema_long_context.yaml +17 -18
  253. helm/benchmark/static/schema_medhelm.yaml +36 -0
  254. helm/benchmark/static/schema_slp.yaml +219 -0
  255. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  256. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  257. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  258. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  259. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  260. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  261. helm/benchmark/static_build/index.html +5 -6
  262. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  263. helm/clients/ai21_client.py +2 -0
  264. helm/clients/aleph_alpha_client.py +2 -0
  265. helm/clients/anthropic_client.py +7 -1
  266. helm/clients/audio_language/diva_llama_client.py +2 -0
  267. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  268. helm/clients/audio_language/llama_omni/constants.py +9 -0
  269. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  270. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  271. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  272. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  273. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  274. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  275. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  276. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  277. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  278. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  279. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  280. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  281. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  282. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  283. helm/clients/audio_language/llama_omni/utils.py +202 -0
  284. helm/clients/audio_language/llama_omni_client.py +2 -1
  285. helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
  286. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  287. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  288. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  289. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  290. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  291. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  292. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  293. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  294. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  295. helm/clients/bedrock_client.py +63 -6
  296. helm/clients/cohere_client.py +3 -0
  297. helm/clients/dspy_client.py +135 -0
  298. helm/clients/google_client.py +2 -0
  299. helm/clients/http_model_client.py +2 -0
  300. helm/clients/huggingface_client.py +4 -3
  301. helm/clients/ibm_client.py +3 -1
  302. helm/clients/image_generation/adobe_vision_client.py +2 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  304. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  305. helm/clients/image_generation/cogview2_client.py +2 -1
  306. helm/clients/image_generation/dalle2_client.py +2 -0
  307. helm/clients/image_generation/dalle_mini_client.py +2 -1
  308. helm/clients/image_generation/deep_floyd_client.py +2 -0
  309. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  310. helm/clients/image_generation/lexica_client.py +2 -0
  311. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  312. helm/clients/image_generation/mindalle_client.py +2 -1
  313. helm/clients/image_generation/together_image_generation_client.py +2 -0
  314. helm/clients/megatron_client.py +2 -0
  315. helm/clients/mistral_client.py +2 -0
  316. helm/clients/moderation_api_client.py +2 -0
  317. helm/clients/openai_client.py +38 -21
  318. helm/clients/openai_responses_client.py +34 -8
  319. helm/clients/openrouter_client.py +31 -0
  320. helm/clients/palmyra_client.py +2 -1
  321. helm/clients/reka_client.py +2 -1
  322. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  323. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  324. helm/clients/test_huggingface_client.py +3 -3
  325. helm/clients/test_openrouter_client.py +69 -0
  326. helm/clients/together_client.py +52 -13
  327. helm/clients/vertexai_client.py +23 -11
  328. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  329. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  330. helm/clients/vision_language/idefics_client.py +2 -1
  331. helm/clients/vision_language/open_flamingo_client.py +2 -1
  332. helm/clients/vision_language/paligemma_client.py +2 -1
  333. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  334. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  335. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  336. helm/clients/vllm_client.py +43 -7
  337. helm/clients/vllm_granite_thinking_client.py +56 -0
  338. helm/clients/writer_client.py +5 -2
  339. helm/common/critique_request.py +0 -1
  340. helm/common/hierarchical_logger.py +103 -34
  341. helm/common/object_spec.py +23 -8
  342. helm/common/optional_dependencies.py +1 -1
  343. helm/common/test_general.py +4 -0
  344. helm/common/test_logging.py +94 -0
  345. helm/config/model_deployments.yaml +1001 -187
  346. helm/config/model_metadata.yaml +602 -18
  347. helm/config/tokenizer_configs.yaml +202 -5
  348. helm/proxy/cli.py +1 -1
  349. helm/proxy/example_queries.py +8 -8
  350. helm/proxy/retry.py +5 -0
  351. helm/proxy/server.py +2 -1
  352. helm/proxy/static/index.css +4 -0
  353. helm/proxy/static/index.js +7 -1
  354. helm/tokenizers/auto_tokenizer.py +2 -2
  355. helm/tokenizers/grok_tokenizer.py +2 -0
  356. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  357. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  358. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  359. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  360. helm/benchmark/metrics/medalign_metrics.py +0 -14
  361. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  362. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  363. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  364. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  365. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  366. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  367. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  368. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  369. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  370. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  371. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
  372. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  373. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  374. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  375. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  376. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  377. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  378. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
  379. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  380. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
  381. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  382. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  383. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  384. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  385. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  386. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  387. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  388. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  389. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  390. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  391. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  392. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  393. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  394. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -0,0 +1,69 @@
1
+ import os
2
+ import pytest
3
+ import tempfile
4
+
5
+ from helm.common.cache import BlackHoleCacheConfig, SqliteCacheConfig
6
+ from helm.common.request import Request
7
+ from helm.clients.openrouter_client import OpenRouterClient
8
+
9
+ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
10
+
11
+
12
+ class TestOpenRouterClient:
13
+ def setup_method(self, method):
14
+ cache_file = tempfile.NamedTemporaryFile(delete=False)
15
+ self.cache_path: str = cache_file.name
16
+ self.tokenizer_name = "mistralai/Mistral-7B-v0.1"
17
+ self.tokenizer = HuggingFaceTokenizer(
18
+ cache_config=BlackHoleCacheConfig(),
19
+ tokenizer_name=self.tokenizer_name,
20
+ )
21
+
22
+ def teardown_method(self, method):
23
+ os.remove(self.cache_path)
24
+
25
+ @pytest.mark.parametrize(
26
+ "model_name,test_input,expected_model",
27
+ [
28
+ (
29
+ "mistralai/mistral-medium-3.1",
30
+ Request(
31
+ model="mistralai/mistral-medium-3.1",
32
+ model_deployment="openrouter/mistral-medium-3.1",
33
+ ),
34
+ "mistralai/mistral-medium-3.1",
35
+ ),
36
+ (
37
+ None,
38
+ Request(model="openai/gpt-oss-20b:free", model_deployment="openrouter/gpt-oss-20b:free"),
39
+ "openai/gpt-oss-20b:free",
40
+ ),
41
+ ],
42
+ )
43
+ def test_get_model_for_request(self, model_name, test_input, expected_model):
44
+ client = OpenRouterClient(
45
+ tokenizer_name=self.tokenizer_name,
46
+ tokenizer=self.tokenizer,
47
+ cache_config=SqliteCacheConfig(self.cache_path),
48
+ model_name=model_name,
49
+ api_key="test_key",
50
+ )
51
+ assert client._get_model_for_request(test_input) == expected_model
52
+
53
+ def test_api_key_env_var(self, monkeypatch):
54
+ monkeypatch.setenv("OPENROUTER_API_KEY", "test_key")
55
+ client = OpenRouterClient(
56
+ tokenizer_name=self.tokenizer_name,
57
+ tokenizer=self.tokenizer,
58
+ cache_config=SqliteCacheConfig(self.cache_path),
59
+ )
60
+ assert client.api_key == "test_key"
61
+
62
+ def test_api_key_argument(self):
63
+ client = OpenRouterClient(
64
+ tokenizer_name=self.tokenizer_name,
65
+ tokenizer=self.tokenizer,
66
+ cache_config=BlackHoleCacheConfig(),
67
+ api_key="explicit_key",
68
+ )
69
+ assert client.api_key == "explicit_key"
@@ -9,6 +9,7 @@ import requests
9
9
  from retrying import retry
10
10
 
11
11
  from helm.common.cache import CacheConfig
12
+ from helm.common.hierarchical_logger import hexception
12
13
  from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
13
14
  from helm.common.object_spec import get_class_by_name
14
15
  from helm.common.optional_dependencies import handle_module_not_found_error
@@ -25,8 +26,6 @@ except ModuleNotFoundError as e:
25
26
  class _RewriteRequestTags:
26
27
  """Tags that indicate that the request for the model must be rewritten before sending to Together."""
27
28
 
28
- # TODO: Convert to StrEnum after upgrading to Python 3.11
29
-
30
29
  ADD_EOS_TOKEN_AS_STOP_SEQUENCE = "ADD_EOS_TOKEN_AS_STOP_SEQUENCE"
31
30
  """Indicates that the EOS token should be added as an extra stop sequence.
32
31
 
@@ -101,7 +100,20 @@ class JobNotFinishedError(TogetherClientError):
101
100
  pass
102
101
 
103
102
 
104
- def _parse_thinking(input: str) -> Tuple[str, str]:
103
+ def _parse_thinking_deepseek_r1(input: str) -> Tuple[str, str]:
104
+ """Return a tuple of thinking text and output text."""
105
+ match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
106
+ if match:
107
+ return (match.group(1), match.group(2))
108
+
109
+ match = re.match(r"<think>\n?(.*)", input, re.DOTALL)
110
+ if match:
111
+ return (match.group(1), "")
112
+
113
+ return (input, "")
114
+
115
+
116
+ def _parse_thinking_qwen3(input: str) -> Tuple[str, str]:
105
117
  """Return a tuple of thinking text and output text."""
106
118
  match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
107
119
  if match:
@@ -114,6 +126,31 @@ def _parse_thinking(input: str) -> Tuple[str, str]:
114
126
  return (input, "")
115
127
 
116
128
 
129
+ def _parse_thinking_glm_4_5(input: str) -> Tuple[str, str]:
130
+ """Return a tuple of thinking text and output text."""
131
+ match = re.match(r"\n<think>(.*)</think>(.*)", input, re.DOTALL)
132
+ if match:
133
+ return (match.group(1), match.group(2))
134
+
135
+ match = re.match(r"\n<think>(.*)", input, re.DOTALL)
136
+ if match:
137
+ return (match.group(1), "")
138
+
139
+ return (input, "")
140
+
141
+
142
+ def _parse_thinking(input: str, model_name: str) -> Tuple[str, str]:
143
+ # TODO: Come up with a more sustainable extensible way of doing this.
144
+ if "deepseek-r1" in model_name:
145
+ return _parse_thinking_deepseek_r1(input)
146
+ elif "qwen3" in model_name:
147
+ return _parse_thinking_qwen3(input)
148
+ elif "glm-4.5" in model_name:
149
+ return _parse_thinking_glm_4_5(input)
150
+ else:
151
+ raise Exception(f"No thinking parser available for model {model_name}")
152
+
153
+
117
154
  class TogetherClient(CachingClient):
118
155
  """
119
156
  Client for the models where we evaluate offline. Since the queries are handled offline, the `TogetherClient` just
@@ -237,6 +274,7 @@ class TogetherClient(CachingClient):
237
274
  try:
238
275
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it_sync))
239
276
  except Exception as error:
277
+ hexception(error)
240
278
  return RequestResult(
241
279
  success=False,
242
280
  cached=False,
@@ -348,9 +386,8 @@ class TogetherChatClient(CachingClient):
348
386
  self._client = Together(api_key=api_key)
349
387
  self._together_model = together_model
350
388
  self._disable_logprobs = bool(disable_logprobs)
351
- # self.output_processor is actually a function, not a class
352
389
  self._parse_thinking = bool(parse_thinking)
353
-
390
+ # self.output_processor is actually a function, not a class
354
391
  self.output_processor: Optional[Callable[[str], str]] = (
355
392
  get_class_by_name(output_processor) if output_processor else None
356
393
  )
@@ -420,6 +457,7 @@ class TogetherChatClient(CachingClient):
420
457
  raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
421
458
  response = ChatCompletionResponse.model_validate(raw_response)
422
459
  except Exception as error:
460
+ hexception(error)
423
461
  return RequestResult(
424
462
  success=False,
425
463
  cached=False,
@@ -446,15 +484,15 @@ class TogetherChatClient(CachingClient):
446
484
  if self.output_processor:
447
485
  output_text = self.output_processor(output_text)
448
486
 
487
+ thinking: Optional[Thinking] = None
449
488
  if self._parse_thinking:
450
- thinking_text, output_text = _parse_thinking(output_text)
451
- generated_outputs.append(
452
- GeneratedOutput(
453
- text=output_text, logprob=logprob, tokens=tokens, thinking=Thinking(text=thinking_text)
454
- )
455
- )
456
- else:
457
- generated_outputs.append(GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens))
489
+ thinking_text, output_text = _parse_thinking(output_text, request.model)
490
+ thinking = Thinking(text=thinking_text)
491
+ elif hasattr(choice.message, "reasoning_content"):
492
+ thinking = Thinking(text=choice.message.reasoning_content)
493
+ generated_outputs.append(
494
+ GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens, thinking=thinking)
495
+ )
458
496
  return RequestResult(
459
497
  success=True,
460
498
  cached=cached,
@@ -527,6 +565,7 @@ class TogetherCompletionClient(CachingClient):
527
565
  raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
528
566
  response = CompletionResponse.model_validate(raw_response)
529
567
  except Exception as error:
568
+ hexception(error)
530
569
  return RequestResult(
531
570
  success=False,
532
571
  cached=False,
@@ -1,9 +1,10 @@
1
1
  import requests
2
2
  from abc import ABC, abstractmethod
3
3
  from threading import Lock
4
- from typing import Any, Dict, Mapping, Optional, List, Union
4
+ from typing import Any, Dict, Mapping, Optional, List, Union, cast
5
5
 
6
6
  from helm.common.cache import CacheConfig
7
+ from helm.common.hierarchical_logger import hexception
7
8
  from helm.common.multimodal_request_utils import get_contents_as_bytes
8
9
  from helm.common.media_object import TEXT_TYPE
9
10
  from helm.common.optional_dependencies import handle_module_not_found_error
@@ -107,7 +108,7 @@ class VertexAITextClient(VertexAIClient):
107
108
 
108
109
  def make_request(self, request: Request) -> RequestResult:
109
110
  """Make a request"""
110
- parameters = {
111
+ parameters: Dict[str, Any] = {
111
112
  "temperature": request.temperature,
112
113
  "max_output_tokens": request.max_tokens,
113
114
  "top_k": request.top_k_per_token,
@@ -152,6 +153,7 @@ class VertexAITextClient(VertexAIClient):
152
153
 
153
154
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
154
155
  except (requests.exceptions.RequestException, AssertionError) as e:
156
+ hexception(e)
155
157
  error: str = f"VertexAITextClient error: {e}"
156
158
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
157
159
 
@@ -207,21 +209,23 @@ class VertexAIChatClient(VertexAIClient):
207
209
 
208
210
  def make_request(self, request: Request) -> RequestResult:
209
211
  """Make a request"""
210
- contents = [request.prompt]
212
+ # mypy is unhappy without this cast
213
+ contents: Union[List[Union[str, Image, Part]], List[Content]] = cast(
214
+ List[Union[str, Image, Part]], [request.prompt]
215
+ )
211
216
 
212
217
  # For the multimodal case, build up the content with the media objects of `request.multimodal_prompt`
213
218
  if request.multimodal_prompt is not None:
214
219
  return self._make_multimodal_request(request)
215
220
 
216
221
  if request.messages is not None:
217
- contents = []
218
222
  role_mapping = {"user": "user", "assistant": "model"}
219
- for msg in request.messages:
220
- contents.append(
221
- Content(role=role_mapping.get(msg["role"], "user"), parts=[Part.from_text(msg["content"])])
222
- )
223
+ contents = [
224
+ Content(role=role_mapping.get(msg["role"], "user"), parts=[Part.from_text(msg["content"])])
225
+ for msg in request.messages
226
+ ]
223
227
 
224
- parameters = {
228
+ parameters: Dict[str, Any] = {
225
229
  "temperature": request.temperature,
226
230
  "max_output_tokens": request.max_tokens,
227
231
  "top_k": request.top_k_per_token,
@@ -274,8 +278,14 @@ class VertexAIChatClient(VertexAIClient):
274
278
  if not candidate.content:
275
279
  raise VertexAIContentBlockedError(f"No content in candidate: {candidate}")
276
280
  if not candidate.content.parts:
277
- raise VertexAIContentBlockedError(f"No content parts in candidate: {candidate}")
278
- predictions.append({"text": candidate.content.text})
281
+ if candidate.finish_reason == 2: # MAX_TOKENS
282
+ # This means that there is no text output because the maximum number of tokens were
283
+ # reached during thinking.
284
+ predictions.append({"text": ""})
285
+ else:
286
+ raise VertexAIContentBlockedError(f"No content parts in candidate: {candidate}")
287
+ else:
288
+ predictions.append({"text": candidate.content.text})
279
289
  # TODO: Extract more information from the response
280
290
  return {"predictions": predictions}
281
291
 
@@ -302,6 +312,7 @@ class VertexAIChatClient(VertexAIClient):
302
312
  error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
303
313
  )
304
314
  except (requests.exceptions.RequestException, AssertionError) as e:
315
+ hexception(e)
305
316
  error: str = f"VertexAITextClient error: {e}"
306
317
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
307
318
 
@@ -432,6 +443,7 @@ class VertexAIChatClient(VertexAIClient):
432
443
  cache_key = self.make_cache_key_with_safety_settings_preset(raw_cache_key, request)
433
444
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
434
445
  except requests.exceptions.RequestException as e:
446
+ hexception(e)
435
447
  error: str = f"Gemini Vision error: {e}"
436
448
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
437
449
  except VertexAIContentBlockedError as e:
@@ -8,7 +8,7 @@ import torch
8
8
 
9
9
  from helm.common.cache import CacheConfig
10
10
  from helm.common.gpu_utils import get_torch_device_name, is_cuda_available
11
- from helm.common.hierarchical_logger import hlog, htrack_block
11
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
12
12
  from helm.common.media_object import TEXT_TYPE
13
13
  from helm.common.request import Request, RequestResult, GeneratedOutput, Token
14
14
  from helm.common.request import wrap_request_time
@@ -125,6 +125,7 @@ class HuggingFaceVision2SeqClient(CachingClient):
125
125
  )
126
126
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
127
127
  except RuntimeError as model_error:
128
+ hexception(model_error)
128
129
  return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
129
130
 
130
131
  for text in result["output"]:
@@ -5,6 +5,7 @@ from transformers import pipeline
5
5
  from transformers.pipelines import ImageToTextPipeline
6
6
 
7
7
  from helm.common.cache import CacheConfig
8
+ from helm.common.hierarchical_logger import hexception
8
9
  from helm.common.images_utils import open_image
9
10
  from helm.common.media_object import TEXT_TYPE
10
11
  from helm.common.optional_dependencies import handle_module_not_found_error
@@ -93,6 +94,7 @@ class HuggingFaceVLMClient(CachingClient):
93
94
  )
94
95
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
95
96
  except RuntimeError as e:
97
+ hexception(e)
96
98
  return RequestResult(success=False, cached=False, error=str(e), completions=[], embedding=[])
97
99
 
98
100
  output: str = result["generated_text"]
@@ -8,7 +8,7 @@ from transformers import IdeficsForVisionText2Text, AutoProcessor, IdeficsProces
8
8
  from helm.common.cache import CacheConfig
9
9
  from helm.common.images_utils import open_image
10
10
  from helm.common.gpu_utils import get_torch_device_name
11
- from helm.common.hierarchical_logger import hlog, htrack_block
11
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
12
12
  from helm.common.media_object import TEXT_TYPE
13
13
  from helm.common.optional_dependencies import handle_module_not_found_error
14
14
  from helm.common.request import Request, RequestResult, GeneratedOutput, Token
@@ -137,6 +137,7 @@ class IDEFICSClient(CachingClient):
137
137
  )
138
138
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
139
139
  except RuntimeError as model_error:
140
+ hexception(model_error)
140
141
  return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
141
142
 
142
143
  for text in result["output"]:
@@ -5,7 +5,7 @@ import torch
5
5
  from huggingface_hub import hf_hub_download
6
6
 
7
7
  from helm.common.cache import CacheConfig
8
- from helm.common.hierarchical_logger import hlog, htrack_block
8
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
9
9
  from helm.common.images_utils import open_image
10
10
  from helm.common.gpu_utils import get_torch_device_name
11
11
  from helm.common.media_object import TEXT_TYPE
@@ -131,6 +131,7 @@ class OpenFlamingoClient(CachingClient):
131
131
  )
132
132
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
133
133
  except RuntimeError as ex:
134
+ hexception(ex)
134
135
  return RequestResult(success=False, cached=False, error=str(ex), completions=[], embedding=[])
135
136
 
136
137
  completions: List[GeneratedOutput] = []
@@ -8,7 +8,7 @@ from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
8
8
  from helm.common.cache import CacheConfig
9
9
  from helm.common.images_utils import open_image
10
10
  from helm.common.gpu_utils import get_torch_device_name
11
- from helm.common.hierarchical_logger import hlog, htrack_block
11
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
12
12
  from helm.common.media_object import TEXT_TYPE
13
13
  from helm.common.optional_dependencies import handle_module_not_found_error
14
14
  from helm.common.request import Request, RequestResult, GeneratedOutput, Token
@@ -126,6 +126,7 @@ class PaliGemmaClient(CachingClient):
126
126
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
127
127
  concat_results.append(result)
128
128
  except RuntimeError as model_error:
129
+ hexception(model_error)
129
130
  return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
130
131
 
131
132
  for result in concat_results:
@@ -5,6 +5,7 @@ import requests
5
5
 
6
6
  from helm.common.cache import CacheConfig
7
7
  from helm.common.images_utils import encode_base64
8
+ from helm.common.hierarchical_logger import hexception
8
9
  from helm.common.media_object import TEXT_TYPE
9
10
  from helm.common.request import Request, RequestResult, GeneratedOutput, ErrorFlags
10
11
  from helm.common.request import wrap_request_time
@@ -76,6 +77,7 @@ class PalmyraVisionClient(CachingClient):
76
77
  )
77
78
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
78
79
  except PalmyraVisionContentBlockedError as ex:
80
+ hexception(ex)
79
81
  return RequestResult(
80
82
  success=False,
81
83
  cached=False,
@@ -8,7 +8,7 @@ import torch
8
8
 
9
9
  from helm.common.cache import CacheConfig
10
10
  from helm.common.gpu_utils import get_torch_device_name
11
- from helm.common.hierarchical_logger import hlog, htrack_block
11
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
12
12
  from helm.common.media_object import TEXT_TYPE
13
13
  from helm.common.request import Request, RequestResult, GeneratedOutput, Token
14
14
  from helm.common.request import wrap_request_time
@@ -157,6 +157,7 @@ class Qwen2VLMClient(CachingClient):
157
157
  )
158
158
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
159
159
  except RuntimeError as model_error:
160
+ hexception(model_error)
160
161
  return RequestResult(
161
162
  success=False,
162
163
  cached=False,
@@ -7,7 +7,7 @@ from transformers.generation import GenerationConfig
7
7
 
8
8
  from helm.common.cache import CacheConfig
9
9
  from helm.common.gpu_utils import get_torch_device_name
10
- from helm.common.hierarchical_logger import hlog, htrack_block
10
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
11
11
  from helm.common.media_object import TEXT_TYPE
12
12
  from helm.common.request import Request, RequestResult, GeneratedOutput, Token
13
13
  from helm.common.request import wrap_request_time
@@ -139,6 +139,7 @@ class QwenVLMClient(CachingClient):
139
139
  )
140
140
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
141
141
  except RuntimeError as model_error:
142
+ hexception(model_error)
142
143
  return RequestResult(
143
144
  success=False, cached=False, error=str(model_error), completions=[], embedding=[]
144
145
  )
@@ -2,7 +2,7 @@ from typing import Any, Dict, Optional
2
2
 
3
3
  from helm.common.cache import CacheConfig
4
4
  from helm.common.request import Request
5
- from helm.clients.openai_client import OpenAILegacyCompletionsClient
5
+ from helm.clients.openai_client import OpenAIClient, OpenAILegacyCompletionsClient
6
6
  from helm.tokenizers.tokenizer import Tokenizer
7
7
 
8
8
 
@@ -19,6 +19,8 @@ class VLLMClient(OpenAILegacyCompletionsClient):
19
19
  tokenizer_name: str,
20
20
  cache_config: CacheConfig,
21
21
  base_url: Optional[str] = None,
22
+ vllm_model_name: Optional[str] = None,
23
+ **kwargs,
22
24
  ):
23
25
  super().__init__(
24
26
  tokenizer=tokenizer,
@@ -27,18 +29,52 @@ class VLLMClient(OpenAILegacyCompletionsClient):
27
29
  api_key="EMPTY",
28
30
  org_id=None,
29
31
  base_url=base_url,
32
+ openai_model_name=vllm_model_name,
33
+ **kwargs,
30
34
  )
31
35
  self.tokenizer = tokenizer
32
36
  self.tokenizer_name = tokenizer_name
33
-
34
- def _get_model_for_request(self, request: Request) -> str:
35
- # The `model` parameter for vLLM should be the whole model name including the creator organization,
36
- # unlike OpenAI which only uses the model engine.
37
- return request.model
37
+ self.vllm_model_name = vllm_model_name
38
38
 
39
39
  def _to_raw_completion_request(self, request: Request) -> Dict[str, Any]:
40
40
  raw_request = super()._to_raw_completion_request(request)
41
41
  # This avoids the error: best_of must be 1 when using greedy sampling
42
- if "best_of" in raw_request and raw_request["best_of"] > 1:
42
+ if (
43
+ "temperature" in raw_request
44
+ and raw_request["temperature"] == 0.0
45
+ and "best_of" in raw_request
46
+ and raw_request["best_of"] > 1
47
+ ):
43
48
  raw_request["best_of"] = 1
44
49
  return raw_request
50
+
51
+
52
+ class VLLMChatClient(OpenAIClient):
53
+ """Sends request to a vLLM server using the OpenAI-compatible API.
54
+
55
+ Only uses the Chat Completions API.
56
+
57
+ See: https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server"""
58
+
59
+ def __init__(
60
+ self,
61
+ tokenizer: Tokenizer,
62
+ tokenizer_name: str,
63
+ cache_config: CacheConfig,
64
+ base_url: Optional[str] = None,
65
+ vllm_model_name: Optional[str] = None,
66
+ **kwargs,
67
+ ):
68
+ super().__init__(
69
+ tokenizer=tokenizer,
70
+ tokenizer_name=tokenizer_name,
71
+ cache_config=cache_config,
72
+ api_key="EMPTY",
73
+ org_id=None,
74
+ base_url=base_url,
75
+ openai_model_name=vllm_model_name,
76
+ **kwargs,
77
+ )
78
+ self.tokenizer = tokenizer
79
+ self.tokenizer_name = tokenizer_name
80
+ self.vllm_model_name = vllm_model_name
@@ -0,0 +1,56 @@
1
+ from dataclasses import replace
2
+ import re
3
+ from typing import Any, Dict, List, Tuple
4
+
5
+ from helm.clients.vllm_client import VLLMChatClient
6
+ from helm.common.request import GeneratedOutput, Request, RequestResult, Thinking
7
+
8
+
9
+ class VLLMGraniteThinkingClient(VLLMChatClient):
10
+ """Sends request to a Granite model on vLLM server with thinking enabled.
11
+
12
+ From vLLM documentation at
13
+ https://docs.vllm.ai/en/v0.9.1/features/reasoning_outputs.html
14
+
15
+ IBM Granite 3.2 reasoning is disabled by default;
16
+ to enable it, you must also pass thinking=True in your chat_template_kwargs.
17
+ """
18
+
19
+ def _make_chat_raw_request(self, request: Request) -> Dict[str, Any]:
20
+ raw_request = super()._make_chat_raw_request(request)
21
+ raw_request["extra_body"] = {"chat_template_kwargs": {"thinking": True}}
22
+ return raw_request
23
+
24
+ def _parse_thinking(self, input: str) -> Tuple[str, str]:
25
+ """Return a tuple of thinking text and output text."""
26
+ match = re.match(r"<think>(.*)</think>\s*<response>(.*)</response>", input, re.DOTALL)
27
+ if match:
28
+ return (match.group(1), match.group(2))
29
+
30
+ match = re.match(r"<think>(.*)</think>\s*<response>(.*)", input, re.DOTALL)
31
+ if match:
32
+ return (match.group(1), match.group(2))
33
+
34
+ match = re.match(r"<think>(.*)</think>\s*", input, re.DOTALL)
35
+ if match:
36
+ return (match.group(1), "")
37
+
38
+ match = re.match(r"<think>(.*)", input, re.DOTALL)
39
+ if match:
40
+ return (match.group(1), "")
41
+
42
+ return (input, "")
43
+
44
+ def _make_chat_request(self, request: Request) -> RequestResult:
45
+ request_result = super()._make_chat_request(request)
46
+ modified_completions: List[GeneratedOutput] = []
47
+ for completion in request_result.completions:
48
+ thinking, modified_text = self._parse_thinking(completion.text)
49
+ modified_completions.append(
50
+ replace(
51
+ completion,
52
+ text=modified_text,
53
+ thinking=Thinking(text=thinking),
54
+ )
55
+ )
56
+ return replace(request_result, completions=modified_completions)
@@ -2,8 +2,10 @@ from typing import Any, Dict, List, Mapping, Optional
2
2
 
3
3
  from helm.clients.client import CachingClient
4
4
  from helm.common.cache import CacheConfig
5
+ from helm.common.hierarchical_logger import hexception
5
6
  from helm.common.optional_dependencies import handle_module_not_found_error
6
7
  from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
8
+ from helm.proxy.retry import NonRetriableException
7
9
 
8
10
  try:
9
11
  from writerai import Writer
@@ -19,9 +21,9 @@ class WriterClient(CachingClient):
19
21
 
20
22
  def _get_messages_from_request(self, request: Request) -> List[Dict]:
21
23
  if request.prompt and request.messages:
22
- raise ValueError(f"Only one of `prompt` and `messages` may be set in request: {request}")
24
+ raise NonRetriableException(f"Only one of `prompt` and `messages` may be set in request: {request}")
23
25
  if request.multimodal_prompt:
24
- raise ValueError("`multimodal_prompt` is not supported by WriterClient")
26
+ raise NonRetriableException("`multimodal_prompt` is not supported by WriterClient")
25
27
  if request.messages:
26
28
  return [{"role": message["role"], "content": message["content"]} for message in request.messages]
27
29
  else:
@@ -82,6 +84,7 @@ class WriterClient(CachingClient):
82
84
  raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
83
85
  chat_completion: ChatCompletion = ChatCompletion.model_validate(raw_response)
84
86
  except Exception as error:
87
+ hexception(error)
85
88
  return RequestResult(
86
89
  success=False,
87
90
  cached=False,
@@ -6,7 +6,6 @@ from helm.common.media_object import MediaObject
6
6
  class QuestionType:
7
7
  """String enum of question types."""
8
8
 
9
- # TODO: Make this a StrEnum after upgrading to Python 3.11
10
9
  MULTIPLE_CHOICE: str = "multiple_choice"
11
10
  CHECKBOX: str = "checkbox"
12
11
  FREE_RESPONSE: str = "free_response"