crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/bbq_metrics.py +12 -0
  27. helm/benchmark/metrics/classification_metrics.py +19 -1
  28. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  29. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  30. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  31. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  32. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  33. helm/benchmark/metrics/comet_metric.py +1 -1
  34. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  35. helm/benchmark/metrics/copyright_metrics.py +1 -1
  36. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  37. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  38. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  39. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  40. helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
  41. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  42. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  43. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  44. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  45. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  46. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  47. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  48. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  49. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  50. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  51. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  52. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  53. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  54. helm/benchmark/metrics/medec_metrics.py +25 -2
  55. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  56. helm/benchmark/metrics/metric.py +25 -0
  57. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  58. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  59. helm/benchmark/metrics/safety_metrics.py +13 -1
  60. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  61. helm/benchmark/metrics/summac/model_summac.py +3 -3
  62. helm/benchmark/metrics/summarization_metrics.py +129 -1
  63. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  64. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  65. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  66. helm/benchmark/model_deployment_registry.py +11 -19
  67. helm/benchmark/presentation/create_plots.py +11 -2
  68. helm/benchmark/presentation/run_display.py +13 -3
  69. helm/benchmark/presentation/run_entry.py +2 -2
  70. helm/benchmark/presentation/schema.py +10 -22
  71. helm/benchmark/presentation/summarize.py +189 -14
  72. helm/benchmark/presentation/taxonomy_info.py +20 -0
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/run.py +15 -4
  75. helm/benchmark/run_expander.py +4 -0
  76. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  77. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  78. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  79. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  80. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  81. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  82. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  83. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  84. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  85. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  86. helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
  87. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  88. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
  89. helm/benchmark/runner.py +7 -0
  90. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  91. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  92. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  93. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  94. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  95. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  96. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  97. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  98. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  99. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  100. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  101. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  102. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  103. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
  104. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
  105. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
  106. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  107. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  108. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  109. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  110. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  111. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  112. helm/benchmark/scenarios/bold_scenario.py +15 -0
  113. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  115. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  116. helm/benchmark/scenarios/clear_scenario.py +23 -0
  117. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  118. helm/benchmark/scenarios/code_scenario.py +28 -0
  119. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  120. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  121. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  122. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  123. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  124. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  125. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  126. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  127. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  128. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  129. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  130. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  131. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  132. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  133. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  134. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  135. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  136. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  137. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  138. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  139. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  140. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  141. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  142. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  143. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  144. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  145. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  146. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  147. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  148. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  149. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  150. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  151. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  152. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  153. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  154. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  155. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  156. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  157. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  158. helm/benchmark/scenarios/ice_scenario.py +21 -1
  159. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  160. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  161. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  162. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  163. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  164. helm/benchmark/scenarios/koala_scenario.py +21 -1
  165. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  166. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  167. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  168. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  169. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  170. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  171. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  172. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  173. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  175. helm/benchmark/scenarios/math_scenario.py +54 -20
  176. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  177. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  178. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  179. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  180. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  181. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  182. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  183. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  184. helm/benchmark/scenarios/medec_scenario.py +23 -0
  185. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  186. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  187. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  188. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  189. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  190. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  191. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  192. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  193. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  194. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  195. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  196. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  197. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  198. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  199. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  200. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  201. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  202. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  203. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  204. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  205. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  206. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  207. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  208. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  209. helm/benchmark/scenarios/quac_scenario.py +14 -0
  210. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  211. helm/benchmark/scenarios/raft_scenario.py +15 -0
  212. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  213. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  214. helm/benchmark/scenarios/scenario.py +31 -0
  215. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  216. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  217. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  218. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  219. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  220. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  221. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  222. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  223. helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
  224. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  225. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  226. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  227. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  228. helm/benchmark/scenarios/spider_scenario.py +18 -0
  229. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  230. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  231. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  232. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  233. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  234. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  235. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  236. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  237. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  238. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  239. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  240. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  241. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  242. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  243. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  244. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  245. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  246. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  247. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  248. helm/benchmark/slurm_jobs.py +1 -2
  249. helm/benchmark/slurm_runner.py +8 -1
  250. helm/benchmark/static/schema_arabic.yaml +271 -0
  251. helm/benchmark/static/schema_classic.yaml +0 -17
  252. helm/benchmark/static/schema_long_context.yaml +17 -18
  253. helm/benchmark/static/schema_medhelm.yaml +36 -0
  254. helm/benchmark/static/schema_slp.yaml +219 -0
  255. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  256. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  257. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  258. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  259. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  260. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  261. helm/benchmark/static_build/index.html +5 -6
  262. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  263. helm/clients/ai21_client.py +2 -0
  264. helm/clients/aleph_alpha_client.py +2 -0
  265. helm/clients/anthropic_client.py +7 -1
  266. helm/clients/audio_language/diva_llama_client.py +2 -0
  267. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  268. helm/clients/audio_language/llama_omni/constants.py +9 -0
  269. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  270. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  271. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  272. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  273. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  274. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  275. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  276. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  277. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  278. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  279. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  280. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  281. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  282. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  283. helm/clients/audio_language/llama_omni/utils.py +202 -0
  284. helm/clients/audio_language/llama_omni_client.py +2 -1
  285. helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
  286. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  287. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  288. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  289. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  290. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  291. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  292. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  293. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  294. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  295. helm/clients/bedrock_client.py +63 -6
  296. helm/clients/cohere_client.py +3 -0
  297. helm/clients/dspy_client.py +135 -0
  298. helm/clients/google_client.py +2 -0
  299. helm/clients/http_model_client.py +2 -0
  300. helm/clients/huggingface_client.py +4 -3
  301. helm/clients/ibm_client.py +3 -1
  302. helm/clients/image_generation/adobe_vision_client.py +2 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  304. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  305. helm/clients/image_generation/cogview2_client.py +2 -1
  306. helm/clients/image_generation/dalle2_client.py +2 -0
  307. helm/clients/image_generation/dalle_mini_client.py +2 -1
  308. helm/clients/image_generation/deep_floyd_client.py +2 -0
  309. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  310. helm/clients/image_generation/lexica_client.py +2 -0
  311. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  312. helm/clients/image_generation/mindalle_client.py +2 -1
  313. helm/clients/image_generation/together_image_generation_client.py +2 -0
  314. helm/clients/megatron_client.py +2 -0
  315. helm/clients/mistral_client.py +2 -0
  316. helm/clients/moderation_api_client.py +2 -0
  317. helm/clients/openai_client.py +38 -21
  318. helm/clients/openai_responses_client.py +34 -8
  319. helm/clients/openrouter_client.py +31 -0
  320. helm/clients/palmyra_client.py +2 -1
  321. helm/clients/reka_client.py +2 -1
  322. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  323. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  324. helm/clients/test_huggingface_client.py +3 -3
  325. helm/clients/test_openrouter_client.py +69 -0
  326. helm/clients/together_client.py +52 -13
  327. helm/clients/vertexai_client.py +23 -11
  328. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  329. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  330. helm/clients/vision_language/idefics_client.py +2 -1
  331. helm/clients/vision_language/open_flamingo_client.py +2 -1
  332. helm/clients/vision_language/paligemma_client.py +2 -1
  333. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  334. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  335. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  336. helm/clients/vllm_client.py +43 -7
  337. helm/clients/vllm_granite_thinking_client.py +56 -0
  338. helm/clients/writer_client.py +5 -2
  339. helm/common/critique_request.py +0 -1
  340. helm/common/hierarchical_logger.py +103 -34
  341. helm/common/object_spec.py +23 -8
  342. helm/common/optional_dependencies.py +1 -1
  343. helm/common/test_general.py +4 -0
  344. helm/common/test_logging.py +94 -0
  345. helm/config/model_deployments.yaml +1001 -187
  346. helm/config/model_metadata.yaml +602 -18
  347. helm/config/tokenizer_configs.yaml +202 -5
  348. helm/proxy/cli.py +1 -1
  349. helm/proxy/example_queries.py +8 -8
  350. helm/proxy/retry.py +5 -0
  351. helm/proxy/server.py +2 -1
  352. helm/proxy/static/index.css +4 -0
  353. helm/proxy/static/index.js +7 -1
  354. helm/tokenizers/auto_tokenizer.py +2 -2
  355. helm/tokenizers/grok_tokenizer.py +2 -0
  356. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  357. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  358. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  359. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  360. helm/benchmark/metrics/medalign_metrics.py +0 -14
  361. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  362. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  363. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  364. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  365. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  366. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  367. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  368. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  369. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  370. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  371. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
  372. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  373. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  374. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  375. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  376. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  377. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  378. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
  379. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  380. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
  381. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  382. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  383. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  384. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  385. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  386. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  387. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  388. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  389. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  390. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  391. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  392. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  393. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  394. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -8,7 +8,7 @@ from transformers.generation.stopping_criteria import (
8
8
  from typing import Any, Dict, List, Optional, TypedDict
9
9
 
10
10
  from helm.common.cache import CacheConfig
11
- from helm.common.hierarchical_logger import htrack_block, hlog, hwarn
11
+ from helm.common.hierarchical_logger import hexception, htrack_block, hlog, hwarn
12
12
  from helm.common.optional_dependencies import handle_module_not_found_error
13
13
  from helm.common.request import (
14
14
  wrap_request_time,
@@ -293,12 +293,12 @@ class HuggingFaceClient(CachingClient):
293
293
  if self._apply_chat_template:
294
294
  with self._wrapped_tokenizer as tokenizer:
295
295
  if request.messages:
296
- prompt = tokenizer.apply_chat_template(request.messages, tokenize=False)
296
+ prompt = tokenizer.apply_chat_template(request.messages, tokenize=False, add_generation_prompt=True)
297
297
  assert isinstance(prompt, str)
298
298
  return prompt
299
299
  else:
300
300
  prompt = tokenizer.apply_chat_template(
301
- [{"role": "user", "content": request.prompt}], tokenize=False
301
+ [{"role": "user", "content": request.prompt}], tokenize=False, add_generation_prompt=True
302
302
  )
303
303
  assert isinstance(prompt, str)
304
304
  return prompt
@@ -345,6 +345,7 @@ class HuggingFaceClient(CachingClient):
345
345
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
346
346
  except Exception as e: # Do something if error is encountered.
347
347
  error: str = f"HuggingFace error: {e}"
348
+ hexception(e)
348
349
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
349
350
 
350
351
  completions = []
@@ -1,7 +1,7 @@
1
1
  from abc import ABC
2
2
  from abc import abstractmethod
3
3
 
4
- from helm.common.hierarchical_logger import hlog
4
+ from helm.common.hierarchical_logger import hexception, hlog
5
5
  from helm.common.cache import CacheConfig
6
6
  from helm.common.request import (
7
7
  Request,
@@ -249,6 +249,7 @@ class IbmChatClient(IbmClient):
249
249
  )
250
250
 
251
251
  except Exception as e:
252
+ hexception(e)
252
253
  error: str = f"IBM Chat client Model error: {e}"
253
254
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
254
255
 
@@ -263,5 +264,6 @@ class IbmTextClient(IbmClient):
263
264
  inference_handler=GenerateInferenceHandler(inference_engine=self.inference_engine), request=request
264
265
  )
265
266
  except Exception as e:
267
+ hexception(e)
266
268
  error: str = f"IBM Text client Model error: {e}"
267
269
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
@@ -1,6 +1,7 @@
1
1
  from typing import List, Dict
2
2
 
3
3
  from helm.common.cache import Cache, CacheConfig
4
+ from helm.common.hierarchical_logger import hexception
4
5
  from helm.common.request import Request, RequestResult, GeneratedOutput
5
6
  from helm.common.tokenization_request import (
6
7
  TokenizationRequest,
@@ -54,6 +55,7 @@ class AdobeVisionClient(Client):
54
55
 
55
56
  response, cached = self._cache.get(cache_key, fail)
56
57
  except RuntimeError as e:
58
+ hexception(e)
57
59
  error: str = f"Adobe Vision Client error: {e}"
58
60
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
59
61
 
@@ -1,6 +1,7 @@
1
1
  from typing import List, Dict
2
2
 
3
3
  from helm.common.cache import Cache, CacheConfig
4
+ from helm.common.hierarchical_logger import hexception
4
5
  from helm.common.request import Request, RequestResult, GeneratedOutput
5
6
  from helm.common.tokenization_request import (
6
7
  TokenizationRequest,
@@ -74,6 +75,7 @@ class AlephAlphaImageGenerationClient(Client):
74
75
 
75
76
  response, cached = self._cache.get(cache_key, fail)
76
77
  except RuntimeError as e:
78
+ hexception(e)
77
79
  error: str = f"AlephAlphaVisionClient error: {e}"
78
80
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
79
81
 
@@ -184,7 +184,7 @@ def sparse_attention_2d_light(
184
184
  attention_dropout=None,
185
185
  log_attention_weights=None,
186
186
  add_scalar=0,
187
- **kwargs
187
+ **kwargs,
188
188
  ):
189
189
  """
190
190
  q0, k0, v0: [batch_size, 1088, hidden_size]
@@ -9,7 +9,7 @@ from torchvision.utils import save_image
9
9
 
10
10
  from helm.common.cache import CacheConfig, Cache
11
11
  from helm.common.file_caches.file_cache import FileCache
12
- from helm.common.hierarchical_logger import hlog, htrack_block
12
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
13
13
  from helm.common.optional_dependencies import handle_module_not_found_error
14
14
  from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
15
15
  from helm.common.tokenization_request import (
@@ -167,6 +167,7 @@ class CogView2Client(Client):
167
167
  )
168
168
  results, cached = self._cache.get(cache_key, wrap_request_time(do_it))
169
169
  except RuntimeError as e:
170
+ hexception(e)
170
171
  error: str = f"CogView2Client error: {e}"
171
172
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
172
173
 
@@ -4,6 +4,7 @@ import base64
4
4
  from helm.common.cache import CacheConfig, Cache
5
5
  from helm.common.general import hlog
6
6
  from helm.common.file_caches.file_cache import FileCache
7
+ from helm.common.hierarchical_logger import hexception
7
8
  from helm.common.media_object import MultimediaObject
8
9
  from helm.common.optional_dependencies import handle_module_not_found_error
9
10
  from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
@@ -124,6 +125,7 @@ class DALLE2Client(Client):
124
125
  hlog(f"Failed safety check: {request.prompt}")
125
126
  return self.get_content_policy_violated_result(request)
126
127
  else:
128
+ hexception(error)
127
129
  return RequestResult(
128
130
  success=False, cached=False, error=f"DALL-E error: {error}", completions=[], embedding=[]
129
131
  )
@@ -5,7 +5,7 @@ from functools import partial
5
5
 
6
6
  from helm.common.cache import CacheConfig, Cache
7
7
  from helm.common.file_caches.file_cache import FileCache
8
- from helm.common.hierarchical_logger import hlog, htrack_block
8
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
9
9
  from helm.common.optional_dependencies import handle_module_not_found_error
10
10
  from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
11
11
  from helm.common.tokenization_request import (
@@ -166,6 +166,7 @@ class DALLEMiniClient(Client):
166
166
  )
167
167
  results, cached = self._cache.get(cache_key, wrap_request_time(do_it))
168
168
  except RuntimeError as e:
169
+ hexception(e)
169
170
  error: str = f"DALLEMiniClient error: {e}"
170
171
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
171
172
 
@@ -1,6 +1,7 @@
1
1
  from typing import List, Dict
2
2
 
3
3
  from helm.common.cache import Cache, CacheConfig
4
+ from helm.common.hierarchical_logger import hexception
4
5
  from helm.common.request import Request, RequestResult, GeneratedOutput
5
6
  from helm.common.tokenization_request import (
6
7
  TokenizationRequest,
@@ -54,6 +55,7 @@ class DeepFloydClient(Client):
54
55
 
55
56
  response, cached = self._cache.get(cache_key, fail)
56
57
  except RuntimeError as e:
58
+ hexception(e)
57
59
  error: str = f"DeepFloyd Client error: {e}"
58
60
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
59
61
 
@@ -7,7 +7,7 @@ import torch
7
7
  from helm.common.cache import CacheConfig, Cache
8
8
  from helm.common.file_caches.file_cache import FileCache
9
9
  from helm.common.gpu_utils import get_torch_device_name, is_cuda_available
10
- from helm.common.hierarchical_logger import hlog, htrack_block
10
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
11
11
  from helm.common.optional_dependencies import handle_module_not_found_error
12
12
  from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
13
13
  from helm.common.tokenization_request import (
@@ -178,6 +178,7 @@ class HuggingFaceDiffusersClient(Client):
178
178
  )
179
179
  results, cached = self._cache.get(cache_key, wrap_request_time(do_it))
180
180
  except RuntimeError as ex:
181
+ hexception(ex)
181
182
  error: str = f"HuggingFaceDiffusersClient error: {ex}"
182
183
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
183
184
 
@@ -5,6 +5,7 @@ import urllib.parse
5
5
 
6
6
  from helm.common.cache import CacheConfig, Cache
7
7
  from helm.common.file_caches.file_cache import FileCache
8
+ from helm.common.hierarchical_logger import hexception
8
9
  from helm.common.images_utils import encode_base64
9
10
  from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
10
11
  from helm.common.tokenization_request import (
@@ -62,6 +63,7 @@ class LexicaClient(Client):
62
63
 
63
64
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
64
65
  except RuntimeError as e:
66
+ hexception(e)
65
67
  error: str = f"LexicaClient error: {e}"
66
68
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
67
69
 
@@ -141,7 +141,7 @@ class Encoder(nn.Module):
141
141
  in_channels: int,
142
142
  resolution: int,
143
143
  z_channels: int,
144
- double_z: Optional[bool] = None
144
+ double_z: Optional[bool] = None,
145
145
  ) -> None:
146
146
  super().__init__()
147
147
  self.ch = ch
@@ -232,7 +232,7 @@ class Decoder(nn.Module):
232
232
  in_channels: int,
233
233
  resolution: int,
234
234
  z_channels: int,
235
- double_z: bool
235
+ double_z: bool,
236
236
  ) -> None:
237
237
  super().__init__()
238
238
  self.ch = ch
@@ -5,7 +5,7 @@ import numpy as np
5
5
  from helm.common.cache import CacheConfig, Cache
6
6
  from helm.common.file_caches.file_cache import FileCache
7
7
  from helm.common.gpu_utils import get_torch_device_name
8
- from helm.common.hierarchical_logger import hlog, htrack_block
8
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
9
9
  from helm.common.optional_dependencies import handle_module_not_found_error
10
10
  from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
11
11
  from helm.common.tokenization_request import (
@@ -91,6 +91,7 @@ class MinDALLEClient(Client):
91
91
  )
92
92
  results, cached = self._cache.get(cache_key, wrap_request_time(do_it))
93
93
  except RuntimeError as ex:
94
+ hexception(ex)
94
95
  error: str = f"MinDALLEClient error: {ex}"
95
96
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
96
97
 
@@ -4,6 +4,7 @@ import requests
4
4
 
5
5
  from helm.common.cache import CacheConfig, Cache
6
6
  from helm.common.file_caches.file_cache import FileCache
7
+ from helm.common.hierarchical_logger import hexception
7
8
  from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
8
9
  from helm.common.tokenization_request import (
9
10
  TokenizationRequest,
@@ -84,6 +85,7 @@ class TogetherImageGenerationClient(Client):
84
85
 
85
86
  response, cached = self._cache.get(cache_key, wrap_request_time(do_it))
86
87
  except RuntimeError as e:
88
+ hexception(e)
87
89
  error: str = f"TogetherVisionClient error: {e}"
88
90
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
89
91
 
@@ -4,6 +4,7 @@ from typing import Any, Dict, List
4
4
  import traceback
5
5
  from helm.common.cache import CacheConfig
6
6
 
7
+ from helm.common.hierarchical_logger import hexception
7
8
  from helm.common.request import (
8
9
  wrap_request_time,
9
10
  EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
@@ -103,6 +104,7 @@ class MegatronClient(CachingClient):
103
104
  try:
104
105
  return self._make_request(request)
105
106
  except Exception as e:
107
+ hexception(e)
106
108
  return RequestResult(
107
109
  success=False,
108
110
  cached=False,
@@ -1,6 +1,7 @@
1
1
  import requests
2
2
  from typing import Any, Dict, List, Optional, TypedDict, Union
3
3
 
4
+ from helm.common.hierarchical_logger import hexception
4
5
  from helm.proxy.retry import NonRetriableException
5
6
  from helm.common.cache import CacheConfig
6
7
  from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
@@ -156,6 +157,7 @@ class MistralAIClient(CachingClient):
156
157
 
157
158
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
158
159
  except (requests.exceptions.RequestException, AssertionError) as e:
160
+ hexception(e)
159
161
  error: str = f"MistralClient error: {e}"
160
162
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
161
163
 
@@ -1,5 +1,6 @@
1
1
  from typing import Any, Dict
2
2
 
3
+ from helm.common.hierarchical_logger import hexception
3
4
  from helm.common.request import wrap_request_time
4
5
  from helm.common.cache import Cache, CacheConfig
5
6
  from helm.common.moderations_api_request import (
@@ -64,6 +65,7 @@ class ModerationAPIClient:
64
65
 
65
66
  response, cached = self.cache.get(raw_request, wrap_request_time(do_it))
66
67
  except openai.OpenAIError as e:
68
+ hexception(e)
67
69
  error: str = f"Moderation API error: {e}"
68
70
  return ModerationAPIRequestResult(
69
71
  success=False, cached=False, error=error, flagged=None, flagged_results=None, scores=None
@@ -10,7 +10,7 @@ from helm.common import multimodal_request_utils
10
10
  from helm.common.cache import CacheConfig
11
11
  from helm.common.media_object import TEXT_TYPE, MultimediaObject, MediaObject
12
12
  from helm.common.request import ErrorFlags, Thinking, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
13
- from helm.common.hierarchical_logger import hlog, hwarn
13
+ from helm.common.hierarchical_logger import hlog, hwarn, hexception
14
14
  from helm.common.object_spec import get_class_by_name
15
15
  from helm.common.optional_dependencies import handle_module_not_found_error
16
16
  from helm.common.tokenization_request import (
@@ -33,9 +33,12 @@ class OpenAIClientUtils:
33
33
  @classmethod
34
34
  def is_reasoning_model(cls, model_engine: str) -> bool:
35
35
  # All OpenAI reasoning models start "o[somenumber]", so we regexp for that to future proof things
36
- return bool(re.match(r"^o\d+", model_engine))
36
+ return bool(re.match(r"^o\d+", model_engine)) or bool(re.match(r"^gpt-5", model_engine))
37
37
 
38
38
  # Error OpenAI throws when the image in the prompt violates their content policy
39
+ HARMFUL_INFORMATION_ERROR: str = (
40
+ "Invalid prompt: we've limited access to this content for safety reasons. This type of information may be used to benefit or to harm people." # noqa: E501
41
+ )
39
42
  INAPPROPRIATE_IMAGE_ERROR: str = "Your input image may contain content that is not allowed by our safety system"
40
43
  INAPPROPRIATE_PROMPT_ERROR: str = "Invalid prompt: your prompt was flagged"
41
44
  INAPPROPRIATE_PROMPT_AZURE_ERROR: str = (
@@ -44,12 +47,10 @@ class OpenAIClientUtils:
44
47
  INAPPROPRIATE_PROMPT_MICROSOFT_ERROR: str = (
45
48
  "The response was filtered due to the prompt triggering Microsoft's content management policy."
46
49
  )
47
-
48
- # OpenAI server error
49
- OPENAI_SERVER_ERROR: str = (
50
- "The server had an error processing your request. Sorry about that! You can retry your request, "
51
- "or contact us through our help center at help.openai.com if you keep seeing this error."
52
- )
50
+ # Grok content safety guidelines error message
51
+ # TODO: Refactor so that this is owned by the Grok client instead.
52
+ SAFETY_GUIDELINES_GROK_ERROR: str = "Content violates safety guidelines."
53
+ USAGE_GUIDELINES_GROK_ERROR: str = "Content violates usage guidelines."
53
54
 
54
55
  # Set the finish reason to this if the prompt violates OpenAI's content policy
55
56
  CONTENT_POLICY_VIOLATED_FINISH_REASON: str = (
@@ -74,21 +75,14 @@ class OpenAIClientUtils:
74
75
  completions=[empty_completion] * request.num_completions,
75
76
  embedding=[],
76
77
  )
77
- elif cls.OPENAI_SERVER_ERROR in str(e):
78
- # Handle these errors by returning an empty completion to unblock
79
- hwarn(f"OpenAI server error for request: {str(request)}")
80
- empty_completion = GeneratedOutput(
81
- text="",
82
- logprob=0,
83
- tokens=[],
84
- finish_reason={"reason": cls.OPENAI_SERVER_ERROR},
85
- )
78
+ elif cls.HARMFUL_INFORMATION_ERROR in str(e):
86
79
  return RequestResult(
87
- success=True,
80
+ success=False,
88
81
  cached=False,
89
- request_time=0,
90
- completions=[empty_completion] * request.num_completions,
82
+ error="Prompt blocked by OpenAI's safety filter",
83
+ completions=[],
91
84
  embedding=[],
85
+ error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
92
86
  )
93
87
  elif cls.INAPPROPRIATE_PROMPT_AZURE_ERROR in str(e) or cls.INAPPROPRIATE_PROMPT_MICROSOFT_ERROR in str(e):
94
88
  return RequestResult(
@@ -99,7 +93,26 @@ class OpenAIClientUtils:
99
93
  embedding=[],
100
94
  error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
101
95
  )
96
+ elif cls.SAFETY_GUIDELINES_GROK_ERROR in str(e):
97
+ return RequestResult(
98
+ success=False,
99
+ cached=False,
100
+ error="Grok API error: Content violates safety guidelines",
101
+ completions=[],
102
+ embedding=[],
103
+ error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
104
+ )
105
+ elif cls.USAGE_GUIDELINES_GROK_ERROR in str(e):
106
+ return RequestResult(
107
+ success=False,
108
+ cached=False,
109
+ error="Grok API error: Content violates usage guidelines",
110
+ completions=[],
111
+ embedding=[],
112
+ error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
113
+ )
102
114
 
115
+ hexception(e)
103
116
  error: str = f"OpenAI error: {e}"
104
117
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
105
118
 
@@ -118,11 +131,12 @@ class OpenAIClient(CachingClient):
118
131
  reasoning_effort: Optional[str] = None,
119
132
  openai_model_name: Optional[str] = None,
120
133
  output_processor: Optional[str] = None,
134
+ **kwargs,
121
135
  ):
122
136
  super().__init__(cache_config=cache_config)
123
137
  self.tokenizer = tokenizer
124
138
  self.tokenizer_name = tokenizer_name
125
- self.client = OpenAI(api_key=api_key, organization=org_id, base_url=base_url)
139
+ self.client = OpenAI(api_key=api_key, organization=org_id, base_url=base_url, **kwargs)
126
140
  self.reasoning_effort = reasoning_effort
127
141
  self.openai_model_name = openai_model_name
128
142
  self.output_processor: Optional[Callable[[str], str]] = (
@@ -157,6 +171,7 @@ class OpenAIClient(CachingClient):
157
171
  cache_key = self._get_cache_key(raw_request, request)
158
172
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
159
173
  except openai.OpenAIError as e:
174
+ hexception(e)
160
175
  error: str = f"OpenAI error: {e}"
161
176
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
162
177
 
@@ -423,6 +438,7 @@ class OpenAIClient(CachingClient):
423
438
  cache_key = self._get_cache_key(raw_request, request)
424
439
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
425
440
  except openai.OpenAIError as e:
441
+ hexception(e)
426
442
  error: str = f"OpenAI error: {e}"
427
443
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
428
444
 
@@ -478,6 +494,7 @@ class OpenAIClient(CachingClient):
478
494
  cache_key = self._get_cache_key({"audio": audio_path, "model": model}, request)
479
495
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
480
496
  except openai.OpenAIError as e:
497
+ hexception(e)
481
498
  error: str = f"OpenAI error: {e}"
482
499
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
483
500
 
@@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional, Union
5
5
 
6
6
  from helm.clients.openai_client import OpenAIClientUtils
7
7
  from helm.common.cache import CacheConfig
8
+ from helm.common.hierarchical_logger import hwarn
8
9
  from helm.common.media_object import TEXT_TYPE
9
10
  from helm.common.request import (
10
11
  Thinking,
@@ -60,7 +61,28 @@ class OpenAIResponseClient(CachingClient):
60
61
 
61
62
  def _make_raw_request(self, request: Request) -> dict[str, Any]:
62
63
  input: Union[str, List[Dict[str, Any]]]
63
- if request.multimodal_prompt is not None:
64
+
65
+ if (
66
+ (request.prompt and request.messages)
67
+ or (request.prompt and request.multimodal_prompt)
68
+ or (request.messages and request.multimodal_prompt)
69
+ ):
70
+ raise ValueError(
71
+ f"More than one of `prompt`, `messages` and `multimodal_prompt` was set in request: {request}"
72
+ )
73
+
74
+ if request.messages is not None:
75
+ # Checks that all messages have a role and some content
76
+ for message in request.messages:
77
+ if not message.get("role") or not message.get("content"):
78
+ raise ValueError("All messages must have a role and content")
79
+ # Checks that the last role is "user"
80
+ if request.messages[-1]["role"] != "user":
81
+ raise ValueError("Last message must have role 'user'")
82
+ if request.prompt != "":
83
+ hwarn("Since message is set, prompt will be ignored")
84
+ input = request.messages
85
+ elif request.multimodal_prompt is not None:
64
86
  content = []
65
87
  request.validate()
66
88
  for media_object in request.multimodal_prompt.media_objects:
@@ -101,6 +123,8 @@ class OpenAIResponseClient(CachingClient):
101
123
  # Plus other changes
102
124
  model_engine: str = request.model_engine
103
125
  if OpenAIClientUtils.is_reasoning_model(model_engine):
126
+ if "reasoning" not in raw_request:
127
+ raw_request["reasoning"] = {}
104
128
  raw_request["reasoning"]["summary"] = "detailed"
105
129
  # Avoid error:
106
130
  # "Error code: 400 - {'error': {'message': "Unsupported parameter: 'temperature' is
@@ -145,13 +169,15 @@ class OpenAIResponseClient(CachingClient):
145
169
  if request.echo_prompt:
146
170
  text_output += request.prompt
147
171
  for output in response["output"]:
148
- output_type = output["type"] # one of "message" or "reasoning" from API observation
149
- is_reasoning_output = output_type == "reasoning"
150
-
151
- if is_reasoning_output:
152
- reasoning_output += "\n".join([raw_output["text"] for raw_output in output["summary"]])
153
- else:
154
- text_output += "\n".join([raw_output["text"] for raw_output in output["content"]])
172
+ output_type = output[
173
+ "type"
174
+ ] # one of "message" or "reasoning" from API observation, but can also include tool calls
175
+
176
+ if output_type == "reasoning":
177
+ reasoning_output += "\n\n".join([raw_output["text"] for raw_output in output["summary"]])
178
+ elif output_type == "message":
179
+ text_output += "\n\n".join([raw_output["text"] for raw_output in output["content"]])
180
+ # (Other output types are ignored)
155
181
 
156
182
  completion = truncate_and_tokenize_response_text(
157
183
  text_output,
@@ -0,0 +1,31 @@
1
+ import os
2
+ from typing import Optional
3
+ from helm.clients.openai_client import OpenAIClient
4
+ from helm.common.cache import CacheConfig
5
+ from helm.tokenizers.tokenizer import Tokenizer
6
+
7
+
8
+ class OpenRouterClient(OpenAIClient):
9
+ def __init__(
10
+ self,
11
+ tokenizer_name: str,
12
+ tokenizer: Tokenizer,
13
+ cache_config: CacheConfig,
14
+ api_key: Optional[str] = None,
15
+ model_name: Optional[str] = None,
16
+ output_processor: Optional[str] = None,
17
+ ):
18
+ self.api_key = api_key or os.getenv("OPENROUTER_API_KEY")
19
+ self.base_url = "https://openrouter.ai/api/v1/"
20
+ super().__init__(
21
+ tokenizer,
22
+ tokenizer_name,
23
+ cache_config=cache_config,
24
+ output_processor=output_processor,
25
+ base_url=self.base_url,
26
+ api_key=self.api_key,
27
+ )
28
+ self.model_name = model_name
29
+
30
+ def _get_model_for_request(self, request):
31
+ return self.model_name or request.model
@@ -5,7 +5,7 @@ from typing import Any, Dict, List
5
5
 
6
6
  from helm.clients.openai_client import OpenAIClient
7
7
  from helm.common.cache import CacheConfig
8
- from helm.common.hierarchical_logger import hwarn
8
+ from helm.common.hierarchical_logger import hexception, hwarn
9
9
  from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token, ErrorFlags
10
10
  from helm.common.tokenization_request import (
11
11
  TokenizationRequest,
@@ -99,6 +99,7 @@ class PalmyraClient(CachingClient):
99
99
 
100
100
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
101
101
  except (requests.exceptions.RequestException, AssertionError) as e:
102
+ hexception(e)
102
103
  error: str = f"PalmyraClient error: {e}"
103
104
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
104
105
 
@@ -6,7 +6,7 @@ from helm.proxy.retry import NonRetriableException
6
6
  from helm.common.cache import CacheConfig
7
7
  from helm.common.media_object import TEXT_TYPE
8
8
  from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput
9
- from helm.common.hierarchical_logger import hwarn
9
+ from helm.common.hierarchical_logger import hexception, hwarn
10
10
  from helm.common.optional_dependencies import handle_module_not_found_error
11
11
  from helm.tokenizers.tokenizer import Tokenizer
12
12
  from helm.clients.client import CachingClient, truncate_and_tokenize_response_text
@@ -167,6 +167,7 @@ class RekaClient(CachingClient):
167
167
 
168
168
  response, cached = self.cache.get(raw_request, wrap_request_time(do_it))
169
169
  except (requests.exceptions.RequestException, AssertionError) as e:
170
+ hexception(e)
170
171
  error: str = f"RekaClient error: {e}"
171
172
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
172
173
 
@@ -39,7 +39,7 @@ class StanfordHealthCareAzureOpenAIClient(AzureOpenAIClient):
39
39
  tokenizer=tokenizer,
40
40
  tokenizer_name=tokenizer_name,
41
41
  cache_config=cache_config,
42
- api_key="unused",
42
+ api_key=api_key,
43
43
  base_url=base_url,
44
44
  azure_openai_deployment_name=openai_model_name,
45
45
  api_version=api_version,
@@ -50,7 +50,7 @@ class StanfordHealthCareAzureOpenAIClient(AzureOpenAIClient):
50
50
  tokenizer=tokenizer,
51
51
  tokenizer_name=tokenizer_name,
52
52
  cache_config=cache_config,
53
- api_key="unused",
53
+ api_key=api_key,
54
54
  endpoint=endpoint,
55
55
  azure_openai_deployment_name=openai_model_name,
56
56
  api_version=api_version,
@@ -5,6 +5,7 @@ from dataclasses import asdict
5
5
  from typing import Any, Dict, List, Optional
6
6
 
7
7
  from helm.common.cache import CacheConfig
8
+ from helm.common.hierarchical_logger import hexception
8
9
  from helm.common.request import (
9
10
  wrap_request_time,
10
11
  Request,
@@ -82,6 +83,7 @@ class StanfordHealthCareHTTPModelClient(CachingClient, ABC):
82
83
  request_time=response["request_time"],
83
84
  )
84
85
  except requests.exceptions.RequestException as e:
86
+ hexception(e)
85
87
  return RequestResult(success=False, cached=False, error=f"Request error: {e}", completions=[], embedding=[])
86
88
 
87
89
  @abstractmethod
@@ -9,7 +9,7 @@ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
9
9
  class TestHuggingFaceClient:
10
10
  def test_gpt2(self):
11
11
  tokenizer = HuggingFaceTokenizer(
12
- BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2"
12
+ BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai-community/gpt2"
13
13
  )
14
14
  client = HuggingFaceClient(
15
15
  cache_config=BlackHoleCacheConfig(),
@@ -36,7 +36,7 @@ class TestHuggingFaceClient:
36
36
  @pytest.mark.skip(reason="GPT-J 6B is 22 GB and extremely slow without a GPU.")
37
37
  def test_gptj_6b(self):
38
38
  tokenizer = HuggingFaceTokenizer(
39
- BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2"
39
+ BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai-community/gpt2"
40
40
  )
41
41
  client = HuggingFaceClient(
42
42
  cache_config=BlackHoleCacheConfig(),
@@ -57,7 +57,7 @@ class TestHuggingFaceClient:
57
57
 
58
58
  def test_logprob(self):
59
59
  tokenizer = HuggingFaceTokenizer(
60
- BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2"
60
+ BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai-community/gpt2"
61
61
  )
62
62
  client = HuggingFaceClient(
63
63
  cache_config=BlackHoleCacheConfig(),