crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -1,18 +1,23 @@
1
1
  # mypy: check_untyped_defs = False
2
2
  from dataclasses import replace
3
- from typing import Any, Dict, List, Optional, cast, Union
3
+ import re
4
+ from typing import Any, Dict, List, Optional, cast, Union, Callable
5
+
6
+ from openai import OpenAIError
4
7
 
5
8
  from helm.benchmark.model_metadata_registry import is_vlm
9
+ from helm.common import multimodal_request_utils
6
10
  from helm.common.cache import CacheConfig
7
- from helm.common.media_object import TEXT_TYPE
8
- from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
9
- from helm.common.hierarchical_logger import hlog
11
+ from helm.common.media_object import TEXT_TYPE, MultimediaObject, MediaObject
12
+ from helm.common.request import ErrorFlags, Thinking, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
13
+ from helm.common.hierarchical_logger import hlog, hwarn
14
+ from helm.common.object_spec import get_class_by_name
10
15
  from helm.common.optional_dependencies import handle_module_not_found_error
11
16
  from helm.common.tokenization_request import (
12
17
  TokenizationRequest,
13
18
  TokenizationRequestResult,
14
19
  )
15
- from .client import CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
20
+ from helm.clients.client import Client, CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
16
21
  from helm.tokenizers.tokenizer import Tokenizer
17
22
 
18
23
  try:
@@ -22,11 +27,29 @@ except ModuleNotFoundError as e:
22
27
  handle_module_not_found_error(e, ["openai"])
23
28
 
24
29
 
25
- class OpenAIClient(CachingClient):
26
- END_OF_TEXT: str = "<|endoftext|>"
30
+ class OpenAIClientUtils:
31
+ """Methods used by both the chat completions client and the responses API client"""
32
+
33
+ @classmethod
34
+ def is_reasoning_model(cls, model_engine: str) -> bool:
35
+ # All OpenAI reasoning models start "o[somenumber]", so we regexp for that to future proof things
36
+ return bool(re.match(r"^o\d+", model_engine))
27
37
 
28
38
  # Error OpenAI throws when the image in the prompt violates their content policy
29
39
  INAPPROPRIATE_IMAGE_ERROR: str = "Your input image may contain content that is not allowed by our safety system"
40
+ INAPPROPRIATE_PROMPT_ERROR: str = "Invalid prompt: your prompt was flagged"
41
+ INAPPROPRIATE_PROMPT_AZURE_ERROR: str = (
42
+ "The response was filtered due to the prompt triggering Azure OpenAI's content management policy."
43
+ )
44
+ INAPPROPRIATE_PROMPT_MICROSOFT_ERROR: str = (
45
+ "The response was filtered due to the prompt triggering Microsoft's content management policy."
46
+ )
47
+
48
+ # OpenAI server error
49
+ OPENAI_SERVER_ERROR: str = (
50
+ "The server had an error processing your request. Sorry about that! You can retry your request, "
51
+ "or contact us through our help center at help.openai.com if you keep seeing this error."
52
+ )
30
53
 
31
54
  # Set the finish reason to this if the prompt violates OpenAI's content policy
32
55
  CONTENT_POLICY_VIOLATED_FINISH_REASON: str = (
@@ -34,6 +57,56 @@ class OpenAIClient(CachingClient):
34
57
  "See https://labs.openai.com/policies/content-policy for more information."
35
58
  )
36
59
 
60
+ @classmethod
61
+ def handle_openai_error(cls, e: OpenAIError, request: Request):
62
+ if cls.INAPPROPRIATE_IMAGE_ERROR in str(e) or cls.INAPPROPRIATE_PROMPT_ERROR in str(e):
63
+ hwarn(f"Failed safety check: {str(request)}")
64
+ empty_completion = GeneratedOutput(
65
+ text="",
66
+ logprob=0,
67
+ tokens=[],
68
+ finish_reason={"reason": cls.CONTENT_POLICY_VIOLATED_FINISH_REASON},
69
+ )
70
+ return RequestResult(
71
+ success=True,
72
+ cached=False,
73
+ request_time=0,
74
+ completions=[empty_completion] * request.num_completions,
75
+ embedding=[],
76
+ )
77
+ elif cls.OPENAI_SERVER_ERROR in str(e):
78
+ # Handle these errors by returning an empty completion to unblock
79
+ hwarn(f"OpenAI server error for request: {str(request)}")
80
+ empty_completion = GeneratedOutput(
81
+ text="",
82
+ logprob=0,
83
+ tokens=[],
84
+ finish_reason={"reason": cls.OPENAI_SERVER_ERROR},
85
+ )
86
+ return RequestResult(
87
+ success=True,
88
+ cached=False,
89
+ request_time=0,
90
+ completions=[empty_completion] * request.num_completions,
91
+ embedding=[],
92
+ )
93
+ elif cls.INAPPROPRIATE_PROMPT_AZURE_ERROR in str(e) or cls.INAPPROPRIATE_PROMPT_MICROSOFT_ERROR in str(e):
94
+ return RequestResult(
95
+ success=False,
96
+ cached=False,
97
+ error="Content blocked by Azure's content management filter",
98
+ completions=[],
99
+ embedding=[],
100
+ error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
101
+ )
102
+
103
+ error: str = f"OpenAI error: {e}"
104
+ return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
105
+
106
+
107
+ class OpenAIClient(CachingClient):
108
+ END_OF_TEXT: str = "<|endoftext|>"
109
+
37
110
  def __init__(
38
111
  self,
39
112
  tokenizer: Tokenizer,
@@ -42,28 +115,31 @@ class OpenAIClient(CachingClient):
42
115
  api_key: Optional[str] = None,
43
116
  org_id: Optional[str] = None,
44
117
  base_url: Optional[str] = None,
118
+ reasoning_effort: Optional[str] = None,
119
+ openai_model_name: Optional[str] = None,
120
+ output_processor: Optional[str] = None,
45
121
  ):
46
122
  super().__init__(cache_config=cache_config)
47
123
  self.tokenizer = tokenizer
48
124
  self.tokenizer_name = tokenizer_name
49
125
  self.client = OpenAI(api_key=api_key, organization=org_id, base_url=base_url)
50
-
51
- def _is_chat_model_engine(self, model_engine: str) -> bool:
52
- if model_engine == "gpt-3.5-turbo-instruct":
53
- return False
54
- elif model_engine.startswith("gpt-3.5") or model_engine.startswith("gpt-4") or model_engine.startswith("o1"):
55
- return True
56
- return False
126
+ self.reasoning_effort = reasoning_effort
127
+ self.openai_model_name = openai_model_name
128
+ self.output_processor: Optional[Callable[[str], str]] = (
129
+ get_class_by_name(output_processor) if output_processor else None
130
+ )
57
131
 
58
132
  def _get_model_for_request(self, request: Request) -> str:
59
- return request.model_engine
133
+ return self.openai_model_name or request.model_engine
60
134
 
61
135
  def _get_cache_key(self, raw_request: Dict, request: Request):
62
136
  cache_key = CachingClient.make_cache_key(raw_request, request)
63
137
  if request.multimodal_prompt:
64
138
  prompt_key: str = generate_uid_for_multimodal_prompt(request.multimodal_prompt)
65
139
  cache_key = {**cache_key, "multimodal_prompt": prompt_key}
66
- del cache_key["messages"]
140
+
141
+ if "messages" in cache_key:
142
+ del cache_key["messages"]
67
143
  return cache_key
68
144
 
69
145
  def _make_embedding_request(self, request: Request) -> RequestResult:
@@ -100,7 +176,7 @@ class OpenAIClient(CachingClient):
100
176
  embedding=embedding,
101
177
  )
102
178
 
103
- def _make_chat_request(self, request: Request) -> RequestResult:
179
+ def _make_chat_raw_request(self, request: Request) -> Dict[str, Any]:
104
180
  messages: Optional[List[Dict[str, Union[str, Any]]]] = request.messages
105
181
  if (
106
182
  (request.prompt and request.messages)
@@ -119,7 +195,7 @@ class OpenAIClient(CachingClient):
119
195
  if request.messages[-1]["role"] != "user":
120
196
  raise ValueError("Last message must have role 'user'")
121
197
  if request.prompt != "":
122
- hlog("WARNING: Since message is set, prompt will be ignored")
198
+ hwarn("Since message is set, prompt will be ignored")
123
199
  else:
124
200
  # Convert prompt into a single message
125
201
  # For now, put the whole prompt in a single user message, and expect the response
@@ -140,6 +216,19 @@ class OpenAIClient(CachingClient):
140
216
  base64_image: str = encode_base64(media_object.location)
141
217
  image_object: Dict[str, str] = {"url": f"data:image/jpeg;base64,{base64_image}"}
142
218
  content.append({"type": "image_url", "image_url": image_object})
219
+ elif media_object.is_type("audio") and media_object.location:
220
+ base64_audio: str = multimodal_request_utils.get_contents_as_base64(media_object.location)
221
+ format: str = media_object.content_type.split("/")[1]
222
+ if format == "mpeg":
223
+ # OpenAI expects "mp3" for mpeg audio
224
+ format = "mp3"
225
+
226
+ content.append(
227
+ {
228
+ "type": "input_audio",
229
+ "input_audio": {"data": base64_audio, "format": format},
230
+ }
231
+ )
143
232
  elif media_object.is_type(TEXT_TYPE):
144
233
  content.append({"type": media_object.type, "text": media_object.text})
145
234
  else:
@@ -164,15 +253,35 @@ class OpenAIClient(CachingClient):
164
253
  "frequency_penalty": request.frequency_penalty,
165
254
  }
166
255
 
167
- # OpenAI's vision API doesn't allow None values for stop.
168
- # Fails with "body -> stop: none is not an allowed value" if None is passed.
169
- if is_vlm(request.model) and raw_request["stop"] is None:
170
- raw_request.pop("stop")
256
+ if request.response_format and request.response_format.json_schema:
257
+ # Copy and modify JSON schema to conform to OpenAI's requirements
258
+ json_schema = dict(request.response_format.json_schema)
259
+
260
+ # additionalProperties: false must always be set in objects
261
+ # https://platform.openai.com/docs/guides/structured-outputs#additionalproperties-false-must-always-be-set-in-objects
262
+ if "additionalProperties" not in json_schema:
263
+ json_schema["additionalProperties"] = False
264
+
265
+ # All fields must be required
266
+ # https://platform.openai.com/docs/guides/structured-outputs#all-fields-must-be-required
267
+ if "required" not in json_schema:
268
+ json_schema["required"] = list(json_schema["properties"].keys())
269
+
270
+ raw_request["response_format"] = {
271
+ "type": "json_schema",
272
+ "json_schema": {
273
+ "name": "response",
274
+ "description": "",
275
+ "schema": json_schema,
276
+ "strict": True,
277
+ },
278
+ }
171
279
 
172
280
  # Special handling for o1 models.
173
281
  # Refer to the "Reasoning models" documentation further discussion of o1 model limitations:
174
282
  # https://platform.openai.com/docs/guides/reasoning
175
- if request.model_engine.startswith("o1"):
283
+ model_engine: str = request.model_engine
284
+ if OpenAIClientUtils.is_reasoning_model(model_engine):
176
285
  # Avoid error:
177
286
  # "Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead." # noqa: E501
178
287
  # Note that openai>=1.45 is needed for this
@@ -184,6 +293,39 @@ class OpenAIClient(CachingClient):
184
293
  if raw_request["stop"] is None:
185
294
  raw_request.pop("stop")
186
295
 
296
+ # Avoid error:
297
+ # "Error code: 400 - {'error': {'message': "Unsupported parameter: 'temperature' is
298
+ # not supported with this model.", 'type': 'invalid_request_error', 'param': 'temperature',
299
+ # 'code': 'unsupported_parameter'}}"
300
+ raw_request.pop("temperature", None)
301
+
302
+ # The following parameters also happen to be unsupported by the o-series (code unsupported_parameter)
303
+ raw_request.pop("top_p", None)
304
+ raw_request.pop("frequency_penalty", None)
305
+ raw_request.pop("presence_penalty", None)
306
+
307
+ if self.reasoning_effort:
308
+ raw_request["reasoning_effort"] = self.reasoning_effort
309
+ elif is_vlm(request.model):
310
+ # Avoid error:
311
+ # "Invalid type for 'stop': expected an unsupported value, but got null instead."
312
+ if raw_request["stop"] is None:
313
+ raw_request.pop("stop")
314
+
315
+ # Special handling for gpt-4o-audio-preview
316
+ # See: https://platform.openai.com/docs/guides/audio
317
+ if model_engine.startswith("gpt-4o-audio-preview") or model_engine.startswith("gpt-4o-mini-audio-preview"):
318
+ raw_request["modalities"] = ["text"]
319
+
320
+ # Avoid error:
321
+ # OpenAI error: Error code: 400 - {'error': {'message': "[{'type': 'string_type', 'loc': ('body', 'stop', 'str'), 'msg': 'Input should be a valid string', 'input': None}, {'type': 'list_type', 'loc': ('body', 'stop', 'list[str]'), 'msg': 'Input should be a valid list', 'input': None}, {'type': 'list_type', 'loc': ('body', 'stop', 'list[list[int]]'), 'msg': 'Input should be a valid list', 'input': None}]", 'type': 'invalid_request_error', 'param': None, 'code': None}} # noqa: 3501
322
+ if raw_request["stop"] is None:
323
+ raw_request.pop("stop")
324
+ return raw_request
325
+
326
+ def _make_chat_request(self, request: Request) -> RequestResult:
327
+ raw_request = self._make_chat_raw_request(request)
328
+
187
329
  def do_it() -> Dict[str, Any]:
188
330
  return self.client.chat.completions.create(**raw_request).model_dump(mode="json")
189
331
 
@@ -191,30 +333,27 @@ class OpenAIClient(CachingClient):
191
333
  cache_key = self._get_cache_key(raw_request, request)
192
334
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
193
335
  except openai.OpenAIError as e:
194
- if self.INAPPROPRIATE_IMAGE_ERROR in str(e):
195
- hlog(f"Failed safety check: {str(request)}")
196
- empty_completion = GeneratedOutput(
197
- text="",
198
- logprob=0,
199
- tokens=[],
200
- finish_reason={"reason": self.CONTENT_POLICY_VIOLATED_FINISH_REASON},
201
- )
336
+ return OpenAIClientUtils.handle_openai_error(e, request)
337
+
338
+ completions: List[GeneratedOutput] = []
339
+ for raw_completion in response["choices"]:
340
+ # Handle Azure OpenAI content filter
341
+ # See: https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/content-filter
342
+ if raw_completion["finish_reason"] == "content_filter":
343
+ hlog(f"Content blocked by OpenAI filter: {str(raw_request)}")
202
344
  return RequestResult(
203
- success=True,
345
+ success=False,
204
346
  cached=False,
205
- request_time=0,
206
- completions=[empty_completion] * request.num_completions,
347
+ error="Content blocked by OpenAI filter",
348
+ completions=[],
207
349
  embedding=[],
350
+ error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
208
351
  )
209
-
210
- error: str = f"OpenAI error: {e}"
211
- return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
212
-
213
- completions: List[GeneratedOutput] = []
214
- for raw_completion in response["choices"]:
215
352
  # The OpenAI chat completion API doesn't support echo.
216
353
  # If `echo_prompt` is true, combine the prompt and completion.
217
354
  raw_completion_content = raw_completion["message"]["content"]
355
+ if self.output_processor:
356
+ raw_completion_content = self.output_processor(raw_completion_content)
218
357
  text: str = request.prompt + raw_completion_content if request.echo_prompt else raw_completion_content
219
358
  # The OpenAI chat completion API doesn't return us tokens or logprobs, so we tokenize ourselves.
220
359
  tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize(
@@ -224,11 +363,20 @@ class OpenAIClient(CachingClient):
224
363
  tokens: List[Token] = [
225
364
  Token(text=cast(str, raw_token), logprob=0) for raw_token in tokenization_result.raw_tokens
226
365
  ]
366
+ # vLLM has a optional `reasoning_content` field in the message
367
+ # that is not in the standard OpenAI API.
368
+ # This field is also used by some model providers such as Grok.
369
+ thinking = (
370
+ Thinking(text=raw_completion["message"]["reasoning_content"])
371
+ if "reasoning_content" in raw_completion["message"]
372
+ else None
373
+ )
227
374
  completion = GeneratedOutput(
228
375
  text=text,
229
376
  logprob=0, # OpenAI does not provide logprobs
230
377
  tokens=tokens,
231
378
  finish_reason={"reason": raw_completion["finish_reason"]},
379
+ thinking=thinking,
232
380
  )
233
381
  completions.append(truncate_sequence(completion, request)) # Truncate the text by stop sequences
234
382
 
@@ -313,10 +461,127 @@ class OpenAIClient(CachingClient):
313
461
  embedding=[],
314
462
  )
315
463
 
464
+ def _make_transcription_request(self, request: Request) -> RequestResult:
465
+ assert (
466
+ request.multimodal_prompt is not None and request.multimodal_prompt.size == 1
467
+ ), "Expected just a single audio file."
468
+ media_object = request.multimodal_prompt.media_objects[0]
469
+ assert media_object.is_type("audio") and media_object.location, "Expected an audio file."
470
+ audio_path: str = media_object.location
471
+ model: str = self._get_model_for_request(request)
472
+
473
+ def do_it() -> Dict[str, Any]:
474
+ transcription = self.client.audio.transcriptions.create(model=model, file=open(audio_path, "rb"))
475
+ return {"transcription": transcription.text}
476
+
477
+ try:
478
+ cache_key = self._get_cache_key({"audio": audio_path, "model": model}, request)
479
+ response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
480
+ except openai.OpenAIError as e:
481
+ error: str = f"OpenAI error: {e}"
482
+ return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
483
+
484
+ return RequestResult(
485
+ success=True,
486
+ cached=cached,
487
+ request_time=response["request_time"],
488
+ request_datetime=response.get("request_datetime"),
489
+ completions=[GeneratedOutput(text=response["transcription"], logprob=0, tokens=[])],
490
+ embedding=[],
491
+ )
492
+
316
493
  def make_request(self, request: Request) -> RequestResult:
317
494
  if request.embedding:
318
495
  return self._make_embedding_request(request)
319
- elif self._is_chat_model_engine(request.model_engine):
320
- return self._make_chat_request(request)
496
+ elif "whisper" in request.model_engine or "transcribe" in request.model_engine:
497
+ return self._make_transcription_request(request)
321
498
  else:
322
- return self._make_completion_request(request)
499
+ return self._make_chat_request(request)
500
+
501
+
502
+ class OpenAILegacyCompletionsClient(OpenAIClient):
503
+ def make_request(self, request: Request) -> RequestResult:
504
+ return self._make_completion_request(request)
505
+
506
+
507
+ class OpenAITranscriptionThenCompletionClient(Client):
508
+ """
509
+ Wrapper around `OpenAIClient` that transcribes audio to text with a
510
+ speech-to-text model (e.g., Whisper) before making a completion request.
511
+ """
512
+
513
+ @staticmethod
514
+ def wrap_transcribed_indicator(transcription: str) -> str:
515
+ return f"\n[TRANSCRIBED AUDIO START]\n{transcription}\n[TRANSCRIBED AUDIO END]\n"
516
+
517
+ def __init__(
518
+ self,
519
+ tokenizer: Tokenizer,
520
+ tokenizer_name: str,
521
+ cache_config: CacheConfig,
522
+ api_key: Optional[str] = None,
523
+ org_id: Optional[str] = None,
524
+ ):
525
+ self._openai_client = OpenAIClient(
526
+ tokenizer=tokenizer,
527
+ tokenizer_name=tokenizer_name,
528
+ cache_config=cache_config,
529
+ api_key=api_key,
530
+ org_id=org_id,
531
+ )
532
+
533
+ def make_request(self, request: Request) -> RequestResult:
534
+ # Ensure that there is only one _ in the model engine name as the format is
535
+ # `transcription-model_completion-model`
536
+ assert request.model_engine.count("_") == 1, f"Invalid model name: {request.model_engine}"
537
+ # Use `model_engine` to determine both the models for transcription and completion
538
+ transcription_model, completion_model = request.model_engine.split("_")
539
+
540
+ # Only multimodal prompts are supported
541
+ assert request.multimodal_prompt is not None, "Expected a multimodal prompt"
542
+
543
+ # Gather all the text content and transcribe any audio to text
544
+ text_content: List[str] = []
545
+ for media_object in request.multimodal_prompt.media_objects:
546
+ if media_object.is_type("audio") and media_object.location:
547
+ request = Request(
548
+ model=f"openai/{transcription_model}",
549
+ multimodal_prompt=MultimediaObject(media_objects=[media_object]),
550
+ )
551
+ response = self._openai_client.make_request(request)
552
+
553
+ transcribed_text: str
554
+ if response.success and response.completions:
555
+ transcribed_text = response.completions[0].text
556
+ else:
557
+ transcribed_text = ""
558
+ hlog(f"Failed to transcribe audio: {response.error}")
559
+
560
+ text_content.append(self.wrap_transcribed_indicator(transcribed_text))
561
+ elif media_object.is_type(TEXT_TYPE):
562
+ assert media_object.text is not None, "Expected text content"
563
+ text_content.append(media_object.text)
564
+ else:
565
+ raise ValueError(f"Unrecognized media type: {media_object.type}")
566
+
567
+ text_prompt: str = "\n".join(text_content)
568
+ hlog(f"Transcribed prompt:\n{text_prompt}")
569
+
570
+ # Now make the request to the completion model with just a text-only prompt and no audio
571
+ # Use the same decoding parameters as the original request
572
+ # Ensure to set multimodal_prompt to None so the request is treated as text-only.
573
+ request_result: RequestResult = self._openai_client.make_request(
574
+ replace(request, prompt=text_prompt, model=f"openai/{completion_model}", multimodal_prompt=None)
575
+ )
576
+
577
+ # Also include the generated transcript to the request result
578
+ completions_with_transcript: List[GeneratedOutput] = [
579
+ replace(
580
+ completion,
581
+ multimodal_content=MultimediaObject(
582
+ media_objects=[MediaObject(text=text_prompt, content_type="text/plain")]
583
+ ),
584
+ )
585
+ for completion in request_result.completions
586
+ ]
587
+ return replace(request_result, completions=completions_with_transcript)
@@ -0,0 +1,174 @@
1
+ # mypy: check_untyped_defs = False
2
+ import dataclasses
3
+ from typing import Any, Dict, List, Optional, Union
4
+
5
+
6
+ from helm.clients.openai_client import OpenAIClientUtils
7
+ from helm.common.cache import CacheConfig
8
+ from helm.common.media_object import TEXT_TYPE
9
+ from helm.common.request import (
10
+ Thinking,
11
+ wrap_request_time,
12
+ Request,
13
+ RequestResult,
14
+ GeneratedOutput,
15
+ )
16
+ from helm.common.optional_dependencies import handle_module_not_found_error
17
+ from helm.clients.client import (
18
+ CachingClient,
19
+ truncate_and_tokenize_response_text,
20
+ generate_uid_for_multimodal_prompt,
21
+ )
22
+ from helm.tokenizers.tokenizer import Tokenizer
23
+
24
+ try:
25
+ import openai
26
+ from openai import OpenAI
27
+ except ModuleNotFoundError as e:
28
+ handle_module_not_found_error(e, ["openai"])
29
+
30
+
31
+ class OpenAIResponseClient(CachingClient):
32
+ def __init__(
33
+ self,
34
+ tokenizer: Tokenizer,
35
+ tokenizer_name: str,
36
+ cache_config: CacheConfig,
37
+ api_key: Optional[str] = None,
38
+ org_id: Optional[str] = None,
39
+ base_url: Optional[str] = None,
40
+ reasoning_effort: Optional[str] = None,
41
+ openai_model_name: Optional[str] = None,
42
+ ):
43
+ super().__init__(cache_config=cache_config)
44
+ self.tokenizer = tokenizer
45
+ self.tokenizer_name = tokenizer_name
46
+ self.client = OpenAI(
47
+ api_key=api_key,
48
+ organization=org_id,
49
+ base_url=base_url,
50
+ )
51
+ self.reasoning_effort = reasoning_effort
52
+ self.openai_model_name = openai_model_name
53
+
54
+ def _get_cache_key(self, raw_request: Dict, request: Request):
55
+ cache_key = CachingClient.make_cache_key(raw_request, request)
56
+ if request.multimodal_prompt:
57
+ prompt_key: str = generate_uid_for_multimodal_prompt(request.multimodal_prompt)
58
+ cache_key = {**cache_key, "multimodal_prompt": prompt_key}
59
+ return cache_key
60
+
61
+ def _make_raw_request(self, request: Request) -> dict[str, Any]:
62
+ input: Union[str, List[Dict[str, Any]]]
63
+ if request.multimodal_prompt is not None:
64
+ content = []
65
+ request.validate()
66
+ for media_object in request.multimodal_prompt.media_objects:
67
+ if media_object.is_type("image") and media_object.location:
68
+ from helm.common.images_utils import encode_base64
69
+
70
+ base64_image: str = encode_base64(media_object.location)
71
+ content.append(
72
+ {
73
+ "type": "input_image",
74
+ "image_url": f"data:image/jpeg;base64,{base64_image}",
75
+ }
76
+ )
77
+ elif media_object.is_type(TEXT_TYPE):
78
+ assert media_object.text is not None
79
+ content.append({"type": "input_text", "text": media_object.text})
80
+ else:
81
+ raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
82
+ input = [{"role": "user", "content": content}]
83
+ else:
84
+ input = request.prompt
85
+
86
+ raw_request: Dict[str, Any] = {
87
+ "model": self._get_model_for_request(request),
88
+ "input": input,
89
+ "top_p": request.top_p,
90
+ # API errors if max_output_tokens is less than 16
91
+ # (Error you get: "Invalid 'max_output_tokens': integer below minimum value.
92
+ # Expected a value >= 16, but got 5 instead.")
93
+ "max_output_tokens": max(16, request.max_tokens),
94
+ "temperature": request.temperature,
95
+ # Don't store responses for later retrieval
96
+ "store": False,
97
+ }
98
+ if self.reasoning_effort:
99
+ raw_request["reasoning"] = {"effort": self.reasoning_effort}
100
+ # If o-series model, get reasoning summaries
101
+ # Plus other changes
102
+ model_engine: str = request.model_engine
103
+ if OpenAIClientUtils.is_reasoning_model(model_engine):
104
+ raw_request["reasoning"]["summary"] = "detailed"
105
+ # Avoid error:
106
+ # "Error code: 400 - {'error': {'message': "Unsupported parameter: 'temperature' is
107
+ # not supported with this model.", 'type': 'invalid_request_error', 'param': 'temperature',
108
+ # 'code': 'unsupported_parameter'}}"
109
+ raw_request.pop("temperature", None)
110
+
111
+ # The following parameters also happen to be unsupported by the o-series (code unsupported_parameter)
112
+ raw_request.pop("top_p", None)
113
+
114
+ return raw_request
115
+
116
+ def _get_model_for_request(self, request: Request) -> str:
117
+ return self.openai_model_name or request.model_engine
118
+
119
+ def make_request(self, request: Request) -> RequestResult:
120
+ # Content can either be text or a list of multimodal content made up of text and images:
121
+ # https://platform.openai.com/docs/api-reference/responses/create
122
+ raw_request = self._make_raw_request(request)
123
+
124
+ # The responses API does not support a "num_completions" parameter,
125
+ # so we need to handle it ourselves with a simple loop
126
+ completions: list[GeneratedOutput] = []
127
+ for _ in range(request.num_completions):
128
+
129
+ def do_it() -> Dict[str, Any]:
130
+ raw_response = self.client.responses.create(**raw_request).model_dump(mode="json")
131
+ assert not raw_response.get("error", None), f"Error in response: {raw_response}"
132
+ return raw_response
133
+
134
+ try:
135
+ cache_key = self._get_cache_key(raw_request, request)
136
+ response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
137
+ except openai.OpenAIError as e:
138
+ return OpenAIClientUtils.handle_openai_error(e, request)
139
+
140
+ # We can only return one completition really,
141
+ # but we get an array of messages back, so we need to contact them
142
+ reasoning_output = ""
143
+ text_output = ""
144
+
145
+ if request.echo_prompt:
146
+ text_output += request.prompt
147
+ for output in response["output"]:
148
+ output_type = output["type"] # one of "message" or "reasoning" from API observation
149
+ is_reasoning_output = output_type == "reasoning"
150
+
151
+ if is_reasoning_output:
152
+ reasoning_output += "\n".join([raw_output["text"] for raw_output in output["summary"]])
153
+ else:
154
+ text_output += "\n".join([raw_output["text"] for raw_output in output["content"]])
155
+
156
+ completion = truncate_and_tokenize_response_text(
157
+ text_output,
158
+ request,
159
+ self.tokenizer,
160
+ self.tokenizer_name,
161
+ original_finish_reason="",
162
+ )
163
+ if reasoning_output:
164
+ completion = dataclasses.replace(completion, thinking=Thinking(text=reasoning_output))
165
+ completions.append(completion)
166
+
167
+ return RequestResult(
168
+ success=True,
169
+ cached=cached,
170
+ request_time=response["request_time"],
171
+ request_datetime=response.get("request_datetime"),
172
+ completions=completions,
173
+ embedding=[],
174
+ )