crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  31. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  32. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  33. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  34. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  35. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  36. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  37. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  38. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  39. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  40. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  41. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  42. helm/benchmark/annotation/medalign_annotator.py +100 -0
  43. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  44. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  45. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  46. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  47. helm/benchmark/annotation/model_as_judge.py +218 -6
  48. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  49. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  50. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  51. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  52. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  53. helm/benchmark/annotation/spider_annotator.py +18 -0
  54. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  55. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  56. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  57. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  58. helm/benchmark/annotation_executor.py +35 -15
  59. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  60. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  61. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  62. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  63. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  64. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  65. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  66. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  67. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  68. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  69. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  70. helm/benchmark/augmentations/perturbation.py +1 -1
  71. helm/benchmark/augmentations/space_perturbation.py +2 -2
  72. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  73. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  74. helm/benchmark/augmentations/test_perturbation.py +16 -13
  75. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  76. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  77. helm/benchmark/data_preprocessor.py +2 -2
  78. helm/benchmark/huggingface_registration.py +2 -7
  79. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  80. helm/benchmark/metrics/basic_metrics.py +6 -6
  81. helm/benchmark/metrics/bbq_metrics.py +2 -2
  82. helm/benchmark/metrics/bias_metrics.py +12 -3
  83. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  84. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  85. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  86. helm/benchmark/metrics/classification_metrics.py +76 -12
  87. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  88. helm/benchmark/metrics/code_metrics.py +5 -5
  89. helm/benchmark/metrics/comet_metric.py +125 -0
  90. helm/benchmark/metrics/common_metric_specs.py +9 -2
  91. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  92. helm/benchmark/metrics/copyright_metrics.py +4 -4
  93. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  94. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  95. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  96. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  97. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  98. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  99. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  100. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  101. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  102. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  103. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  104. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  105. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  106. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  107. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  108. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  109. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  110. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  111. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  112. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  113. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  114. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  115. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  116. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  117. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  118. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  120. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  121. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  122. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  123. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  124. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  125. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  126. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  127. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  128. helm/benchmark/metrics/medalign_metrics.py +34 -0
  129. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  130. helm/benchmark/metrics/medec_metrics.py +101 -0
  131. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  132. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  133. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  134. helm/benchmark/metrics/metric.py +3 -3
  135. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  136. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  137. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  138. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  139. helm/benchmark/metrics/nltk_helper.py +32 -0
  140. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  141. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  142. helm/benchmark/metrics/output_processing_metric.py +60 -0
  143. helm/benchmark/metrics/output_processors.py +15 -0
  144. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  145. helm/benchmark/metrics/ranking_metrics.py +3 -3
  146. helm/benchmark/metrics/reference_metric.py +3 -3
  147. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  148. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  149. helm/benchmark/metrics/spider_metrics.py +7 -0
  150. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  151. helm/benchmark/metrics/statistic.py +1 -1
  152. helm/benchmark/metrics/summac/model_summac.py +1 -1
  153. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  154. helm/benchmark/metrics/summarization_metrics.py +19 -9
  155. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  156. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  157. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  158. helm/benchmark/metrics/test_metric.py +1 -1
  159. helm/benchmark/metrics/test_statistic.py +2 -2
  160. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  161. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  162. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  163. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  164. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  165. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  167. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  169. helm/benchmark/metrics/unitxt_metrics.py +4 -1
  170. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  171. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  172. helm/benchmark/model_metadata_registry.py +16 -0
  173. helm/benchmark/presentation/summarize.py +23 -10
  174. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  175. helm/benchmark/reeval_run.py +203 -0
  176. helm/benchmark/reeval_runner.py +355 -0
  177. helm/benchmark/run.py +8 -17
  178. helm/benchmark/run_expander.py +78 -8
  179. helm/benchmark/run_spec_factory.py +12 -0
  180. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  181. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  182. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  183. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  184. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  185. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  186. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  187. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  188. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  189. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  190. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  191. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  192. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  193. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  194. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  195. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  196. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  197. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  198. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  199. helm/benchmark/run_specs/vlm_run_specs.py +75 -2
  200. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  201. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  202. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  203. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  204. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  205. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  206. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  207. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  208. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  209. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  210. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  211. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  212. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  213. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  214. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  215. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  216. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  217. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  218. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  219. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  220. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  221. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  222. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  223. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  224. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  225. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  226. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  227. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  228. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  229. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  230. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  231. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  232. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  233. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  234. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  235. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  236. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  237. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  238. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  239. helm/benchmark/scenarios/bold_scenario.py +1 -1
  240. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  241. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  242. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  243. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  244. helm/benchmark/scenarios/clear_scenario.py +153 -0
  245. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  246. helm/benchmark/scenarios/code_scenario.py +17 -4
  247. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  248. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  249. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  250. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  251. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  252. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  253. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  254. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  255. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  256. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  257. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  258. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  259. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  260. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  261. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  262. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  263. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  264. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  265. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  266. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  267. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  268. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  269. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  270. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  271. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  272. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  273. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  274. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  275. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  276. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  277. helm/benchmark/scenarios/ice_scenario.py +8 -4
  278. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  279. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  280. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  281. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  282. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  283. helm/benchmark/scenarios/koala_scenario.py +1 -1
  284. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  285. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  286. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  287. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  288. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  289. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  290. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  291. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  292. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  293. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  294. helm/benchmark/scenarios/math_scenario.py +9 -1
  295. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  296. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  297. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  298. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  299. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  300. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  301. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  302. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  303. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  304. helm/benchmark/scenarios/medec_scenario.py +120 -0
  305. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  306. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  307. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  308. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  309. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  310. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  311. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  312. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  313. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  314. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  315. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  316. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  317. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  318. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  319. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  320. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  321. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  322. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  323. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  324. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  325. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  326. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  328. helm/benchmark/scenarios/quac_scenario.py +10 -1
  329. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  330. helm/benchmark/scenarios/raft_scenario.py +17 -2
  331. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  332. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  333. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  334. helm/benchmark/scenarios/scenario.py +9 -1
  335. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  336. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  337. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  338. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  339. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  340. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  341. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  342. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  343. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  344. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  345. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  346. helm/benchmark/scenarios/spider_scenario.py +91 -0
  347. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  348. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  349. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  350. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  351. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  352. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  353. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  354. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  355. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  356. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  357. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  358. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  359. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  360. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  361. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  362. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  363. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  364. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  366. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  367. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  368. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  369. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  370. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  371. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  372. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  373. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  374. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  375. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  376. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  377. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  378. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  379. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  380. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  381. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  382. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  383. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  384. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  385. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  386. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  387. helm/benchmark/server.py +11 -0
  388. helm/benchmark/slurm_runner.py +1 -1
  389. helm/benchmark/static/schema_audio.yaml +752 -0
  390. helm/benchmark/static/schema_autobencher.yaml +150 -0
  391. helm/benchmark/static/schema_call_center.yaml +97 -60
  392. helm/benchmark/static/schema_capabilities.yaml +254 -0
  393. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  394. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  395. helm/benchmark/static/schema_enterprise.yaml +298 -0
  396. helm/benchmark/static/schema_finance.yaml +14 -12
  397. helm/benchmark/static/schema_heim.yaml +1389 -0
  398. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  399. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  400. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  401. helm/benchmark/static/schema_safety.yaml +18 -1
  402. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  403. helm/benchmark/static/schema_social_audio.yaml +224 -0
  404. helm/benchmark/static/schema_sql.yaml +171 -0
  405. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  406. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  407. helm/benchmark/static/schema_vhelm.yaml +109 -36
  408. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  409. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  410. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  411. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  412. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  413. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  414. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  415. helm/benchmark/static_build/config.js +1 -1
  416. helm/benchmark/static_build/index.html +5 -5
  417. helm/benchmark/window_services/default_window_service.py +1 -1
  418. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  419. helm/benchmark/window_services/ice_window_service.py +1 -1
  420. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  421. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  422. helm/benchmark/window_services/local_window_service.py +2 -2
  423. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  424. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  425. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  426. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  427. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  428. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  429. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  430. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  431. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  432. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  433. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  434. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  435. helm/benchmark/window_services/test_utils.py +1 -1
  436. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  437. helm/benchmark/window_services/yalm_window_service.py +1 -1
  438. helm/clients/ai21_client.py +3 -3
  439. helm/clients/aleph_alpha_client.py +1 -1
  440. helm/clients/audio_language/__init__.py +0 -0
  441. helm/clients/audio_language/diva_llama_client.py +118 -0
  442. helm/clients/audio_language/llama_omni_client.py +198 -0
  443. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  444. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  445. helm/clients/auto_client.py +4 -2
  446. helm/clients/azure_openai_client.py +55 -0
  447. helm/clients/bedrock_client.py +201 -7
  448. helm/clients/bedrock_utils.py +33 -0
  449. helm/clients/clip_scorers/clip_scorer.py +1 -1
  450. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  451. helm/clients/cohere_client.py +3 -3
  452. helm/clients/google_client.py +1 -1
  453. helm/clients/http_model_client.py +1 -1
  454. helm/clients/huggingface_client.py +10 -18
  455. helm/clients/ibm_client.py +267 -0
  456. helm/clients/image_generation/adobe_vision_client.py +1 -1
  457. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  458. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  459. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  460. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  461. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  462. helm/clients/image_generation/cogview2_client.py +1 -1
  463. helm/clients/image_generation/dalle2_client.py +1 -1
  464. helm/clients/image_generation/dalle3_client.py +2 -2
  465. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  466. helm/clients/image_generation/dalle_mini/data.py +1 -1
  467. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  468. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  469. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  470. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  471. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  472. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  473. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  474. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  475. helm/clients/image_generation/dalle_mini_client.py +1 -1
  476. helm/clients/image_generation/deep_floyd_client.py +1 -1
  477. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  478. helm/clients/image_generation/lexica_client.py +1 -1
  479. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  480. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  481. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  482. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  483. helm/clients/image_generation/mindalle_client.py +1 -1
  484. helm/clients/image_generation/together_image_generation_client.py +1 -1
  485. helm/clients/lit_gpt_client.py +2 -2
  486. helm/clients/mistral_client.py +62 -18
  487. helm/clients/nvidia_nim_client.py +0 -3
  488. helm/clients/openai_client.py +241 -22
  489. helm/clients/palmyra_client.py +1 -4
  490. helm/clients/reka_client.py +1 -1
  491. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  492. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  493. helm/clients/stanfordhealthcare_google_client.py +43 -0
  494. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  495. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  496. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  497. helm/clients/test_client.py +1 -1
  498. helm/clients/test_together_client.py +6 -1
  499. helm/clients/together_client.py +47 -7
  500. helm/clients/upstage_client.py +23 -0
  501. helm/clients/vertexai_client.py +39 -13
  502. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  503. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  504. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  505. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  506. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  507. helm/clients/vllm_client.py +4 -6
  508. helm/clients/yi_client.py +0 -3
  509. helm/common/audio_utils.py +111 -0
  510. helm/common/file_caches/local_file_cache.py +1 -1
  511. helm/common/file_caches/test_local_file_cache.py +1 -1
  512. helm/common/images_utils.py +2 -2
  513. helm/common/media_object.py +2 -2
  514. helm/common/multimodal_request_utils.py +26 -0
  515. helm/common/reeval_parameters.py +12 -0
  516. helm/common/request.py +6 -2
  517. helm/common/response_format.py +18 -0
  518. helm/common/test_media_object.py +1 -1
  519. helm/config/model_deployments.yaml +1112 -19
  520. helm/config/model_metadata.yaml +985 -44
  521. helm/config/tokenizer_configs.yaml +379 -3
  522. helm/proxy/cli.py +2 -2
  523. helm/proxy/example_queries.py +1 -1
  524. helm/proxy/server.py +11 -4
  525. helm/proxy/services/remote_service.py +1 -1
  526. helm/proxy/services/server_service.py +1 -1
  527. helm/proxy/services/test_remote_service.py +2 -2
  528. helm/proxy/services/test_service.py +1 -1
  529. helm/proxy/static/general.js +122 -0
  530. helm/proxy/static/help.html +99 -0
  531. helm/proxy/static/index.css +57 -0
  532. helm/proxy/static/index.html +40 -0
  533. helm/proxy/static/index.js +456 -0
  534. helm/proxy/static/info-icon.png +0 -0
  535. helm/proxy/test_retry.py +1 -1
  536. helm/proxy/token_counters/auto_token_counter.py +1 -1
  537. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  538. helm/tokenizers/caching_tokenizer.py +2 -30
  539. helm/tokenizers/http_model_tokenizer.py +1 -1
  540. helm/tokenizers/huggingface_tokenizer.py +2 -2
  541. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  542. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  543. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  544. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  545. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  546. helm/tokenizers/tokenizer.py +3 -1
  547. helm/tokenizers/yalm_tokenizer.py +3 -3
  548. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  549. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  550. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  551. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  552. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  553. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  554. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  555. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  556. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  557. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  558. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  559. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  560. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  561. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  562. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  563. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  564. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  565. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  566. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  567. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  568. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  569. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  570. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  571. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  572. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  573. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  574. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  575. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  576. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  577. helm/tokenizers/anthropic_tokenizer.py +0 -52
  578. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  579. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  580. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,17 @@
1
1
  import requests
2
- from typing import Any, Dict, List, Optional, TypedDict
2
+ from typing import Any, Dict, List, Optional, TypedDict, Union
3
3
 
4
4
  from helm.proxy.retry import NonRetriableException
5
5
  from helm.common.cache import CacheConfig
6
+ from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
6
7
  from helm.common.optional_dependencies import handle_module_not_found_error
7
8
  from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput
8
9
  from helm.tokenizers.tokenizer import Tokenizer
9
- from .client import CachingClient, truncate_and_tokenize_response_text
10
+ from helm.clients.client import CachingClient, truncate_and_tokenize_response_text
10
11
 
11
12
  try:
12
- from mistralai.client import MistralClient
13
- from mistralai.models.chat_completion import ChatMessage, ChatCompletionResponse
13
+ from mistralai import Mistral
14
+ from mistralai.models import ChatCompletionResponse
14
15
  except ModuleNotFoundError as e:
15
16
  handle_module_not_found_error(e, ["mistral"])
16
17
 
@@ -19,11 +20,14 @@ class MistralAIRequest(TypedDict):
19
20
  """Data passed between make_request and _send_request. Used as the cache key."""
20
21
 
21
22
  model: str
22
- prompt: str
23
+ # The prompt can be either a string or a list of messages that can be multimodal
24
+ prompt: Optional[Union[str, List[Dict[str, str]]]]
25
+ messages: Optional[List[Dict[str, Any]]]
23
26
  max_tokens: int
24
27
  temperature: float
25
28
  top_p: float
26
29
  random_seed: Optional[int]
30
+ stop: Optional[List[str]]
27
31
 
28
32
 
29
33
  class MistralAIClient(CachingClient):
@@ -43,21 +47,24 @@ class MistralAIClient(CachingClient):
43
47
  self.api_key: str = api_key
44
48
  self.tokenizer = tokenizer
45
49
  self.tokenizer_name = tokenizer_name
46
- self._client = MistralClient(api_key=self.api_key)
50
+ self._client = Mistral(api_key=self.api_key)
47
51
  self.mistral_model = mistral_model
48
52
 
49
53
  def _send_request(self, raw_request: MistralAIRequest) -> Dict[str, Any]:
50
- messages = [ChatMessage(role="user", content=raw_request["prompt"])]
51
-
52
- chat_response: ChatCompletionResponse = self._client.chat(
54
+ if raw_request["messages"] is not None:
55
+ messages = raw_request["messages"]
56
+ else:
57
+ messages = [{"role": "user", "content": raw_request["prompt"]}]
58
+ chat_response: Optional[ChatCompletionResponse] = self._client.chat.complete(
53
59
  model=raw_request["model"],
54
- messages=messages,
60
+ messages=messages, # type: ignore
55
61
  temperature=raw_request["temperature"],
56
62
  max_tokens=raw_request["max_tokens"],
57
63
  top_p=raw_request["top_p"],
58
64
  random_seed=raw_request["random_seed"],
59
65
  safe_prompt=False, # Disable safe_prompt
60
66
  )
67
+ assert chat_response is not None
61
68
  # Documentation: "If mode is 'json', the output will only contain JSON serializable types."
62
69
  # Source: https://docs.pydantic.dev/latest/api/base_model/#pydantic.BaseModel.model_dump
63
70
  #
@@ -86,17 +93,54 @@ class MistralAIClient(CachingClient):
86
93
  """Make a request"""
87
94
  completions: List[GeneratedOutput] = []
88
95
 
96
+ prompt: Union[str, List[Dict[str, str]]] = request.prompt
97
+ if request.multimodal_prompt:
98
+ # Following https://docs.mistral.ai/capabilities/vision
99
+ multimodal_content: List[Dict[str, str]] = []
100
+ for media_object in request.multimodal_prompt.media_objects:
101
+ if media_object.is_type(IMAGE_TYPE) and media_object.location:
102
+ assert media_object.location
103
+ if media_object.is_local_file:
104
+ from helm.common.images_utils import encode_base64
105
+
106
+ base64_image: str = encode_base64(media_object.location)
107
+ image_url = f"data:image/jpeg;base64,{base64_image}"
108
+ else:
109
+ image_url = media_object.location
110
+ multimodal_content.append({"type": "image_url", "image_url": image_url})
111
+ elif media_object.is_type(TEXT_TYPE):
112
+ assert media_object.text
113
+ multimodal_content.append({"type": "text", "text": media_object.text})
114
+ else:
115
+ raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
116
+
117
+ prompt = multimodal_content
118
+
89
119
  # `num_completions` is not supported, so instead make `num_completions` separate requests.
90
120
  for completion_index in range(request.num_completions):
91
121
  try:
92
- raw_request: MistralAIRequest = {
93
- "model": self.mistral_model or request.model_engine,
94
- "prompt": request.prompt,
95
- "max_tokens": request.max_tokens,
96
- "temperature": request.temperature,
97
- "top_p": request.top_p,
98
- "random_seed": self._get_random_seed(request, completion_index),
99
- }
122
+ if request.messages:
123
+ raw_request: MistralAIRequest = {
124
+ "model": self.mistral_model or request.model_engine,
125
+ "prompt": None,
126
+ "messages": request.messages,
127
+ "max_tokens": request.max_tokens,
128
+ "temperature": request.temperature,
129
+ "top_p": request.top_p,
130
+ "random_seed": self._get_random_seed(request, completion_index),
131
+ "stop": request.stop_sequences or None,
132
+ }
133
+ else:
134
+ raw_request = {
135
+ "model": self.mistral_model or request.model_engine,
136
+ "prompt": prompt,
137
+ "messages": None,
138
+ "max_tokens": request.max_tokens,
139
+ "temperature": request.temperature,
140
+ "top_p": request.top_p,
141
+ "random_seed": self._get_random_seed(request, completion_index),
142
+ "stop": request.stop_sequences or None,
143
+ }
100
144
 
101
145
  def do_it() -> Dict[str, Any]:
102
146
  result: Dict[str, Any] = self._send_request(raw_request)
@@ -30,6 +30,3 @@ class NvidiaNimClient(OpenAIClient):
30
30
 
31
31
  def _get_model_for_request(self, request: Request) -> str:
32
32
  return request.model
33
-
34
- def _is_chat_model_engine(self, model_engine: str) -> bool:
35
- return True
@@ -1,18 +1,20 @@
1
1
  # mypy: check_untyped_defs = False
2
2
  from dataclasses import replace
3
- from typing import Any, Dict, List, Optional, cast, Union
3
+ from typing import Any, Dict, List, Optional, cast, Union, Callable
4
4
 
5
5
  from helm.benchmark.model_metadata_registry import is_vlm
6
+ from helm.common import multimodal_request_utils
6
7
  from helm.common.cache import CacheConfig
7
- from helm.common.media_object import TEXT_TYPE
8
- from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
8
+ from helm.common.media_object import TEXT_TYPE, MultimediaObject
9
+ from helm.common.request import ErrorFlags, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
9
10
  from helm.common.hierarchical_logger import hlog
11
+ from helm.common.object_spec import get_class_by_name
10
12
  from helm.common.optional_dependencies import handle_module_not_found_error
11
13
  from helm.common.tokenization_request import (
12
14
  TokenizationRequest,
13
15
  TokenizationRequestResult,
14
16
  )
15
- from .client import CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
17
+ from helm.clients.client import Client, CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
16
18
  from helm.tokenizers.tokenizer import Tokenizer
17
19
 
18
20
  try:
@@ -27,6 +29,19 @@ class OpenAIClient(CachingClient):
27
29
 
28
30
  # Error OpenAI throws when the image in the prompt violates their content policy
29
31
  INAPPROPRIATE_IMAGE_ERROR: str = "Your input image may contain content that is not allowed by our safety system"
32
+ INAPPROPRIATE_PROMPT_ERROR: str = "Invalid prompt: your prompt was flagged"
33
+ INAPPROPRIATE_PROMPT_AZURE_ERROR: str = (
34
+ "The response was filtered due to the prompt triggering Azure OpenAI's content management policy."
35
+ )
36
+ INAPPROPRIATE_PROMPT_MICROSOFT_ERROR: str = (
37
+ "The response was filtered due to the prompt triggering Microsoft's content management policy."
38
+ )
39
+
40
+ # OpenAI server error
41
+ OPENAI_SERVER_ERROR: str = (
42
+ "The server had an error processing your request. Sorry about that! You can retry your request, "
43
+ "or contact us through our help center at help.openai.com if you keep seeing this error."
44
+ )
30
45
 
31
46
  # Set the finish reason to this if the prompt violates OpenAI's content policy
32
47
  CONTENT_POLICY_VIOLATED_FINISH_REASON: str = (
@@ -42,28 +57,31 @@ class OpenAIClient(CachingClient):
42
57
  api_key: Optional[str] = None,
43
58
  org_id: Optional[str] = None,
44
59
  base_url: Optional[str] = None,
60
+ reasoning_effort: Optional[str] = None,
61
+ openai_model_name: Optional[str] = None,
62
+ output_processor: Optional[str] = None,
45
63
  ):
46
64
  super().__init__(cache_config=cache_config)
47
65
  self.tokenizer = tokenizer
48
66
  self.tokenizer_name = tokenizer_name
49
67
  self.client = OpenAI(api_key=api_key, organization=org_id, base_url=base_url)
50
-
51
- def _is_chat_model_engine(self, model_engine: str) -> bool:
52
- if model_engine == "gpt-3.5-turbo-instruct":
53
- return False
54
- elif model_engine.startswith("gpt-3.5") or model_engine.startswith("gpt-4") or model_engine.startswith("o1"):
55
- return True
56
- return False
68
+ self.reasoning_effort = reasoning_effort
69
+ self.openai_model_name = openai_model_name
70
+ self.output_processor: Optional[Callable[[str], str]] = (
71
+ get_class_by_name(output_processor) if output_processor else None
72
+ )
57
73
 
58
74
  def _get_model_for_request(self, request: Request) -> str:
59
- return request.model_engine
75
+ return self.openai_model_name or request.model_engine
60
76
 
61
77
  def _get_cache_key(self, raw_request: Dict, request: Request):
62
78
  cache_key = CachingClient.make_cache_key(raw_request, request)
63
79
  if request.multimodal_prompt:
64
80
  prompt_key: str = generate_uid_for_multimodal_prompt(request.multimodal_prompt)
65
81
  cache_key = {**cache_key, "multimodal_prompt": prompt_key}
66
- del cache_key["messages"]
82
+
83
+ if "messages" in cache_key:
84
+ del cache_key["messages"]
67
85
  return cache_key
68
86
 
69
87
  def _make_embedding_request(self, request: Request) -> RequestResult:
@@ -140,6 +158,19 @@ class OpenAIClient(CachingClient):
140
158
  base64_image: str = encode_base64(media_object.location)
141
159
  image_object: Dict[str, str] = {"url": f"data:image/jpeg;base64,{base64_image}"}
142
160
  content.append({"type": "image_url", "image_url": image_object})
161
+ elif media_object.is_type("audio") and media_object.location:
162
+ base64_audio: str = multimodal_request_utils.get_contents_as_base64(media_object.location)
163
+ format: str = media_object.content_type.split("/")[1]
164
+ if format == "mpeg":
165
+ # OpenAI expects "mp3" for mpeg audio
166
+ format = "mp3"
167
+
168
+ content.append(
169
+ {
170
+ "type": "input_audio",
171
+ "input_audio": {"data": base64_audio, "format": format},
172
+ }
173
+ )
143
174
  elif media_object.is_type(TEXT_TYPE):
144
175
  content.append({"type": media_object.type, "text": media_object.text})
145
176
  else:
@@ -164,15 +195,35 @@ class OpenAIClient(CachingClient):
164
195
  "frequency_penalty": request.frequency_penalty,
165
196
  }
166
197
 
167
- # OpenAI's vision API doesn't allow None values for stop.
168
- # Fails with "body -> stop: none is not an allowed value" if None is passed.
169
- if is_vlm(request.model) and raw_request["stop"] is None:
170
- raw_request.pop("stop")
198
+ if request.response_format and request.response_format.json_schema:
199
+ # Copy and modify JSON schema to conform to OpenAI's requirements
200
+ json_schema = dict(request.response_format.json_schema)
201
+
202
+ # additionalProperties: false must always be set in objects
203
+ # https://platform.openai.com/docs/guides/structured-outputs#additionalproperties-false-must-always-be-set-in-objects
204
+ if "additionalProperties" not in json_schema:
205
+ json_schema["additionalProperties"] = False
206
+
207
+ # All fields must be required
208
+ # https://platform.openai.com/docs/guides/structured-outputs#all-fields-must-be-required
209
+ if "required" not in json_schema:
210
+ json_schema["required"] = list(json_schema["properties"].keys())
211
+
212
+ raw_request["response_format"] = {
213
+ "type": "json_schema",
214
+ "json_schema": {
215
+ "name": "response",
216
+ "description": "",
217
+ "schema": json_schema,
218
+ "strict": True,
219
+ },
220
+ }
171
221
 
172
222
  # Special handling for o1 models.
173
223
  # Refer to the "Reasoning models" documentation further discussion of o1 model limitations:
174
224
  # https://platform.openai.com/docs/guides/reasoning
175
- if request.model_engine.startswith("o1"):
225
+ model_engine: str = request.model_engine
226
+ if model_engine.startswith("o1") or model_engine.startswith("o3"):
176
227
  # Avoid error:
177
228
  # "Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead." # noqa: E501
178
229
  # Note that openai>=1.45 is needed for this
@@ -184,6 +235,30 @@ class OpenAIClient(CachingClient):
184
235
  if raw_request["stop"] is None:
185
236
  raw_request.pop("stop")
186
237
 
238
+ # Avoid error:
239
+ # "Error code: 400 - {'error': {'message': "Unsupported parameter: 'temperature' is
240
+ # not supported with this model.", 'type': 'invalid_request_error', 'param': 'temperature',
241
+ # 'code': 'unsupported_parameter'}}"
242
+ raw_request.pop("temperature", None)
243
+
244
+ if self.reasoning_effort:
245
+ raw_request["reasoning_effort"] = "self.reasoning_effort"
246
+ elif is_vlm(request.model):
247
+ # Avoid error:
248
+ # "Invalid type for 'stop': expected an unsupported value, but got null instead."
249
+ if raw_request["stop"] is None:
250
+ raw_request.pop("stop")
251
+
252
+ # Special handling for gpt-4o-audio-preview
253
+ # See: https://platform.openai.com/docs/guides/audio
254
+ if model_engine.startswith("gpt-4o-audio-preview") or model_engine.startswith("gpt-4o-mini-audio-preview"):
255
+ raw_request["modalities"] = ["text"]
256
+
257
+ # Avoid error:
258
+ # OpenAI error: Error code: 400 - {'error': {'message': "[{'type': 'string_type', 'loc': ('body', 'stop', 'str'), 'msg': 'Input should be a valid string', 'input': None}, {'type': 'list_type', 'loc': ('body', 'stop', 'list[str]'), 'msg': 'Input should be a valid list', 'input': None}, {'type': 'list_type', 'loc': ('body', 'stop', 'list[list[int]]'), 'msg': 'Input should be a valid list', 'input': None}]", 'type': 'invalid_request_error', 'param': None, 'code': None}} # noqa: 3501
259
+ if raw_request["stop"] is None:
260
+ raw_request.pop("stop")
261
+
187
262
  def do_it() -> Dict[str, Any]:
188
263
  return self.client.chat.completions.create(**raw_request).model_dump(mode="json")
189
264
 
@@ -191,7 +266,7 @@ class OpenAIClient(CachingClient):
191
266
  cache_key = self._get_cache_key(raw_request, request)
192
267
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
193
268
  except openai.OpenAIError as e:
194
- if self.INAPPROPRIATE_IMAGE_ERROR in str(e):
269
+ if self.INAPPROPRIATE_IMAGE_ERROR in str(e) or self.INAPPROPRIATE_PROMPT_ERROR in str(e):
195
270
  hlog(f"Failed safety check: {str(request)}")
196
271
  empty_completion = GeneratedOutput(
197
272
  text="",
@@ -206,15 +281,54 @@ class OpenAIClient(CachingClient):
206
281
  completions=[empty_completion] * request.num_completions,
207
282
  embedding=[],
208
283
  )
284
+ elif self.OPENAI_SERVER_ERROR in str(e):
285
+ # Handle these errors by returning an empty completion to unblock
286
+ hlog(f"OpenAI server error for request: {str(request)}")
287
+ empty_completion = GeneratedOutput(
288
+ text="",
289
+ logprob=0,
290
+ tokens=[],
291
+ finish_reason={"reason": self.OPENAI_SERVER_ERROR},
292
+ )
293
+ return RequestResult(
294
+ success=True,
295
+ cached=False,
296
+ request_time=0,
297
+ completions=[empty_completion] * request.num_completions,
298
+ embedding=[],
299
+ )
300
+ elif self.INAPPROPRIATE_PROMPT_AZURE_ERROR in str(e) or self.INAPPROPRIATE_PROMPT_MICROSOFT_ERROR in str(e):
301
+ return RequestResult(
302
+ success=False,
303
+ cached=False,
304
+ error="Content blocked by Azure's content management filter",
305
+ completions=[],
306
+ embedding=[],
307
+ error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
308
+ )
209
309
 
210
310
  error: str = f"OpenAI error: {e}"
211
311
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
212
312
 
213
313
  completions: List[GeneratedOutput] = []
214
314
  for raw_completion in response["choices"]:
315
+ # Handle Azure OpenAI content filter
316
+ # See: https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/content-filter
317
+ if raw_completion["finish_reason"] == "content_filter":
318
+ hlog(f"Content blocked by OpenAI filter: {str(raw_request)}")
319
+ return RequestResult(
320
+ success=False,
321
+ cached=False,
322
+ error="Content blocked by OpenAI filter",
323
+ completions=[],
324
+ embedding=[],
325
+ error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
326
+ )
215
327
  # The OpenAI chat completion API doesn't support echo.
216
328
  # If `echo_prompt` is true, combine the prompt and completion.
217
329
  raw_completion_content = raw_completion["message"]["content"]
330
+ if self.output_processor:
331
+ raw_completion_content = self.output_processor(raw_completion_content)
218
332
  text: str = request.prompt + raw_completion_content if request.echo_prompt else raw_completion_content
219
333
  # The OpenAI chat completion API doesn't return us tokens or logprobs, so we tokenize ourselves.
220
334
  tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize(
@@ -313,10 +427,115 @@ class OpenAIClient(CachingClient):
313
427
  embedding=[],
314
428
  )
315
429
 
430
+ def _make_transcription_request(self, request: Request) -> RequestResult:
431
+ assert (
432
+ request.multimodal_prompt is not None and request.multimodal_prompt.size == 1
433
+ ), "Expected just a single audio file."
434
+ media_object = request.multimodal_prompt.media_objects[0]
435
+ assert media_object.is_type("audio") and media_object.location, "Expected an audio file."
436
+ audio_path: str = media_object.location
437
+ model: str = self._get_model_for_request(request)
438
+
439
+ def do_it() -> Dict[str, Any]:
440
+ transcription = self.client.audio.transcriptions.create(model=model, file=open(audio_path, "rb"))
441
+ return {"transcription": transcription.text}
442
+
443
+ try:
444
+ cache_key = self._get_cache_key({"audio": audio_path, "model": model}, request)
445
+ response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
446
+ except openai.OpenAIError as e:
447
+ error: str = f"OpenAI error: {e}"
448
+ return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
449
+
450
+ return RequestResult(
451
+ success=True,
452
+ cached=cached,
453
+ request_time=response["request_time"],
454
+ request_datetime=response.get("request_datetime"),
455
+ completions=[GeneratedOutput(text=response["transcription"], logprob=0, tokens=[])],
456
+ embedding=[],
457
+ )
458
+
316
459
  def make_request(self, request: Request) -> RequestResult:
317
460
  if request.embedding:
318
461
  return self._make_embedding_request(request)
319
- elif self._is_chat_model_engine(request.model_engine):
320
- return self._make_chat_request(request)
462
+ elif "whisper" in request.model_engine:
463
+ return self._make_transcription_request(request)
321
464
  else:
322
- return self._make_completion_request(request)
465
+ return self._make_chat_request(request)
466
+
467
+
468
+ class OpenAILegacyCompletionsClient(OpenAIClient):
469
+ def make_request(self, request: Request) -> RequestResult:
470
+ return self._make_completion_request(request)
471
+
472
+
473
+ class OpenAITranscriptionThenCompletionClient(Client):
474
+ """
475
+ Wrapper around `OpenAIClient` that transcribes audio to text with a
476
+ speech-to-text model (e.g., Whisper) before making a completion request.
477
+ """
478
+
479
+ @staticmethod
480
+ def wrap_transcribed_indicator(transcription: str) -> str:
481
+ return f"\n[TRANSCRIBED AUDIO START]\n{transcription}\n[TRANSCRIBED AUDIO END]\n"
482
+
483
+ def __init__(
484
+ self,
485
+ tokenizer: Tokenizer,
486
+ tokenizer_name: str,
487
+ cache_config: CacheConfig,
488
+ api_key: Optional[str] = None,
489
+ org_id: Optional[str] = None,
490
+ ):
491
+ self._openai_client = OpenAIClient(
492
+ tokenizer=tokenizer,
493
+ tokenizer_name=tokenizer_name,
494
+ cache_config=cache_config,
495
+ api_key=api_key,
496
+ org_id=org_id,
497
+ )
498
+
499
+ def make_request(self, request: Request) -> RequestResult:
500
+ # Ensure that there is only one _ in the model engine name as the format is
501
+ # `transcription-model_completion-model`
502
+ assert request.model_engine.count("_") == 1, f"Invalid model name: {request.model_engine}"
503
+ # Use `model_engine` to determine both the models for transcription and completion
504
+ transcription_model, completion_model = request.model_engine.split("_")
505
+
506
+ # Only multimodal prompts are supported
507
+ assert request.multimodal_prompt is not None, "Expected a multimodal prompt"
508
+
509
+ # Gather all the text content and transcribe any audio to text
510
+ text_content: List[str] = []
511
+ for media_object in request.multimodal_prompt.media_objects:
512
+ if media_object.is_type("audio") and media_object.location:
513
+ request = Request(
514
+ model=f"openai/{transcription_model}",
515
+ multimodal_prompt=MultimediaObject(media_objects=[media_object]),
516
+ )
517
+ response = self._openai_client.make_request(request)
518
+
519
+ transcribed_text: str
520
+ if response.success and response.completions:
521
+ transcribed_text = response.completions[0].text
522
+ else:
523
+ transcribed_text = ""
524
+ hlog(f"Failed to transcribe audio: {response.error}")
525
+
526
+ text_content.append(self.wrap_transcribed_indicator(transcribed_text))
527
+ elif media_object.is_type(TEXT_TYPE):
528
+ assert media_object.text is not None, "Expected text content"
529
+ text_content.append(media_object.text)
530
+ else:
531
+ raise ValueError(f"Unrecognized media type: {media_object.type}")
532
+
533
+ text_prompt: str = "\n".join(text_content)
534
+ hlog(f"Transcribed prompt:\n{text_prompt}")
535
+
536
+ # Now make the request to the completion model with just a text-only prompt and no audio
537
+ # Use the same decoding parameters as the original request
538
+ # Ensure to set multimodal_prompt to None so the request is treated as text-only.
539
+ return self._openai_client.make_request(
540
+ replace(request, prompt=text_prompt, model=f"openai/{completion_model}", multimodal_prompt=None)
541
+ )
@@ -12,7 +12,7 @@ from helm.common.tokenization_request import (
12
12
  TokenizationRequestResult,
13
13
  )
14
14
  from helm.tokenizers.tokenizer import Tokenizer
15
- from .client import CachingClient, truncate_sequence
15
+ from helm.clients.client import CachingClient, truncate_sequence
16
16
 
17
17
 
18
18
  _CONTENT_MODERATION_KEY = "fail.content.moderation.failed"
@@ -163,6 +163,3 @@ class PalmyraChatClient(OpenAIClient):
163
163
  org_id=None,
164
164
  base_url="https://api.writer.com/v1/chat",
165
165
  )
166
-
167
- def _is_chat_model_engine(self, model_engine: str) -> bool:
168
- return True
@@ -9,7 +9,7 @@ from helm.common.request import wrap_request_time, Request, RequestResult, Gener
9
9
  from helm.common.hierarchical_logger import hlog
10
10
  from helm.common.optional_dependencies import handle_module_not_found_error
11
11
  from helm.tokenizers.tokenizer import Tokenizer
12
- from .client import CachingClient, truncate_and_tokenize_response_text
12
+ from helm.clients.client import CachingClient, truncate_and_tokenize_response_text
13
13
 
14
14
  try:
15
15
  import reka
@@ -0,0 +1,58 @@
1
+ from typing import Optional
2
+
3
+ from helm.clients.azure_openai_client import AzureOpenAIClient
4
+ from helm.common.cache import CacheConfig
5
+ from helm.proxy.retry import NonRetriableException
6
+ from helm.tokenizers.tokenizer import Tokenizer
7
+
8
+
9
+ class StanfordHealthCareAzureOpenAIClient(AzureOpenAIClient):
10
+ """
11
+ Client for accessing OpenAI models hosted on Stanford Health Care's model API.
12
+
13
+ Configure by setting the following in prod_env/credentials.conf:
14
+
15
+ ```
16
+ stanfordhealthcareEndpoint: https://your-domain-name/
17
+ stanfordhealthcareApiKey: your-private-key
18
+ ```
19
+ """
20
+
21
+ CREDENTIAL_HEADER_NAME = "Ocp-Apim-Subscription-Key"
22
+
23
+ def __init__(
24
+ self,
25
+ tokenizer: Tokenizer,
26
+ tokenizer_name: str,
27
+ cache_config: CacheConfig,
28
+ openai_model_name: str,
29
+ api_version: str,
30
+ api_key: Optional[str] = None,
31
+ endpoint: Optional[str] = None,
32
+ base_url: Optional[str] = None,
33
+ ):
34
+ if not api_key:
35
+ raise NonRetriableException("Must provide API key through credentials.conf")
36
+ if base_url:
37
+ base_url = base_url.format(endpoint=endpoint)
38
+ super().__init__(
39
+ tokenizer=tokenizer,
40
+ tokenizer_name=tokenizer_name,
41
+ cache_config=cache_config,
42
+ api_key="unused",
43
+ base_url=base_url,
44
+ azure_openai_deployment_name=openai_model_name,
45
+ api_version=api_version,
46
+ default_headers={StanfordHealthCareAzureOpenAIClient.CREDENTIAL_HEADER_NAME: api_key},
47
+ )
48
+ else:
49
+ super().__init__(
50
+ tokenizer=tokenizer,
51
+ tokenizer_name=tokenizer_name,
52
+ cache_config=cache_config,
53
+ api_key="unused",
54
+ endpoint=endpoint,
55
+ azure_openai_deployment_name=openai_model_name,
56
+ api_version=api_version,
57
+ default_headers={StanfordHealthCareAzureOpenAIClient.CREDENTIAL_HEADER_NAME: api_key},
58
+ )
@@ -0,0 +1,31 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from helm.common.request import (
4
+ Request,
5
+ GeneratedOutput,
6
+ )
7
+ from helm.clients.stanfordhealthcare_http_model_client import StanfordHealthCareHTTPModelClient
8
+
9
+
10
+ class StanfordHealthCareClaudeClient(StanfordHealthCareHTTPModelClient):
11
+ """
12
+ Client for accessing Claude models hosted on Stanford Health Care's model API.
13
+
14
+ Configure by setting the following in prod_env/credentials.conf:
15
+
16
+ ```
17
+ stanfordhealthcareEndpoint: https://your-domain-name/
18
+ stanfordhealthcareApiKey: your-private-key
19
+ ```
20
+ """
21
+
22
+ def get_request(self, request: Request) -> Dict[str, Any]:
23
+ return {
24
+ "model_id": self.model,
25
+ "prompt_text": request.prompt,
26
+ "max_tokens": request.max_tokens,
27
+ "temperature": request.temperature,
28
+ }
29
+
30
+ def parse_response(self, response: Dict[str, Any]) -> List[GeneratedOutput]:
31
+ return [GeneratedOutput(text=item["text"], logprob=0, tokens=[]) for item in response["content"]]
@@ -0,0 +1,43 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from helm.common.request import (
4
+ Request,
5
+ GeneratedOutput,
6
+ )
7
+ from helm.clients.stanfordhealthcare_http_model_client import StanfordHealthCareHTTPModelClient
8
+
9
+
10
+ class StanfordHealthCareGoogleClient(StanfordHealthCareHTTPModelClient):
11
+ """
12
+ Client for accessing Google models hosted on Stanford Health Care's model API.
13
+
14
+ Configure by setting the following in prod_env/credentials.conf:
15
+
16
+ ```
17
+ stanfordhealthcareEndpoint: https://your-domain-name/
18
+ stanfordhealthcareApiKey: your-private-key
19
+ ```
20
+ """
21
+
22
+ def get_request(self, request: Request) -> Dict[str, Any]:
23
+ return {
24
+ "contents": {
25
+ "role": "user",
26
+ "parts": {"text": request.prompt},
27
+ },
28
+ "generation_config": {
29
+ "temperature": request.temperature,
30
+ "topP": request.top_p,
31
+ "topK": request.top_k_per_token,
32
+ "candidateCount": 1,
33
+ "maxOutputTokens": request.max_tokens,
34
+ "stopSequences": request.stop_sequences,
35
+ },
36
+ }
37
+
38
+ def parse_response(self, response: Dict[str, Any]) -> List[GeneratedOutput]:
39
+ completion = ""
40
+ for item in response["content"]:
41
+ if "content" in item["candidates"][0]:
42
+ completion += item["candidates"][0]["content"]["parts"][0]["text"]
43
+ return [GeneratedOutput(text=completion, logprob=0, tokens=[])]