crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  31. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  32. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  33. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  34. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  35. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  36. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  37. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  38. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  39. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  40. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  41. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  42. helm/benchmark/annotation/medalign_annotator.py +100 -0
  43. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  44. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  45. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  46. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  47. helm/benchmark/annotation/model_as_judge.py +218 -6
  48. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  49. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  50. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  51. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  52. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  53. helm/benchmark/annotation/spider_annotator.py +18 -0
  54. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  55. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  56. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  57. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  58. helm/benchmark/annotation_executor.py +35 -15
  59. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  60. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  61. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  62. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  63. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  64. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  65. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  66. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  67. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  68. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  69. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  70. helm/benchmark/augmentations/perturbation.py +1 -1
  71. helm/benchmark/augmentations/space_perturbation.py +2 -2
  72. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  73. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  74. helm/benchmark/augmentations/test_perturbation.py +16 -13
  75. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  76. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  77. helm/benchmark/data_preprocessor.py +2 -2
  78. helm/benchmark/huggingface_registration.py +2 -7
  79. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  80. helm/benchmark/metrics/basic_metrics.py +6 -6
  81. helm/benchmark/metrics/bbq_metrics.py +2 -2
  82. helm/benchmark/metrics/bias_metrics.py +12 -3
  83. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  84. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  85. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  86. helm/benchmark/metrics/classification_metrics.py +76 -12
  87. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  88. helm/benchmark/metrics/code_metrics.py +5 -5
  89. helm/benchmark/metrics/comet_metric.py +125 -0
  90. helm/benchmark/metrics/common_metric_specs.py +9 -2
  91. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  92. helm/benchmark/metrics/copyright_metrics.py +4 -4
  93. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  94. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  95. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  96. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  97. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  98. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  99. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  100. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  101. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  102. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  103. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  104. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  105. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  106. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  107. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  108. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  109. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  110. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  111. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  112. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  113. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  114. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  115. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  116. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  117. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  118. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  120. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  121. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  122. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  123. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  124. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  125. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  126. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  127. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  128. helm/benchmark/metrics/medalign_metrics.py +34 -0
  129. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  130. helm/benchmark/metrics/medec_metrics.py +101 -0
  131. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  132. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  133. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  134. helm/benchmark/metrics/metric.py +3 -3
  135. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  136. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  137. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  138. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  139. helm/benchmark/metrics/nltk_helper.py +32 -0
  140. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  141. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  142. helm/benchmark/metrics/output_processing_metric.py +60 -0
  143. helm/benchmark/metrics/output_processors.py +15 -0
  144. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  145. helm/benchmark/metrics/ranking_metrics.py +3 -3
  146. helm/benchmark/metrics/reference_metric.py +3 -3
  147. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  148. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  149. helm/benchmark/metrics/spider_metrics.py +7 -0
  150. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  151. helm/benchmark/metrics/statistic.py +1 -1
  152. helm/benchmark/metrics/summac/model_summac.py +1 -1
  153. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  154. helm/benchmark/metrics/summarization_metrics.py +19 -9
  155. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  156. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  157. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  158. helm/benchmark/metrics/test_metric.py +1 -1
  159. helm/benchmark/metrics/test_statistic.py +2 -2
  160. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  161. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  162. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  163. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  164. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  165. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  167. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  169. helm/benchmark/metrics/unitxt_metrics.py +4 -1
  170. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  171. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  172. helm/benchmark/model_metadata_registry.py +16 -0
  173. helm/benchmark/presentation/summarize.py +23 -10
  174. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  175. helm/benchmark/reeval_run.py +203 -0
  176. helm/benchmark/reeval_runner.py +355 -0
  177. helm/benchmark/run.py +8 -17
  178. helm/benchmark/run_expander.py +78 -8
  179. helm/benchmark/run_spec_factory.py +12 -0
  180. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  181. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  182. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  183. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  184. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  185. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  186. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  187. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  188. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  189. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  190. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  191. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  192. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  193. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  194. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  195. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  196. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  197. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  198. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  199. helm/benchmark/run_specs/vlm_run_specs.py +75 -2
  200. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  201. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  202. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  203. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  204. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  205. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  206. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  207. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  208. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  209. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  210. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  211. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  212. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  213. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  214. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  215. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  216. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  217. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  218. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  219. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  220. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  221. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  222. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  223. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  224. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  225. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  226. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  227. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  228. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  229. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  230. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  231. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  232. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  233. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  234. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  235. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  236. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  237. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  238. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  239. helm/benchmark/scenarios/bold_scenario.py +1 -1
  240. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  241. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  242. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  243. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  244. helm/benchmark/scenarios/clear_scenario.py +153 -0
  245. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  246. helm/benchmark/scenarios/code_scenario.py +17 -4
  247. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  248. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  249. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  250. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  251. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  252. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  253. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  254. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  255. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  256. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  257. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  258. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  259. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  260. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  261. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  262. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  263. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  264. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  265. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  266. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  267. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  268. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  269. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  270. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  271. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  272. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  273. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  274. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  275. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  276. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  277. helm/benchmark/scenarios/ice_scenario.py +8 -4
  278. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  279. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  280. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  281. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  282. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  283. helm/benchmark/scenarios/koala_scenario.py +1 -1
  284. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  285. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  286. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  287. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  288. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  289. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  290. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  291. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  292. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  293. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  294. helm/benchmark/scenarios/math_scenario.py +9 -1
  295. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  296. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  297. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  298. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  299. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  300. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  301. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  302. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  303. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  304. helm/benchmark/scenarios/medec_scenario.py +120 -0
  305. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  306. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  307. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  308. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  309. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  310. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  311. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  312. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  313. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  314. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  315. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  316. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  317. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  318. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  319. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  320. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  321. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  322. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  323. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  324. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  325. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  326. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  328. helm/benchmark/scenarios/quac_scenario.py +10 -1
  329. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  330. helm/benchmark/scenarios/raft_scenario.py +17 -2
  331. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  332. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  333. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  334. helm/benchmark/scenarios/scenario.py +9 -1
  335. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  336. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  337. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  338. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  339. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  340. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  341. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  342. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  343. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  344. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  345. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  346. helm/benchmark/scenarios/spider_scenario.py +91 -0
  347. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  348. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  349. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  350. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  351. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  352. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  353. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  354. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  355. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  356. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  357. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  358. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  359. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  360. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  361. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  362. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  363. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  364. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  366. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  367. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  368. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  369. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  370. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  371. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  372. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  373. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  374. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  375. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  376. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  377. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  378. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  379. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  380. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  381. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  382. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  383. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  384. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  385. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  386. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  387. helm/benchmark/server.py +11 -0
  388. helm/benchmark/slurm_runner.py +1 -1
  389. helm/benchmark/static/schema_audio.yaml +752 -0
  390. helm/benchmark/static/schema_autobencher.yaml +150 -0
  391. helm/benchmark/static/schema_call_center.yaml +97 -60
  392. helm/benchmark/static/schema_capabilities.yaml +254 -0
  393. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  394. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  395. helm/benchmark/static/schema_enterprise.yaml +298 -0
  396. helm/benchmark/static/schema_finance.yaml +14 -12
  397. helm/benchmark/static/schema_heim.yaml +1389 -0
  398. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  399. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  400. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  401. helm/benchmark/static/schema_safety.yaml +18 -1
  402. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  403. helm/benchmark/static/schema_social_audio.yaml +224 -0
  404. helm/benchmark/static/schema_sql.yaml +171 -0
  405. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  406. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  407. helm/benchmark/static/schema_vhelm.yaml +109 -36
  408. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  409. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  410. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  411. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  412. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  413. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  414. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  415. helm/benchmark/static_build/config.js +1 -1
  416. helm/benchmark/static_build/index.html +5 -5
  417. helm/benchmark/window_services/default_window_service.py +1 -1
  418. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  419. helm/benchmark/window_services/ice_window_service.py +1 -1
  420. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  421. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  422. helm/benchmark/window_services/local_window_service.py +2 -2
  423. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  424. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  425. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  426. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  427. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  428. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  429. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  430. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  431. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  432. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  433. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  434. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  435. helm/benchmark/window_services/test_utils.py +1 -1
  436. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  437. helm/benchmark/window_services/yalm_window_service.py +1 -1
  438. helm/clients/ai21_client.py +3 -3
  439. helm/clients/aleph_alpha_client.py +1 -1
  440. helm/clients/audio_language/__init__.py +0 -0
  441. helm/clients/audio_language/diva_llama_client.py +118 -0
  442. helm/clients/audio_language/llama_omni_client.py +198 -0
  443. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  444. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  445. helm/clients/auto_client.py +4 -2
  446. helm/clients/azure_openai_client.py +55 -0
  447. helm/clients/bedrock_client.py +201 -7
  448. helm/clients/bedrock_utils.py +33 -0
  449. helm/clients/clip_scorers/clip_scorer.py +1 -1
  450. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  451. helm/clients/cohere_client.py +3 -3
  452. helm/clients/google_client.py +1 -1
  453. helm/clients/http_model_client.py +1 -1
  454. helm/clients/huggingface_client.py +10 -18
  455. helm/clients/ibm_client.py +267 -0
  456. helm/clients/image_generation/adobe_vision_client.py +1 -1
  457. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  458. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  459. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  460. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  461. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  462. helm/clients/image_generation/cogview2_client.py +1 -1
  463. helm/clients/image_generation/dalle2_client.py +1 -1
  464. helm/clients/image_generation/dalle3_client.py +2 -2
  465. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  466. helm/clients/image_generation/dalle_mini/data.py +1 -1
  467. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  468. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  469. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  470. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  471. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  472. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  473. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  474. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  475. helm/clients/image_generation/dalle_mini_client.py +1 -1
  476. helm/clients/image_generation/deep_floyd_client.py +1 -1
  477. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  478. helm/clients/image_generation/lexica_client.py +1 -1
  479. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  480. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  481. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  482. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  483. helm/clients/image_generation/mindalle_client.py +1 -1
  484. helm/clients/image_generation/together_image_generation_client.py +1 -1
  485. helm/clients/lit_gpt_client.py +2 -2
  486. helm/clients/mistral_client.py +62 -18
  487. helm/clients/nvidia_nim_client.py +0 -3
  488. helm/clients/openai_client.py +241 -22
  489. helm/clients/palmyra_client.py +1 -4
  490. helm/clients/reka_client.py +1 -1
  491. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  492. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  493. helm/clients/stanfordhealthcare_google_client.py +43 -0
  494. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  495. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  496. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  497. helm/clients/test_client.py +1 -1
  498. helm/clients/test_together_client.py +6 -1
  499. helm/clients/together_client.py +47 -7
  500. helm/clients/upstage_client.py +23 -0
  501. helm/clients/vertexai_client.py +39 -13
  502. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  503. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  504. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  505. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  506. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  507. helm/clients/vllm_client.py +4 -6
  508. helm/clients/yi_client.py +0 -3
  509. helm/common/audio_utils.py +111 -0
  510. helm/common/file_caches/local_file_cache.py +1 -1
  511. helm/common/file_caches/test_local_file_cache.py +1 -1
  512. helm/common/images_utils.py +2 -2
  513. helm/common/media_object.py +2 -2
  514. helm/common/multimodal_request_utils.py +26 -0
  515. helm/common/reeval_parameters.py +12 -0
  516. helm/common/request.py +6 -2
  517. helm/common/response_format.py +18 -0
  518. helm/common/test_media_object.py +1 -1
  519. helm/config/model_deployments.yaml +1112 -19
  520. helm/config/model_metadata.yaml +985 -44
  521. helm/config/tokenizer_configs.yaml +379 -3
  522. helm/proxy/cli.py +2 -2
  523. helm/proxy/example_queries.py +1 -1
  524. helm/proxy/server.py +11 -4
  525. helm/proxy/services/remote_service.py +1 -1
  526. helm/proxy/services/server_service.py +1 -1
  527. helm/proxy/services/test_remote_service.py +2 -2
  528. helm/proxy/services/test_service.py +1 -1
  529. helm/proxy/static/general.js +122 -0
  530. helm/proxy/static/help.html +99 -0
  531. helm/proxy/static/index.css +57 -0
  532. helm/proxy/static/index.html +40 -0
  533. helm/proxy/static/index.js +456 -0
  534. helm/proxy/static/info-icon.png +0 -0
  535. helm/proxy/test_retry.py +1 -1
  536. helm/proxy/token_counters/auto_token_counter.py +1 -1
  537. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  538. helm/tokenizers/caching_tokenizer.py +2 -30
  539. helm/tokenizers/http_model_tokenizer.py +1 -1
  540. helm/tokenizers/huggingface_tokenizer.py +2 -2
  541. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  542. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  543. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  544. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  545. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  546. helm/tokenizers/tokenizer.py +3 -1
  547. helm/tokenizers/yalm_tokenizer.py +3 -3
  548. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  549. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  550. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  551. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  552. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  553. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  554. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  555. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  556. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  557. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  558. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  559. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  560. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  561. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  562. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  563. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  564. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  565. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  566. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  567. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  568. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  569. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  570. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  571. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  572. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  573. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  574. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  575. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  576. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  577. helm/tokenizers/anthropic_tokenizer.py +0 -52
  578. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  579. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  580. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
@@ -16,6 +16,161 @@ model_deployments:
16
16
  client_spec:
17
17
  class_name: "helm.clients.simple_client.SimpleClient"
18
18
 
19
+ # Stanford Health Care
20
+ # Placed earlier in the file to make them non-default
21
+ - name: stanfordhealthcare/claude-3-5-sonnet-20241022
22
+ model_name: anthropic/claude-3-5-sonnet-20241022
23
+ tokenizer_name: anthropic/claude
24
+ max_sequence_length: 200000
25
+ client_spec:
26
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
27
+ args:
28
+ model: anthropic.claude-3-5-sonnet-20241022-v2:0
29
+ deployment: Claude35Sonnetv2/awssig4fa
30
+
31
+ - name: stanfordhealthcare/claude-3-7-sonnet-20250219
32
+ model_name: anthropic/claude-3-7-sonnet-20250219
33
+ tokenizer_name: anthropic/claude
34
+ max_sequence_length: 200000
35
+ client_spec:
36
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
37
+ args:
38
+ model: arn:aws:bedrock:us-west-2:679683451337:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0
39
+ deployment: awssig4claude37/aswsig4claude37
40
+
41
+ - name: stanfordhealthcare/gemini-1.5-pro-001
42
+ model_name: google/gemini-1.5-pro-001
43
+ tokenizer_name: google/gemma-2b
44
+ max_sequence_length: 1000000
45
+ client_spec:
46
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
47
+ args:
48
+ deployment: gcpgemini/apim-gcp-oauth-fa
49
+
50
+ - name: stanfordhealthcare/gemini-2.0-flash-001
51
+ model_name: google/gemini-2.0-flash-001
52
+ tokenizer_name: google/gemma-2b
53
+ max_sequence_length: 1000000
54
+ client_spec:
55
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
56
+ args:
57
+ deployment: gcp-gem20flash-fa/apim-gcp-gem20flash-fa
58
+
59
+ - name: stanfordhealthcare/gpt-4o-mini-2024-07-18
60
+ model_name: openai/gpt-4o-mini-2024-07-18
61
+ tokenizer_name: openai/o200k_base
62
+ max_sequence_length: 128000
63
+ client_spec:
64
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
65
+ args:
66
+ openai_model_name: gpt-4o-mini
67
+ api_version: 2023-05-15
68
+
69
+ - name: stanfordhealthcare/gpt-4o-2024-05-13
70
+ model_name: openai/gpt-4o-2024-05-13
71
+ tokenizer_name: openai/o200k_base
72
+ max_sequence_length: 128000
73
+ client_spec:
74
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
75
+ args:
76
+ openai_model_name: gpt-4o
77
+ api_version: 2023-05-15
78
+
79
+ - name: stanfordhealthcare/gpt-4-0613
80
+ model_name: openai/gpt-4-0613
81
+ tokenizer_name: openai/o200k_base
82
+ max_sequence_length: 8192
83
+ client_spec:
84
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
85
+ args:
86
+ openai_model_name: gpt-4
87
+ api_version: 2023-05-15
88
+
89
+ - name: stanfordhealthcare/gpt-4-turbo-2024-04-09
90
+ model_name: openai/gpt-4-turbo-2024-04-09
91
+ tokenizer_name: openai/cl100k_base
92
+ max_sequence_length: 128000
93
+ client_spec:
94
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
95
+ args:
96
+ openai_model_name: gpt-4-turbo
97
+ api_version: 2023-05-15
98
+
99
+ - name: stanfordhealthcare/o3-mini-2025-01-31
100
+ model_name: openai/o3-mini-2025-01-31
101
+ tokenizer_name: openai/cl100k_base
102
+ max_sequence_length: 200000
103
+ client_spec:
104
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
105
+ args:
106
+ openai_model_name: o3-mini
107
+ api_version: 2024-12-01-preview
108
+ base_url: "{endpoint}/openai-eastus2"
109
+
110
+ - name: stanfordhealthcare/o1-2024-12-17
111
+ model_name: openai/o1-2024-12-17
112
+ tokenizer_name: openai/cl100k_base
113
+ max_sequence_length: 128000
114
+ client_spec:
115
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
116
+ args:
117
+ openai_model_name: o1
118
+ api_version: 2024-12-01-preview
119
+ base_url: "{endpoint}/openai-eastus2"
120
+
121
+ - name: stanfordhealthcare/deepseek-r1
122
+ model_name: deepseek-ai/deepseek-r1
123
+ tokenizer_name: deepseek-ai/deepseek-r1
124
+ max_sequence_length: 128000
125
+ client_spec:
126
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
127
+ args:
128
+ openai_model_name: deepseek-chat
129
+ output_processor: helm.benchmark.metrics.output_processors.remove_deepseek_r1_thinking
130
+ base_url: "{endpoint}/deepseekr1/v1"
131
+
132
+ - name: stanfordhealthcare/llama-3.3-70b-instruct
133
+ model_name: meta/llama-3.3-70b-instruct
134
+ tokenizer_name: meta/llama-3.3-70b-instruct
135
+ max_sequence_length: 128000
136
+ client_spec:
137
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
138
+ args:
139
+ base_url: "{endpoint}/llama3370b/v1"
140
+
141
+ - name: stanfordhealthcare/phi-3.5-mini-instruct
142
+ model_name: microsoft/phi-3.5-mini-instruct
143
+ tokenizer_name: microsoft/phi-3.5-mini-instruct
144
+ max_sequence_length: 131072
145
+ client_spec:
146
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
147
+ args:
148
+ base_url: "{endpoint}/phi35mi/v1"
149
+
150
+ - name: stanfordhealthcare_shc/gpt-4o-2024-05-13
151
+ model_name: openai/gpt-4o-2024-05-13
152
+ tokenizer_name: openai/o200k_base
153
+ max_sequence_length: 128000
154
+ client_spec:
155
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
156
+ deployment: gpt-4o
157
+
158
+ - name: stanfordhealthcare_shc/gpt-4o-mini-2024-07-18
159
+ model_name: openai/gpt-4o-mini-2024-07-18
160
+ tokenizer_name: openai/o200k_base
161
+ max_sequence_length: 128000
162
+ client_spec:
163
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
164
+ deployment: gpt-4o-mini
165
+
166
+ - name: stanfordhealthcare_shc/gpt-4-turbo-2024-04-09
167
+ model_name: openai/gpt-4-turbo-2024-04-09
168
+ tokenizer_name: openai/cl100k_base
169
+ max_sequence_length: 128000
170
+ client_spec:
171
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
172
+ deployment: gpt-4-turbo-2024-04-09
173
+
19
174
  # Adobe
20
175
  - name: adobe/giga-gan
21
176
  model_name: adobe/giga-gan
@@ -104,7 +259,31 @@ model_deployments:
104
259
  class_name: "helm.benchmark.window_services.image_generation.clip_window_service.CLIPWindowService"
105
260
 
106
261
 
107
- # Amazon
262
+ # Amazon nova models
263
+
264
+ - name: amazon/nova-pro-v1:0
265
+ model_name: amazon/nova-pro-v1:0
266
+ tokenizer_name: huggingface/gpt2
267
+ max_sequence_length: 300000
268
+ client_spec:
269
+ class_name: "helm.clients.bedrock_client.BedrockNovaClient"
270
+
271
+ - name: amazon/nova-lite-v1:0
272
+ model_name: amazon/nova-lite-v1:0
273
+ tokenizer_name: huggingface/gpt2
274
+ max_sequence_length: 300000
275
+ client_spec:
276
+ class_name: "helm.clients.bedrock_client.BedrockNovaClient"
277
+
278
+ - name: amazon/nova-micro-v1:0
279
+ model_name: amazon/nova-micro-v1:0
280
+ tokenizer_name: huggingface/gpt2
281
+ max_sequence_length: 128000
282
+ client_spec:
283
+ class_name: "helm.clients.bedrock_client.BedrockNovaClient"
284
+
285
+ # Titan on Amazon Bedrock
286
+
108
287
  - name: amazon/titan-text-lite-v1
109
288
  model_name: amazon/titan-text-lite-v1
110
289
  tokenizer_name: huggingface/gpt2
@@ -112,20 +291,87 @@ model_deployments:
112
291
  client_spec:
113
292
  class_name: "helm.clients.bedrock_client.BedrockTitanClient"
114
293
 
115
- - name: amazon/titan-tg1-large
116
- model_name: amazon/titan-tg1-large
294
+ - name: amazon/titan-text-express-v1
295
+ model_name: amazon/titan-text-express-v1
117
296
  tokenizer_name: huggingface/gpt2
118
297
  max_sequence_length: 8000
119
298
  client_spec:
120
299
  class_name: "helm.clients.bedrock_client.BedrockTitanClient"
300
+
301
+ # Mistral on Amazon Bedrock
121
302
 
122
- - name: amazon/titan-text-express-v1
123
- model_name: amazon/titan-text-express-v1
303
+ - name: amazon/mistral-7b-instruct-v0:2
304
+ model_name: mistralai/amazon-mistral-7b-instruct-v0:2
124
305
  tokenizer_name: huggingface/gpt2
125
306
  max_sequence_length: 8000
126
307
  client_spec:
127
- class_name: "helm.clients.bedrock_client.BedrockTitanClient"
308
+ class_name: "helm.clients.bedrock_client.BedrockMistralClient"
309
+
310
+ - name: amazon/mixtral-8x7b-instruct-v0:1
311
+ model_name: mistralai/amazon-mixtral-8x7b-instruct-v0:1
312
+ tokenizer_name: huggingface/gpt2
313
+ max_sequence_length: 4000
314
+ client_spec:
315
+ class_name: "helm.clients.bedrock_client.BedrockMistralClient"
316
+
317
+ - name: amazon/mistral-large-2402-v1:0
318
+ model_name: mistralai/amazon-mistral-large-2402-v1:0
319
+ tokenizer_name: huggingface/gpt2
320
+ max_sequence_length: 8000
321
+ client_spec:
322
+ class_name: "helm.clients.bedrock_client.BedrockMistralClient"
323
+
324
+ - name: amazon/mistral-small-2402-v1:0
325
+ model_name: mistralai/amazon-mistral-small-2402-v1:0
326
+ tokenizer_name: huggingface/gpt2
327
+ max_sequence_length: 8000
328
+ client_spec:
329
+ class_name: "helm.clients.bedrock_client.BedrockMistralClient"
330
+
331
+ - name: amazon/mistral-large-2407-v1:0
332
+ model_name: mistralai/amazon-mistral-large-2407-v1:0
333
+ tokenizer_name: huggingface/gpt2
334
+ max_sequence_length: 8000
335
+ client_spec:
336
+ class_name: "helm.clients.bedrock_client.BedrockMistralClient"
128
337
 
338
+ # Llama 3 on Amazon Bedrock
339
+
340
+ - name: amazon/llama3-8b-instruct-v1:0
341
+ model_name: meta/amazon-llama3-8b-instruct-v1:0
342
+ tokenizer_name: huggingface/gpt2
343
+ max_sequence_length: 2000
344
+ client_spec:
345
+ class_name: "helm.clients.bedrock_client.BedrockLlamaClient"
346
+
347
+ - name: amazon/llama3-70b-instruct-v1:0
348
+ model_name: meta/amazon-llama3-70b-instruct-v1:0
349
+ tokenizer_name: huggingface/gpt2
350
+ max_sequence_length: 2000
351
+ client_spec:
352
+ class_name: "helm.clients.bedrock_client.BedrockLlamaClient"
353
+
354
+ - name: amazon/llama3-1-405b-instruct-v1:0
355
+ model_name: meta/amazon-llama3-1-405b-instruct-v1:0
356
+ tokenizer_name: huggingface/gpt2
357
+ max_sequence_length: 2000
358
+ client_spec:
359
+ class_name: "helm.clients.bedrock_client.BedrockLlamaClient"
360
+
361
+ - name: amazon/llama3-1-70b-instruct-v1:0
362
+ model_name: meta/amazon-llama3-1-70b-instruct-v1:0
363
+ tokenizer_name: huggingface/gpt2
364
+ max_sequence_length: 2000
365
+ client_spec:
366
+ class_name: "helm.clients.bedrock_client.BedrockLlamaClient"
367
+
368
+
369
+ - name: amazon/llama3-1-8b-instruct-v1:0
370
+ model_name: meta/amazon-llama3-1-8b-instruct-v1:0
371
+ tokenizer_name: huggingface/gpt2
372
+ max_sequence_length: 2000
373
+ client_spec:
374
+ class_name: "helm.clients.bedrock_client.BedrockLlamaClient"
129
375
 
130
376
  # Anthropic
131
377
  - name: anthropic/claude-v1.3
@@ -189,6 +435,13 @@ model_deployments:
189
435
  client_spec:
190
436
  class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
191
437
 
438
+ - name: anthropic/claude-3-5-haiku-20241022
439
+ model_name: anthropic/claude-3-5-haiku-20241022
440
+ tokenizer_name: anthropic/claude
441
+ max_sequence_length: 200000
442
+ client_spec:
443
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
444
+
192
445
  - name: anthropic/claude-3-5-sonnet-20240620
193
446
  model_name: anthropic/claude-3-5-sonnet-20240620
194
447
  tokenizer_name: anthropic/claude
@@ -196,6 +449,20 @@ model_deployments:
196
449
  client_spec:
197
450
  class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
198
451
 
452
+ - name: anthropic/claude-3-5-sonnet-20241022
453
+ model_name: anthropic/claude-3-5-sonnet-20241022
454
+ tokenizer_name: anthropic/claude
455
+ max_sequence_length: 200000
456
+ client_spec:
457
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
458
+
459
+ - name: anthropic/claude-3-7-sonnet-20250219
460
+ model_name: anthropic/claude-3-7-sonnet-20250219
461
+ tokenizer_name: anthropic/claude
462
+ max_sequence_length: 200000
463
+ client_spec:
464
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
465
+
199
466
  - name: anthropic/stanford-online-all-v4-s3
200
467
  deprecated: true # Closed model, not accessible via API
201
468
  model_name: anthropic/stanford-online-all-v4-s3
@@ -307,6 +574,35 @@ model_deployments:
307
574
  client_spec:
308
575
  class_name: "helm.clients.together_client.TogetherChatClient"
309
576
 
577
+ - name: together/deepseek-v3
578
+ model_name: deepseek-ai/deepseek-v3
579
+ tokenizer_name: deepseek-ai/deepseek-v3
580
+ max_sequence_length: 16384
581
+ client_spec:
582
+ class_name: "helm.clients.together_client.TogetherChatClient"
583
+ args:
584
+ disable_logprobs: True
585
+
586
+ - name: together/deepseek-r1
587
+ model_name: deepseek-ai/deepseek-r1
588
+ tokenizer_name: deepseek-ai/deepseek-r1
589
+ max_sequence_length: 32768
590
+ client_spec:
591
+ class_name: "helm.clients.together_client.TogetherChatClient"
592
+ args:
593
+ disable_logprobs: True
594
+
595
+ - name: together/deepseek-r1-hide-reasoning
596
+ model_name: deepseek-ai/deepseek-r1-hide-reasoning
597
+ tokenizer_name: deepseek-ai/deepseek-r1
598
+ max_sequence_length: 32768
599
+ client_spec:
600
+ class_name: "helm.clients.together_client.TogetherChatClient"
601
+ args:
602
+ together_model: deepseek-ai/deepseek-r1
603
+ disable_logprobs: True
604
+ output_processor: helm.benchmark.metrics.output_processors.remove_deepseek_r1_thinking
605
+
310
606
  # Gooseai
311
607
 
312
608
  # TODO: Migrate these models to use OpenAIClient
@@ -422,6 +718,7 @@ model_deployments:
422
718
  client_spec:
423
719
  class_name: "helm.clients.vertexai_client.VertexAIChatClient"
424
720
  args:
721
+ vertexai_model: gemini-1.5-pro-001
425
722
  safety_settings_preset: default
426
723
 
427
724
  - name: google/gemini-1.5-pro-001-safety-block-none
@@ -432,6 +729,7 @@ model_deployments:
432
729
  client_spec:
433
730
  class_name: "helm.clients.vertexai_client.VertexAIChatClient"
434
731
  args:
732
+ vertexai_model: gemini-1.5-pro-001
435
733
  safety_settings_preset: block_none
436
734
 
437
735
  - name: google/gemini-1.5-flash-001-safety-default
@@ -442,6 +740,7 @@ model_deployments:
442
740
  client_spec:
443
741
  class_name: "helm.clients.vertexai_client.VertexAIChatClient"
444
742
  args:
743
+ vertexai_model: gemini-1.5-flash-001
445
744
  safety_settings_preset: default
446
745
 
447
746
  - name: google/gemini-1.5-flash-001-safety-block-none
@@ -452,8 +751,99 @@ model_deployments:
452
751
  client_spec:
453
752
  class_name: "helm.clients.vertexai_client.VertexAIChatClient"
454
753
  args:
754
+ vertexai_model: gemini-1.5-flash-001
455
755
  safety_settings_preset: block_none
456
756
 
757
+ - name: google/gemini-1.5-pro-002
758
+ model_name: google/gemini-1.5-pro-002
759
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
760
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
761
+ # TODO: Max output tokens: 8192
762
+ client_spec:
763
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
764
+
765
+ - name: google/gemini-1.5-flash-002
766
+ model_name: google/gemini-1.5-flash-002
767
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
768
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
769
+ # TODO: Max output tokens: 8192
770
+ client_spec:
771
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
772
+
773
+ - name: google/gemini-2.0-pro-exp-02-05
774
+ model_name: google/gemini-2.0-pro-exp-02-05
775
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
776
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
777
+ # TODO: Max output tokens: 8192
778
+ client_spec:
779
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
780
+
781
+ - name: google/gemini-2.0-flash-exp
782
+ model_name: google/gemini-2.0-flash-exp
783
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
784
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
785
+ # TODO: Max output tokens: 8192
786
+ client_spec:
787
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
788
+
789
+ - name: google/gemini-2.0-flash-001
790
+ model_name: google/gemini-2.0-flash-001
791
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
792
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
793
+ # TODO: Max output tokens: 8192
794
+ client_spec:
795
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
796
+
797
+ - name: google/gemini-2.0-flash-lite-preview-02-05
798
+ model_name: google/gemini-2.0-flash-lite-preview-02-05
799
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
800
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
801
+ # TODO: Max output tokens: 8192
802
+ client_spec:
803
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
804
+
805
+ - name: google/gemini-2.0-flash-thinking-exp-01-21
806
+ model_name: google/gemini-2.0-flash-thinking-exp-01-21
807
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
808
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
809
+ client_spec:
810
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
811
+
812
+ - name: google/gemini-1.5-flash-8b-001
813
+ model_name: google/gemini-1.5-flash-8b-001
814
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
815
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
816
+ # TODO: Max output tokens: 8192
817
+ client_spec:
818
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
819
+
820
+ - name: google/llama-3.1-8b-instruct
821
+ model_name: meta/llama-3.1-8b-instruct
822
+ tokenizer_name: meta/llama-3.1-8b-instruct
823
+ max_sequence_length: 128000
824
+ client_spec:
825
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
826
+ args:
827
+ vertexai_model: publishers/meta/models/llama-3.1-8b-instruct-maas
828
+
829
+ - name: google/llama-3.1-70b-instruct
830
+ model_name: meta/llama-3.1-70b-instruct
831
+ tokenizer_name: meta/llama-3.1-8b-instruct
832
+ max_sequence_length: 128000
833
+ client_spec:
834
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
835
+ args:
836
+ vertexai_model: publishers/meta/models/llama-3.1-70b-instruct-maas
837
+
838
+ - name: google/llama-3.1-405b-instruct
839
+ model_name: meta/llama-3.1-405b-instruct
840
+ tokenizer_name: meta/llama-3.1-8b-instruct
841
+ max_sequence_length: 128000
842
+ client_spec:
843
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
844
+ args:
845
+ vertexai_model: publishers/meta/models/llama-3.1-405b-instruct-maas
846
+
457
847
  ## Gemma
458
848
  - name: together/gemma-2b
459
849
  model_name: google/gemma-2b
@@ -646,6 +1036,66 @@ model_deployments:
646
1036
  device_map: auto
647
1037
  torch_dtype: torch.bfloat16
648
1038
 
1039
+ - name: huggingface/gemma2-9b-cpt-sea-lionv3-base
1040
+ model_name: aisingapore/gemma2-9b-cpt-sea-lionv3-base
1041
+ tokenizer_name: google/gemma-2-9b
1042
+ max_sequence_length: 8192
1043
+ client_spec:
1044
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1045
+ args:
1046
+ device_map: auto
1047
+ torch_dtype: torch.bfloat16
1048
+
1049
+ - name: huggingface/gemma2-9b-cpt-sea-lionv3-instruct
1050
+ model_name: aisingapore/gemma2-9b-cpt-sea-lionv3-instruct
1051
+ tokenizer_name: google/gemma-2-9b
1052
+ max_sequence_length: 8192
1053
+ client_spec:
1054
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1055
+ args:
1056
+ device_map: auto
1057
+ torch_dtype: torch.bfloat16
1058
+
1059
+ - name: huggingface/llama3.1-8b-cpt-sea-lionv3-base
1060
+ model_name: aisingapore/llama3.1-8b-cpt-sea-lionv3-base
1061
+ tokenizer_name: meta/llama-3.1-8b
1062
+ max_sequence_length: 128000
1063
+ client_spec:
1064
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1065
+ args:
1066
+ device_map: auto
1067
+ torch_dtype: torch.bfloat16
1068
+
1069
+ - name: huggingface/llama3.1-8b-cpt-sea-lionv3-instruct
1070
+ model_name: aisingapore/llama3.1-8b-cpt-sea-lionv3-instruct
1071
+ tokenizer_name: meta/llama-3.1-8b
1072
+ max_sequence_length: 128000
1073
+ client_spec:
1074
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1075
+ args:
1076
+ device_map: auto
1077
+ torch_dtype: torch.bfloat16
1078
+
1079
+ - name: huggingface/llama3.1-70b-cpt-sea-lionv3-base
1080
+ model_name: aisingapore/llama3.1-70b-cpt-sea-lionv3-base
1081
+ tokenizer_name: meta/llama-3.1-8b
1082
+ max_sequence_length: 128000
1083
+ client_spec:
1084
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1085
+ args:
1086
+ device_map: auto
1087
+ torch_dtype: torch.bfloat16
1088
+
1089
+ - name: huggingface/llama3.1-70b-cpt-sea-lionv3-instruct
1090
+ model_name: aisingapore/llama3.1-70b-cpt-sea-lionv3-instruct
1091
+ tokenizer_name: meta/llama-3.1-8b
1092
+ max_sequence_length: 128000
1093
+ client_spec:
1094
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1095
+ args:
1096
+ device_map: auto
1097
+ torch_dtype: torch.bfloat16
1098
+
649
1099
  ## Bigcode
650
1100
  - name: huggingface/santacoder
651
1101
  model_name: bigcode/santacoder
@@ -803,6 +1253,24 @@ model_deployments:
803
1253
  class_name: "helm.clients.huggingface_client.HuggingFaceClient"
804
1254
 
805
1255
  ## Meta
1256
+ - name: huggingface/llama-3.1-8b-instruct
1257
+ model_name: meta/llama-3.1-8b-instruct
1258
+ tokenizer_name: meta/llama-3.1-8b-instruct
1259
+ max_sequence_length: 131072
1260
+ client_spec:
1261
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1262
+ args:
1263
+ pretrained_model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
1264
+
1265
+ - name: huggingface/llama-3.2-1b-instruct
1266
+ model_name: meta/llama-3.2-1b-instruct
1267
+ tokenizer_name: meta/llama-3.2-1b-instruct
1268
+ max_sequence_length: 131072
1269
+ client_spec:
1270
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1271
+ args:
1272
+ pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
1273
+
806
1274
  - name: huggingface/opt-175b
807
1275
  model_name: meta/opt-175b
808
1276
  tokenizer_name: facebook/opt-66b
@@ -881,7 +1349,22 @@ model_deployments:
881
1349
  max_sequence_length: 2048
882
1350
  client_spec:
883
1351
  class_name: "helm.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient"
1352
+
1353
+ ## NECTEC
1354
+ - name: huggingface/Pathumma-llm-text-1.0.0
1355
+ model_name: nectec/Pathumma-llm-text-1.0.0
1356
+ tokenizer_name: nectec/Pathumma-llm-text-1.0.0
1357
+ max_sequence_length: 8192
1358
+ client_spec:
1359
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
884
1360
 
1361
+ - name: huggingface/OpenThaiLLM-Prebuilt-7B
1362
+ model_name: nectec/OpenThaiLLM-Prebuilt-7B
1363
+ tokenizer_name: nectec/OpenThaiLLM-Prebuilt-7B
1364
+ max_sequence_length: 4096
1365
+ client_spec:
1366
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1367
+
885
1368
  ## KAIST AI
886
1369
  - name: huggingface/prometheus-vision-13b-v1.0-hf
887
1370
  model_name: kaistai/prometheus-vision-13b-v1.0-hf
@@ -929,7 +1412,37 @@ model_deployments:
929
1412
  args:
930
1413
  device_map: auto
931
1414
  torch_dtype: auto
932
-
1415
+
1416
+ - name: huggingface/phi-3.5-mini-instruct-4bit
1417
+ model_name: microsoft/phi-3.5-mini-instruct
1418
+ tokenizer_name: microsoft/phi-3.5-mini-instruct
1419
+ max_sequence_length: 131072
1420
+ client_spec:
1421
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1422
+ args:
1423
+ device_map: auto
1424
+ torch_dtype: "float16"
1425
+ quantization_config:
1426
+ load_in_4bit: true
1427
+ attn_implementation: "flash_attention_2"
1428
+
1429
+ - name: huggingface/phi-3.5-mini-instruct
1430
+ model_name: microsoft/phi-3.5-mini-instruct
1431
+ tokenizer_name: microsoft/phi-3.5-mini-instruct
1432
+ max_sequence_length: 131072
1433
+ client_spec:
1434
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1435
+
1436
+ - name: huggingface/phi-3.5-moe-instruct
1437
+ model_name: microsoft/phi-3.5-moe-instruct
1438
+ tokenizer_name: microsoft/phi-3.5-mini-instruct
1439
+ max_sequence_length: 131072
1440
+ client_spec:
1441
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1442
+ args:
1443
+ device_map: auto
1444
+ torch_dtype: auto
1445
+
933
1446
  ## Mistral AI
934
1447
  - name: huggingface/bakLlava-v1-hf
935
1448
  model_name: mistralai/bakLlava-v1-hf
@@ -1172,6 +1685,17 @@ model_deployments:
1172
1685
  client_spec:
1173
1686
  class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1174
1687
 
1688
+ # Upstage
1689
+ - name: huggingface/solar-pro-preview-instruct
1690
+ model_name: upstage/solar-pro-preview-instruct
1691
+ tokenizer_name: upstage/solar-pro-preview-instruct
1692
+ max_sequence_length: 4096
1693
+ client_spec:
1694
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1695
+ args:
1696
+ torch_dtype: auto
1697
+ trust_remote_code: true
1698
+
1175
1699
  ## Text-to-Image Diffusion Models
1176
1700
 
1177
1701
  - name: huggingface/dreamlike-diffusion-v1-0
@@ -1406,14 +1930,20 @@ model_deployments:
1406
1930
  precision: bf16-true
1407
1931
 
1408
1932
  # Mistral AI
1409
- - name: mistralai/mistral-tiny
1410
- model_name: mistralai/mistral-7b-v0.1
1411
- tokenizer_name: mistralai/Mistral-7B-v0.1
1412
- max_sequence_length: 32000
1933
+
1934
+ - name: mistralai/ministral-3b-2410
1935
+ model_name: mistralai/ministral-3b-2410
1936
+ tokenizer_name: mistralai/Ministral-8B-Instruct-2410
1937
+ max_sequence_length: 128000
1938
+ client_spec:
1939
+ class_name: "helm.clients.mistral_client.MistralAIClient"
1940
+
1941
+ - name: mistralai/ministral-8b-2410
1942
+ model_name: mistralai/ministral-8b-2410
1943
+ tokenizer_name: mistralai/Ministral-8B-Instruct-2410
1944
+ max_sequence_length: 128000
1413
1945
  client_spec:
1414
1946
  class_name: "helm.clients.mistral_client.MistralAIClient"
1415
- args:
1416
- mistral_model: "mistral-tiny"
1417
1947
 
1418
1948
  - name: mistralai/mistral-small-2402
1419
1949
  model_name: mistralai/mistral-small-2402
@@ -1422,6 +1952,27 @@ model_deployments:
1422
1952
  client_spec:
1423
1953
  class_name: "helm.clients.mistral_client.MistralAIClient"
1424
1954
 
1955
+ - name: mistralai/mistral-small-2409
1956
+ model_name: mistralai/mistral-small-2409
1957
+ tokenizer_name: mistralai/Mistral-7B-v0.1
1958
+ max_sequence_length: 32000
1959
+ client_spec:
1960
+ class_name: "helm.clients.mistral_client.MistralAIClient"
1961
+
1962
+ - name: mistralai/mistral-small-2501
1963
+ model_name: mistralai/mistral-small-2501
1964
+ tokenizer_name: mistralai/Mistral-Small-24B-Instruct-2501
1965
+ max_sequence_length: 32000
1966
+ client_spec:
1967
+ class_name: "helm.clients.mistral_client.MistralAIClient"
1968
+
1969
+ - name: mistralai/mistral-small-2503
1970
+ model_name: mistralai/mistral-small-2503
1971
+ tokenizer_name: mistralai/Mistral-Small-24B-Instruct-2501
1972
+ max_sequence_length: 128000
1973
+ client_spec:
1974
+ class_name: "helm.clients.mistral_client.MistralAIClient"
1975
+
1425
1976
  - name: mistralai/mistral-medium-2312
1426
1977
  model_name: mistralai/mistral-medium-2312
1427
1978
  tokenizer_name: mistralai/Mistral-7B-v0.1
@@ -1443,13 +1994,35 @@ model_deployments:
1443
1994
  client_spec:
1444
1995
  class_name: "helm.clients.mistral_client.MistralAIClient"
1445
1996
 
1446
- - name: mistralai/open-mistral-nemo-2407
1447
- model_name: mistralai/open-mistral-nemo-2407
1448
- tokenizer_name: mistralai/Mistral-Nemo-Base-2407
1997
+ - name: mistralai/mistral-large-2411
1998
+ model_name: mistralai/mistral-large-2411
1999
+ tokenizer_name: mistralai/Mistral-Large-Instruct-2411
2000
+ max_sequence_length: 128000
2001
+ client_spec:
2002
+ class_name: "helm.clients.mistral_client.MistralAIClient"
2003
+
2004
+ - name: mistralai/open-mistral-nemo-2407
2005
+ model_name: mistralai/open-mistral-nemo-2407
2006
+ tokenizer_name: mistralai/Mistral-Nemo-Base-2407
2007
+ max_sequence_length: 128000
2008
+ client_spec:
2009
+ class_name: "helm.clients.mistral_client.MistralAIClient"
2010
+
2011
+ - name: mistralai/pixtral-12b-2409
2012
+ model_name: mistralai/pixtral-12b-2409
2013
+ tokenizer_name: mistralai/Mistral-7B-v0.1
2014
+ max_sequence_length: 128000
2015
+ client_spec:
2016
+ class_name: "helm.clients.mistral_client.MistralAIClient"
2017
+
2018
+ - name: mistralai/pixtral-large-2411
2019
+ model_name: mistralai/pixtral-large-2411
2020
+ tokenizer_name: mistralai/Mistral-Large-Instruct-2407
1449
2021
  max_sequence_length: 128000
1450
2022
  client_spec:
1451
2023
  class_name: "helm.clients.mistral_client.MistralAIClient"
1452
2024
 
2025
+
1453
2026
  # Neurips
1454
2027
  - name: neurips/local
1455
2028
  model_name: neurips/local
@@ -1484,7 +2057,7 @@ model_deployments:
1484
2057
  # to provide a margin of error.
1485
2058
  max_sequence_length: 16000
1486
2059
  client_spec:
1487
- class_name: "helm.clients.openai_client.OpenAIClient"
2060
+ class_name: "helm.clients.openai_client.OpenAILegacyCompletionsClient"
1488
2061
 
1489
2062
  - name: openai/babbage-002
1490
2063
  model_name: openai/babbage-002
@@ -1493,7 +2066,7 @@ model_deployments:
1493
2066
  # to provide a margin of error.
1494
2067
  max_sequence_length: 16000
1495
2068
  client_spec:
1496
- class_name: "helm.clients.openai_client.OpenAIClient"
2069
+ class_name: "helm.clients.openai_client.OpenAILegacyCompletionsClient"
1497
2070
 
1498
2071
  ## GPT 3.5 Turbo Models
1499
2072
  # ChatGPT: https://openai.com/blog/chatgpt
@@ -1504,7 +2077,7 @@ model_deployments:
1504
2077
  max_sequence_length: 4096
1505
2078
  max_request_length: 4097
1506
2079
  client_spec:
1507
- class_name: "helm.clients.openai_client.OpenAIClient"
2080
+ class_name: "helm.clients.openai_client.OpenAILegacyCompletionsClient"
1508
2081
 
1509
2082
  # The claimed sequence length is 4096, but as of 2023-03-07, the empirical usable
1510
2083
  # sequence length is smaller at 4087 with one user input message and one assistant
@@ -1636,6 +2209,13 @@ model_deployments:
1636
2209
  client_spec:
1637
2210
  class_name: "helm.clients.openai_client.OpenAIClient"
1638
2211
 
2212
+ - name: openai/gpt-4o-2024-11-20
2213
+ model_name: openai/gpt-4o-2024-11-20
2214
+ tokenizer_name: openai/o200k_base
2215
+ max_sequence_length: 128000
2216
+ client_spec:
2217
+ class_name: "helm.clients.openai_client.OpenAIClient"
2218
+
1639
2219
  - name: openai/gpt-4o-mini-2024-07-18
1640
2220
  model_name: openai/gpt-4o-mini-2024-07-18
1641
2221
  tokenizer_name: openai/o200k_base
@@ -1643,6 +2223,34 @@ model_deployments:
1643
2223
  client_spec:
1644
2224
  class_name: "helm.clients.openai_client.OpenAIClient"
1645
2225
 
2226
+ - name: openai/whisper-1_gpt-4o-2024-11-20
2227
+ model_name: openai/whisper-1_gpt-4o-2024-11-20
2228
+ tokenizer_name: openai/o200k_base
2229
+ max_sequence_length: 128000
2230
+ client_spec:
2231
+ class_name: "helm.clients.openai_client.OpenAITranscriptionThenCompletionClient"
2232
+
2233
+ - name: openai/gpt-4o-audio-preview-2024-10-01
2234
+ model_name: openai/gpt-4o-audio-preview-2024-10-01
2235
+ tokenizer_name: openai/o200k_base
2236
+ max_sequence_length: 128000
2237
+ client_spec:
2238
+ class_name: "helm.clients.openai_client.OpenAIClient"
2239
+
2240
+ - name: openai/gpt-4o-audio-preview-2024-12-17
2241
+ model_name: openai/gpt-4o-audio-preview-2024-12-17
2242
+ tokenizer_name: openai/o200k_base
2243
+ max_sequence_length: 128000
2244
+ client_spec:
2245
+ class_name: "helm.clients.openai_client.OpenAIClient"
2246
+
2247
+ - name: openai/gpt-4o-mini-audio-preview-2024-12-17
2248
+ model_name: openai/gpt-4o-mini-audio-preview-2024-12-17
2249
+ tokenizer_name: openai/o200k_base
2250
+ max_sequence_length: 128000
2251
+ client_spec:
2252
+ class_name: "helm.clients.openai_client.OpenAIClient"
2253
+
1646
2254
  - name: openai/gpt-4-vision-preview
1647
2255
  model_name: openai/gpt-4-vision-preview
1648
2256
  tokenizer_name: openai/cl100k_base
@@ -1661,7 +2269,42 @@ model_deployments:
1661
2269
  client_spec:
1662
2270
  class_name: "helm.clients.openai_client.OpenAIClient"
1663
2271
 
2272
+ ## GPT-4.5
2273
+ - name: openai/gpt-4.5-preview-2025-02-27
2274
+ model_name: openai/gpt-4.5-preview-2025-02-27
2275
+ tokenizer_name: openai/o200k_base
2276
+ max_sequence_length: 128000
2277
+ client_spec:
2278
+ class_name: "helm.clients.openai_client.OpenAIClient"
2279
+
1664
2280
  ## o1 Models
2281
+ - name: openai/o1-2024-12-17
2282
+ model_name: openai/o1-2024-12-17
2283
+ tokenizer_name: openai/cl100k_base
2284
+ max_sequence_length: 128000
2285
+ client_spec:
2286
+ class_name: "helm.clients.openai_client.OpenAIClient"
2287
+
2288
+ - name: openai/o1-2024-12-17-low-reasoning-effort
2289
+ model_name: openai/o1-2024-12-17-low-reasoning-effort
2290
+ tokenizer_name: openai/cl100k_base
2291
+ max_sequence_length: 128000
2292
+ client_spec:
2293
+ class_name: "helm.clients.openai_client.OpenAIClient"
2294
+ args:
2295
+ openai_model_name: o1-2024-12-17
2296
+ reasoning_effort: low
2297
+
2298
+ - name: openai/o1-2024-12-17-high-reasoning-effort
2299
+ model_name: openai/o1-2024-12-17-high-reasoning-effort
2300
+ tokenizer_name: openai/cl100k_base
2301
+ max_sequence_length: 128000
2302
+ client_spec:
2303
+ class_name: "helm.clients.openai_client.OpenAIClient"
2304
+ args:
2305
+ openai_model_name: o1-2024-12-17
2306
+ reasoning_effort: high
2307
+
1665
2308
  - name: openai/o1-preview-2024-09-12
1666
2309
  model_name: openai/o1-preview-2024-09-12
1667
2310
  tokenizer_name: openai/cl100k_base
@@ -1676,6 +2319,33 @@ model_deployments:
1676
2319
  client_spec:
1677
2320
  class_name: "helm.clients.openai_client.OpenAIClient"
1678
2321
 
2322
+ - name: openai/o3-mini-2025-01-31
2323
+ model_name: openai/o3-mini-2025-01-31
2324
+ tokenizer_name: openai/cl100k_base
2325
+ max_sequence_length: 200000
2326
+ client_spec:
2327
+ class_name: "helm.clients.openai_client.OpenAIClient"
2328
+
2329
+ - name: openai/o3-mini-2025-01-31-low-reasoning-effort
2330
+ model_name: openai/o3-mini-2025-01-31-low-reasoning-effort
2331
+ tokenizer_name: openai/cl100k_base
2332
+ max_sequence_length: 200000
2333
+ client_spec:
2334
+ class_name: "helm.clients.openai_client.OpenAIClient"
2335
+ args:
2336
+ openai_model_name: o3-mini-2025-01-31
2337
+ reasoning_effort: low
2338
+
2339
+ - name: openai/o3-mini-2025-01-31-high-reasoning-effort
2340
+ model_name: openai/o3-mini-2025-01-31-high-reasoning-effort
2341
+ tokenizer_name: openai/cl100k_base
2342
+ max_sequence_length: 200000
2343
+ client_spec:
2344
+ class_name: "helm.clients.openai_client.OpenAIClient"
2345
+ args:
2346
+ openai_model_name: o3-mini-2025-01-31
2347
+ reasoning_effort: high
2348
+
1679
2349
  ## Text Similarity Models
1680
2350
  # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings
1681
2351
  # The number of parameters is guessed based on the number of parameters of the
@@ -1985,6 +2655,15 @@ model_deployments:
1985
2655
  args:
1986
2656
  together_model: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
1987
2657
 
2658
+ - name: together/llama-3.3-70b-instruct-turbo
2659
+ model_name: meta/llama-3.3-70b-instruct-turbo
2660
+ tokenizer_name: meta/llama-3.3-70b-instruct
2661
+ max_sequence_length: 128000
2662
+ client_spec:
2663
+ class_name: "helm.clients.together_client.TogetherChatClient"
2664
+ args:
2665
+ together_model: meta-llama/Llama-3.3-70B-Instruct-Turbo
2666
+
1988
2667
  - name: together/llama-guard-7b
1989
2668
  model_name: meta/llama-guard-7b
1990
2669
  tokenizer_name: meta-llama/Llama-2-7b-hf
@@ -2012,6 +2691,16 @@ model_deployments:
2012
2691
  args:
2013
2692
  together_model: meta-llama/Meta-Llama-Guard-3-8B
2014
2693
 
2694
+ # NVIDIA
2695
+ - name: together/llama-3.1-nemotron-70b-instruct
2696
+ model_name: nvidia/llama-3.1-nemotron-70b-instruct
2697
+ tokenizer_name: nvidia/llama-3.1-nemotron-70b-instruct
2698
+ max_sequence_length: 32768
2699
+ client_spec:
2700
+ class_name: "helm.clients.together_client.TogetherClient"
2701
+ args:
2702
+ together_model: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
2703
+
2015
2704
  # 01.AI
2016
2705
  - name: together/yi-6b
2017
2706
  model_name: 01-ai/yi-6b
@@ -2119,12 +2808,22 @@ model_deployments:
2119
2808
  client_spec:
2120
2809
  class_name: "helm.clients.together_client.TogetherChatClient"
2121
2810
 
2811
+ - name: huggingface/mistral-7b-instruct-v0.3
2812
+ model_name: mistralai/mistral-7b-instruct-v0.3-hf
2813
+ tokenizer_name: mistralai/Mistral-7B-Instruct-v0.3
2814
+ max_sequence_length: 32000
2815
+ client_spec:
2816
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
2817
+ args:
2818
+ pretrained_model_name_or_path: mistralai/Mistral-7B-Instruct-v0.3
2819
+
2122
2820
  - name: together/mistral-7b-instruct-v0.3
2123
2821
  model_name: mistralai/mistral-7b-instruct-v0.3
2124
2822
  tokenizer_name: mistralai/Mistral-7B-Instruct-v0.3
2125
2823
  max_sequence_length: 32000
2126
2824
  client_spec:
2127
2825
  class_name: "helm.clients.together_client.TogetherChatClient"
2826
+
2128
2827
 
2129
2828
  - name: together/mixtral-8x7b-32kseqlen
2130
2829
  model_name: mistralai/mixtral-8x7b-32kseqlen
@@ -2395,6 +3094,27 @@ model_deployments:
2395
3094
  client_spec:
2396
3095
  class_name: "helm.clients.palmyra_client.PalmyraChatClient"
2397
3096
 
3097
+ - name: writer/palmyra-fin
3098
+ model_name: writer/palmyra-fin
3099
+ tokenizer_name: meta/llama-3-8b-instruct
3100
+ max_sequence_length: 128000
3101
+ client_spec:
3102
+ class_name: "helm.clients.palmyra_client.PalmyraChatClient"
3103
+
3104
+
3105
+ # xAI
3106
+
3107
+ - name: xai/grok-beta
3108
+ model_name: xai/grok-beta
3109
+ # No public information on tokenizer, so just pick an arbitrary one.
3110
+ # It shouldn't matter since the context is long.
3111
+ tokenizer_name: openai/o200k_base
3112
+ max_sequence_length: 128000
3113
+ client_spec:
3114
+ class_name: "helm.clients.openai_client.OpenAIClient"
3115
+ args:
3116
+ base_url: https://api.x.ai/v1
3117
+
2398
3118
  # Qwen
2399
3119
 
2400
3120
  - name: together/qwen-7b
@@ -2484,6 +3204,49 @@ model_deployments:
2484
3204
  client_spec:
2485
3205
  class_name: "helm.clients.together_client.TogetherChatClient"
2486
3206
 
3207
+ - name: together/qwen2.5-7b-instruct-turbo
3208
+ model_name: qwen/qwen2.5-7b-instruct-turbo
3209
+ tokenizer_name: qwen/qwen2.5-7b-instruct
3210
+ max_sequence_length: 128000
3211
+ client_spec:
3212
+ class_name: "helm.clients.together_client.TogetherChatClient"
3213
+
3214
+ - name: together/qwen2.5-72b-instruct-turbo
3215
+ model_name: qwen/qwen2.5-72b-instruct-turbo
3216
+ tokenizer_name: qwen/qwen2.5-7b-instruct
3217
+ max_sequence_length: 128000
3218
+ client_spec:
3219
+ class_name: "helm.clients.together_client.TogetherChatClient"
3220
+
3221
+ - name: huggingface/qwen2.5-7b-instruct-4bit
3222
+ model_name: qwen/qwen2.5-7b-instruct
3223
+ tokenizer_name: qwen/qwen2.5-7b-instruct
3224
+ max_sequence_length: 128000
3225
+ client_spec:
3226
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3227
+ args:
3228
+ pretrained_model_name_or_path: Qwen/Qwen2.5-7B-Instruct
3229
+ torch_dtype: "float16"
3230
+ quantization_config:
3231
+ load_in_4bit: true
3232
+ attn_implementation: "flash_attention_2"
3233
+
3234
+ - name: huggingface/qwen2.5-7b-instruct
3235
+ model_name: qwen/qwen2.5-7b-instruct
3236
+ tokenizer_name: qwen/qwen2.5-7b-instruct
3237
+ max_sequence_length: 128000
3238
+ client_spec:
3239
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3240
+ args:
3241
+ pretrained_model_name_or_path: Qwen/Qwen2.5-7B-Instruct
3242
+
3243
+ - name: together/qwq-32b-preview
3244
+ model_name: qwen/qwq-32b-preview
3245
+ tokenizer_name: qwen/qwq-32b-preview
3246
+ max_sequence_length: 32768
3247
+ client_spec:
3248
+ class_name: "helm.clients.together_client.TogetherChatClient"
3249
+
2487
3250
  - name: huggingface/qwen-vl
2488
3251
  model_name: qwen/qwen-vl
2489
3252
  tokenizer_name: qwen/qwen-vl
@@ -2498,6 +3261,34 @@ model_deployments:
2498
3261
  client_spec:
2499
3262
  class_name: "helm.clients.vision_language.qwen_vlm_client.QwenVLMClient"
2500
3263
 
3264
+ - name: huggingface/qwen2-vl-7b-instruct
3265
+ model_name: qwen/qwen2-vl-7b-instruct
3266
+ tokenizer_name: qwen/qwen-vl-chat
3267
+ max_sequence_length: 8191
3268
+ client_spec:
3269
+ class_name: "helm.clients.vision_language.qwen2_vlm_client.Qwen2VLMClient"
3270
+
3271
+ - name: huggingface/qwen2-vl-72b-instruct
3272
+ model_name: qwen/qwen2-vl-72b-instruct
3273
+ tokenizer_name: qwen/qwen-vl-chat
3274
+ max_sequence_length: 8191
3275
+ client_spec:
3276
+ class_name: "helm.clients.vision_language.qwen2_vlm_client.Qwen2VLMClient"
3277
+
3278
+ - name: huggingface/qwen-audio-chat
3279
+ model_name: qwen/qwen-audio-chat
3280
+ tokenizer_name: qwen/qwen-audio-chat
3281
+ max_sequence_length: 8191
3282
+ client_spec:
3283
+ class_name: "helm.clients.audio_language.qwen_audiolm_client.QwenAudioLMClient"
3284
+
3285
+ - name: huggingface/qwen2-audio-7b-instruct
3286
+ model_name: qwen/qwen2-audio-7b-instruct
3287
+ tokenizer_name: qwen/qwen2-audio-instruct
3288
+ max_sequence_length: 8191
3289
+ client_spec:
3290
+ class_name: "helm.clients.audio_language.qwen2_audiolm_client.Qwen2AudioLMClient"
3291
+
2501
3292
  # Reka
2502
3293
  - name: reka/reka-core
2503
3294
  model_name: reka/reka-core
@@ -2547,3 +3338,305 @@ model_deployments:
2547
3338
  max_sequence_length: 64000
2548
3339
  client_spec:
2549
3340
  class_name: "helm.clients.reka_client.RekaClient"
3341
+
3342
+ # Upstage
3343
+ - name: upstage/solar-pro-241126
3344
+ model_name: upstage/solar-pro-241126
3345
+ tokenizer_name: upstage/solar-pro-preview-instruct
3346
+ max_sequence_length: 32768
3347
+ client_spec:
3348
+ class_name: "helm.clients.upstage_client.UpstageChatClient"
3349
+
3350
+ # Diva Llama
3351
+ - name: huggingface/diva-llama
3352
+ model_name: stanford/diva-llama
3353
+ # TODO: Set the right tokenizer
3354
+ tokenizer_name: meta/llama-3-8b-instruct
3355
+ max_sequence_length: 8192
3356
+ client_spec:
3357
+ class_name: "helm.clients.audio_language.diva_llama_client.DivaLlamaClient"
3358
+
3359
+ # LLaMA-Omni
3360
+ - name: ictnlp/llama-3.1-8b-omni
3361
+ model_name: ictnlp/llama-3.1-8b-omni
3362
+ tokenizer_name: ictnlp/llama-3.1-8b-omni
3363
+ max_sequence_length: 8192
3364
+ client_spec:
3365
+ class_name: "helm.clients.audio_language.llama_omni_client.LlamaOmniAudioLMClient"
3366
+
3367
+ # IBM - Granite 3.0
3368
+ - name: huggingface/granite-3.0-2b-base
3369
+ model_name: ibm-granite/granite-3.0-2b-base
3370
+ tokenizer_name: ibm-granite/granite-3.0-2b-base
3371
+ max_sequence_length: 4096
3372
+ client_spec:
3373
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3374
+ args:
3375
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-2b-base
3376
+
3377
+ - name: huggingface/granite-3.0-2b-instruct
3378
+ model_name: ibm-granite/granite-3.0-2b-instruct
3379
+ tokenizer_name: ibm-granite/granite-3.0-2b-instruct
3380
+ max_sequence_length: 4096
3381
+ client_spec:
3382
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3383
+ args:
3384
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-2b-instruct
3385
+
3386
+ - name: huggingface/granite-3.0-8b-instruct
3387
+ model_name: ibm-granite/granite-3.0-8b-instruct
3388
+ tokenizer_name: ibm-granite/granite-3.0-8b-instruct
3389
+ max_sequence_length: 4096
3390
+ client_spec:
3391
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3392
+ args:
3393
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-8b-instruct
3394
+
3395
+ - name: huggingface/granite-3.0-8b-base
3396
+ model_name: ibm-granite/granite-3.0-8b-base
3397
+ tokenizer_name: ibm-granite/granite-3.0-8b-base
3398
+ max_sequence_length: 4096
3399
+ client_spec:
3400
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3401
+ args:
3402
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-8b-base
3403
+
3404
+ - name: huggingface/granite-3.0-3b-a800m-instruct
3405
+ model_name: ibm-granite/granite-3.0-3b-a800m-instruct
3406
+ tokenizer_name: ibm-granite/granite-3.0-3b-a800m-instruct
3407
+ max_sequence_length: 4096
3408
+ client_spec:
3409
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3410
+ args:
3411
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-3b-a800m-instruct
3412
+
3413
+ - name: huggingface/granite-3.0-3b-a800m-base
3414
+ model_name: ibm-granite/granite-3.0-3b-a800m-base
3415
+ tokenizer_name: ibm-granite/granite-3.0-3b-a800m-base
3416
+ max_sequence_length: 4096
3417
+ client_spec:
3418
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3419
+ args:
3420
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-3b-a800m-base
3421
+
3422
+ - name: huggingface/granite-3.0-1b-a400m-instruct
3423
+ model_name: ibm-granite/granite-3.0-1b-a400m-instruct
3424
+ tokenizer_name: ibm-granite/granite-3.0-1b-a400m-instruct
3425
+ max_sequence_length: 4096
3426
+ client_spec:
3427
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3428
+ args:
3429
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-instruct
3430
+
3431
+ - name: huggingface/granite-3.0-1b-a400m-base
3432
+ model_name: ibm-granite/granite-3.0-1b-a400m-base
3433
+ tokenizer_name: ibm-granite/granite-3.0-1b-a400m-base
3434
+ max_sequence_length: 4096
3435
+ client_spec:
3436
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3437
+ args:
3438
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base
3439
+
3440
+ - name: huggingface/sabia-7b
3441
+ model_name: maritaca-ai/sabia-7b
3442
+ tokenizer_name: maritaca-ai/sabia-7b
3443
+ max_sequence_length: 2048
3444
+ client_spec:
3445
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3446
+ args:
3447
+ pretrained_model_name_or_path: maritaca-ai/sabia-7b
3448
+
3449
+ # Granite-3.1-8b-base
3450
+ - name: huggingface/granite-3.1-8b-base
3451
+ model_name: ibm-granite/granite-3.1-8b-base
3452
+ tokenizer_name: ibm-granite/granite-3.1-8b-base
3453
+ max_sequence_length: 128000
3454
+ client_spec:
3455
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3456
+ args:
3457
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-8b-base
3458
+
3459
+ # Granite-3.1-8b-instruct
3460
+ - name: huggingface/granite-3.1-8b-instruct
3461
+ model_name: ibm-granite/granite-3.1-8b-instruct
3462
+ tokenizer_name: ibm-granite/granite-3.1-8b-instruct
3463
+ max_sequence_length: 128000
3464
+ client_spec:
3465
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3466
+ args:
3467
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-8b-instruct
3468
+
3469
+ # Granite-3.1-2b-instruct
3470
+ - name: huggingface/granite-3.1-2b-instruct
3471
+ model_name: ibm-granite/granite-3.1-2b-instruct
3472
+ tokenizer_name: ibm-granite/granite-3.1-2b-instruct
3473
+ max_sequence_length: 128000
3474
+ client_spec:
3475
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3476
+ args:
3477
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-2b-instruct
3478
+
3479
+ # Granite-3.1-2b-base
3480
+ - name: huggingface/granite-3.1-2b-base
3481
+ model_name: ibm-granite/granite-3.1-2b-base
3482
+ tokenizer_name: ibm-granite/granite-3.1-2b-base
3483
+ max_sequence_length: 128000
3484
+ client_spec:
3485
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3486
+ args:
3487
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-2b-base
3488
+
3489
+ # Granite-3.1-3b-a800m-instruct
3490
+ - name: huggingface/granite-3.1-3b-a800m-instruct
3491
+ model_name: ibm-granite/granite-3.1-3b-a800m-instruct
3492
+ tokenizer_name: ibm-granite/granite-3.1-3b-a800m-instruct
3493
+ max_sequence_length: 128000
3494
+ client_spec:
3495
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3496
+ args:
3497
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-3b-a800m-instruct
3498
+
3499
+ # Granite-3.1-3b-a800m-base
3500
+ - name: huggingface/granite-3.1-3b-a800m-base
3501
+ model_name: ibm-granite/granite-3.1-3b-a800m-base
3502
+ tokenizer_name: ibm-granite/granite-3.1-3b-a800m-base
3503
+ max_sequence_length: 128000
3504
+ client_spec:
3505
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3506
+ args:
3507
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-3b-a800m-base
3508
+
3509
+ # Granite-3.1-1b-a400m-instruct
3510
+ - name: huggingface/granite-3.1-1b-a400m-instruct
3511
+ model_name: ibm-granite/granite-3.1-1b-a400m-instruct
3512
+ tokenizer_name: ibm-granite/granite-3.1-1b-a400m-instruct
3513
+ max_sequence_length: 128000
3514
+ client_spec:
3515
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3516
+ args:
3517
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-1b-a400m-instruct
3518
+
3519
+ # Granite-3.1-1b-a400m-base
3520
+ - name: huggingface/granite-3.1-1b-a400m-base
3521
+ model_name: ibm-granite/granite-3.1-1b-a400m-base
3522
+ tokenizer_name: ibm-granite/granite-3.1-1b-a400m-base
3523
+ max_sequence_length: 128000
3524
+ client_spec:
3525
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3526
+ args:
3527
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-1b-a400m-base
3528
+
3529
+ # DeepSeek-R1-Distill-Llama-3.1-8b
3530
+ - name: huggingface/DeepSeek-R1-Distill-Llama-8B
3531
+ model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
3532
+ tokenizer_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
3533
+ max_sequence_length: 128000
3534
+ client_spec:
3535
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3536
+ args:
3537
+ pretrained_model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
3538
+
3539
+ # deepseek-ai/deepseek-coder-6.7b-instruct
3540
+ - name: huggingface/deepseek-coder-6.7b-instruct
3541
+ model_name: deepseek-ai/deepseek-coder-6.7b-instruct
3542
+ tokenizer_name: deepseek-ai/deepseek-coder-6.7b-instruct
3543
+ max_sequence_length: 128000
3544
+ client_spec:
3545
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3546
+ args:
3547
+ pretrained_model_name_or_path: deepseek-ai/deepseek-coder-6.7b-instruct
3548
+
3549
+ # IBM WatsonX
3550
+ - name: ibm/llama-3.3-70b-instruct
3551
+ model_name: meta/llama-3.3-70b-instruct
3552
+ tokenizer_name: meta/llama-3.3-70b-instruct
3553
+ max_sequence_length: 128000
3554
+ client_spec:
3555
+ class_name: "helm.clients.ibm_client.IbmChatClient"
3556
+ args:
3557
+ watsonx_model_name: meta-llama/llama-3-3-70b-instruct
3558
+ region: Dallas
3559
+
3560
+ - name: ibm/granite-3-2b-instruct
3561
+ model_name: ibm/granite-3.1-2b-instruct
3562
+ tokenizer_name: ibm-granite/granite-3.1-2b-instruct
3563
+ max_sequence_length: 128000
3564
+ client_spec:
3565
+ class_name: "helm.clients.ibm_client.IbmChatClient"
3566
+ args:
3567
+ watsonx_model_name: ibm/granite-3-2b-instruct
3568
+ region: Dallas
3569
+
3570
+
3571
+ #
3572
+ - name: ibm/granite-3-8b-instruct
3573
+ model_name: ibm/granite-3.1-8b-instruct
3574
+ tokenizer_name: ibm-granite/granite-3.1-8b-instruct
3575
+ max_sequence_length: 128000
3576
+ client_spec:
3577
+ class_name: "helm.clients.ibm_client.IbmChatClient"
3578
+ args:
3579
+ watsonx_model_name: ibm/granite-3-8b-instruct
3580
+ region: Dallas
3581
+ #
3582
+
3583
+ #
3584
+ - name: ibm/granite-13b-instruct-v2
3585
+ model_name: ibm/granite-13b-instruct-v2
3586
+ tokenizer_name: EleutherAI/gpt-neox-20b
3587
+ max_sequence_length: 128000
3588
+ client_spec:
3589
+ class_name: "helm.clients.ibm_client.IbmTextClient"
3590
+ args:
3591
+ watsonx_model_name: ibm/granite-13b-instruct-v2
3592
+ region: Dallas
3593
+ #
3594
+ - name: ibm/granite-20b-code-instruct-8k
3595
+ model_name: ibm/granite-20b-code-instruct-8k
3596
+ tokenizer_name: ibm-granite/granite-20b-code-instruct-8k
3597
+ max_sequence_length: 128000
3598
+ client_spec:
3599
+ class_name: "helm.clients.ibm_client.IbmChatClient"
3600
+ args:
3601
+ watsonx_model_name: ibm/granite-20b-code-instruct
3602
+ region: Dallas
3603
+ #
3604
+ - name: ibm/granite-34b-code-instruct
3605
+ model_name: ibm/granite-34b-code-instruct
3606
+ tokenizer_name: ibm-granite/granite-34b-code-instruct-8k
3607
+ max_sequence_length: 128000
3608
+ client_spec:
3609
+ class_name: "helm.clients.ibm_client.IbmChatClient"
3610
+ args:
3611
+ watsonx_model_name: ibm/granite-34b-code-instruct
3612
+ region: Dallas
3613
+ #
3614
+ - name: ibm/granite-3b-code-instruct
3615
+ model_name: ibm/granite-3b-code-instruct
3616
+ tokenizer_name: ibm-granite/granite-3b-code-instruct-128k
3617
+ max_sequence_length: 128000
3618
+ client_spec:
3619
+ class_name: "helm.clients.ibm_client.IbmTextClient"
3620
+ args:
3621
+ watsonx_model_name: ibm/granite-3b-code-instruct
3622
+ region: Dallas
3623
+ #
3624
+ - name: ibm/granite-8b-code-instruct
3625
+ model_name: ibm/granite-8b-code-instruct
3626
+ tokenizer_name: ibm-granite/granite-8b-code-instruct-128k
3627
+ max_sequence_length: 128000
3628
+ client_spec:
3629
+ class_name: "helm.clients.ibm_client.IbmTextClient"
3630
+ args:
3631
+ watsonx_model_name: ibm/granite-8b-code-instruct
3632
+ region: Dallas
3633
+
3634
+ - name: ibm/mixtral-8x7b-instruct-v0:1
3635
+ model_name: mistralai/mixtral-8x7b-instruct-v0:1
3636
+ tokenizer_name: huggingface/gpt2
3637
+ max_sequence_length: 4000
3638
+ client_spec:
3639
+ class_name: "helm.clients.ibm_client.IbmChatClient"
3640
+ args:
3641
+ watsonx_model_name: mistralai/mixtral-8x7b-instruct-v01
3642
+ region: Dallas