crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/call_center_annotator.py +22 -11
  36. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  37. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  38. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  39. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  40. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  41. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  42. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  43. helm/benchmark/annotation/live_qa_annotator.py +10 -5
  44. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  45. helm/benchmark/annotation/medalign_annotator.py +100 -0
  46. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  47. helm/benchmark/annotation/medication_qa_annotator.py +90 -61
  48. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  49. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  50. helm/benchmark/annotation/model_as_judge.py +281 -18
  51. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  52. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  54. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  55. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  56. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  57. helm/benchmark/annotation/spider_annotator.py +18 -0
  58. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  59. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  60. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  61. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  62. helm/benchmark/annotation/xstest_annotator.py +20 -30
  63. helm/benchmark/annotation_executor.py +35 -15
  64. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  65. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  66. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  67. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  68. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  69. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  70. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  71. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  72. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  73. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  74. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  75. helm/benchmark/augmentations/perturbation.py +1 -1
  76. helm/benchmark/augmentations/space_perturbation.py +2 -2
  77. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  78. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  79. helm/benchmark/augmentations/test_perturbation.py +16 -13
  80. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  81. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  82. helm/benchmark/data_preprocessor.py +2 -2
  83. helm/benchmark/huggingface_registration.py +2 -7
  84. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  85. helm/benchmark/metrics/basic_metrics.py +6 -6
  86. helm/benchmark/metrics/bbq_metrics.py +2 -2
  87. helm/benchmark/metrics/bias_metrics.py +12 -3
  88. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  89. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  90. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  91. helm/benchmark/metrics/classification_metrics.py +76 -12
  92. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  93. helm/benchmark/metrics/code_metrics.py +5 -5
  94. helm/benchmark/metrics/comet_metric.py +125 -0
  95. helm/benchmark/metrics/common_metric_specs.py +9 -2
  96. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  97. helm/benchmark/metrics/copyright_metrics.py +4 -4
  98. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  99. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  101. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  102. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  103. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  104. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  105. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  106. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  107. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  108. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  109. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  110. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  113. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  115. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  116. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  117. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  118. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  120. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  122. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  123. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  124. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  125. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  126. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  127. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  128. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  129. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  130. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  131. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  132. helm/benchmark/metrics/medalign_metrics.py +34 -0
  133. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  134. helm/benchmark/metrics/medec_metrics.py +101 -0
  135. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  136. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  137. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  138. helm/benchmark/metrics/metric.py +3 -3
  139. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  140. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  141. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  142. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  143. helm/benchmark/metrics/nltk_helper.py +32 -0
  144. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  145. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  146. helm/benchmark/metrics/output_processing_metric.py +60 -0
  147. helm/benchmark/metrics/output_processors.py +15 -0
  148. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  149. helm/benchmark/metrics/ranking_metrics.py +3 -3
  150. helm/benchmark/metrics/reference_metric.py +3 -3
  151. helm/benchmark/metrics/safety_metrics.py +39 -17
  152. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  153. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  154. helm/benchmark/metrics/spider_metrics.py +7 -0
  155. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  156. helm/benchmark/metrics/statistic.py +1 -1
  157. helm/benchmark/metrics/summac/model_summac.py +1 -1
  158. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  159. helm/benchmark/metrics/summarization_metrics.py +19 -9
  160. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  161. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  162. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  163. helm/benchmark/metrics/test_metric.py +1 -1
  164. helm/benchmark/metrics/test_statistic.py +2 -2
  165. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  167. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  169. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  170. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  171. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  172. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  173. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  174. helm/benchmark/metrics/unitxt_metrics.py +21 -4
  175. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  176. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  177. helm/benchmark/model_metadata_registry.py +16 -0
  178. helm/benchmark/presentation/create_plots.py +1 -1
  179. helm/benchmark/presentation/schema.py +3 -0
  180. helm/benchmark/presentation/summarize.py +119 -256
  181. helm/benchmark/presentation/test_summarize.py +145 -3
  182. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  183. helm/benchmark/reeval_run.py +203 -0
  184. helm/benchmark/reeval_runner.py +355 -0
  185. helm/benchmark/run.py +8 -17
  186. helm/benchmark/run_expander.py +105 -8
  187. helm/benchmark/run_spec_factory.py +12 -0
  188. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  189. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  190. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  191. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  192. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  193. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  194. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  195. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  196. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  197. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  198. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  199. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  200. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  201. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  202. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  203. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  204. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
  205. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  206. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  207. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  208. helm/benchmark/run_specs/vlm_run_specs.py +83 -5
  209. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  210. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  211. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  212. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  213. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  214. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  215. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  216. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  217. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  218. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  219. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  220. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  221. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  222. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  223. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  224. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  225. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  226. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  227. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  228. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  229. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  230. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  231. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  232. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  233. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  234. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  235. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  236. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  237. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  238. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  239. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  240. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  241. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  242. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  243. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  244. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  245. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  246. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  247. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  248. helm/benchmark/scenarios/bold_scenario.py +1 -1
  249. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  250. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  251. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  252. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  253. helm/benchmark/scenarios/clear_scenario.py +153 -0
  254. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  255. helm/benchmark/scenarios/code_scenario.py +17 -4
  256. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  257. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  258. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  259. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  260. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  261. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  262. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  263. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  264. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  265. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  266. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  267. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  268. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  269. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  270. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  271. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  272. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  273. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  274. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  275. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  276. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  277. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  278. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  279. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  280. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  281. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  282. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  283. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  284. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  285. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  286. helm/benchmark/scenarios/ice_scenario.py +8 -4
  287. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  288. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  289. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  290. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  291. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  292. helm/benchmark/scenarios/koala_scenario.py +1 -1
  293. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  294. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  295. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  296. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  297. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  298. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  299. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  300. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  301. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  302. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  303. helm/benchmark/scenarios/math_scenario.py +9 -1
  304. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  305. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  306. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  307. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  308. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  309. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  310. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  311. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  312. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  313. helm/benchmark/scenarios/medec_scenario.py +120 -0
  314. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  315. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  316. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  317. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  318. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  319. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  320. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  321. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  322. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  323. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  324. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  325. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  326. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  327. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  328. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  329. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  330. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  331. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  332. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  333. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  334. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  335. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  336. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  337. helm/benchmark/scenarios/quac_scenario.py +10 -1
  338. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  339. helm/benchmark/scenarios/raft_scenario.py +18 -3
  340. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  341. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  342. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  343. helm/benchmark/scenarios/scenario.py +9 -1
  344. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
  345. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  346. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  347. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  348. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  349. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  350. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  351. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  352. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  353. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  354. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  355. helm/benchmark/scenarios/spider_scenario.py +91 -0
  356. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  357. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  358. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  359. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  360. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  361. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  362. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  363. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  364. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  366. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  367. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  368. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  369. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  370. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  371. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  372. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  373. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  374. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  375. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  376. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  377. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  378. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  379. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  380. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  381. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  382. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  383. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  384. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  385. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  386. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  387. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  388. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  389. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  390. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  391. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  392. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  393. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  394. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  395. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  396. helm/benchmark/server.py +11 -0
  397. helm/benchmark/slurm_runner.py +1 -1
  398. helm/benchmark/static/schema_audio.yaml +752 -0
  399. helm/benchmark/static/schema_autobencher.yaml +150 -0
  400. helm/benchmark/static/schema_call_center.yaml +97 -60
  401. helm/benchmark/static/schema_capabilities.yaml +254 -0
  402. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  403. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  404. helm/benchmark/static/schema_enterprise.yaml +298 -0
  405. helm/benchmark/static/schema_finance.yaml +14 -12
  406. helm/benchmark/static/schema_heim.yaml +1389 -0
  407. helm/benchmark/static/schema_legal.yaml +566 -0
  408. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  409. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  410. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  411. helm/benchmark/static/schema_safety.yaml +42 -6
  412. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
  413. helm/benchmark/static/schema_social_audio.yaml +224 -0
  414. helm/benchmark/static/schema_sql.yaml +171 -0
  415. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
  416. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  417. helm/benchmark/static/schema_vhelm.yaml +151 -47
  418. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  419. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  420. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  421. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  422. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  423. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  424. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  425. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  426. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  427. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  428. helm/benchmark/static_build/config.js +1 -1
  429. helm/benchmark/static_build/index.html +5 -5
  430. helm/benchmark/window_services/default_window_service.py +1 -1
  431. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  432. helm/benchmark/window_services/ice_window_service.py +1 -1
  433. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  434. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  435. helm/benchmark/window_services/local_window_service.py +2 -2
  436. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  437. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  438. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  439. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  440. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  441. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  442. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  443. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  444. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  445. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  446. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  447. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  448. helm/benchmark/window_services/test_utils.py +1 -1
  449. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  450. helm/benchmark/window_services/tokenizer_service.py +0 -5
  451. helm/benchmark/window_services/yalm_window_service.py +1 -1
  452. helm/clients/ai21_client.py +3 -3
  453. helm/clients/aleph_alpha_client.py +1 -1
  454. helm/clients/audio_language/__init__.py +0 -0
  455. helm/clients/audio_language/diva_llama_client.py +118 -0
  456. helm/clients/audio_language/llama_omni_client.py +198 -0
  457. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  458. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  459. helm/clients/auto_client.py +4 -2
  460. helm/clients/azure_openai_client.py +55 -0
  461. helm/clients/bedrock_client.py +201 -7
  462. helm/clients/bedrock_utils.py +33 -0
  463. helm/clients/clip_scorers/clip_scorer.py +1 -1
  464. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  465. helm/clients/cohere_client.py +3 -3
  466. helm/clients/google_client.py +1 -1
  467. helm/clients/http_model_client.py +1 -1
  468. helm/clients/huggingface_client.py +10 -18
  469. helm/clients/ibm_client.py +267 -0
  470. helm/clients/image_generation/adobe_vision_client.py +1 -1
  471. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  472. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  473. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  474. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  475. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  476. helm/clients/image_generation/cogview2_client.py +1 -1
  477. helm/clients/image_generation/dalle2_client.py +1 -1
  478. helm/clients/image_generation/dalle3_client.py +2 -2
  479. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  480. helm/clients/image_generation/dalle_mini/data.py +1 -1
  481. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  482. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  483. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  484. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  485. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  486. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  487. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  488. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  489. helm/clients/image_generation/dalle_mini_client.py +1 -1
  490. helm/clients/image_generation/deep_floyd_client.py +1 -1
  491. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  492. helm/clients/image_generation/lexica_client.py +1 -1
  493. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  494. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  495. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  496. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  497. helm/clients/image_generation/mindalle_client.py +1 -1
  498. helm/clients/image_generation/together_image_generation_client.py +1 -1
  499. helm/clients/lit_gpt_client.py +2 -2
  500. helm/clients/mistral_client.py +62 -18
  501. helm/clients/nvidia_nim_client.py +0 -3
  502. helm/clients/openai_client.py +255 -21
  503. helm/clients/palmyra_client.py +2 -6
  504. helm/clients/reka_client.py +1 -1
  505. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  506. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  507. helm/clients/stanfordhealthcare_google_client.py +43 -0
  508. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  509. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  510. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  511. helm/clients/test_client.py +1 -1
  512. helm/clients/test_together_client.py +6 -1
  513. helm/clients/together_client.py +69 -7
  514. helm/clients/upstage_client.py +23 -0
  515. helm/clients/vertexai_client.py +39 -13
  516. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  517. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  518. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  519. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  520. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  521. helm/clients/vllm_client.py +4 -6
  522. helm/clients/yi_client.py +0 -3
  523. helm/common/audio_utils.py +111 -0
  524. helm/common/cache.py +8 -30
  525. helm/common/file_caches/local_file_cache.py +1 -1
  526. helm/common/file_caches/test_local_file_cache.py +1 -1
  527. helm/common/images_utils.py +2 -2
  528. helm/common/key_value_store.py +9 -9
  529. helm/common/media_object.py +2 -2
  530. helm/common/mongo_key_value_store.py +3 -3
  531. helm/common/multimodal_request_utils.py +26 -0
  532. helm/common/reeval_parameters.py +12 -0
  533. helm/common/request.py +6 -2
  534. helm/common/response_format.py +18 -0
  535. helm/common/test_cache.py +1 -48
  536. helm/common/test_media_object.py +1 -1
  537. helm/common/tokenization_request.py +0 -9
  538. helm/config/model_deployments.yaml +1258 -33
  539. helm/config/model_metadata.yaml +1110 -41
  540. helm/config/tokenizer_configs.yaml +403 -3
  541. helm/proxy/cli.py +2 -2
  542. helm/proxy/example_queries.py +1 -1
  543. helm/proxy/server.py +11 -13
  544. helm/proxy/services/remote_service.py +1 -7
  545. helm/proxy/services/server_service.py +6 -19
  546. helm/proxy/services/service.py +0 -6
  547. helm/proxy/services/test_remote_service.py +2 -2
  548. helm/proxy/services/test_service.py +1 -1
  549. helm/proxy/static/general.js +122 -0
  550. helm/proxy/static/help.html +99 -0
  551. helm/proxy/static/index.css +57 -0
  552. helm/proxy/static/index.html +40 -0
  553. helm/proxy/static/index.js +456 -0
  554. helm/proxy/static/info-icon.png +0 -0
  555. helm/proxy/test_retry.py +1 -1
  556. helm/proxy/token_counters/auto_token_counter.py +1 -1
  557. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  558. helm/tokenizers/caching_tokenizer.py +2 -30
  559. helm/tokenizers/http_model_tokenizer.py +1 -1
  560. helm/tokenizers/huggingface_tokenizer.py +2 -2
  561. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  562. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  563. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  564. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  565. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  566. helm/tokenizers/tokenizer.py +3 -1
  567. helm/tokenizers/yalm_tokenizer.py +3 -3
  568. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  569. crfm_helm-0.5.3.dist-info/METADATA +0 -355
  570. crfm_helm-0.5.3.dist-info/RECORD +0 -699
  571. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  572. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  573. helm/benchmark/data_overlap/light_scenario.py +0 -60
  574. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  575. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  576. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  577. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  578. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  579. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  580. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  581. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  582. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  583. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  584. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  585. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  586. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  587. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  588. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  589. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  590. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  591. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  592. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  593. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  594. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  595. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  596. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  597. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  598. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  599. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  600. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  601. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  602. helm/tokenizers/anthropic_tokenizer.py +0 -52
  603. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  604. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  605. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
  606. /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
@@ -16,6 +16,161 @@ model_deployments:
16
16
  client_spec:
17
17
  class_name: "helm.clients.simple_client.SimpleClient"
18
18
 
19
+ # Stanford Health Care
20
+ # Placed earlier in the file to make them non-default
21
+ - name: stanfordhealthcare/claude-3-5-sonnet-20241022
22
+ model_name: anthropic/claude-3-5-sonnet-20241022
23
+ tokenizer_name: anthropic/claude
24
+ max_sequence_length: 200000
25
+ client_spec:
26
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
27
+ args:
28
+ model: anthropic.claude-3-5-sonnet-20241022-v2:0
29
+ deployment: Claude35Sonnetv2/awssig4fa
30
+
31
+ - name: stanfordhealthcare/claude-3-7-sonnet-20250219
32
+ model_name: anthropic/claude-3-7-sonnet-20250219
33
+ tokenizer_name: anthropic/claude
34
+ max_sequence_length: 200000
35
+ client_spec:
36
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
37
+ args:
38
+ model: arn:aws:bedrock:us-west-2:679683451337:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0
39
+ deployment: awssig4claude37/aswsig4claude37
40
+
41
+ - name: stanfordhealthcare/gemini-1.5-pro-001
42
+ model_name: google/gemini-1.5-pro-001
43
+ tokenizer_name: google/gemma-2b
44
+ max_sequence_length: 1000000
45
+ client_spec:
46
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
47
+ args:
48
+ deployment: gcpgemini/apim-gcp-oauth-fa
49
+
50
+ - name: stanfordhealthcare/gemini-2.0-flash-001
51
+ model_name: google/gemini-2.0-flash-001
52
+ tokenizer_name: google/gemma-2b
53
+ max_sequence_length: 1000000
54
+ client_spec:
55
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
56
+ args:
57
+ deployment: gcp-gem20flash-fa/apim-gcp-gem20flash-fa
58
+
59
+ - name: stanfordhealthcare/gpt-4o-mini-2024-07-18
60
+ model_name: openai/gpt-4o-mini-2024-07-18
61
+ tokenizer_name: openai/o200k_base
62
+ max_sequence_length: 128000
63
+ client_spec:
64
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
65
+ args:
66
+ openai_model_name: gpt-4o-mini
67
+ api_version: 2023-05-15
68
+
69
+ - name: stanfordhealthcare/gpt-4o-2024-05-13
70
+ model_name: openai/gpt-4o-2024-05-13
71
+ tokenizer_name: openai/o200k_base
72
+ max_sequence_length: 128000
73
+ client_spec:
74
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
75
+ args:
76
+ openai_model_name: gpt-4o
77
+ api_version: 2023-05-15
78
+
79
+ - name: stanfordhealthcare/gpt-4-0613
80
+ model_name: openai/gpt-4-0613
81
+ tokenizer_name: openai/o200k_base
82
+ max_sequence_length: 8192
83
+ client_spec:
84
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
85
+ args:
86
+ openai_model_name: gpt-4
87
+ api_version: 2023-05-15
88
+
89
+ - name: stanfordhealthcare/gpt-4-turbo-2024-04-09
90
+ model_name: openai/gpt-4-turbo-2024-04-09
91
+ tokenizer_name: openai/cl100k_base
92
+ max_sequence_length: 128000
93
+ client_spec:
94
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
95
+ args:
96
+ openai_model_name: gpt-4-turbo
97
+ api_version: 2023-05-15
98
+
99
+ - name: stanfordhealthcare/o3-mini-2025-01-31
100
+ model_name: openai/o3-mini-2025-01-31
101
+ tokenizer_name: openai/cl100k_base
102
+ max_sequence_length: 200000
103
+ client_spec:
104
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
105
+ args:
106
+ openai_model_name: o3-mini
107
+ api_version: 2024-12-01-preview
108
+ base_url: "{endpoint}/openai-eastus2"
109
+
110
+ - name: stanfordhealthcare/o1-2024-12-17
111
+ model_name: openai/o1-2024-12-17
112
+ tokenizer_name: openai/cl100k_base
113
+ max_sequence_length: 128000
114
+ client_spec:
115
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
116
+ args:
117
+ openai_model_name: o1
118
+ api_version: 2024-12-01-preview
119
+ base_url: "{endpoint}/openai-eastus2"
120
+
121
+ - name: stanfordhealthcare/deepseek-r1
122
+ model_name: deepseek-ai/deepseek-r1
123
+ tokenizer_name: deepseek-ai/deepseek-r1
124
+ max_sequence_length: 128000
125
+ client_spec:
126
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
127
+ args:
128
+ openai_model_name: deepseek-chat
129
+ output_processor: helm.benchmark.metrics.output_processors.remove_deepseek_r1_thinking
130
+ base_url: "{endpoint}/deepseekr1/v1"
131
+
132
+ - name: stanfordhealthcare/llama-3.3-70b-instruct
133
+ model_name: meta/llama-3.3-70b-instruct
134
+ tokenizer_name: meta/llama-3.3-70b-instruct
135
+ max_sequence_length: 128000
136
+ client_spec:
137
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
138
+ args:
139
+ base_url: "{endpoint}/llama3370b/v1"
140
+
141
+ - name: stanfordhealthcare/phi-3.5-mini-instruct
142
+ model_name: microsoft/phi-3.5-mini-instruct
143
+ tokenizer_name: microsoft/phi-3.5-mini-instruct
144
+ max_sequence_length: 131072
145
+ client_spec:
146
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
147
+ args:
148
+ base_url: "{endpoint}/phi35mi/v1"
149
+
150
+ - name: stanfordhealthcare_shc/gpt-4o-2024-05-13
151
+ model_name: openai/gpt-4o-2024-05-13
152
+ tokenizer_name: openai/o200k_base
153
+ max_sequence_length: 128000
154
+ client_spec:
155
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
156
+ deployment: gpt-4o
157
+
158
+ - name: stanfordhealthcare_shc/gpt-4o-mini-2024-07-18
159
+ model_name: openai/gpt-4o-mini-2024-07-18
160
+ tokenizer_name: openai/o200k_base
161
+ max_sequence_length: 128000
162
+ client_spec:
163
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
164
+ deployment: gpt-4o-mini
165
+
166
+ - name: stanfordhealthcare_shc/gpt-4-turbo-2024-04-09
167
+ model_name: openai/gpt-4-turbo-2024-04-09
168
+ tokenizer_name: openai/cl100k_base
169
+ max_sequence_length: 128000
170
+ client_spec:
171
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
172
+ deployment: gpt-4-turbo-2024-04-09
173
+
19
174
  # Adobe
20
175
  - name: adobe/giga-gan
21
176
  model_name: adobe/giga-gan
@@ -104,7 +259,31 @@ model_deployments:
104
259
  class_name: "helm.benchmark.window_services.image_generation.clip_window_service.CLIPWindowService"
105
260
 
106
261
 
107
- # Amazon
262
+ # Amazon nova models
263
+
264
+ - name: amazon/nova-pro-v1:0
265
+ model_name: amazon/nova-pro-v1:0
266
+ tokenizer_name: huggingface/gpt2
267
+ max_sequence_length: 300000
268
+ client_spec:
269
+ class_name: "helm.clients.bedrock_client.BedrockNovaClient"
270
+
271
+ - name: amazon/nova-lite-v1:0
272
+ model_name: amazon/nova-lite-v1:0
273
+ tokenizer_name: huggingface/gpt2
274
+ max_sequence_length: 300000
275
+ client_spec:
276
+ class_name: "helm.clients.bedrock_client.BedrockNovaClient"
277
+
278
+ - name: amazon/nova-micro-v1:0
279
+ model_name: amazon/nova-micro-v1:0
280
+ tokenizer_name: huggingface/gpt2
281
+ max_sequence_length: 128000
282
+ client_spec:
283
+ class_name: "helm.clients.bedrock_client.BedrockNovaClient"
284
+
285
+ # Titan on Amazon Bedrock
286
+
108
287
  - name: amazon/titan-text-lite-v1
109
288
  model_name: amazon/titan-text-lite-v1
110
289
  tokenizer_name: huggingface/gpt2
@@ -112,20 +291,87 @@ model_deployments:
112
291
  client_spec:
113
292
  class_name: "helm.clients.bedrock_client.BedrockTitanClient"
114
293
 
115
- - name: amazon/titan-tg1-large
116
- model_name: amazon/titan-tg1-large
294
+ - name: amazon/titan-text-express-v1
295
+ model_name: amazon/titan-text-express-v1
117
296
  tokenizer_name: huggingface/gpt2
118
297
  max_sequence_length: 8000
119
298
  client_spec:
120
299
  class_name: "helm.clients.bedrock_client.BedrockTitanClient"
300
+
301
+ # Mistral on Amazon Bedrock
121
302
 
122
- - name: amazon/titan-text-express-v1
123
- model_name: amazon/titan-text-express-v1
303
+ - name: amazon/mistral-7b-instruct-v0:2
304
+ model_name: mistralai/amazon-mistral-7b-instruct-v0:2
124
305
  tokenizer_name: huggingface/gpt2
125
306
  max_sequence_length: 8000
126
307
  client_spec:
127
- class_name: "helm.clients.bedrock_client.BedrockTitanClient"
308
+ class_name: "helm.clients.bedrock_client.BedrockMistralClient"
309
+
310
+ - name: amazon/mixtral-8x7b-instruct-v0:1
311
+ model_name: mistralai/amazon-mixtral-8x7b-instruct-v0:1
312
+ tokenizer_name: huggingface/gpt2
313
+ max_sequence_length: 4000
314
+ client_spec:
315
+ class_name: "helm.clients.bedrock_client.BedrockMistralClient"
316
+
317
+ - name: amazon/mistral-large-2402-v1:0
318
+ model_name: mistralai/amazon-mistral-large-2402-v1:0
319
+ tokenizer_name: huggingface/gpt2
320
+ max_sequence_length: 8000
321
+ client_spec:
322
+ class_name: "helm.clients.bedrock_client.BedrockMistralClient"
323
+
324
+ - name: amazon/mistral-small-2402-v1:0
325
+ model_name: mistralai/amazon-mistral-small-2402-v1:0
326
+ tokenizer_name: huggingface/gpt2
327
+ max_sequence_length: 8000
328
+ client_spec:
329
+ class_name: "helm.clients.bedrock_client.BedrockMistralClient"
128
330
 
331
+ - name: amazon/mistral-large-2407-v1:0
332
+ model_name: mistralai/amazon-mistral-large-2407-v1:0
333
+ tokenizer_name: huggingface/gpt2
334
+ max_sequence_length: 8000
335
+ client_spec:
336
+ class_name: "helm.clients.bedrock_client.BedrockMistralClient"
337
+
338
+ # Llama 3 on Amazon Bedrock
339
+
340
+ - name: amazon/llama3-8b-instruct-v1:0
341
+ model_name: meta/amazon-llama3-8b-instruct-v1:0
342
+ tokenizer_name: huggingface/gpt2
343
+ max_sequence_length: 2000
344
+ client_spec:
345
+ class_name: "helm.clients.bedrock_client.BedrockLlamaClient"
346
+
347
+ - name: amazon/llama3-70b-instruct-v1:0
348
+ model_name: meta/amazon-llama3-70b-instruct-v1:0
349
+ tokenizer_name: huggingface/gpt2
350
+ max_sequence_length: 2000
351
+ client_spec:
352
+ class_name: "helm.clients.bedrock_client.BedrockLlamaClient"
353
+
354
+ - name: amazon/llama3-1-405b-instruct-v1:0
355
+ model_name: meta/amazon-llama3-1-405b-instruct-v1:0
356
+ tokenizer_name: huggingface/gpt2
357
+ max_sequence_length: 2000
358
+ client_spec:
359
+ class_name: "helm.clients.bedrock_client.BedrockLlamaClient"
360
+
361
+ - name: amazon/llama3-1-70b-instruct-v1:0
362
+ model_name: meta/amazon-llama3-1-70b-instruct-v1:0
363
+ tokenizer_name: huggingface/gpt2
364
+ max_sequence_length: 2000
365
+ client_spec:
366
+ class_name: "helm.clients.bedrock_client.BedrockLlamaClient"
367
+
368
+
369
+ - name: amazon/llama3-1-8b-instruct-v1:0
370
+ model_name: meta/amazon-llama3-1-8b-instruct-v1:0
371
+ tokenizer_name: huggingface/gpt2
372
+ max_sequence_length: 2000
373
+ client_spec:
374
+ class_name: "helm.clients.bedrock_client.BedrockLlamaClient"
129
375
 
130
376
  # Anthropic
131
377
  - name: anthropic/claude-v1.3
@@ -189,6 +435,13 @@ model_deployments:
189
435
  client_spec:
190
436
  class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
191
437
 
438
+ - name: anthropic/claude-3-5-haiku-20241022
439
+ model_name: anthropic/claude-3-5-haiku-20241022
440
+ tokenizer_name: anthropic/claude
441
+ max_sequence_length: 200000
442
+ client_spec:
443
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
444
+
192
445
  - name: anthropic/claude-3-5-sonnet-20240620
193
446
  model_name: anthropic/claude-3-5-sonnet-20240620
194
447
  tokenizer_name: anthropic/claude
@@ -196,6 +449,20 @@ model_deployments:
196
449
  client_spec:
197
450
  class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
198
451
 
452
+ - name: anthropic/claude-3-5-sonnet-20241022
453
+ model_name: anthropic/claude-3-5-sonnet-20241022
454
+ tokenizer_name: anthropic/claude
455
+ max_sequence_length: 200000
456
+ client_spec:
457
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
458
+
459
+ - name: anthropic/claude-3-7-sonnet-20250219
460
+ model_name: anthropic/claude-3-7-sonnet-20250219
461
+ tokenizer_name: anthropic/claude
462
+ max_sequence_length: 200000
463
+ client_spec:
464
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
465
+
199
466
  - name: anthropic/stanford-online-all-v4-s3
200
467
  deprecated: true # Closed model, not accessible via API
201
468
  model_name: anthropic/stanford-online-all-v4-s3
@@ -307,6 +574,35 @@ model_deployments:
307
574
  client_spec:
308
575
  class_name: "helm.clients.together_client.TogetherChatClient"
309
576
 
577
+ - name: together/deepseek-v3
578
+ model_name: deepseek-ai/deepseek-v3
579
+ tokenizer_name: deepseek-ai/deepseek-v3
580
+ max_sequence_length: 16384
581
+ client_spec:
582
+ class_name: "helm.clients.together_client.TogetherChatClient"
583
+ args:
584
+ disable_logprobs: True
585
+
586
+ - name: together/deepseek-r1
587
+ model_name: deepseek-ai/deepseek-r1
588
+ tokenizer_name: deepseek-ai/deepseek-r1
589
+ max_sequence_length: 32768
590
+ client_spec:
591
+ class_name: "helm.clients.together_client.TogetherChatClient"
592
+ args:
593
+ disable_logprobs: True
594
+
595
+ - name: together/deepseek-r1-hide-reasoning
596
+ model_name: deepseek-ai/deepseek-r1-hide-reasoning
597
+ tokenizer_name: deepseek-ai/deepseek-r1
598
+ max_sequence_length: 32768
599
+ client_spec:
600
+ class_name: "helm.clients.together_client.TogetherChatClient"
601
+ args:
602
+ together_model: deepseek-ai/deepseek-r1
603
+ disable_logprobs: True
604
+ output_processor: helm.benchmark.metrics.output_processors.remove_deepseek_r1_thinking
605
+
310
606
  # Gooseai
311
607
 
312
608
  # TODO: Migrate these models to use OpenAIClient
@@ -422,6 +718,7 @@ model_deployments:
422
718
  client_spec:
423
719
  class_name: "helm.clients.vertexai_client.VertexAIChatClient"
424
720
  args:
721
+ vertexai_model: gemini-1.5-pro-001
425
722
  safety_settings_preset: default
426
723
 
427
724
  - name: google/gemini-1.5-pro-001-safety-block-none
@@ -432,6 +729,7 @@ model_deployments:
432
729
  client_spec:
433
730
  class_name: "helm.clients.vertexai_client.VertexAIChatClient"
434
731
  args:
732
+ vertexai_model: gemini-1.5-pro-001
435
733
  safety_settings_preset: block_none
436
734
 
437
735
  - name: google/gemini-1.5-flash-001-safety-default
@@ -442,6 +740,7 @@ model_deployments:
442
740
  client_spec:
443
741
  class_name: "helm.clients.vertexai_client.VertexAIChatClient"
444
742
  args:
743
+ vertexai_model: gemini-1.5-flash-001
445
744
  safety_settings_preset: default
446
745
 
447
746
  - name: google/gemini-1.5-flash-001-safety-block-none
@@ -452,8 +751,99 @@ model_deployments:
452
751
  client_spec:
453
752
  class_name: "helm.clients.vertexai_client.VertexAIChatClient"
454
753
  args:
754
+ vertexai_model: gemini-1.5-flash-001
455
755
  safety_settings_preset: block_none
456
756
 
757
+ - name: google/gemini-1.5-pro-002
758
+ model_name: google/gemini-1.5-pro-002
759
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
760
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
761
+ # TODO: Max output tokens: 8192
762
+ client_spec:
763
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
764
+
765
+ - name: google/gemini-1.5-flash-002
766
+ model_name: google/gemini-1.5-flash-002
767
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
768
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
769
+ # TODO: Max output tokens: 8192
770
+ client_spec:
771
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
772
+
773
+ - name: google/gemini-2.0-pro-exp-02-05
774
+ model_name: google/gemini-2.0-pro-exp-02-05
775
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
776
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
777
+ # TODO: Max output tokens: 8192
778
+ client_spec:
779
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
780
+
781
+ - name: google/gemini-2.0-flash-exp
782
+ model_name: google/gemini-2.0-flash-exp
783
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
784
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
785
+ # TODO: Max output tokens: 8192
786
+ client_spec:
787
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
788
+
789
+ - name: google/gemini-2.0-flash-001
790
+ model_name: google/gemini-2.0-flash-001
791
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
792
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
793
+ # TODO: Max output tokens: 8192
794
+ client_spec:
795
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
796
+
797
+ - name: google/gemini-2.0-flash-lite-preview-02-05
798
+ model_name: google/gemini-2.0-flash-lite-preview-02-05
799
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
800
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
801
+ # TODO: Max output tokens: 8192
802
+ client_spec:
803
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
804
+
805
+ - name: google/gemini-2.0-flash-thinking-exp-01-21
806
+ model_name: google/gemini-2.0-flash-thinking-exp-01-21
807
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
808
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
809
+ client_spec:
810
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
811
+
812
+ - name: google/gemini-1.5-flash-8b-001
813
+ model_name: google/gemini-1.5-flash-8b-001
814
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
815
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
816
+ # TODO: Max output tokens: 8192
817
+ client_spec:
818
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
819
+
820
+ - name: google/llama-3.1-8b-instruct
821
+ model_name: meta/llama-3.1-8b-instruct
822
+ tokenizer_name: meta/llama-3.1-8b-instruct
823
+ max_sequence_length: 128000
824
+ client_spec:
825
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
826
+ args:
827
+ vertexai_model: publishers/meta/models/llama-3.1-8b-instruct-maas
828
+
829
+ - name: google/llama-3.1-70b-instruct
830
+ model_name: meta/llama-3.1-70b-instruct
831
+ tokenizer_name: meta/llama-3.1-8b-instruct
832
+ max_sequence_length: 128000
833
+ client_spec:
834
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
835
+ args:
836
+ vertexai_model: publishers/meta/models/llama-3.1-70b-instruct-maas
837
+
838
+ - name: google/llama-3.1-405b-instruct
839
+ model_name: meta/llama-3.1-405b-instruct
840
+ tokenizer_name: meta/llama-3.1-8b-instruct
841
+ max_sequence_length: 128000
842
+ client_spec:
843
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
844
+ args:
845
+ vertexai_model: publishers/meta/models/llama-3.1-405b-instruct-maas
846
+
457
847
  ## Gemma
458
848
  - name: together/gemma-2b
459
849
  model_name: google/gemma-2b
@@ -626,6 +1016,86 @@ model_deployments:
626
1016
  args:
627
1017
  trust_remote_code: true
628
1018
 
1019
+ - name: huggingface/llama3-8b-cpt-sea-lionv2-base
1020
+ model_name: aisingapore/llama3-8b-cpt-sea-lionv2-base
1021
+ tokenizer_name: meta/llama-3-8b-instruct
1022
+ max_sequence_length: 8192
1023
+ client_spec:
1024
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1025
+ args:
1026
+ device_map: auto
1027
+ torch_dtype: torch.bfloat16
1028
+
1029
+ - name: huggingface/llama3-8b-cpt-sea-lionv2.1-instruct
1030
+ model_name: aisingapore/llama3-8b-cpt-sea-lionv2.1-instruct
1031
+ tokenizer_name: meta/llama-3-8b-instruct
1032
+ max_sequence_length: 8192
1033
+ client_spec:
1034
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1035
+ args:
1036
+ device_map: auto
1037
+ torch_dtype: torch.bfloat16
1038
+
1039
+ - name: huggingface/gemma2-9b-cpt-sea-lionv3-base
1040
+ model_name: aisingapore/gemma2-9b-cpt-sea-lionv3-base
1041
+ tokenizer_name: google/gemma-2-9b
1042
+ max_sequence_length: 8192
1043
+ client_spec:
1044
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1045
+ args:
1046
+ device_map: auto
1047
+ torch_dtype: torch.bfloat16
1048
+
1049
+ - name: huggingface/gemma2-9b-cpt-sea-lionv3-instruct
1050
+ model_name: aisingapore/gemma2-9b-cpt-sea-lionv3-instruct
1051
+ tokenizer_name: google/gemma-2-9b
1052
+ max_sequence_length: 8192
1053
+ client_spec:
1054
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1055
+ args:
1056
+ device_map: auto
1057
+ torch_dtype: torch.bfloat16
1058
+
1059
+ - name: huggingface/llama3.1-8b-cpt-sea-lionv3-base
1060
+ model_name: aisingapore/llama3.1-8b-cpt-sea-lionv3-base
1061
+ tokenizer_name: meta/llama-3.1-8b
1062
+ max_sequence_length: 128000
1063
+ client_spec:
1064
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1065
+ args:
1066
+ device_map: auto
1067
+ torch_dtype: torch.bfloat16
1068
+
1069
+ - name: huggingface/llama3.1-8b-cpt-sea-lionv3-instruct
1070
+ model_name: aisingapore/llama3.1-8b-cpt-sea-lionv3-instruct
1071
+ tokenizer_name: meta/llama-3.1-8b
1072
+ max_sequence_length: 128000
1073
+ client_spec:
1074
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1075
+ args:
1076
+ device_map: auto
1077
+ torch_dtype: torch.bfloat16
1078
+
1079
+ - name: huggingface/llama3.1-70b-cpt-sea-lionv3-base
1080
+ model_name: aisingapore/llama3.1-70b-cpt-sea-lionv3-base
1081
+ tokenizer_name: meta/llama-3.1-8b
1082
+ max_sequence_length: 128000
1083
+ client_spec:
1084
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1085
+ args:
1086
+ device_map: auto
1087
+ torch_dtype: torch.bfloat16
1088
+
1089
+ - name: huggingface/llama3.1-70b-cpt-sea-lionv3-instruct
1090
+ model_name: aisingapore/llama3.1-70b-cpt-sea-lionv3-instruct
1091
+ tokenizer_name: meta/llama-3.1-8b
1092
+ max_sequence_length: 128000
1093
+ client_spec:
1094
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1095
+ args:
1096
+ device_map: auto
1097
+ torch_dtype: torch.bfloat16
1098
+
629
1099
  ## Bigcode
630
1100
  - name: huggingface/santacoder
631
1101
  model_name: bigcode/santacoder
@@ -783,6 +1253,24 @@ model_deployments:
783
1253
  class_name: "helm.clients.huggingface_client.HuggingFaceClient"
784
1254
 
785
1255
  ## Meta
1256
+ - name: huggingface/llama-3.1-8b-instruct
1257
+ model_name: meta/llama-3.1-8b-instruct
1258
+ tokenizer_name: meta/llama-3.1-8b-instruct
1259
+ max_sequence_length: 131072
1260
+ client_spec:
1261
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1262
+ args:
1263
+ pretrained_model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
1264
+
1265
+ - name: huggingface/llama-3.2-1b-instruct
1266
+ model_name: meta/llama-3.2-1b-instruct
1267
+ tokenizer_name: meta/llama-3.2-1b-instruct
1268
+ max_sequence_length: 131072
1269
+ client_spec:
1270
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1271
+ args:
1272
+ pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
1273
+
786
1274
  - name: huggingface/opt-175b
787
1275
  model_name: meta/opt-175b
788
1276
  tokenizer_name: facebook/opt-66b
@@ -861,7 +1349,22 @@ model_deployments:
861
1349
  max_sequence_length: 2048
862
1350
  client_spec:
863
1351
  class_name: "helm.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient"
1352
+
1353
+ ## NECTEC
1354
+ - name: huggingface/Pathumma-llm-text-1.0.0
1355
+ model_name: nectec/Pathumma-llm-text-1.0.0
1356
+ tokenizer_name: nectec/Pathumma-llm-text-1.0.0
1357
+ max_sequence_length: 8192
1358
+ client_spec:
1359
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
864
1360
 
1361
+ - name: huggingface/OpenThaiLLM-Prebuilt-7B
1362
+ model_name: nectec/OpenThaiLLM-Prebuilt-7B
1363
+ tokenizer_name: nectec/OpenThaiLLM-Prebuilt-7B
1364
+ max_sequence_length: 4096
1365
+ client_spec:
1366
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1367
+
865
1368
  ## KAIST AI
866
1369
  - name: huggingface/prometheus-vision-13b-v1.0-hf
867
1370
  model_name: kaistai/prometheus-vision-13b-v1.0-hf
@@ -909,7 +1412,37 @@ model_deployments:
909
1412
  args:
910
1413
  device_map: auto
911
1414
  torch_dtype: auto
912
-
1415
+
1416
+ - name: huggingface/phi-3.5-mini-instruct-4bit
1417
+ model_name: microsoft/phi-3.5-mini-instruct
1418
+ tokenizer_name: microsoft/phi-3.5-mini-instruct
1419
+ max_sequence_length: 131072
1420
+ client_spec:
1421
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1422
+ args:
1423
+ device_map: auto
1424
+ torch_dtype: "float16"
1425
+ quantization_config:
1426
+ load_in_4bit: true
1427
+ attn_implementation: "flash_attention_2"
1428
+
1429
+ - name: huggingface/phi-3.5-mini-instruct
1430
+ model_name: microsoft/phi-3.5-mini-instruct
1431
+ tokenizer_name: microsoft/phi-3.5-mini-instruct
1432
+ max_sequence_length: 131072
1433
+ client_spec:
1434
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1435
+
1436
+ - name: huggingface/phi-3.5-moe-instruct
1437
+ model_name: microsoft/phi-3.5-moe-instruct
1438
+ tokenizer_name: microsoft/phi-3.5-mini-instruct
1439
+ max_sequence_length: 131072
1440
+ client_spec:
1441
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1442
+ args:
1443
+ device_map: auto
1444
+ torch_dtype: auto
1445
+
913
1446
  ## Mistral AI
914
1447
  - name: huggingface/bakLlava-v1-hf
915
1448
  model_name: mistralai/bakLlava-v1-hf
@@ -1152,6 +1685,17 @@ model_deployments:
1152
1685
  client_spec:
1153
1686
  class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1154
1687
 
1688
+ # Upstage
1689
+ - name: huggingface/solar-pro-preview-instruct
1690
+ model_name: upstage/solar-pro-preview-instruct
1691
+ tokenizer_name: upstage/solar-pro-preview-instruct
1692
+ max_sequence_length: 4096
1693
+ client_spec:
1694
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1695
+ args:
1696
+ torch_dtype: auto
1697
+ trust_remote_code: true
1698
+
1155
1699
  ## Text-to-Image Diffusion Models
1156
1700
 
1157
1701
  - name: huggingface/dreamlike-diffusion-v1-0
@@ -1386,14 +1930,20 @@ model_deployments:
1386
1930
  precision: bf16-true
1387
1931
 
1388
1932
  # Mistral AI
1389
- - name: mistralai/mistral-tiny
1390
- model_name: mistralai/mistral-7b-v0.1
1391
- tokenizer_name: mistralai/Mistral-7B-v0.1
1392
- max_sequence_length: 32000
1933
+
1934
+ - name: mistralai/ministral-3b-2410
1935
+ model_name: mistralai/ministral-3b-2410
1936
+ tokenizer_name: mistralai/Ministral-8B-Instruct-2410
1937
+ max_sequence_length: 128000
1938
+ client_spec:
1939
+ class_name: "helm.clients.mistral_client.MistralAIClient"
1940
+
1941
+ - name: mistralai/ministral-8b-2410
1942
+ model_name: mistralai/ministral-8b-2410
1943
+ tokenizer_name: mistralai/Ministral-8B-Instruct-2410
1944
+ max_sequence_length: 128000
1393
1945
  client_spec:
1394
1946
  class_name: "helm.clients.mistral_client.MistralAIClient"
1395
- args:
1396
- mistral_model: "mistral-tiny"
1397
1947
 
1398
1948
  - name: mistralai/mistral-small-2402
1399
1949
  model_name: mistralai/mistral-small-2402
@@ -1402,6 +1952,27 @@ model_deployments:
1402
1952
  client_spec:
1403
1953
  class_name: "helm.clients.mistral_client.MistralAIClient"
1404
1954
 
1955
+ - name: mistralai/mistral-small-2409
1956
+ model_name: mistralai/mistral-small-2409
1957
+ tokenizer_name: mistralai/Mistral-7B-v0.1
1958
+ max_sequence_length: 32000
1959
+ client_spec:
1960
+ class_name: "helm.clients.mistral_client.MistralAIClient"
1961
+
1962
+ - name: mistralai/mistral-small-2501
1963
+ model_name: mistralai/mistral-small-2501
1964
+ tokenizer_name: mistralai/Mistral-Small-24B-Instruct-2501
1965
+ max_sequence_length: 32000
1966
+ client_spec:
1967
+ class_name: "helm.clients.mistral_client.MistralAIClient"
1968
+
1969
+ - name: mistralai/mistral-small-2503
1970
+ model_name: mistralai/mistral-small-2503
1971
+ tokenizer_name: mistralai/Mistral-Small-24B-Instruct-2501
1972
+ max_sequence_length: 128000
1973
+ client_spec:
1974
+ class_name: "helm.clients.mistral_client.MistralAIClient"
1975
+
1405
1976
  - name: mistralai/mistral-medium-2312
1406
1977
  model_name: mistralai/mistral-medium-2312
1407
1978
  tokenizer_name: mistralai/Mistral-7B-v0.1
@@ -1423,6 +1994,13 @@ model_deployments:
1423
1994
  client_spec:
1424
1995
  class_name: "helm.clients.mistral_client.MistralAIClient"
1425
1996
 
1997
+ - name: mistralai/mistral-large-2411
1998
+ model_name: mistralai/mistral-large-2411
1999
+ tokenizer_name: mistralai/Mistral-Large-Instruct-2411
2000
+ max_sequence_length: 128000
2001
+ client_spec:
2002
+ class_name: "helm.clients.mistral_client.MistralAIClient"
2003
+
1426
2004
  - name: mistralai/open-mistral-nemo-2407
1427
2005
  model_name: mistralai/open-mistral-nemo-2407
1428
2006
  tokenizer_name: mistralai/Mistral-Nemo-Base-2407
@@ -1430,6 +2008,21 @@ model_deployments:
1430
2008
  client_spec:
1431
2009
  class_name: "helm.clients.mistral_client.MistralAIClient"
1432
2010
 
2011
+ - name: mistralai/pixtral-12b-2409
2012
+ model_name: mistralai/pixtral-12b-2409
2013
+ tokenizer_name: mistralai/Mistral-7B-v0.1
2014
+ max_sequence_length: 128000
2015
+ client_spec:
2016
+ class_name: "helm.clients.mistral_client.MistralAIClient"
2017
+
2018
+ - name: mistralai/pixtral-large-2411
2019
+ model_name: mistralai/pixtral-large-2411
2020
+ tokenizer_name: mistralai/Mistral-Large-Instruct-2407
2021
+ max_sequence_length: 128000
2022
+ client_spec:
2023
+ class_name: "helm.clients.mistral_client.MistralAIClient"
2024
+
2025
+
1433
2026
  # Neurips
1434
2027
  - name: neurips/local
1435
2028
  model_name: neurips/local
@@ -1464,7 +2057,7 @@ model_deployments:
1464
2057
  # to provide a margin of error.
1465
2058
  max_sequence_length: 16000
1466
2059
  client_spec:
1467
- class_name: "helm.clients.openai_client.OpenAIClient"
2060
+ class_name: "helm.clients.openai_client.OpenAILegacyCompletionsClient"
1468
2061
 
1469
2062
  - name: openai/babbage-002
1470
2063
  model_name: openai/babbage-002
@@ -1473,7 +2066,7 @@ model_deployments:
1473
2066
  # to provide a margin of error.
1474
2067
  max_sequence_length: 16000
1475
2068
  client_spec:
1476
- class_name: "helm.clients.openai_client.OpenAIClient"
2069
+ class_name: "helm.clients.openai_client.OpenAILegacyCompletionsClient"
1477
2070
 
1478
2071
  ## GPT 3.5 Turbo Models
1479
2072
  # ChatGPT: https://openai.com/blog/chatgpt
@@ -1484,7 +2077,7 @@ model_deployments:
1484
2077
  max_sequence_length: 4096
1485
2078
  max_request_length: 4097
1486
2079
  client_spec:
1487
- class_name: "helm.clients.openai_client.OpenAIClient"
2080
+ class_name: "helm.clients.openai_client.OpenAILegacyCompletionsClient"
1488
2081
 
1489
2082
  # The claimed sequence length is 4096, but as of 2023-03-07, the empirical usable
1490
2083
  # sequence length is smaller at 4087 with one user input message and one assistant
@@ -1616,30 +2209,142 @@ model_deployments:
1616
2209
  client_spec:
1617
2210
  class_name: "helm.clients.openai_client.OpenAIClient"
1618
2211
 
1619
- - name: openai/gpt-4o-mini-2024-07-18
1620
- model_name: openai/gpt-4o-mini-2024-07-18
1621
- tokenizer_name: openai/o200k_base
1622
- max_sequence_length: 128000
2212
+ - name: openai/gpt-4o-2024-11-20
2213
+ model_name: openai/gpt-4o-2024-11-20
2214
+ tokenizer_name: openai/o200k_base
2215
+ max_sequence_length: 128000
2216
+ client_spec:
2217
+ class_name: "helm.clients.openai_client.OpenAIClient"
2218
+
2219
+ - name: openai/gpt-4o-mini-2024-07-18
2220
+ model_name: openai/gpt-4o-mini-2024-07-18
2221
+ tokenizer_name: openai/o200k_base
2222
+ max_sequence_length: 128000
2223
+ client_spec:
2224
+ class_name: "helm.clients.openai_client.OpenAIClient"
2225
+
2226
+ - name: openai/whisper-1_gpt-4o-2024-11-20
2227
+ model_name: openai/whisper-1_gpt-4o-2024-11-20
2228
+ tokenizer_name: openai/o200k_base
2229
+ max_sequence_length: 128000
2230
+ client_spec:
2231
+ class_name: "helm.clients.openai_client.OpenAITranscriptionThenCompletionClient"
2232
+
2233
+ - name: openai/gpt-4o-audio-preview-2024-10-01
2234
+ model_name: openai/gpt-4o-audio-preview-2024-10-01
2235
+ tokenizer_name: openai/o200k_base
2236
+ max_sequence_length: 128000
2237
+ client_spec:
2238
+ class_name: "helm.clients.openai_client.OpenAIClient"
2239
+
2240
+ - name: openai/gpt-4o-audio-preview-2024-12-17
2241
+ model_name: openai/gpt-4o-audio-preview-2024-12-17
2242
+ tokenizer_name: openai/o200k_base
2243
+ max_sequence_length: 128000
2244
+ client_spec:
2245
+ class_name: "helm.clients.openai_client.OpenAIClient"
2246
+
2247
+ - name: openai/gpt-4o-mini-audio-preview-2024-12-17
2248
+ model_name: openai/gpt-4o-mini-audio-preview-2024-12-17
2249
+ tokenizer_name: openai/o200k_base
2250
+ max_sequence_length: 128000
2251
+ client_spec:
2252
+ class_name: "helm.clients.openai_client.OpenAIClient"
2253
+
2254
+ - name: openai/gpt-4-vision-preview
2255
+ model_name: openai/gpt-4-vision-preview
2256
+ tokenizer_name: openai/cl100k_base
2257
+ max_sequence_length: 128000 # According to https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo
2258
+ max_request_length: 128001
2259
+ max_sequence_and_generated_tokens_length: 132096
2260
+ client_spec:
2261
+ class_name: "helm.clients.openai_client.OpenAIClient"
2262
+
2263
+ - name: openai/gpt-4-1106-vision-preview
2264
+ model_name: openai/gpt-4-1106-vision-preview
2265
+ tokenizer_name: openai/cl100k_base
2266
+ max_sequence_length: 128000 # According to https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo
2267
+ max_request_length: 128001
2268
+ max_sequence_and_generated_tokens_length: 132096
2269
+ client_spec:
2270
+ class_name: "helm.clients.openai_client.OpenAIClient"
2271
+
2272
+ ## GPT-4.5
2273
+ - name: openai/gpt-4.5-preview-2025-02-27
2274
+ model_name: openai/gpt-4.5-preview-2025-02-27
2275
+ tokenizer_name: openai/o200k_base
2276
+ max_sequence_length: 128000
2277
+ client_spec:
2278
+ class_name: "helm.clients.openai_client.OpenAIClient"
2279
+
2280
+ ## o1 Models
2281
+ - name: openai/o1-2024-12-17
2282
+ model_name: openai/o1-2024-12-17
2283
+ tokenizer_name: openai/cl100k_base
2284
+ max_sequence_length: 128000
2285
+ client_spec:
2286
+ class_name: "helm.clients.openai_client.OpenAIClient"
2287
+
2288
+ - name: openai/o1-2024-12-17-low-reasoning-effort
2289
+ model_name: openai/o1-2024-12-17-low-reasoning-effort
2290
+ tokenizer_name: openai/cl100k_base
2291
+ max_sequence_length: 128000
2292
+ client_spec:
2293
+ class_name: "helm.clients.openai_client.OpenAIClient"
2294
+ args:
2295
+ openai_model_name: o1-2024-12-17
2296
+ reasoning_effort: low
2297
+
2298
+ - name: openai/o1-2024-12-17-high-reasoning-effort
2299
+ model_name: openai/o1-2024-12-17-high-reasoning-effort
2300
+ tokenizer_name: openai/cl100k_base
2301
+ max_sequence_length: 128000
2302
+ client_spec:
2303
+ class_name: "helm.clients.openai_client.OpenAIClient"
2304
+ args:
2305
+ openai_model_name: o1-2024-12-17
2306
+ reasoning_effort: high
2307
+
2308
+ - name: openai/o1-preview-2024-09-12
2309
+ model_name: openai/o1-preview-2024-09-12
2310
+ tokenizer_name: openai/cl100k_base
2311
+ max_sequence_length: 128000
2312
+ client_spec:
2313
+ class_name: "helm.clients.openai_client.OpenAIClient"
2314
+
2315
+ - name: openai/o1-mini-2024-09-12
2316
+ model_name: openai/o1-mini-2024-09-12
2317
+ tokenizer_name: openai/cl100k_base
2318
+ max_sequence_length: 128000
2319
+ client_spec:
2320
+ class_name: "helm.clients.openai_client.OpenAIClient"
2321
+
2322
+ - name: openai/o3-mini-2025-01-31
2323
+ model_name: openai/o3-mini-2025-01-31
2324
+ tokenizer_name: openai/cl100k_base
2325
+ max_sequence_length: 200000
1623
2326
  client_spec:
1624
2327
  class_name: "helm.clients.openai_client.OpenAIClient"
1625
2328
 
1626
- - name: openai/gpt-4-vision-preview
1627
- model_name: openai/gpt-4-vision-preview
2329
+ - name: openai/o3-mini-2025-01-31-low-reasoning-effort
2330
+ model_name: openai/o3-mini-2025-01-31-low-reasoning-effort
1628
2331
  tokenizer_name: openai/cl100k_base
1629
- max_sequence_length: 128000 # According to https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo
1630
- max_request_length: 128001
1631
- max_sequence_and_generated_tokens_length: 132096
2332
+ max_sequence_length: 200000
1632
2333
  client_spec:
1633
2334
  class_name: "helm.clients.openai_client.OpenAIClient"
2335
+ args:
2336
+ openai_model_name: o3-mini-2025-01-31
2337
+ reasoning_effort: low
1634
2338
 
1635
- - name: openai/gpt-4-1106-vision-preview
1636
- model_name: openai/gpt-4-1106-vision-preview
2339
+ - name: openai/o3-mini-2025-01-31-high-reasoning-effort
2340
+ model_name: openai/o3-mini-2025-01-31-high-reasoning-effort
1637
2341
  tokenizer_name: openai/cl100k_base
1638
- max_sequence_length: 128000 # According to https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo
1639
- max_request_length: 128001
1640
- max_sequence_and_generated_tokens_length: 132096
2342
+ max_sequence_length: 200000
1641
2343
  client_spec:
1642
2344
  class_name: "helm.clients.openai_client.OpenAIClient"
2345
+ args:
2346
+ openai_model_name: o3-mini-2025-01-31
2347
+ reasoning_effort: high
1643
2348
 
1644
2349
  ## Text Similarity Models
1645
2350
  # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings
@@ -1831,7 +2536,25 @@ model_deployments:
1831
2536
  client_spec:
1832
2537
  class_name: "helm.clients.together_client.TogetherClient"
1833
2538
  args:
1834
- together_model: meta-llama/Meta-Llama-3-8B
2539
+ together_model: meta-llama/Llama-3-8b-hf
2540
+
2541
+ - name: together/llama-3-8b-instruct-turbo
2542
+ model_name: meta/llama-3-8b-instruct-turbo
2543
+ tokenizer_name: meta/llama-3-8b
2544
+ max_sequence_length: 8191
2545
+ client_spec:
2546
+ class_name: "helm.clients.together_client.TogetherClient"
2547
+ args:
2548
+ together_model: meta-llama/Meta-Llama-3-8B-Instruct-Turbo
2549
+
2550
+ - name: together/llama-3-8b-instruct-lite
2551
+ model_name: meta/llama-3-8b-instruct-lite
2552
+ tokenizer_name: meta/llama-3-8b
2553
+ max_sequence_length: 8191
2554
+ client_spec:
2555
+ class_name: "helm.clients.together_client.TogetherClient"
2556
+ args:
2557
+ together_model: meta-llama/Meta-Llama-3-8B-Instruct-Lite
1835
2558
 
1836
2559
  - name: together/llama-3-70b
1837
2560
  model_name: meta/llama-3-70b
@@ -1842,6 +2565,24 @@ model_deployments:
1842
2565
  args:
1843
2566
  together_model: meta-llama/Meta-Llama-3-70B
1844
2567
 
2568
+ - name: together/llama-3-70b-instruct-turbo
2569
+ model_name: meta/llama-3-70b-instruct-turbo
2570
+ tokenizer_name: meta/llama-3-8b
2571
+ max_sequence_length: 8191
2572
+ client_spec:
2573
+ class_name: "helm.clients.together_client.TogetherClient"
2574
+ args:
2575
+ together_model: meta-llama/Meta-Llama-3-70B-Instruct-Turbo
2576
+
2577
+ - name: together/llama-3-70b-instruct-lite
2578
+ model_name: meta/llama-3-70b-instruct-lite
2579
+ tokenizer_name: meta/llama-3-8b
2580
+ max_sequence_length: 8191
2581
+ client_spec:
2582
+ class_name: "helm.clients.together_client.TogetherClient"
2583
+ args:
2584
+ together_model: meta-llama/Meta-Llama-3-70B-Instruct-Lite
2585
+
1845
2586
  - name: together/llama-3.1-8b-instruct-turbo
1846
2587
  model_name: meta/llama-3.1-8b-instruct-turbo
1847
2588
  tokenizer_name: meta/llama-3.1-8b
@@ -1871,7 +2612,7 @@ model_deployments:
1871
2612
 
1872
2613
  - name: together/llama-3-8b-chat
1873
2614
  model_name: meta/llama-3-8b-chat
1874
- tokenizer_name: meta/llama-3-8b
2615
+ tokenizer_name: meta/llama-3-8b-instruct
1875
2616
  max_sequence_length: 8182
1876
2617
  client_spec:
1877
2618
  class_name: "helm.clients.together_client.TogetherChatClient"
@@ -1880,13 +2621,49 @@ model_deployments:
1880
2621
 
1881
2622
  - name: together/llama-3-70b-chat
1882
2623
  model_name: meta/llama-3-70b-chat
1883
- tokenizer_name: meta/llama-3-8b
2624
+ tokenizer_name: meta/llama-3-8b-instruct
1884
2625
  max_sequence_length: 8182
1885
2626
  client_spec:
1886
2627
  class_name: "helm.clients.together_client.TogetherChatClient"
1887
2628
  args:
1888
2629
  together_model: meta-llama/Llama-3-70b-chat-hf
1889
2630
 
2631
+ - name: together/llama-3.2-3b-instruct-turbo
2632
+ model_name: meta/llama-3.2-3b-instruct-turbo
2633
+ tokenizer_name: meta/llama-3.2-3b-instruct
2634
+ max_sequence_length: 128000
2635
+ client_spec:
2636
+ class_name: "helm.clients.together_client.TogetherChatClient"
2637
+ args:
2638
+ together_model: meta-llama/Llama-3.2-3B-Instruct-Turbo
2639
+
2640
+ - name: together/llama-3.2-11b-vision-instruct-turbo
2641
+ model_name: meta/llama-3.2-11b-vision-instruct-turbo
2642
+ tokenizer_name: meta/llama-3.2-11b-vision-instruct
2643
+ max_sequence_length: 128000
2644
+ client_spec:
2645
+ class_name: "helm.clients.together_client.TogetherChatClient"
2646
+ args:
2647
+ together_model: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo
2648
+
2649
+ - name: together/llama-3.2-90b-vision-instruct-turbo
2650
+ model_name: meta/llama-3.2-90b-vision-instruct-turbo
2651
+ tokenizer_name: meta/llama-3.2-11b-vision-instruct
2652
+ max_sequence_length: 128000
2653
+ client_spec:
2654
+ class_name: "helm.clients.together_client.TogetherChatClient"
2655
+ args:
2656
+ together_model: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
2657
+
2658
+ - name: together/llama-3.3-70b-instruct-turbo
2659
+ model_name: meta/llama-3.3-70b-instruct-turbo
2660
+ tokenizer_name: meta/llama-3.3-70b-instruct
2661
+ max_sequence_length: 128000
2662
+ client_spec:
2663
+ class_name: "helm.clients.together_client.TogetherChatClient"
2664
+ args:
2665
+ together_model: meta-llama/Llama-3.3-70B-Instruct-Turbo
2666
+
1890
2667
  - name: together/llama-guard-7b
1891
2668
  model_name: meta/llama-guard-7b
1892
2669
  tokenizer_name: meta-llama/Llama-2-7b-hf
@@ -1914,6 +2691,16 @@ model_deployments:
1914
2691
  args:
1915
2692
  together_model: meta-llama/Meta-Llama-Guard-3-8B
1916
2693
 
2694
+ # NVIDIA
2695
+ - name: together/llama-3.1-nemotron-70b-instruct
2696
+ model_name: nvidia/llama-3.1-nemotron-70b-instruct
2697
+ tokenizer_name: nvidia/llama-3.1-nemotron-70b-instruct
2698
+ max_sequence_length: 32768
2699
+ client_spec:
2700
+ class_name: "helm.clients.together_client.TogetherClient"
2701
+ args:
2702
+ together_model: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
2703
+
1917
2704
  # 01.AI
1918
2705
  - name: together/yi-6b
1919
2706
  model_name: 01-ai/yi-6b
@@ -2021,12 +2808,22 @@ model_deployments:
2021
2808
  client_spec:
2022
2809
  class_name: "helm.clients.together_client.TogetherChatClient"
2023
2810
 
2811
+ - name: huggingface/mistral-7b-instruct-v0.3
2812
+ model_name: mistralai/mistral-7b-instruct-v0.3-hf
2813
+ tokenizer_name: mistralai/Mistral-7B-Instruct-v0.3
2814
+ max_sequence_length: 32000
2815
+ client_spec:
2816
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
2817
+ args:
2818
+ pretrained_model_name_or_path: mistralai/Mistral-7B-Instruct-v0.3
2819
+
2024
2820
  - name: together/mistral-7b-instruct-v0.3
2025
2821
  model_name: mistralai/mistral-7b-instruct-v0.3
2026
2822
  tokenizer_name: mistralai/Mistral-7B-Instruct-v0.3
2027
2823
  max_sequence_length: 32000
2028
2824
  client_spec:
2029
2825
  class_name: "helm.clients.together_client.TogetherChatClient"
2826
+
2030
2827
 
2031
2828
  - name: together/mixtral-8x7b-32kseqlen
2032
2829
  model_name: mistralai/mixtral-8x7b-32kseqlen
@@ -2262,6 +3059,61 @@ model_deployments:
2262
3059
  client_spec:
2263
3060
  class_name: "helm.clients.vision_language.palmyra_vision_client.PalmyraVisionClient"
2264
3061
 
3062
+ - name: writer/palmyra-x-004
3063
+ model_name: writer/palmyra-x-004
3064
+ # Actual tokenizer is Llama 2, but it cannot be used in HELM due to this issue:
3065
+ # https://github.com/stanford-crfm/helm/issues/2467
3066
+ # Work around by using Llama 3 tokenizer for now.
3067
+ tokenizer_name: meta/llama-3-8b
3068
+ max_sequence_length: 8192
3069
+ client_spec:
3070
+ class_name: "helm.clients.palmyra_client.PalmyraChatClient"
3071
+
3072
+ - name: writer/palmyra-med-32k
3073
+ model_name: writer/palmyra-med-32k
3074
+ # Palmyra-Med uses the "<|end_of_text|>" as the end of text token, which is used by meta/llama-3-8b,
3075
+ # rather than "<|eot_id|>", which is used by meta/llama-3-8b-instruct
3076
+ tokenizer_name: meta/llama-3-8b
3077
+ max_sequence_length: 32000
3078
+ client_spec:
3079
+ class_name: "helm.clients.palmyra_client.PalmyraChatClient"
3080
+
3081
+ - name: writer/palmyra-med
3082
+ model_name: writer/palmyra-med
3083
+ # Palmyra-Med uses the "<|end_of_text|>" as the end of text token, which is used by meta/llama-3-8b,
3084
+ # rather than "<|eot_id|>", which is used by meta/llama-3-8b-instruct
3085
+ tokenizer_name: meta/llama-3-8b
3086
+ max_sequence_length: 4096
3087
+ client_spec:
3088
+ class_name: "helm.clients.palmyra_client.PalmyraChatClient"
3089
+
3090
+ - name: writer/palmyra-fin-32k
3091
+ model_name: writer/palmyra-fin-32k
3092
+ tokenizer_name: meta/llama-3-8b-instruct
3093
+ max_sequence_length: 32000
3094
+ client_spec:
3095
+ class_name: "helm.clients.palmyra_client.PalmyraChatClient"
3096
+
3097
+ - name: writer/palmyra-fin
3098
+ model_name: writer/palmyra-fin
3099
+ tokenizer_name: meta/llama-3-8b-instruct
3100
+ max_sequence_length: 128000
3101
+ client_spec:
3102
+ class_name: "helm.clients.palmyra_client.PalmyraChatClient"
3103
+
3104
+
3105
+ # xAI
3106
+
3107
+ - name: xai/grok-beta
3108
+ model_name: xai/grok-beta
3109
+ # No public information on tokenizer, so just pick an arbitrary one.
3110
+ # It shouldn't matter since the context is long.
3111
+ tokenizer_name: openai/o200k_base
3112
+ max_sequence_length: 128000
3113
+ client_spec:
3114
+ class_name: "helm.clients.openai_client.OpenAIClient"
3115
+ args:
3116
+ base_url: https://api.x.ai/v1
2265
3117
 
2266
3118
  # Qwen
2267
3119
 
@@ -2352,6 +3204,49 @@ model_deployments:
2352
3204
  client_spec:
2353
3205
  class_name: "helm.clients.together_client.TogetherChatClient"
2354
3206
 
3207
+ - name: together/qwen2.5-7b-instruct-turbo
3208
+ model_name: qwen/qwen2.5-7b-instruct-turbo
3209
+ tokenizer_name: qwen/qwen2.5-7b-instruct
3210
+ max_sequence_length: 128000
3211
+ client_spec:
3212
+ class_name: "helm.clients.together_client.TogetherChatClient"
3213
+
3214
+ - name: together/qwen2.5-72b-instruct-turbo
3215
+ model_name: qwen/qwen2.5-72b-instruct-turbo
3216
+ tokenizer_name: qwen/qwen2.5-7b-instruct
3217
+ max_sequence_length: 128000
3218
+ client_spec:
3219
+ class_name: "helm.clients.together_client.TogetherChatClient"
3220
+
3221
+ - name: huggingface/qwen2.5-7b-instruct-4bit
3222
+ model_name: qwen/qwen2.5-7b-instruct
3223
+ tokenizer_name: qwen/qwen2.5-7b-instruct
3224
+ max_sequence_length: 128000
3225
+ client_spec:
3226
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3227
+ args:
3228
+ pretrained_model_name_or_path: Qwen/Qwen2.5-7B-Instruct
3229
+ torch_dtype: "float16"
3230
+ quantization_config:
3231
+ load_in_4bit: true
3232
+ attn_implementation: "flash_attention_2"
3233
+
3234
+ - name: huggingface/qwen2.5-7b-instruct
3235
+ model_name: qwen/qwen2.5-7b-instruct
3236
+ tokenizer_name: qwen/qwen2.5-7b-instruct
3237
+ max_sequence_length: 128000
3238
+ client_spec:
3239
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3240
+ args:
3241
+ pretrained_model_name_or_path: Qwen/Qwen2.5-7B-Instruct
3242
+
3243
+ - name: together/qwq-32b-preview
3244
+ model_name: qwen/qwq-32b-preview
3245
+ tokenizer_name: qwen/qwq-32b-preview
3246
+ max_sequence_length: 32768
3247
+ client_spec:
3248
+ class_name: "helm.clients.together_client.TogetherChatClient"
3249
+
2355
3250
  - name: huggingface/qwen-vl
2356
3251
  model_name: qwen/qwen-vl
2357
3252
  tokenizer_name: qwen/qwen-vl
@@ -2366,6 +3261,34 @@ model_deployments:
2366
3261
  client_spec:
2367
3262
  class_name: "helm.clients.vision_language.qwen_vlm_client.QwenVLMClient"
2368
3263
 
3264
+ - name: huggingface/qwen2-vl-7b-instruct
3265
+ model_name: qwen/qwen2-vl-7b-instruct
3266
+ tokenizer_name: qwen/qwen-vl-chat
3267
+ max_sequence_length: 8191
3268
+ client_spec:
3269
+ class_name: "helm.clients.vision_language.qwen2_vlm_client.Qwen2VLMClient"
3270
+
3271
+ - name: huggingface/qwen2-vl-72b-instruct
3272
+ model_name: qwen/qwen2-vl-72b-instruct
3273
+ tokenizer_name: qwen/qwen-vl-chat
3274
+ max_sequence_length: 8191
3275
+ client_spec:
3276
+ class_name: "helm.clients.vision_language.qwen2_vlm_client.Qwen2VLMClient"
3277
+
3278
+ - name: huggingface/qwen-audio-chat
3279
+ model_name: qwen/qwen-audio-chat
3280
+ tokenizer_name: qwen/qwen-audio-chat
3281
+ max_sequence_length: 8191
3282
+ client_spec:
3283
+ class_name: "helm.clients.audio_language.qwen_audiolm_client.QwenAudioLMClient"
3284
+
3285
+ - name: huggingface/qwen2-audio-7b-instruct
3286
+ model_name: qwen/qwen2-audio-7b-instruct
3287
+ tokenizer_name: qwen/qwen2-audio-instruct
3288
+ max_sequence_length: 8191
3289
+ client_spec:
3290
+ class_name: "helm.clients.audio_language.qwen2_audiolm_client.Qwen2AudioLMClient"
3291
+
2369
3292
  # Reka
2370
3293
  - name: reka/reka-core
2371
3294
  model_name: reka/reka-core
@@ -2415,3 +3338,305 @@ model_deployments:
2415
3338
  max_sequence_length: 64000
2416
3339
  client_spec:
2417
3340
  class_name: "helm.clients.reka_client.RekaClient"
3341
+
3342
+ # Upstage
3343
+ - name: upstage/solar-pro-241126
3344
+ model_name: upstage/solar-pro-241126
3345
+ tokenizer_name: upstage/solar-pro-preview-instruct
3346
+ max_sequence_length: 32768
3347
+ client_spec:
3348
+ class_name: "helm.clients.upstage_client.UpstageChatClient"
3349
+
3350
+ # Diva Llama
3351
+ - name: huggingface/diva-llama
3352
+ model_name: stanford/diva-llama
3353
+ # TODO: Set the right tokenizer
3354
+ tokenizer_name: meta/llama-3-8b-instruct
3355
+ max_sequence_length: 8192
3356
+ client_spec:
3357
+ class_name: "helm.clients.audio_language.diva_llama_client.DivaLlamaClient"
3358
+
3359
+ # LLaMA-Omni
3360
+ - name: ictnlp/llama-3.1-8b-omni
3361
+ model_name: ictnlp/llama-3.1-8b-omni
3362
+ tokenizer_name: ictnlp/llama-3.1-8b-omni
3363
+ max_sequence_length: 8192
3364
+ client_spec:
3365
+ class_name: "helm.clients.audio_language.llama_omni_client.LlamaOmniAudioLMClient"
3366
+
3367
+ # IBM - Granite 3.0
3368
+ - name: huggingface/granite-3.0-2b-base
3369
+ model_name: ibm-granite/granite-3.0-2b-base
3370
+ tokenizer_name: ibm-granite/granite-3.0-2b-base
3371
+ max_sequence_length: 4096
3372
+ client_spec:
3373
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3374
+ args:
3375
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-2b-base
3376
+
3377
+ - name: huggingface/granite-3.0-2b-instruct
3378
+ model_name: ibm-granite/granite-3.0-2b-instruct
3379
+ tokenizer_name: ibm-granite/granite-3.0-2b-instruct
3380
+ max_sequence_length: 4096
3381
+ client_spec:
3382
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3383
+ args:
3384
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-2b-instruct
3385
+
3386
+ - name: huggingface/granite-3.0-8b-instruct
3387
+ model_name: ibm-granite/granite-3.0-8b-instruct
3388
+ tokenizer_name: ibm-granite/granite-3.0-8b-instruct
3389
+ max_sequence_length: 4096
3390
+ client_spec:
3391
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3392
+ args:
3393
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-8b-instruct
3394
+
3395
+ - name: huggingface/granite-3.0-8b-base
3396
+ model_name: ibm-granite/granite-3.0-8b-base
3397
+ tokenizer_name: ibm-granite/granite-3.0-8b-base
3398
+ max_sequence_length: 4096
3399
+ client_spec:
3400
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3401
+ args:
3402
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-8b-base
3403
+
3404
+ - name: huggingface/granite-3.0-3b-a800m-instruct
3405
+ model_name: ibm-granite/granite-3.0-3b-a800m-instruct
3406
+ tokenizer_name: ibm-granite/granite-3.0-3b-a800m-instruct
3407
+ max_sequence_length: 4096
3408
+ client_spec:
3409
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3410
+ args:
3411
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-3b-a800m-instruct
3412
+
3413
+ - name: huggingface/granite-3.0-3b-a800m-base
3414
+ model_name: ibm-granite/granite-3.0-3b-a800m-base
3415
+ tokenizer_name: ibm-granite/granite-3.0-3b-a800m-base
3416
+ max_sequence_length: 4096
3417
+ client_spec:
3418
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3419
+ args:
3420
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-3b-a800m-base
3421
+
3422
+ - name: huggingface/granite-3.0-1b-a400m-instruct
3423
+ model_name: ibm-granite/granite-3.0-1b-a400m-instruct
3424
+ tokenizer_name: ibm-granite/granite-3.0-1b-a400m-instruct
3425
+ max_sequence_length: 4096
3426
+ client_spec:
3427
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3428
+ args:
3429
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-instruct
3430
+
3431
+ - name: huggingface/granite-3.0-1b-a400m-base
3432
+ model_name: ibm-granite/granite-3.0-1b-a400m-base
3433
+ tokenizer_name: ibm-granite/granite-3.0-1b-a400m-base
3434
+ max_sequence_length: 4096
3435
+ client_spec:
3436
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3437
+ args:
3438
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base
3439
+
3440
+ - name: huggingface/sabia-7b
3441
+ model_name: maritaca-ai/sabia-7b
3442
+ tokenizer_name: maritaca-ai/sabia-7b
3443
+ max_sequence_length: 2048
3444
+ client_spec:
3445
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3446
+ args:
3447
+ pretrained_model_name_or_path: maritaca-ai/sabia-7b
3448
+
3449
+ # Granite-3.1-8b-base
3450
+ - name: huggingface/granite-3.1-8b-base
3451
+ model_name: ibm-granite/granite-3.1-8b-base
3452
+ tokenizer_name: ibm-granite/granite-3.1-8b-base
3453
+ max_sequence_length: 128000
3454
+ client_spec:
3455
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3456
+ args:
3457
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-8b-base
3458
+
3459
+ # Granite-3.1-8b-instruct
3460
+ - name: huggingface/granite-3.1-8b-instruct
3461
+ model_name: ibm-granite/granite-3.1-8b-instruct
3462
+ tokenizer_name: ibm-granite/granite-3.1-8b-instruct
3463
+ max_sequence_length: 128000
3464
+ client_spec:
3465
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3466
+ args:
3467
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-8b-instruct
3468
+
3469
+ # Granite-3.1-2b-instruct
3470
+ - name: huggingface/granite-3.1-2b-instruct
3471
+ model_name: ibm-granite/granite-3.1-2b-instruct
3472
+ tokenizer_name: ibm-granite/granite-3.1-2b-instruct
3473
+ max_sequence_length: 128000
3474
+ client_spec:
3475
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3476
+ args:
3477
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-2b-instruct
3478
+
3479
+ # Granite-3.1-2b-base
3480
+ - name: huggingface/granite-3.1-2b-base
3481
+ model_name: ibm-granite/granite-3.1-2b-base
3482
+ tokenizer_name: ibm-granite/granite-3.1-2b-base
3483
+ max_sequence_length: 128000
3484
+ client_spec:
3485
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3486
+ args:
3487
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-2b-base
3488
+
3489
+ # Granite-3.1-3b-a800m-instruct
3490
+ - name: huggingface/granite-3.1-3b-a800m-instruct
3491
+ model_name: ibm-granite/granite-3.1-3b-a800m-instruct
3492
+ tokenizer_name: ibm-granite/granite-3.1-3b-a800m-instruct
3493
+ max_sequence_length: 128000
3494
+ client_spec:
3495
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3496
+ args:
3497
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-3b-a800m-instruct
3498
+
3499
+ # Granite-3.1-3b-a800m-base
3500
+ - name: huggingface/granite-3.1-3b-a800m-base
3501
+ model_name: ibm-granite/granite-3.1-3b-a800m-base
3502
+ tokenizer_name: ibm-granite/granite-3.1-3b-a800m-base
3503
+ max_sequence_length: 128000
3504
+ client_spec:
3505
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3506
+ args:
3507
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-3b-a800m-base
3508
+
3509
+ # Granite-3.1-1b-a400m-instruct
3510
+ - name: huggingface/granite-3.1-1b-a400m-instruct
3511
+ model_name: ibm-granite/granite-3.1-1b-a400m-instruct
3512
+ tokenizer_name: ibm-granite/granite-3.1-1b-a400m-instruct
3513
+ max_sequence_length: 128000
3514
+ client_spec:
3515
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3516
+ args:
3517
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-1b-a400m-instruct
3518
+
3519
+ # Granite-3.1-1b-a400m-base
3520
+ - name: huggingface/granite-3.1-1b-a400m-base
3521
+ model_name: ibm-granite/granite-3.1-1b-a400m-base
3522
+ tokenizer_name: ibm-granite/granite-3.1-1b-a400m-base
3523
+ max_sequence_length: 128000
3524
+ client_spec:
3525
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3526
+ args:
3527
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-1b-a400m-base
3528
+
3529
+ # DeepSeek-R1-Distill-Llama-3.1-8b
3530
+ - name: huggingface/DeepSeek-R1-Distill-Llama-8B
3531
+ model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
3532
+ tokenizer_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
3533
+ max_sequence_length: 128000
3534
+ client_spec:
3535
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3536
+ args:
3537
+ pretrained_model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
3538
+
3539
+ # deepseek-ai/deepseek-coder-6.7b-instruct
3540
+ - name: huggingface/deepseek-coder-6.7b-instruct
3541
+ model_name: deepseek-ai/deepseek-coder-6.7b-instruct
3542
+ tokenizer_name: deepseek-ai/deepseek-coder-6.7b-instruct
3543
+ max_sequence_length: 128000
3544
+ client_spec:
3545
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3546
+ args:
3547
+ pretrained_model_name_or_path: deepseek-ai/deepseek-coder-6.7b-instruct
3548
+
3549
+ # IBM WatsonX
3550
+ - name: ibm/llama-3.3-70b-instruct
3551
+ model_name: meta/llama-3.3-70b-instruct
3552
+ tokenizer_name: meta/llama-3.3-70b-instruct
3553
+ max_sequence_length: 128000
3554
+ client_spec:
3555
+ class_name: "helm.clients.ibm_client.IbmChatClient"
3556
+ args:
3557
+ watsonx_model_name: meta-llama/llama-3-3-70b-instruct
3558
+ region: Dallas
3559
+
3560
+ - name: ibm/granite-3-2b-instruct
3561
+ model_name: ibm/granite-3.1-2b-instruct
3562
+ tokenizer_name: ibm-granite/granite-3.1-2b-instruct
3563
+ max_sequence_length: 128000
3564
+ client_spec:
3565
+ class_name: "helm.clients.ibm_client.IbmChatClient"
3566
+ args:
3567
+ watsonx_model_name: ibm/granite-3-2b-instruct
3568
+ region: Dallas
3569
+
3570
+
3571
+ #
3572
+ - name: ibm/granite-3-8b-instruct
3573
+ model_name: ibm/granite-3.1-8b-instruct
3574
+ tokenizer_name: ibm-granite/granite-3.1-8b-instruct
3575
+ max_sequence_length: 128000
3576
+ client_spec:
3577
+ class_name: "helm.clients.ibm_client.IbmChatClient"
3578
+ args:
3579
+ watsonx_model_name: ibm/granite-3-8b-instruct
3580
+ region: Dallas
3581
+ #
3582
+
3583
+ #
3584
+ - name: ibm/granite-13b-instruct-v2
3585
+ model_name: ibm/granite-13b-instruct-v2
3586
+ tokenizer_name: EleutherAI/gpt-neox-20b
3587
+ max_sequence_length: 128000
3588
+ client_spec:
3589
+ class_name: "helm.clients.ibm_client.IbmTextClient"
3590
+ args:
3591
+ watsonx_model_name: ibm/granite-13b-instruct-v2
3592
+ region: Dallas
3593
+ #
3594
+ - name: ibm/granite-20b-code-instruct-8k
3595
+ model_name: ibm/granite-20b-code-instruct-8k
3596
+ tokenizer_name: ibm-granite/granite-20b-code-instruct-8k
3597
+ max_sequence_length: 128000
3598
+ client_spec:
3599
+ class_name: "helm.clients.ibm_client.IbmChatClient"
3600
+ args:
3601
+ watsonx_model_name: ibm/granite-20b-code-instruct
3602
+ region: Dallas
3603
+ #
3604
+ - name: ibm/granite-34b-code-instruct
3605
+ model_name: ibm/granite-34b-code-instruct
3606
+ tokenizer_name: ibm-granite/granite-34b-code-instruct-8k
3607
+ max_sequence_length: 128000
3608
+ client_spec:
3609
+ class_name: "helm.clients.ibm_client.IbmChatClient"
3610
+ args:
3611
+ watsonx_model_name: ibm/granite-34b-code-instruct
3612
+ region: Dallas
3613
+ #
3614
+ - name: ibm/granite-3b-code-instruct
3615
+ model_name: ibm/granite-3b-code-instruct
3616
+ tokenizer_name: ibm-granite/granite-3b-code-instruct-128k
3617
+ max_sequence_length: 128000
3618
+ client_spec:
3619
+ class_name: "helm.clients.ibm_client.IbmTextClient"
3620
+ args:
3621
+ watsonx_model_name: ibm/granite-3b-code-instruct
3622
+ region: Dallas
3623
+ #
3624
+ - name: ibm/granite-8b-code-instruct
3625
+ model_name: ibm/granite-8b-code-instruct
3626
+ tokenizer_name: ibm-granite/granite-8b-code-instruct-128k
3627
+ max_sequence_length: 128000
3628
+ client_spec:
3629
+ class_name: "helm.clients.ibm_client.IbmTextClient"
3630
+ args:
3631
+ watsonx_model_name: ibm/granite-8b-code-instruct
3632
+ region: Dallas
3633
+
3634
+ - name: ibm/mixtral-8x7b-instruct-v0:1
3635
+ model_name: mistralai/mixtral-8x7b-instruct-v0:1
3636
+ tokenizer_name: huggingface/gpt2
3637
+ max_sequence_length: 4000
3638
+ client_spec:
3639
+ class_name: "helm.clients.ibm_client.IbmChatClient"
3640
+ args:
3641
+ watsonx_model_name: mistralai/mixtral-8x7b-instruct-v01
3642
+ region: Dallas