crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -104,7 +104,39 @@ model_deployments:
104
104
  class_name: "helm.benchmark.window_services.image_generation.clip_window_service.CLIPWindowService"
105
105
 
106
106
 
107
- # Amazon
107
+ # Amazon nova models
108
+ - name: amazon/nova-premier-v1:0
109
+ model_name: amazon/nova-premier-v1:0
110
+ tokenizer_name: huggingface/gpt2
111
+ max_sequence_length: 1000000
112
+ client_spec:
113
+ class_name: "helm.clients.bedrock_client.BedrockNovaClient"
114
+ args:
115
+ bedrock_model_id: us.amazon.nova-premier-v1:0
116
+
117
+ - name: amazon/nova-pro-v1:0
118
+ model_name: amazon/nova-pro-v1:0
119
+ tokenizer_name: huggingface/gpt2
120
+ max_sequence_length: 300000
121
+ client_spec:
122
+ class_name: "helm.clients.bedrock_client.BedrockNovaClient"
123
+
124
+ - name: amazon/nova-lite-v1:0
125
+ model_name: amazon/nova-lite-v1:0
126
+ tokenizer_name: huggingface/gpt2
127
+ max_sequence_length: 300000
128
+ client_spec:
129
+ class_name: "helm.clients.bedrock_client.BedrockNovaClient"
130
+
131
+ - name: amazon/nova-micro-v1:0
132
+ model_name: amazon/nova-micro-v1:0
133
+ tokenizer_name: huggingface/gpt2
134
+ max_sequence_length: 128000
135
+ client_spec:
136
+ class_name: "helm.clients.bedrock_client.BedrockNovaClient"
137
+
138
+ # Titan on Amazon Bedrock
139
+
108
140
  - name: amazon/titan-text-lite-v1
109
141
  model_name: amazon/titan-text-lite-v1
110
142
  tokenizer_name: huggingface/gpt2
@@ -112,20 +144,87 @@ model_deployments:
112
144
  client_spec:
113
145
  class_name: "helm.clients.bedrock_client.BedrockTitanClient"
114
146
 
115
- - name: amazon/titan-tg1-large
116
- model_name: amazon/titan-tg1-large
147
+ - name: amazon/titan-text-express-v1
148
+ model_name: amazon/titan-text-express-v1
117
149
  tokenizer_name: huggingface/gpt2
118
150
  max_sequence_length: 8000
119
151
  client_spec:
120
152
  class_name: "helm.clients.bedrock_client.BedrockTitanClient"
153
+
154
+ # Mistral on Amazon Bedrock
121
155
 
122
- - name: amazon/titan-text-express-v1
123
- model_name: amazon/titan-text-express-v1
156
+ - name: amazon/mistral-7b-instruct-v0:2
157
+ model_name: mistralai/amazon-mistral-7b-instruct-v0:2
124
158
  tokenizer_name: huggingface/gpt2
125
159
  max_sequence_length: 8000
126
160
  client_spec:
127
- class_name: "helm.clients.bedrock_client.BedrockTitanClient"
161
+ class_name: "helm.clients.bedrock_client.BedrockMistralClient"
162
+
163
+ - name: amazon/mixtral-8x7b-instruct-v0:1
164
+ model_name: mistralai/amazon-mixtral-8x7b-instruct-v0:1
165
+ tokenizer_name: huggingface/gpt2
166
+ max_sequence_length: 4000
167
+ client_spec:
168
+ class_name: "helm.clients.bedrock_client.BedrockMistralClient"
169
+
170
+ - name: amazon/mistral-large-2402-v1:0
171
+ model_name: mistralai/amazon-mistral-large-2402-v1:0
172
+ tokenizer_name: huggingface/gpt2
173
+ max_sequence_length: 8000
174
+ client_spec:
175
+ class_name: "helm.clients.bedrock_client.BedrockMistralClient"
176
+
177
+ - name: amazon/mistral-small-2402-v1:0
178
+ model_name: mistralai/amazon-mistral-small-2402-v1:0
179
+ tokenizer_name: huggingface/gpt2
180
+ max_sequence_length: 8000
181
+ client_spec:
182
+ class_name: "helm.clients.bedrock_client.BedrockMistralClient"
183
+
184
+ - name: amazon/mistral-large-2407-v1:0
185
+ model_name: mistralai/amazon-mistral-large-2407-v1:0
186
+ tokenizer_name: huggingface/gpt2
187
+ max_sequence_length: 8000
188
+ client_spec:
189
+ class_name: "helm.clients.bedrock_client.BedrockMistralClient"
190
+
191
+ # Llama 3 on Amazon Bedrock
192
+
193
+ - name: amazon/llama3-8b-instruct-v1:0
194
+ model_name: meta/amazon-llama3-8b-instruct-v1:0
195
+ tokenizer_name: huggingface/gpt2
196
+ max_sequence_length: 2000
197
+ client_spec:
198
+ class_name: "helm.clients.bedrock_client.BedrockLlamaClient"
199
+
200
+ - name: amazon/llama3-70b-instruct-v1:0
201
+ model_name: meta/amazon-llama3-70b-instruct-v1:0
202
+ tokenizer_name: huggingface/gpt2
203
+ max_sequence_length: 2000
204
+ client_spec:
205
+ class_name: "helm.clients.bedrock_client.BedrockLlamaClient"
206
+
207
+ - name: amazon/llama3-1-405b-instruct-v1:0
208
+ model_name: meta/amazon-llama3-1-405b-instruct-v1:0
209
+ tokenizer_name: huggingface/gpt2
210
+ max_sequence_length: 2000
211
+ client_spec:
212
+ class_name: "helm.clients.bedrock_client.BedrockLlamaClient"
213
+
214
+ - name: amazon/llama3-1-70b-instruct-v1:0
215
+ model_name: meta/amazon-llama3-1-70b-instruct-v1:0
216
+ tokenizer_name: huggingface/gpt2
217
+ max_sequence_length: 2000
218
+ client_spec:
219
+ class_name: "helm.clients.bedrock_client.BedrockLlamaClient"
128
220
 
221
+
222
+ - name: amazon/llama3-1-8b-instruct-v1:0
223
+ model_name: meta/amazon-llama3-1-8b-instruct-v1:0
224
+ tokenizer_name: huggingface/gpt2
225
+ max_sequence_length: 2000
226
+ client_spec:
227
+ class_name: "helm.clients.bedrock_client.BedrockLlamaClient"
129
228
 
130
229
  # Anthropic
131
230
  - name: anthropic/claude-v1.3
@@ -189,6 +288,13 @@ model_deployments:
189
288
  client_spec:
190
289
  class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
191
290
 
291
+ - name: anthropic/claude-3-5-haiku-20241022
292
+ model_name: anthropic/claude-3-5-haiku-20241022
293
+ tokenizer_name: anthropic/claude
294
+ max_sequence_length: 200000
295
+ client_spec:
296
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
297
+
192
298
  - name: anthropic/claude-3-5-sonnet-20240620
193
299
  model_name: anthropic/claude-3-5-sonnet-20240620
194
300
  tokenizer_name: anthropic/claude
@@ -196,6 +302,67 @@ model_deployments:
196
302
  client_spec:
197
303
  class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
198
304
 
305
+ - name: anthropic/claude-3-5-sonnet-20241022
306
+ model_name: anthropic/claude-3-5-sonnet-20241022
307
+ tokenizer_name: anthropic/claude
308
+ max_sequence_length: 200000
309
+ client_spec:
310
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
311
+
312
+ - name: anthropic/claude-3-7-sonnet-20250219
313
+ model_name: anthropic/claude-3-7-sonnet-20250219
314
+ tokenizer_name: anthropic/claude
315
+ max_sequence_length: 200000
316
+ client_spec:
317
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
318
+
319
+ - name: anthropic/claude-3-7-sonnet-20250219-thinking-10k
320
+ model_name: anthropic/claude-3-7-sonnet-20250219-thinking-10k
321
+ tokenizer_name: anthropic/claude
322
+ max_sequence_length: 200000
323
+ client_spec:
324
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
325
+ args:
326
+ anthropic_model_name: claude-3-7-sonnet-20250219
327
+ thinking_budget_tokens: 10000
328
+ stream: true
329
+
330
+ - name: anthropic/claude-sonnet-4-20250514
331
+ model_name: anthropic/claude-sonnet-4-20250514
332
+ tokenizer_name: anthropic/claude
333
+ max_sequence_length: 200000
334
+ client_spec:
335
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
336
+
337
+ - name: anthropic/claude-sonnet-4-20250514-thinking-10k
338
+ model_name: anthropic/claude-sonnet-4-20250514-thinking-10k
339
+ tokenizer_name: anthropic/claude
340
+ max_sequence_length: 200000
341
+ client_spec:
342
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
343
+ args:
344
+ anthropic_model_name: claude-sonnet-4-20250514
345
+ thinking_budget_tokens: 10000
346
+ stream: true
347
+
348
+ - name: anthropic/claude-opus-4-20250514
349
+ model_name: anthropic/claude-opus-4-20250514
350
+ tokenizer_name: anthropic/claude
351
+ max_sequence_length: 200000
352
+ client_spec:
353
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
354
+
355
+ - name: anthropic/claude-opus-4-20250514-thinking-10k
356
+ model_name: anthropic/claude-opus-4-20250514-thinking-10k
357
+ tokenizer_name: anthropic/claude
358
+ max_sequence_length: 200000
359
+ client_spec:
360
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
361
+ args:
362
+ anthropic_model_name: claude-opus-4-20250514
363
+ thinking_budget_tokens: 10000
364
+ stream: true
365
+
199
366
  - name: anthropic/stanford-online-all-v4-s3
200
367
  deprecated: true # Closed model, not accessible via API
201
368
  model_name: anthropic/stanford-online-all-v4-s3
@@ -307,6 +474,26 @@ model_deployments:
307
474
  client_spec:
308
475
  class_name: "helm.clients.together_client.TogetherChatClient"
309
476
 
477
+ - name: together/deepseek-v3
478
+ model_name: deepseek-ai/deepseek-v3
479
+ tokenizer_name: deepseek-ai/deepseek-v3
480
+ max_sequence_length: 16384
481
+ client_spec:
482
+ class_name: "helm.clients.together_client.TogetherChatClient"
483
+ args:
484
+ disable_logprobs: True
485
+
486
+ - name: together/deepseek-r1-0528
487
+ model_name: deepseek-ai/deepseek-r1-0528
488
+ tokenizer_name: deepseek-ai/deepseek-r1
489
+ max_sequence_length: 32768
490
+ client_spec:
491
+ class_name: "helm.clients.together_client.TogetherChatClient"
492
+ args:
493
+ together_model: deepseek-ai/deepseek-r1
494
+ parse_thinking: true
495
+ disable_logprobs: True
496
+
310
497
  # Gooseai
311
498
 
312
499
  # TODO: Migrate these models to use OpenAIClient
@@ -422,6 +609,7 @@ model_deployments:
422
609
  client_spec:
423
610
  class_name: "helm.clients.vertexai_client.VertexAIChatClient"
424
611
  args:
612
+ vertexai_model: gemini-1.5-pro-001
425
613
  safety_settings_preset: default
426
614
 
427
615
  - name: google/gemini-1.5-pro-001-safety-block-none
@@ -432,6 +620,7 @@ model_deployments:
432
620
  client_spec:
433
621
  class_name: "helm.clients.vertexai_client.VertexAIChatClient"
434
622
  args:
623
+ vertexai_model: gemini-1.5-pro-001
435
624
  safety_settings_preset: block_none
436
625
 
437
626
  - name: google/gemini-1.5-flash-001-safety-default
@@ -442,6 +631,7 @@ model_deployments:
442
631
  client_spec:
443
632
  class_name: "helm.clients.vertexai_client.VertexAIChatClient"
444
633
  args:
634
+ vertexai_model: gemini-1.5-flash-001
445
635
  safety_settings_preset: default
446
636
 
447
637
  - name: google/gemini-1.5-flash-001-safety-block-none
@@ -452,8 +642,176 @@ model_deployments:
452
642
  client_spec:
453
643
  class_name: "helm.clients.vertexai_client.VertexAIChatClient"
454
644
  args:
645
+ vertexai_model: gemini-1.5-flash-001
455
646
  safety_settings_preset: block_none
456
647
 
648
+ - name: google/gemini-1.5-pro-002
649
+ model_name: google/gemini-1.5-pro-002
650
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
651
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
652
+ # TODO: Max output tokens: 8192
653
+ client_spec:
654
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
655
+
656
+ - name: google/gemini-1.5-flash-002
657
+ model_name: google/gemini-1.5-flash-002
658
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
659
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
660
+ # TODO: Max output tokens: 8192
661
+ client_spec:
662
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
663
+
664
+ - name: google/gemini-2.0-pro-exp-02-05
665
+ model_name: google/gemini-2.0-pro-exp-02-05
666
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
667
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
668
+ # TODO: Max output tokens: 8192
669
+ client_spec:
670
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
671
+
672
+ - name: google/gemini-2.0-flash-exp
673
+ model_name: google/gemini-2.0-flash-exp
674
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
675
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
676
+ # TODO: Max output tokens: 8192
677
+ client_spec:
678
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
679
+
680
+ - name: google/gemini-2.0-flash-001
681
+ model_name: google/gemini-2.0-flash-001
682
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
683
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
684
+ # TODO: Max output tokens: 8192
685
+ client_spec:
686
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
687
+
688
+ - name: google/gemini-2.0-flash-lite-preview-02-05
689
+ model_name: google/gemini-2.0-flash-lite-preview-02-05
690
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
691
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
692
+ # TODO: Max output tokens: 8192
693
+ client_spec:
694
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
695
+
696
+ - name: google/gemini-2.0-flash-lite-001
697
+ model_name: google/gemini-2.0-flash-lite-001
698
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
699
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
700
+ # TODO: Max output tokens: 8192
701
+ client_spec:
702
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
703
+
704
+ - name: google/gemini-2.0-flash-thinking-exp-01-21
705
+ model_name: google/gemini-2.0-flash-thinking-exp-01-21
706
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
707
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
708
+ client_spec:
709
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
710
+
711
+ - name: google/gemini-2.5-flash-lite-preview-06-17
712
+ model_name: google/gemini-2.5-flash-lite-preview-06-17
713
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
714
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash
715
+ # TODO: Max output tokens: 65536
716
+ client_spec:
717
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
718
+ args:
719
+ # Only the global location is supported. See:
720
+ # - https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash-lite
721
+ # - https://cloud.google.com/vertex-ai/generative-ai/docs/learn/locations#global-endpoint
722
+ location: global
723
+
724
+ - name: google/gemini-2.5-flash-preview-04-17
725
+ model_name: google/gemini-2.5-flash-preview-04-17
726
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
727
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash
728
+ # TODO: Max output tokens: 65536
729
+ client_spec:
730
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
731
+
732
+ - name: google/gemini-2.5-flash-preview-05-20
733
+ model_name: google/gemini-2.5-flash-preview-05-20
734
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
735
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash
736
+ # TODO: Max output tokens: 65536
737
+ client_spec:
738
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
739
+
740
+ - name: google/gemini-2.5-flash
741
+ model_name: google/gemini-2.5-flash
742
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
743
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash
744
+ # TODO: Max output tokens: 65536
745
+ client_spec:
746
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
747
+
748
+ - name: google/gemini-2.5-pro-exp-03-25
749
+ model_name: google/gemini-2.5-pro-exp-03-25
750
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
751
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-pro
752
+ # TODO: Max output tokens: 65536
753
+ client_spec:
754
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
755
+
756
+ - name: google/gemini-2.5-pro-preview-03-25
757
+ model_name: google/gemini-2.5-pro-preview-03-25
758
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
759
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-pro
760
+ # TODO: Max output tokens: 65536
761
+ client_spec:
762
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
763
+
764
+ - name: google/gemini-2.5-pro-preview-05-06
765
+ model_name: google/gemini-2.5-pro-preview-05-06
766
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
767
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-pro
768
+ # TODO: Max output tokens: 65536
769
+ client_spec:
770
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
771
+
772
+ - name: google/gemini-2.5-pro
773
+ model_name: google/gemini-2.5-pro
774
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
775
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-pro
776
+ # TODO: Max output tokens: 65536
777
+ client_spec:
778
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
779
+
780
+ - name: google/gemini-1.5-flash-8b-001
781
+ model_name: google/gemini-1.5-flash-8b-001
782
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
783
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
784
+ # TODO: Max output tokens: 8192
785
+ client_spec:
786
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
787
+
788
+ - name: google/llama-3.1-8b-instruct
789
+ model_name: meta/llama-3.1-8b-instruct
790
+ tokenizer_name: meta/llama-3.1-8b-instruct
791
+ max_sequence_length: 128000
792
+ client_spec:
793
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
794
+ args:
795
+ vertexai_model: publishers/meta/models/llama-3.1-8b-instruct-maas
796
+
797
+ - name: google/llama-3.1-70b-instruct
798
+ model_name: meta/llama-3.1-70b-instruct
799
+ tokenizer_name: meta/llama-3.1-8b-instruct
800
+ max_sequence_length: 128000
801
+ client_spec:
802
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
803
+ args:
804
+ vertexai_model: publishers/meta/models/llama-3.1-70b-instruct-maas
805
+
806
+ - name: google/llama-3.1-405b-instruct
807
+ model_name: meta/llama-3.1-405b-instruct
808
+ tokenizer_name: meta/llama-3.1-8b-instruct
809
+ max_sequence_length: 128000
810
+ client_spec:
811
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
812
+ args:
813
+ vertexai_model: publishers/meta/models/llama-3.1-405b-instruct-maas
814
+
457
815
  ## Gemma
458
816
  - name: together/gemma-2b
459
817
  model_name: google/gemma-2b
@@ -646,6 +1004,66 @@ model_deployments:
646
1004
  device_map: auto
647
1005
  torch_dtype: torch.bfloat16
648
1006
 
1007
+ - name: huggingface/gemma2-9b-cpt-sea-lionv3-base
1008
+ model_name: aisingapore/gemma2-9b-cpt-sea-lionv3-base
1009
+ tokenizer_name: google/gemma-2-9b
1010
+ max_sequence_length: 8192
1011
+ client_spec:
1012
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1013
+ args:
1014
+ device_map: auto
1015
+ torch_dtype: torch.bfloat16
1016
+
1017
+ - name: huggingface/gemma2-9b-cpt-sea-lionv3-instruct
1018
+ model_name: aisingapore/gemma2-9b-cpt-sea-lionv3-instruct
1019
+ tokenizer_name: google/gemma-2-9b
1020
+ max_sequence_length: 8192
1021
+ client_spec:
1022
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1023
+ args:
1024
+ device_map: auto
1025
+ torch_dtype: torch.bfloat16
1026
+
1027
+ - name: huggingface/llama3.1-8b-cpt-sea-lionv3-base
1028
+ model_name: aisingapore/llama3.1-8b-cpt-sea-lionv3-base
1029
+ tokenizer_name: meta/llama-3.1-8b
1030
+ max_sequence_length: 128000
1031
+ client_spec:
1032
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1033
+ args:
1034
+ device_map: auto
1035
+ torch_dtype: torch.bfloat16
1036
+
1037
+ - name: huggingface/llama3.1-8b-cpt-sea-lionv3-instruct
1038
+ model_name: aisingapore/llama3.1-8b-cpt-sea-lionv3-instruct
1039
+ tokenizer_name: meta/llama-3.1-8b
1040
+ max_sequence_length: 128000
1041
+ client_spec:
1042
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1043
+ args:
1044
+ device_map: auto
1045
+ torch_dtype: torch.bfloat16
1046
+
1047
+ - name: huggingface/llama3.1-70b-cpt-sea-lionv3-base
1048
+ model_name: aisingapore/llama3.1-70b-cpt-sea-lionv3-base
1049
+ tokenizer_name: meta/llama-3.1-8b
1050
+ max_sequence_length: 128000
1051
+ client_spec:
1052
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1053
+ args:
1054
+ device_map: auto
1055
+ torch_dtype: torch.bfloat16
1056
+
1057
+ - name: huggingface/llama3.1-70b-cpt-sea-lionv3-instruct
1058
+ model_name: aisingapore/llama3.1-70b-cpt-sea-lionv3-instruct
1059
+ tokenizer_name: meta/llama-3.1-8b
1060
+ max_sequence_length: 128000
1061
+ client_spec:
1062
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1063
+ args:
1064
+ device_map: auto
1065
+ torch_dtype: torch.bfloat16
1066
+
649
1067
  ## Bigcode
650
1068
  - name: huggingface/santacoder
651
1069
  model_name: bigcode/santacoder
@@ -803,8 +1221,26 @@ model_deployments:
803
1221
  class_name: "helm.clients.huggingface_client.HuggingFaceClient"
804
1222
 
805
1223
  ## Meta
806
- - name: huggingface/opt-175b
807
- model_name: meta/opt-175b
1224
+ - name: huggingface/llama-3.1-8b-instruct
1225
+ model_name: meta/llama-3.1-8b-instruct
1226
+ tokenizer_name: meta/llama-3.1-8b-instruct
1227
+ max_sequence_length: 131072
1228
+ client_spec:
1229
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1230
+ args:
1231
+ pretrained_model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
1232
+
1233
+ - name: huggingface/llama-3.2-1b-instruct
1234
+ model_name: meta/llama-3.2-1b-instruct
1235
+ tokenizer_name: meta/llama-3.2-1b-instruct
1236
+ max_sequence_length: 131072
1237
+ client_spec:
1238
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1239
+ args:
1240
+ pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
1241
+
1242
+ - name: huggingface/opt-175b
1243
+ model_name: meta/opt-175b
808
1244
  tokenizer_name: facebook/opt-66b
809
1245
  max_sequence_length: 2048
810
1246
  client_spec:
@@ -881,7 +1317,24 @@ model_deployments:
881
1317
  max_sequence_length: 2048
882
1318
  client_spec:
883
1319
  class_name: "helm.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient"
1320
+
1321
+ ## NECTEC
1322
+ - name: huggingface/Pathumma-llm-text-1.0.0
1323
+ model_name: nectec/Pathumma-llm-text-1.0.0
1324
+ tokenizer_name: nectec/Pathumma-llm-text-1.0.0
1325
+ max_sequence_length: 8192
1326
+ client_spec:
1327
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
884
1328
 
1329
+ - name: huggingface/OpenThaiLLM-Prebuilt-7B
1330
+ model_name: nectec/OpenThaiLLM-Prebuilt-7B
1331
+ tokenizer_name: nectec/OpenThaiLLM-Prebuilt-7B
1332
+ max_sequence_length: 4096
1333
+ client_spec:
1334
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1335
+ args:
1336
+ apply_chat_template: false
1337
+
885
1338
  ## KAIST AI
886
1339
  - name: huggingface/prometheus-vision-13b-v1.0-hf
887
1340
  model_name: kaistai/prometheus-vision-13b-v1.0-hf
@@ -902,6 +1355,23 @@ model_deployments:
902
1355
  tokenizer_name: "anas-awadalla-2/mpt-7b"
903
1356
  cross_attn_every_n_layers: 4
904
1357
 
1358
+ ## Marin Community
1359
+ - name: huggingface/marin-8b-instruct
1360
+ model_name: marin-community/marin-8b-instruct
1361
+ tokenizer_name: marin-community/marin-8b-instruct
1362
+ max_sequence_length: 4096
1363
+ client_spec:
1364
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1365
+ args:
1366
+ device_map: auto
1367
+
1368
+ - name: together/marin-8b-instruct
1369
+ model_name: marin-community/marin-8b-instruct
1370
+ tokenizer_name: marin-community/marin-8b-instruct
1371
+ max_sequence_length: 4096
1372
+ client_spec:
1373
+ class_name: "helm.clients.together_client.TogetherClient"
1374
+
905
1375
  ## Microsoft
906
1376
  - name: together/phi-2
907
1377
  model_name: microsoft/phi-2
@@ -929,7 +1399,37 @@ model_deployments:
929
1399
  args:
930
1400
  device_map: auto
931
1401
  torch_dtype: auto
932
-
1402
+
1403
+ - name: huggingface/phi-3.5-mini-instruct-4bit
1404
+ model_name: microsoft/phi-3.5-mini-instruct
1405
+ tokenizer_name: microsoft/phi-3.5-mini-instruct
1406
+ max_sequence_length: 131072
1407
+ client_spec:
1408
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1409
+ args:
1410
+ device_map: auto
1411
+ torch_dtype: "float16"
1412
+ quantization_config:
1413
+ load_in_4bit: true
1414
+ attn_implementation: "flash_attention_2"
1415
+
1416
+ - name: huggingface/phi-3.5-mini-instruct
1417
+ model_name: microsoft/phi-3.5-mini-instruct
1418
+ tokenizer_name: microsoft/phi-3.5-mini-instruct
1419
+ max_sequence_length: 131072
1420
+ client_spec:
1421
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1422
+
1423
+ - name: huggingface/phi-3.5-moe-instruct
1424
+ model_name: microsoft/phi-3.5-moe-instruct
1425
+ tokenizer_name: microsoft/phi-3.5-mini-instruct
1426
+ max_sequence_length: 131072
1427
+ client_spec:
1428
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1429
+ args:
1430
+ device_map: auto
1431
+ torch_dtype: auto
1432
+
933
1433
  ## Mistral AI
934
1434
  - name: huggingface/bakLlava-v1-hf
935
1435
  model_name: mistralai/bakLlava-v1-hf
@@ -1017,6 +1517,8 @@ model_deployments:
1017
1517
  max_sequence_length: 32768
1018
1518
  client_spec:
1019
1519
  class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1520
+ args:
1521
+ apply_chat_template: false
1020
1522
 
1021
1523
  - name: huggingface/sailor-7b-chat
1022
1524
  model_name: sail/sailor-7b-chat
@@ -1033,6 +1535,7 @@ model_deployments:
1033
1535
  class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1034
1536
  args:
1035
1537
  device_map: auto
1538
+ apply_chat_template: false
1036
1539
 
1037
1540
  - name: huggingface/sailor-14b-chat
1038
1541
  model_name: sail/sailor-14b-chat
@@ -1172,6 +1675,17 @@ model_deployments:
1172
1675
  client_spec:
1173
1676
  class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1174
1677
 
1678
+ # Upstage
1679
+ - name: huggingface/solar-pro-preview-instruct
1680
+ model_name: upstage/solar-pro-preview-instruct
1681
+ tokenizer_name: upstage/solar-pro-preview-instruct
1682
+ max_sequence_length: 4096
1683
+ client_spec:
1684
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1685
+ args:
1686
+ torch_dtype: auto
1687
+ trust_remote_code: true
1688
+
1175
1689
  ## Text-to-Image Diffusion Models
1176
1690
 
1177
1691
  - name: huggingface/dreamlike-diffusion-v1-0
@@ -1406,14 +1920,20 @@ model_deployments:
1406
1920
  precision: bf16-true
1407
1921
 
1408
1922
  # Mistral AI
1409
- - name: mistralai/mistral-tiny
1410
- model_name: mistralai/mistral-7b-v0.1
1411
- tokenizer_name: mistralai/Mistral-7B-v0.1
1412
- max_sequence_length: 32000
1923
+
1924
+ - name: mistralai/ministral-3b-2410
1925
+ model_name: mistralai/ministral-3b-2410
1926
+ tokenizer_name: mistralai/Ministral-8B-Instruct-2410
1927
+ max_sequence_length: 128000
1928
+ client_spec:
1929
+ class_name: "helm.clients.mistral_client.MistralAIClient"
1930
+
1931
+ - name: mistralai/ministral-8b-2410
1932
+ model_name: mistralai/ministral-8b-2410
1933
+ tokenizer_name: mistralai/Ministral-8B-Instruct-2410
1934
+ max_sequence_length: 128000
1413
1935
  client_spec:
1414
1936
  class_name: "helm.clients.mistral_client.MistralAIClient"
1415
- args:
1416
- mistral_model: "mistral-tiny"
1417
1937
 
1418
1938
  - name: mistralai/mistral-small-2402
1419
1939
  model_name: mistralai/mistral-small-2402
@@ -1422,6 +1942,27 @@ model_deployments:
1422
1942
  client_spec:
1423
1943
  class_name: "helm.clients.mistral_client.MistralAIClient"
1424
1944
 
1945
+ - name: mistralai/mistral-small-2409
1946
+ model_name: mistralai/mistral-small-2409
1947
+ tokenizer_name: mistralai/Mistral-7B-v0.1
1948
+ max_sequence_length: 32000
1949
+ client_spec:
1950
+ class_name: "helm.clients.mistral_client.MistralAIClient"
1951
+
1952
+ - name: mistralai/mistral-small-2501
1953
+ model_name: mistralai/mistral-small-2501
1954
+ tokenizer_name: mistralai/Mistral-Small-24B-Instruct-2501
1955
+ max_sequence_length: 32000
1956
+ client_spec:
1957
+ class_name: "helm.clients.mistral_client.MistralAIClient"
1958
+
1959
+ - name: mistralai/mistral-small-2503
1960
+ model_name: mistralai/mistral-small-2503
1961
+ tokenizer_name: mistralai/Mistral-Small-24B-Instruct-2501
1962
+ max_sequence_length: 128000
1963
+ client_spec:
1964
+ class_name: "helm.clients.mistral_client.MistralAIClient"
1965
+
1425
1966
  - name: mistralai/mistral-medium-2312
1426
1967
  model_name: mistralai/mistral-medium-2312
1427
1968
  tokenizer_name: mistralai/Mistral-7B-v0.1
@@ -1429,6 +1970,13 @@ model_deployments:
1429
1970
  client_spec:
1430
1971
  class_name: "helm.clients.mistral_client.MistralAIClient"
1431
1972
 
1973
+ - name: mistralai/mistral-medium-2505
1974
+ model_name: mistralai/mistral-medium-2505
1975
+ tokenizer_name: mistralai/Mistral-7B-v0.1
1976
+ max_sequence_length: 128000
1977
+ client_spec:
1978
+ class_name: "helm.clients.mistral_client.MistralAIClient"
1979
+
1432
1980
  - name: mistralai/mistral-large-2402
1433
1981
  model_name: mistralai/mistral-large-2402
1434
1982
  tokenizer_name: mistralai/Mistral-7B-v0.1
@@ -1443,6 +1991,13 @@ model_deployments:
1443
1991
  client_spec:
1444
1992
  class_name: "helm.clients.mistral_client.MistralAIClient"
1445
1993
 
1994
+ - name: mistralai/mistral-large-2411
1995
+ model_name: mistralai/mistral-large-2411
1996
+ tokenizer_name: mistralai/Mistral-Large-Instruct-2411
1997
+ max_sequence_length: 128000
1998
+ client_spec:
1999
+ class_name: "helm.clients.mistral_client.MistralAIClient"
2000
+
1446
2001
  - name: mistralai/open-mistral-nemo-2407
1447
2002
  model_name: mistralai/open-mistral-nemo-2407
1448
2003
  tokenizer_name: mistralai/Mistral-Nemo-Base-2407
@@ -1450,6 +2005,21 @@ model_deployments:
1450
2005
  client_spec:
1451
2006
  class_name: "helm.clients.mistral_client.MistralAIClient"
1452
2007
 
2008
+ - name: mistralai/pixtral-12b-2409
2009
+ model_name: mistralai/pixtral-12b-2409
2010
+ tokenizer_name: mistralai/Mistral-7B-v0.1
2011
+ max_sequence_length: 128000
2012
+ client_spec:
2013
+ class_name: "helm.clients.mistral_client.MistralAIClient"
2014
+
2015
+ - name: mistralai/pixtral-large-2411
2016
+ model_name: mistralai/pixtral-large-2411
2017
+ tokenizer_name: mistralai/Mistral-Large-Instruct-2407
2018
+ max_sequence_length: 128000
2019
+ client_spec:
2020
+ class_name: "helm.clients.mistral_client.MistralAIClient"
2021
+
2022
+
1453
2023
  # Neurips
1454
2024
  - name: neurips/local
1455
2025
  model_name: neurips/local
@@ -1484,7 +2054,7 @@ model_deployments:
1484
2054
  # to provide a margin of error.
1485
2055
  max_sequence_length: 16000
1486
2056
  client_spec:
1487
- class_name: "helm.clients.openai_client.OpenAIClient"
2057
+ class_name: "helm.clients.openai_client.OpenAILegacyCompletionsClient"
1488
2058
 
1489
2059
  - name: openai/babbage-002
1490
2060
  model_name: openai/babbage-002
@@ -1493,7 +2063,7 @@ model_deployments:
1493
2063
  # to provide a margin of error.
1494
2064
  max_sequence_length: 16000
1495
2065
  client_spec:
1496
- class_name: "helm.clients.openai_client.OpenAIClient"
2066
+ class_name: "helm.clients.openai_client.OpenAILegacyCompletionsClient"
1497
2067
 
1498
2068
  ## GPT 3.5 Turbo Models
1499
2069
  # ChatGPT: https://openai.com/blog/chatgpt
@@ -1504,7 +2074,7 @@ model_deployments:
1504
2074
  max_sequence_length: 4096
1505
2075
  max_request_length: 4097
1506
2076
  client_spec:
1507
- class_name: "helm.clients.openai_client.OpenAIClient"
2077
+ class_name: "helm.clients.openai_client.OpenAILegacyCompletionsClient"
1508
2078
 
1509
2079
  # The claimed sequence length is 4096, but as of 2023-03-07, the empirical usable
1510
2080
  # sequence length is smaller at 4087 with one user input message and one assistant
@@ -1636,6 +2206,13 @@ model_deployments:
1636
2206
  client_spec:
1637
2207
  class_name: "helm.clients.openai_client.OpenAIClient"
1638
2208
 
2209
+ - name: openai/gpt-4o-2024-11-20
2210
+ model_name: openai/gpt-4o-2024-11-20
2211
+ tokenizer_name: openai/o200k_base
2212
+ max_sequence_length: 128000
2213
+ client_spec:
2214
+ class_name: "helm.clients.openai_client.OpenAIClient"
2215
+
1639
2216
  - name: openai/gpt-4o-mini-2024-07-18
1640
2217
  model_name: openai/gpt-4o-mini-2024-07-18
1641
2218
  tokenizer_name: openai/o200k_base
@@ -1643,6 +2220,69 @@ model_deployments:
1643
2220
  client_spec:
1644
2221
  class_name: "helm.clients.openai_client.OpenAIClient"
1645
2222
 
2223
+ - name: openai/gpt-4.1-2025-04-14
2224
+ model_name: openai/gpt-4.1-2025-04-14
2225
+ tokenizer_name: openai/o200k_base
2226
+ max_sequence_length: 1047576
2227
+ client_spec:
2228
+ class_name: "helm.clients.openai_client.OpenAIClient"
2229
+
2230
+ - name: openai/gpt-4.1-mini-2025-04-14
2231
+ model_name: openai/gpt-4.1-mini-2025-04-14
2232
+ tokenizer_name: openai/o200k_base
2233
+ max_sequence_length: 1047576
2234
+ client_spec:
2235
+ class_name: "helm.clients.openai_client.OpenAIClient"
2236
+
2237
+ - name: openai/gpt-4.1-nano-2025-04-14
2238
+ model_name: openai/gpt-4.1-nano-2025-04-14
2239
+ tokenizer_name: openai/o200k_base
2240
+ max_sequence_length: 1047576
2241
+ client_spec:
2242
+ class_name: "helm.clients.openai_client.OpenAIClient"
2243
+
2244
+ - name: openai/whisper-1_gpt-4o-2024-11-20
2245
+ model_name: openai/whisper-1_gpt-4o-2024-11-20
2246
+ tokenizer_name: openai/o200k_base
2247
+ max_sequence_length: 128000
2248
+ client_spec:
2249
+ class_name: "helm.clients.openai_client.OpenAITranscriptionThenCompletionClient"
2250
+
2251
+ - name: openai/gpt-4o-transcribe_gpt-4o-2024-11-20
2252
+ model_name: openai/gpt-4o-transcribe_gpt-4o-2024-11-20
2253
+ tokenizer_name: openai/o200k_base
2254
+ max_sequence_length: 128000
2255
+ client_spec:
2256
+ class_name: "helm.clients.openai_client.OpenAITranscriptionThenCompletionClient"
2257
+
2258
+ - name: openai/gpt-4o-mini-transcribe_gpt-4o-2024-11-20
2259
+ model_name: openai/gpt-4o-mini-transcribe_gpt-4o-2024-11-20
2260
+ tokenizer_name: openai/o200k_base
2261
+ max_sequence_length: 128000
2262
+ client_spec:
2263
+ class_name: "helm.clients.openai_client.OpenAITranscriptionThenCompletionClient"
2264
+
2265
+ - name: openai/gpt-4o-audio-preview-2024-10-01
2266
+ model_name: openai/gpt-4o-audio-preview-2024-10-01
2267
+ tokenizer_name: openai/o200k_base
2268
+ max_sequence_length: 128000
2269
+ client_spec:
2270
+ class_name: "helm.clients.openai_client.OpenAIClient"
2271
+
2272
+ - name: openai/gpt-4o-audio-preview-2024-12-17
2273
+ model_name: openai/gpt-4o-audio-preview-2024-12-17
2274
+ tokenizer_name: openai/o200k_base
2275
+ max_sequence_length: 128000
2276
+ client_spec:
2277
+ class_name: "helm.clients.openai_client.OpenAIClient"
2278
+
2279
+ - name: openai/gpt-4o-mini-audio-preview-2024-12-17
2280
+ model_name: openai/gpt-4o-mini-audio-preview-2024-12-17
2281
+ tokenizer_name: openai/o200k_base
2282
+ max_sequence_length: 128000
2283
+ client_spec:
2284
+ class_name: "helm.clients.openai_client.OpenAIClient"
2285
+
1646
2286
  - name: openai/gpt-4-vision-preview
1647
2287
  model_name: openai/gpt-4-vision-preview
1648
2288
  tokenizer_name: openai/cl100k_base
@@ -1661,7 +2301,69 @@ model_deployments:
1661
2301
  client_spec:
1662
2302
  class_name: "helm.clients.openai_client.OpenAIClient"
1663
2303
 
2304
+ ## GPT-4.5
2305
+ - name: openai/gpt-4.5-preview-2025-02-27
2306
+ model_name: openai/gpt-4.5-preview-2025-02-27
2307
+ tokenizer_name: openai/o200k_base
2308
+ max_sequence_length: 128000
2309
+ client_spec:
2310
+ class_name: "helm.clients.openai_client.OpenAIClient"
2311
+
1664
2312
  ## o1 Models
2313
+ - name: openai/o1-pro-2025-03-19
2314
+ model_name: openai/o1-pro-2025-03-19
2315
+ tokenizer_name: openai/cl100k_base
2316
+ max_sequence_length: 128000
2317
+ client_spec:
2318
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2319
+
2320
+ - name: openai/o1-pro-2025-03-19-low-reasoning-effort
2321
+ model_name: openai/o1-pro-2025-03-19-low-reasoning-effort
2322
+ tokenizer_name: openai/cl100k_base
2323
+ max_sequence_length: 128000
2324
+ client_spec:
2325
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2326
+ args:
2327
+ openai_model_name: o1-pro-2025-03-19
2328
+ reasoning_effort: low
2329
+
2330
+ - name: openai/o1-pro-2025-03-19-high-reasoning-effort
2331
+ model_name: openai/o1-pro-2025-03-19-high-reasoning-effort
2332
+ tokenizer_name: openai/cl100k_base
2333
+ max_sequence_length: 128000
2334
+ client_spec:
2335
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2336
+ args:
2337
+ openai_model_name: o1-pro-2025-03-19
2338
+ reasoning_effort: high
2339
+
2340
+ - name: openai/o1-2024-12-17
2341
+ model_name: openai/o1-2024-12-17
2342
+ tokenizer_name: openai/cl100k_base
2343
+ max_sequence_length: 128000
2344
+ client_spec:
2345
+ class_name: "helm.clients.openai_client.OpenAIClient"
2346
+
2347
+ - name: openai/o1-2024-12-17-low-reasoning-effort
2348
+ model_name: openai/o1-2024-12-17-low-reasoning-effort
2349
+ tokenizer_name: openai/cl100k_base
2350
+ max_sequence_length: 128000
2351
+ client_spec:
2352
+ class_name: "helm.clients.openai_client.OpenAIClient"
2353
+ args:
2354
+ openai_model_name: o1-2024-12-17
2355
+ reasoning_effort: low
2356
+
2357
+ - name: openai/o1-2024-12-17-high-reasoning-effort
2358
+ model_name: openai/o1-2024-12-17-high-reasoning-effort
2359
+ tokenizer_name: openai/cl100k_base
2360
+ max_sequence_length: 128000
2361
+ client_spec:
2362
+ class_name: "helm.clients.openai_client.OpenAIClient"
2363
+ args:
2364
+ openai_model_name: o1-2024-12-17
2365
+ reasoning_effort: high
2366
+
1665
2367
  - name: openai/o1-preview-2024-09-12
1666
2368
  model_name: openai/o1-preview-2024-09-12
1667
2369
  tokenizer_name: openai/cl100k_base
@@ -1676,6 +2378,100 @@ model_deployments:
1676
2378
  client_spec:
1677
2379
  class_name: "helm.clients.openai_client.OpenAIClient"
1678
2380
 
2381
+ - name: openai/o3-mini-2025-01-31
2382
+ model_name: openai/o3-mini-2025-01-31
2383
+ tokenizer_name: openai/cl100k_base
2384
+ max_sequence_length: 200000
2385
+ client_spec:
2386
+ class_name: "helm.clients.openai_client.OpenAIClient"
2387
+
2388
+ - name: openai/o3-mini-2025-01-31-low-reasoning-effort
2389
+ model_name: openai/o3-mini-2025-01-31-low-reasoning-effort
2390
+ tokenizer_name: openai/cl100k_base
2391
+ max_sequence_length: 200000
2392
+ client_spec:
2393
+ class_name: "helm.clients.openai_client.OpenAIClient"
2394
+ args:
2395
+ openai_model_name: o3-mini-2025-01-31
2396
+ reasoning_effort: low
2397
+
2398
+ - name: openai/o3-mini-2025-01-31-high-reasoning-effort
2399
+ model_name: openai/o3-mini-2025-01-31-high-reasoning-effort
2400
+ tokenizer_name: openai/cl100k_base
2401
+ max_sequence_length: 200000
2402
+ client_spec:
2403
+ class_name: "helm.clients.openai_client.OpenAIClient"
2404
+ args:
2405
+ openai_model_name: o3-mini-2025-01-31
2406
+ reasoning_effort: high
2407
+
2408
+ - name: openai/o3-2025-04-16
2409
+ model_name: openai/o3-2025-04-16
2410
+ tokenizer_name: openai/cl100k_base
2411
+ # Source: https://platform.openai.com/docs/models/o3
2412
+ max_sequence_length: 200000
2413
+ # TODO: max_output_tokens: 100000
2414
+ client_spec:
2415
+ class_name: "helm.clients.openai_client.OpenAIClient"
2416
+
2417
+ - name: openai/o3-2025-04-16-low-reasoning-effort
2418
+ model_name: openai/o3-2025-04-16-low-reasoning-effort
2419
+ tokenizer_name: openai/cl100k_base
2420
+ # Source: https://platform.openai.com/docs/models/o3
2421
+ max_sequence_length: 200000
2422
+ # TODO: max_output_tokens: 100000
2423
+ client_spec:
2424
+ class_name: "helm.clients.openai_client.OpenAIClient"
2425
+ args:
2426
+ openai_model_name: o3-2025-04-16
2427
+ reasoning_effort: low
2428
+
2429
+ - name: openai/o3-2025-04-16-high-reasoning-effort
2430
+ model_name: openai/o3-2025-04-16-high-reasoning-effort
2431
+ tokenizer_name: openai/cl100k_base
2432
+ # Source: https://platform.openai.com/docs/models/o3
2433
+ max_sequence_length: 200000
2434
+ # TODO: max_output_tokens: 100000
2435
+ client_spec:
2436
+ class_name: "helm.clients.openai_client.OpenAIClient"
2437
+ args:
2438
+ openai_model_name: o3-2025-04-16
2439
+ reasoning_effort: high
2440
+
2441
+ - name: openai/o4-mini-2025-04-16
2442
+ model_name: openai/o4-mini-2025-04-16
2443
+ tokenizer_name: openai/cl100k_base
2444
+ # Source: https://platform.openai.com/docs/models/o4-mini
2445
+ max_sequence_length: 200000
2446
+ # TODO: max_output_tokens: 100000
2447
+ client_spec:
2448
+ class_name: "helm.clients.openai_client.OpenAIClient"
2449
+
2450
+ - name: openai/o4-mini-2025-04-16-low-reasoning-effort
2451
+ model_name: openai/o4-mini-2025-04-16-low-reasoning-effort
2452
+ tokenizer_name: openai/cl100k_base
2453
+ # Source: https://platform.openai.com/docs/models/o4-mini
2454
+ max_sequence_length: 200000
2455
+ # TODO: max_output_tokens: 100000
2456
+ client_spec:
2457
+ class_name: "helm.clients.openai_client.OpenAIClient"
2458
+ args:
2459
+ openai_model_name: o4-mini-2025-04-16
2460
+ reasoning_effort: low
2461
+
2462
+
2463
+ - name: openai/o4-mini-2025-04-16-high-reasoning-effort
2464
+ model_name: openai/o4-mini-2025-04-16-high-reasoning-effort
2465
+ tokenizer_name: openai/cl100k_base
2466
+ # Source: https://platform.openai.com/docs/models/o4-mini
2467
+ max_sequence_length: 200000
2468
+ # TODO: max_output_tokens: 100000
2469
+ client_spec:
2470
+ class_name: "helm.clients.openai_client.OpenAIClient"
2471
+ args:
2472
+ openai_model_name: o4-mini-2025-04-16
2473
+ reasoning_effort: high
2474
+
1679
2475
  ## Text Similarity Models
1680
2476
  # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings
1681
2477
  # The number of parameters is guessed based on the number of parameters of the
@@ -1940,6 +2736,24 @@ model_deployments:
1940
2736
  args:
1941
2737
  together_model: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
1942
2738
 
2739
+ - name: together/llama-4-scout-17b-16e-instruct
2740
+ model_name: meta/llama-4-scout-17b-16e-instruct
2741
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
2742
+ max_sequence_length: 327680
2743
+ client_spec:
2744
+ class_name: "helm.clients.together_client.TogetherChatClient"
2745
+ args:
2746
+ together_model: meta-llama/Llama-4-Scout-17B-16E-Instruct
2747
+
2748
+ - name: together/llama-4-maverick-17b-128e-instruct-fp8
2749
+ model_name: meta/llama-4-maverick-17b-128e-instruct-fp8
2750
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
2751
+ max_sequence_length: 524288
2752
+ client_spec:
2753
+ class_name: "helm.clients.together_client.TogetherChatClient"
2754
+ args:
2755
+ together_model: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
2756
+
1943
2757
  - name: together/llama-3-8b-chat
1944
2758
  model_name: meta/llama-3-8b-chat
1945
2759
  tokenizer_name: meta/llama-3-8b-instruct
@@ -1985,6 +2799,15 @@ model_deployments:
1985
2799
  args:
1986
2800
  together_model: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
1987
2801
 
2802
+ - name: together/llama-3.3-70b-instruct-turbo
2803
+ model_name: meta/llama-3.3-70b-instruct-turbo
2804
+ tokenizer_name: meta/llama-3.3-70b-instruct
2805
+ max_sequence_length: 128000
2806
+ client_spec:
2807
+ class_name: "helm.clients.together_client.TogetherChatClient"
2808
+ args:
2809
+ together_model: meta-llama/Llama-3.3-70B-Instruct-Turbo
2810
+
1988
2811
  - name: together/llama-guard-7b
1989
2812
  model_name: meta/llama-guard-7b
1990
2813
  tokenizer_name: meta-llama/Llama-2-7b-hf
@@ -2012,6 +2835,16 @@ model_deployments:
2012
2835
  args:
2013
2836
  together_model: meta-llama/Meta-Llama-Guard-3-8B
2014
2837
 
2838
+ # NVIDIA
2839
+ - name: together/llama-3.1-nemotron-70b-instruct
2840
+ model_name: nvidia/llama-3.1-nemotron-70b-instruct
2841
+ tokenizer_name: nvidia/llama-3.1-nemotron-70b-instruct
2842
+ max_sequence_length: 32768
2843
+ client_spec:
2844
+ class_name: "helm.clients.together_client.TogetherClient"
2845
+ args:
2846
+ together_model: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
2847
+
2015
2848
  # 01.AI
2016
2849
  - name: together/yi-6b
2017
2850
  model_name: 01-ai/yi-6b
@@ -2095,13 +2928,49 @@ model_deployments:
2095
2928
  args:
2096
2929
  pretrained_model_name_or_path: allenai/OLMo-1.7-7B-hf
2097
2930
 
2098
- ## MistralAI
2099
- - name: together/mistral-7b-v0.1
2100
- model_name: mistralai/mistral-7b-v0.1
2101
- tokenizer_name: mistralai/Mistral-7B-v0.1
2102
- max_sequence_length: 4095 # Subtract 1 token to work around a off-by-one bug in Together's input validation token counting (#2080)
2931
+ - name: huggingface/olmo-2-1124-7b-instruct
2932
+ model_name: allenai/olmo-2-1124-7b-instruct
2933
+ tokenizer_name: allenai/olmo-2-1124-7b-instruct
2934
+ max_sequence_length: 4096
2103
2935
  client_spec:
2104
- class_name: "helm.clients.together_client.TogetherClient"
2936
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
2937
+ args:
2938
+ device_map: auto
2939
+
2940
+ - name: huggingface/olmo-2-1124-13b-instruct
2941
+ model_name: allenai/olmo-2-1124-13b-instruct
2942
+ tokenizer_name: allenai/olmo-2-1124-7b-instruct
2943
+ max_sequence_length: 4096
2944
+ client_spec:
2945
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
2946
+ args:
2947
+ device_map: auto
2948
+
2949
+ - name: huggingface/olmo-2-0325-32b-instruct
2950
+ model_name: allenai/olmo-2-0325-32b-instruct
2951
+ tokenizer_name: allenai/olmo-2-0325-32b-instruct
2952
+ max_sequence_length: 4096
2953
+ client_spec:
2954
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
2955
+ args:
2956
+ device_map: auto
2957
+
2958
+ - name: huggingface/olmoe-1b-7b-0125-instruct
2959
+ model_name: allenai/olmoe-1b-7b-0125-instruct
2960
+ tokenizer_name: allenai/olmoe-1b-7b-0125-instruct
2961
+ max_sequence_length: 4096
2962
+ client_spec:
2963
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
2964
+ args:
2965
+ device_map: auto
2966
+
2967
+ ## MistralAI
2968
+ - name: together/mistral-7b-v0.1
2969
+ model_name: mistralai/mistral-7b-v0.1
2970
+ tokenizer_name: mistralai/Mistral-7B-v0.1
2971
+ max_sequence_length: 4095 # Subtract 1 token to work around a off-by-one bug in Together's input validation token counting (#2080)
2972
+ client_spec:
2973
+ class_name: "helm.clients.together_client.TogetherClient"
2105
2974
  args:
2106
2975
  together_model: mistralai/Mistral-7B-v0.1
2107
2976
 
@@ -2119,12 +2988,22 @@ model_deployments:
2119
2988
  client_spec:
2120
2989
  class_name: "helm.clients.together_client.TogetherChatClient"
2121
2990
 
2991
+ - name: huggingface/mistral-7b-instruct-v0.3
2992
+ model_name: mistralai/mistral-7b-instruct-v0.3-hf
2993
+ tokenizer_name: mistralai/Mistral-7B-Instruct-v0.3
2994
+ max_sequence_length: 32000
2995
+ client_spec:
2996
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
2997
+ args:
2998
+ pretrained_model_name_or_path: mistralai/Mistral-7B-Instruct-v0.3
2999
+
2122
3000
  - name: together/mistral-7b-instruct-v0.3
2123
3001
  model_name: mistralai/mistral-7b-instruct-v0.3
2124
3002
  tokenizer_name: mistralai/Mistral-7B-Instruct-v0.3
2125
3003
  max_sequence_length: 32000
2126
3004
  client_spec:
2127
3005
  class_name: "helm.clients.together_client.TogetherChatClient"
3006
+
2128
3007
 
2129
3008
  - name: together/mixtral-8x7b-32kseqlen
2130
3009
  model_name: mistralai/mixtral-8x7b-32kseqlen
@@ -2370,6 +3249,14 @@ model_deployments:
2370
3249
  client_spec:
2371
3250
  class_name: "helm.clients.palmyra_client.PalmyraChatClient"
2372
3251
 
3252
+ - name: writer/palmyra-x5
3253
+ model_name: writer/palmyra-x5
3254
+ # See tokenizer comment for writer/palmyra-x-004
3255
+ tokenizer_name: meta/llama-3-8b
3256
+ max_sequence_length: 1000000
3257
+ client_spec:
3258
+ class_name: "helm.clients.writer_client.WriterClient"
3259
+
2373
3260
  - name: writer/palmyra-med-32k
2374
3261
  model_name: writer/palmyra-med-32k
2375
3262
  # Palmyra-Med uses the "<|end_of_text|>" as the end of text token, which is used by meta/llama-3-8b,
@@ -2381,12 +3268,10 @@ model_deployments:
2381
3268
 
2382
3269
  - name: writer/palmyra-med
2383
3270
  model_name: writer/palmyra-med
2384
- # Palmyra-Med uses the "<|end_of_text|>" as the end of text token, which is used by meta/llama-3-8b,
2385
- # rather than "<|eot_id|>", which is used by meta/llama-3-8b-instruct
2386
3271
  tokenizer_name: meta/llama-3-8b
2387
- max_sequence_length: 4096
3272
+ max_sequence_length: 32000
2388
3273
  client_spec:
2389
- class_name: "helm.clients.palmyra_client.PalmyraChatClient"
3274
+ class_name: "helm.clients.writer_client.WriterClient"
2390
3275
 
2391
3276
  - name: writer/palmyra-fin-32k
2392
3277
  model_name: writer/palmyra-fin-32k
@@ -2395,6 +3280,34 @@ model_deployments:
2395
3280
  client_spec:
2396
3281
  class_name: "helm.clients.palmyra_client.PalmyraChatClient"
2397
3282
 
3283
+ - name: writer/palmyra-fin
3284
+ model_name: writer/palmyra-fin
3285
+ tokenizer_name: meta/llama-3-8b-instruct
3286
+ max_sequence_length: 128000
3287
+ client_spec:
3288
+ class_name: "helm.clients.palmyra_client.PalmyraChatClient"
3289
+
3290
+
3291
+ # xAI
3292
+
3293
+ - name: xai/grok-3-beta
3294
+ model_name: xai/grok-3-beta
3295
+ tokenizer_name: xai/grok-3-beta
3296
+ max_sequence_length: 131072
3297
+ client_spec:
3298
+ class_name: "helm.clients.grok_client.GrokChatClient"
3299
+ window_service_spec:
3300
+ class_name: "helm.benchmark.window_services.no_decoding_window_service.NoDecodingWindowService"
3301
+
3302
+ - name: xai/grok-3-mini-beta
3303
+ model_name: xai/grok-3-mini-beta
3304
+ tokenizer_name: xai/grok-3-mini-beta
3305
+ max_sequence_length: 131072
3306
+ client_spec:
3307
+ class_name: "helm.clients.grok_client.GrokChatClient"
3308
+ window_service_spec:
3309
+ class_name: "helm.benchmark.window_services.no_decoding_window_service.NoDecodingWindowService"
3310
+
2398
3311
  # Qwen
2399
3312
 
2400
3313
  - name: together/qwen-7b
@@ -2484,6 +3397,112 @@ model_deployments:
2484
3397
  client_spec:
2485
3398
  class_name: "helm.clients.together_client.TogetherChatClient"
2486
3399
 
3400
+ - name: together/qwen2.5-7b-instruct-turbo
3401
+ model_name: qwen/qwen2.5-7b-instruct-turbo
3402
+ tokenizer_name: qwen/qwen2.5-7b-instruct
3403
+ max_sequence_length: 128000
3404
+ client_spec:
3405
+ class_name: "helm.clients.together_client.TogetherChatClient"
3406
+
3407
+ - name: together/qwen2.5-72b-instruct-turbo
3408
+ model_name: qwen/qwen2.5-72b-instruct-turbo
3409
+ tokenizer_name: qwen/qwen2.5-7b-instruct
3410
+ max_sequence_length: 128000
3411
+ client_spec:
3412
+ class_name: "helm.clients.together_client.TogetherChatClient"
3413
+
3414
+ - name: together/qwen3-235b-a22b-fp8-tput
3415
+ model_name: qwen/qwen3-235b-a22b-fp8-tput
3416
+ tokenizer_name: qwen/qwen3-235b-a22b
3417
+ max_sequence_length: 40960
3418
+ client_spec:
3419
+ class_name: "helm.clients.together_client.TogetherChatClient"
3420
+ args:
3421
+ parse_thinking: true
3422
+
3423
+ - name: huggingface/qwen2.5-7b-instruct-4bit
3424
+ model_name: qwen/qwen2.5-7b-instruct
3425
+ tokenizer_name: qwen/qwen2.5-7b-instruct
3426
+ max_sequence_length: 128000
3427
+ client_spec:
3428
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3429
+ args:
3430
+ pretrained_model_name_or_path: Qwen/Qwen2.5-7B-Instruct
3431
+ torch_dtype: "float16"
3432
+ quantization_config:
3433
+ load_in_4bit: true
3434
+ attn_implementation: "flash_attention_2"
3435
+
3436
+ - name: huggingface/qwen2.5-7b-instruct
3437
+ model_name: qwen/qwen2.5-7b-instruct
3438
+ tokenizer_name: qwen/qwen2.5-7b-instruct
3439
+ max_sequence_length: 128000
3440
+ client_spec:
3441
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3442
+ args:
3443
+ pretrained_model_name_or_path: Qwen/Qwen2.5-7B-Instruct
3444
+
3445
+ - name: huggingface/smollm2-135m
3446
+ model_name: huggingface/smollm2-135m
3447
+ tokenizer_name: huggingface/smollm2-135m
3448
+ max_sequence_length: 8192
3449
+ client_spec:
3450
+ class_name: "helm.clients.huggingface_pipeline_client.HuggingFacePipelineClient"
3451
+ args:
3452
+ pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M
3453
+
3454
+ - name: huggingface/smollm2-360m
3455
+ model_name: huggingface/smollm2-360m
3456
+ tokenizer_name: huggingface/smollm2-135m
3457
+ max_sequence_length: 8192
3458
+ client_spec:
3459
+ class_name: "helm.clients.huggingface_pipeline_client.HuggingFacePipelineClient"
3460
+ args:
3461
+ pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-360M
3462
+
3463
+ - name: huggingface/smollm2-1.7b
3464
+ model_name: huggingface/smollm2-1.7b
3465
+ tokenizer_name: huggingface/smollm2-135m
3466
+ max_sequence_length: 8192
3467
+ client_spec:
3468
+ class_name: "helm.clients.huggingface_pipeline_client.HuggingFacePipelineClient"
3469
+ args:
3470
+ pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-1.7B
3471
+
3472
+ - name: huggingface/smollm2-135m-instruct
3473
+ model_name: huggingface/smollm2-135m-instruct
3474
+ tokenizer_name: huggingface/smollm2-135m-instruct
3475
+ max_sequence_length: 328192768
3476
+ client_spec:
3477
+ class_name: "helm.clients.huggingface_pipeline_client.HuggingFacePipelineClient"
3478
+ args:
3479
+ pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct
3480
+
3481
+ - name: huggingface/smollm2-360m-instruct
3482
+ model_name: huggingface/smollm2-360m-instruct
3483
+ tokenizer_name: huggingface/smollm2-135m-instruct
3484
+ max_sequence_length: 8192
3485
+ client_spec:
3486
+ class_name: "helm.clients.huggingface_pipeline_client.HuggingFacePipelineClient"
3487
+ args:
3488
+ pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-360M-Instruct
3489
+
3490
+ - name: huggingface/smollm2-1.7b-instruct
3491
+ model_name: huggingface/smollm2-1.7b-instruct
3492
+ tokenizer_name: huggingface/smollm2-135m-instruct
3493
+ max_sequence_length: 8192
3494
+ client_spec:
3495
+ class_name: "helm.clients.huggingface_pipeline_client.HuggingFacePipelineClient"
3496
+ args:
3497
+ pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-1.7B-Instruct
3498
+
3499
+ - name: together/qwq-32b-preview
3500
+ model_name: qwen/qwq-32b-preview
3501
+ tokenizer_name: qwen/qwq-32b-preview
3502
+ max_sequence_length: 32768
3503
+ client_spec:
3504
+ class_name: "helm.clients.together_client.TogetherChatClient"
3505
+
2487
3506
  - name: huggingface/qwen-vl
2488
3507
  model_name: qwen/qwen-vl
2489
3508
  tokenizer_name: qwen/qwen-vl
@@ -2498,6 +3517,69 @@ model_deployments:
2498
3517
  client_spec:
2499
3518
  class_name: "helm.clients.vision_language.qwen_vlm_client.QwenVLMClient"
2500
3519
 
3520
+ - name: huggingface/qwen2-vl-7b-instruct
3521
+ model_name: qwen/qwen2-vl-7b-instruct
3522
+ tokenizer_name: qwen/qwen-vl-chat
3523
+ max_sequence_length: 8191
3524
+ client_spec:
3525
+ class_name: "helm.clients.vision_language.qwen2_vlm_client.Qwen2VLMClient"
3526
+
3527
+ - name: huggingface/qwen2-vl-72b-instruct
3528
+ model_name: qwen/qwen2-vl-72b-instruct
3529
+ tokenizer_name: qwen/qwen-vl-chat
3530
+ max_sequence_length: 8191
3531
+ client_spec:
3532
+ class_name: "helm.clients.vision_language.qwen2_vlm_client.Qwen2VLMClient"
3533
+
3534
+ - name: huggingface/qwen2.5-vl-3b-instruct
3535
+ model_name: qwen/qwen2.5-vl-3b-instruct
3536
+ tokenizer_name: qwen/qwen-vl-chat
3537
+ max_sequence_length: 8191
3538
+ client_spec:
3539
+ class_name: "helm.clients.vision_language.qwen2_vlm_client.Qwen2VLMClient"
3540
+
3541
+ - name: huggingface/qwen2.5-vl-7b-instruct
3542
+ model_name: qwen/qwen2.5-vl-7b-instruct
3543
+ tokenizer_name: qwen/qwen-vl-chat
3544
+ max_sequence_length: 8191
3545
+ client_spec:
3546
+ class_name: "helm.clients.vision_language.qwen2_vlm_client.Qwen2VLMClient"
3547
+
3548
+ - name: huggingface/qwen2.5-vl-32b-instruct
3549
+ model_name: qwen/qwen2.5-vl-32b-instruct
3550
+ tokenizer_name: qwen/qwen-vl-chat
3551
+ max_sequence_length: 8191
3552
+ client_spec:
3553
+ class_name: "helm.clients.vision_language.qwen2_vlm_client.Qwen2VLMClient"
3554
+
3555
+ - name: huggingface/qwen2.5-vl-72b-instruct
3556
+ model_name: qwen/qwen2.5-vl-72b-instruct
3557
+ tokenizer_name: qwen/qwen-vl-chat
3558
+ max_sequence_length: 8191
3559
+ client_spec:
3560
+ class_name: "helm.clients.vision_language.qwen2_vlm_client.Qwen2VLMClient"
3561
+
3562
+ - name: huggingface/qwen-audio-chat
3563
+ model_name: qwen/qwen-audio-chat
3564
+ tokenizer_name: qwen/qwen-audio-chat
3565
+ max_sequence_length: 8191
3566
+ client_spec:
3567
+ class_name: "helm.clients.audio_language.qwen_audiolm_client.QwenAudioLMClient"
3568
+
3569
+ - name: huggingface/qwen2-audio-7b-instruct
3570
+ model_name: qwen/qwen2-audio-7b-instruct
3571
+ tokenizer_name: qwen/qwen2-audio-instruct
3572
+ max_sequence_length: 8191
3573
+ client_spec:
3574
+ class_name: "helm.clients.audio_language.qwen2_audiolm_client.Qwen2AudioLMClient"
3575
+
3576
+ - name: huggingface/qwen2.5-omni-7b
3577
+ model_name: qwen/qwen2.5-omni-7b
3578
+ tokenizer_name: qwen/qwen2.5-omni-7b
3579
+ max_sequence_length: 8191
3580
+ client_spec:
3581
+ class_name: "helm.clients.audio_language.qwen2_5_omni_client.Qwen2_5OmniAudioLMClient"
3582
+
2501
3583
  # Reka
2502
3584
  - name: reka/reka-core
2503
3585
  model_name: reka/reka-core
@@ -2547,3 +3629,685 @@ model_deployments:
2547
3629
  max_sequence_length: 64000
2548
3630
  client_spec:
2549
3631
  class_name: "helm.clients.reka_client.RekaClient"
3632
+
3633
+ # Upstage
3634
+ - name: upstage/solar-pro-241126
3635
+ model_name: upstage/solar-pro-241126
3636
+ tokenizer_name: upstage/solar-pro-preview-instruct
3637
+ max_sequence_length: 32768
3638
+ client_spec:
3639
+ class_name: "helm.clients.upstage_client.UpstageChatClient"
3640
+
3641
+ # Diva Llama
3642
+ - name: huggingface/diva-llama
3643
+ model_name: stanford/diva-llama
3644
+ # TODO: Set the right tokenizer
3645
+ tokenizer_name: meta/llama-3-8b-instruct
3646
+ max_sequence_length: 8192
3647
+ client_spec:
3648
+ class_name: "helm.clients.audio_language.diva_llama_client.DivaLlamaClient"
3649
+
3650
+ # LLaMA-Omni
3651
+ - name: ictnlp/llama-3.1-8b-omni
3652
+ model_name: ictnlp/llama-3.1-8b-omni
3653
+ tokenizer_name: ictnlp/llama-3.1-8b-omni
3654
+ max_sequence_length: 8192
3655
+ client_spec:
3656
+ class_name: "helm.clients.audio_language.llama_omni_client.LlamaOmniAudioLMClient"
3657
+
3658
+ # IBM - Granite 3.0
3659
+ - name: huggingface/granite-3.0-2b-base
3660
+ model_name: ibm-granite/granite-3.0-2b-base
3661
+ tokenizer_name: ibm-granite/granite-3.0-2b-base
3662
+ max_sequence_length: 4096
3663
+ client_spec:
3664
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3665
+ args:
3666
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-2b-base
3667
+
3668
+ - name: huggingface/granite-3.0-2b-instruct
3669
+ model_name: ibm-granite/granite-3.0-2b-instruct
3670
+ tokenizer_name: ibm-granite/granite-3.0-2b-instruct
3671
+ max_sequence_length: 4096
3672
+ client_spec:
3673
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3674
+ args:
3675
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-2b-instruct
3676
+
3677
+ - name: huggingface/granite-3.0-8b-instruct
3678
+ model_name: ibm-granite/granite-3.0-8b-instruct
3679
+ tokenizer_name: ibm-granite/granite-3.0-8b-instruct
3680
+ max_sequence_length: 4096
3681
+ client_spec:
3682
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3683
+ args:
3684
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-8b-instruct
3685
+
3686
+ - name: huggingface/granite-3.0-8b-base
3687
+ model_name: ibm-granite/granite-3.0-8b-base
3688
+ tokenizer_name: ibm-granite/granite-3.0-8b-base
3689
+ max_sequence_length: 4096
3690
+ client_spec:
3691
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3692
+ args:
3693
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-8b-base
3694
+
3695
+ - name: huggingface/granite-3.0-3b-a800m-instruct
3696
+ model_name: ibm-granite/granite-3.0-3b-a800m-instruct
3697
+ tokenizer_name: ibm-granite/granite-3.0-3b-a800m-instruct
3698
+ max_sequence_length: 4096
3699
+ client_spec:
3700
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3701
+ args:
3702
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-3b-a800m-instruct
3703
+
3704
+ - name: huggingface/granite-3.0-3b-a800m-base
3705
+ model_name: ibm-granite/granite-3.0-3b-a800m-base
3706
+ tokenizer_name: ibm-granite/granite-3.0-3b-a800m-base
3707
+ max_sequence_length: 4096
3708
+ client_spec:
3709
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3710
+ args:
3711
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-3b-a800m-base
3712
+
3713
+ - name: huggingface/granite-3.0-1b-a400m-instruct
3714
+ model_name: ibm-granite/granite-3.0-1b-a400m-instruct
3715
+ tokenizer_name: ibm-granite/granite-3.0-1b-a400m-instruct
3716
+ max_sequence_length: 4096
3717
+ client_spec:
3718
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3719
+ args:
3720
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-instruct
3721
+
3722
+ - name: huggingface/granite-3.0-1b-a400m-base
3723
+ model_name: ibm-granite/granite-3.0-1b-a400m-base
3724
+ tokenizer_name: ibm-granite/granite-3.0-1b-a400m-base
3725
+ max_sequence_length: 4096
3726
+ client_spec:
3727
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3728
+ args:
3729
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base
3730
+
3731
+ - name: huggingface/sabia-7b
3732
+ model_name: maritaca-ai/sabia-7b
3733
+ tokenizer_name: maritaca-ai/sabia-7b
3734
+ max_sequence_length: 2048
3735
+ client_spec:
3736
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3737
+ args:
3738
+ pretrained_model_name_or_path: maritaca-ai/sabia-7b
3739
+
3740
+ # Granite-3.1-8b-base
3741
+ - name: huggingface/granite-3.1-8b-base
3742
+ model_name: ibm-granite/granite-3.1-8b-base
3743
+ tokenizer_name: ibm-granite/granite-3.1-8b-base
3744
+ max_sequence_length: 128000
3745
+ client_spec:
3746
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3747
+ args:
3748
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-8b-base
3749
+
3750
+ # Granite-3.1-8b-instruct
3751
+ - name: huggingface/granite-3.1-8b-instruct
3752
+ model_name: ibm-granite/granite-3.1-8b-instruct
3753
+ tokenizer_name: ibm-granite/granite-3.1-8b-instruct
3754
+ max_sequence_length: 128000
3755
+ client_spec:
3756
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3757
+ args:
3758
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-8b-instruct
3759
+
3760
+ # Granite-3.1-2b-instruct
3761
+ - name: huggingface/granite-3.1-2b-instruct
3762
+ model_name: ibm-granite/granite-3.1-2b-instruct
3763
+ tokenizer_name: ibm-granite/granite-3.1-2b-instruct
3764
+ max_sequence_length: 128000
3765
+ client_spec:
3766
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3767
+ args:
3768
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-2b-instruct
3769
+
3770
+ # Granite-3.1-2b-base
3771
+ - name: huggingface/granite-3.1-2b-base
3772
+ model_name: ibm-granite/granite-3.1-2b-base
3773
+ tokenizer_name: ibm-granite/granite-3.1-2b-base
3774
+ max_sequence_length: 128000
3775
+ client_spec:
3776
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3777
+ args:
3778
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-2b-base
3779
+
3780
+ # Granite-3.1-3b-a800m-instruct
3781
+ - name: huggingface/granite-3.1-3b-a800m-instruct
3782
+ model_name: ibm-granite/granite-3.1-3b-a800m-instruct
3783
+ tokenizer_name: ibm-granite/granite-3.1-3b-a800m-instruct
3784
+ max_sequence_length: 128000
3785
+ client_spec:
3786
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3787
+ args:
3788
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-3b-a800m-instruct
3789
+
3790
+ # Granite-3.1-3b-a800m-base
3791
+ - name: huggingface/granite-3.1-3b-a800m-base
3792
+ model_name: ibm-granite/granite-3.1-3b-a800m-base
3793
+ tokenizer_name: ibm-granite/granite-3.1-3b-a800m-base
3794
+ max_sequence_length: 128000
3795
+ client_spec:
3796
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3797
+ args:
3798
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-3b-a800m-base
3799
+
3800
+ # Granite-3.1-1b-a400m-instruct
3801
+ - name: huggingface/granite-3.1-1b-a400m-instruct
3802
+ model_name: ibm-granite/granite-3.1-1b-a400m-instruct
3803
+ tokenizer_name: ibm-granite/granite-3.1-1b-a400m-instruct
3804
+ max_sequence_length: 128000
3805
+ client_spec:
3806
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3807
+ args:
3808
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-1b-a400m-instruct
3809
+
3810
+ # Granite-3.1-1b-a400m-base
3811
+ - name: huggingface/granite-3.1-1b-a400m-base
3812
+ model_name: ibm-granite/granite-3.1-1b-a400m-base
3813
+ tokenizer_name: ibm-granite/granite-3.1-1b-a400m-base
3814
+ max_sequence_length: 128000
3815
+ client_spec:
3816
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3817
+ args:
3818
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-1b-a400m-base
3819
+
3820
+ # DeepSeek-R1-Distill-Llama-3.1-8b
3821
+ - name: huggingface/DeepSeek-R1-Distill-Llama-8B
3822
+ model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
3823
+ tokenizer_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
3824
+ max_sequence_length: 128000
3825
+ client_spec:
3826
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3827
+ args:
3828
+ pretrained_model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
3829
+
3830
+ # deepseek-ai/deepseek-coder-6.7b-instruct
3831
+ - name: huggingface/deepseek-coder-6.7b-instruct
3832
+ model_name: deepseek-ai/deepseek-coder-6.7b-instruct
3833
+ tokenizer_name: deepseek-ai/deepseek-coder-6.7b-instruct
3834
+ max_sequence_length: 128000
3835
+ client_spec:
3836
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3837
+ args:
3838
+ pretrained_model_name_or_path: deepseek-ai/deepseek-coder-6.7b-instruct
3839
+
3840
+ # IBM WatsonX
3841
+ - name: ibm/llama-3.3-70b-instruct
3842
+ model_name: meta/llama-3.3-70b-instruct
3843
+ tokenizer_name: meta/llama-3.3-70b-instruct
3844
+ max_sequence_length: 128000
3845
+ client_spec:
3846
+ class_name: "helm.clients.ibm_client.IbmChatClient"
3847
+ args:
3848
+ watsonx_model_name: meta-llama/llama-3-3-70b-instruct
3849
+ region: Dallas
3850
+
3851
+ - name: ibm/granite-3-2b-instruct
3852
+ model_name: ibm/granite-3.1-2b-instruct
3853
+ tokenizer_name: ibm-granite/granite-3.1-2b-instruct
3854
+ max_sequence_length: 128000
3855
+ client_spec:
3856
+ class_name: "helm.clients.ibm_client.IbmChatClient"
3857
+ args:
3858
+ watsonx_model_name: ibm/granite-3-2b-instruct
3859
+ region: Dallas
3860
+
3861
+ - name: ibm/granite-3-8b-instruct
3862
+ model_name: ibm/granite-3.1-8b-instruct
3863
+ tokenizer_name: ibm-granite/granite-3.1-8b-instruct
3864
+ max_sequence_length: 128000
3865
+ client_spec:
3866
+ class_name: "helm.clients.ibm_client.IbmChatClient"
3867
+ args:
3868
+ watsonx_model_name: ibm/granite-3-8b-instruct
3869
+ region: Dallas
3870
+
3871
+ - name: ibm/granite-13b-instruct-v2
3872
+ model_name: ibm/granite-13b-instruct-v2
3873
+ tokenizer_name: EleutherAI/gpt-neox-20b
3874
+ max_sequence_length: 128000
3875
+ client_spec:
3876
+ class_name: "helm.clients.ibm_client.IbmTextClient"
3877
+ args:
3878
+ watsonx_model_name: ibm/granite-13b-instruct-v2
3879
+ region: Dallas
3880
+
3881
+ - name: ibm/granite-20b-code-instruct-8k
3882
+ model_name: ibm/granite-20b-code-instruct-8k
3883
+ tokenizer_name: ibm-granite/granite-20b-code-instruct-8k
3884
+ max_sequence_length: 128000
3885
+ client_spec:
3886
+ class_name: "helm.clients.ibm_client.IbmChatClient"
3887
+ args:
3888
+ watsonx_model_name: ibm/granite-20b-code-instruct
3889
+ region: Dallas
3890
+
3891
+ - name: ibm/granite-34b-code-instruct
3892
+ model_name: ibm/granite-34b-code-instruct
3893
+ tokenizer_name: ibm-granite/granite-34b-code-instruct-8k
3894
+ max_sequence_length: 128000
3895
+ client_spec:
3896
+ class_name: "helm.clients.ibm_client.IbmChatClient"
3897
+ args:
3898
+ watsonx_model_name: ibm/granite-34b-code-instruct
3899
+ region: Dallas
3900
+
3901
+ - name: ibm/granite-3b-code-instruct
3902
+ model_name: ibm/granite-3b-code-instruct
3903
+ tokenizer_name: ibm-granite/granite-3b-code-instruct-128k
3904
+ max_sequence_length: 128000
3905
+ client_spec:
3906
+ class_name: "helm.clients.ibm_client.IbmTextClient"
3907
+ args:
3908
+ watsonx_model_name: ibm/granite-3b-code-instruct
3909
+ region: Dallas
3910
+
3911
+ - name: ibm/granite-8b-code-instruct
3912
+ model_name: ibm/granite-8b-code-instruct
3913
+ tokenizer_name: ibm-granite/granite-8b-code-instruct-128k
3914
+ max_sequence_length: 128000
3915
+ client_spec:
3916
+ class_name: "helm.clients.ibm_client.IbmTextClient"
3917
+ args:
3918
+ watsonx_model_name: ibm/granite-8b-code-instruct
3919
+ region: Dallas
3920
+
3921
+ - name: ibm/mixtral-8x7b-instruct-v0:1
3922
+ model_name: mistralai/mixtral-8x7b-instruct-v0:1
3923
+ tokenizer_name: huggingface/gpt2
3924
+ max_sequence_length: 4000
3925
+ client_spec:
3926
+ class_name: "helm.clients.ibm_client.IbmChatClient"
3927
+ args:
3928
+ watsonx_model_name: mistralai/mixtral-8x7b-instruct-v01
3929
+ region: Dallas
3930
+
3931
+ - name: ibm/granite-3.3-8b-instruct
3932
+ model_name: ibm/granite-3.3-8b-instruct
3933
+ tokenizer_name: ibm/granite-3.3-8b-instruct
3934
+ max_sequence_length: 131072
3935
+ client_spec:
3936
+ class_name: "helm.clients.ibm_client.IbmTextClient"
3937
+ args:
3938
+ watsonx_model_name: ibm/granite-3-3-8b-instruct
3939
+ region: Dallas
3940
+
3941
+ # Vietnamese
3942
+ - name: ura-hcmut/ura-llama-2.1-8b
3943
+ model_name: ura-hcmut/ura-llama-2.1-8b
3944
+ tokenizer_name: meta/llama-3.1-8b-instruct
3945
+ max_sequence_length: 131072
3946
+ client_spec:
3947
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3948
+ args:
3949
+ pretrained_model_name_or_path: ura-hcmut/ura-llama-2.1-8b
3950
+
3951
+ - name: ura-hcmut/ura-llama-2-8b
3952
+ model_name: ura-hcmut/ura-llama-2-8b
3953
+ tokenizer_name: meta/llama-3-8b-instruct
3954
+ max_sequence_length: 8192
3955
+ client_spec:
3956
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3957
+ args:
3958
+ pretrained_model_name_or_path: ura-hcmut/ura-llama-2-8b
3959
+
3960
+ - name: ura-hcmut/ura-llama-7b
3961
+ model_name: ura-hcmut/ura-llama-7b
3962
+ tokenizer_name: meta-llama/Llama-2-7b-hf
3963
+ max_sequence_length: 4096
3964
+ client_spec:
3965
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3966
+ args:
3967
+ pretrained_model_name_or_path: ura-hcmut/ura-llama-7b
3968
+
3969
+ - name: ura-hcmut/ura-llama-13b
3970
+ model_name: ura-hcmut/ura-llama-13b
3971
+ tokenizer_name: meta-llama/Llama-2-7b-hf
3972
+ max_sequence_length: 4096
3973
+ client_spec:
3974
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3975
+ args:
3976
+ pretrained_model_name_or_path: ura-hcmut/ura-llama-13b
3977
+
3978
+ - name: ura-hcmut/ura-llama-70b
3979
+ model_name: ura-hcmut/ura-llama-70b
3980
+ tokenizer_name: meta-llama/Llama-2-7b-hf
3981
+ max_sequence_length: 4096
3982
+ client_spec:
3983
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3984
+ args:
3985
+ pretrained_model_name_or_path: ura-hcmut/ura-llama-70b
3986
+
3987
+ - name: ura-hcmut/GemSUra-7B
3988
+ model_name: ura-hcmut/GemSUra-7B
3989
+ tokenizer_name: google/gemma-2b
3990
+ max_sequence_length: 8192
3991
+ client_spec:
3992
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3993
+ args:
3994
+ pretrained_model_name_or_path: ura-hcmut/GemSUra-7B
3995
+
3996
+ - name: ura-hcmut/GemSUra-2B
3997
+ model_name: ura-hcmut/GemSUra-2B
3998
+ tokenizer_name: google/gemma-2b
3999
+ max_sequence_length: 8192
4000
+ client_spec:
4001
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4002
+ args:
4003
+ pretrained_model_name_or_path: ura-hcmut/GemSUra-2B
4004
+
4005
+ - name: ura-hcmut/MixSUra
4006
+ model_name: ura-hcmut/MixSUra
4007
+ tokenizer_name: mistralai/Mistral-7B-v0.1
4008
+ max_sequence_length: 32768
4009
+ client_spec:
4010
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4011
+ args:
4012
+ pretrained_model_name_or_path: ura-hcmut/MixSUra
4013
+
4014
+ - name: vilm/vinallama-7b-chat
4015
+ model_name: vilm/vinallama-7b-chat
4016
+ tokenizer_name: vilm/vinallama-7b-chat
4017
+ max_sequence_length: 4096
4018
+ client_spec:
4019
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4020
+ args:
4021
+ pretrained_model_name_or_path: vilm/vinallama-7b-chat
4022
+
4023
+ - name: vilm/vinallama-2.7b-chat
4024
+ model_name: vilm/vinallama-2.7b-chat
4025
+ tokenizer_name: vilm/vinallama-2.7b-chat
4026
+ max_sequence_length: 4096
4027
+ client_spec:
4028
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4029
+ args:
4030
+ pretrained_model_name_or_path: vilm/vinallama-2.7b-chat
4031
+
4032
+ - name: vilm/vietcuna-7b-v3
4033
+ model_name: vilm/vietcuna-7b-v3
4034
+ tokenizer_name: vilm/vietcuna-7b-v3
4035
+ max_sequence_length: 2048
4036
+ client_spec:
4037
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4038
+ args:
4039
+ pretrained_model_name_or_path: vilm/vietcuna-7b-v3
4040
+
4041
+ - name: vilm/vietcuna-3b-v2
4042
+ model_name: vilm/vietcuna-3b-v2
4043
+ tokenizer_name: vilm/vietcuna-7b-v3
4044
+ max_sequence_length: 2048
4045
+ client_spec:
4046
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4047
+ args:
4048
+ pretrained_model_name_or_path: vilm/vietcuna-3b-v2
4049
+
4050
+ - name: vilm/Quyen-v0.1
4051
+ model_name: vilm/Quyen-v0.1
4052
+ tokenizer_name: qwen/qwen2-72b-instruct
4053
+ max_sequence_length: 32768
4054
+ client_spec:
4055
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4056
+ args:
4057
+ pretrained_model_name_or_path: vilm/Quyen-v0.1
4058
+
4059
+ - name: vilm/Quyen-Plus-v0.1
4060
+ model_name: vilm/Quyen-Plus-v0.1
4061
+ tokenizer_name: qwen/qwen2-72b-instruct
4062
+ max_sequence_length: 32768
4063
+ client_spec:
4064
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4065
+ args:
4066
+ pretrained_model_name_or_path: vilm/Quyen-Plus-v0.1
4067
+
4068
+ - name: vilm/Quyen-Pro-v0.1
4069
+ model_name: vilm/Quyen-Pro-v0.1
4070
+ tokenizer_name: qwen/qwen2-72b-instruct
4071
+ max_sequence_length: 32768
4072
+ client_spec:
4073
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4074
+ args:
4075
+ pretrained_model_name_or_path: vilm/Quyen-Pro-v0.1
4076
+
4077
+ - name: vilm/Quyen-Pro-Max-v0.1
4078
+ model_name: vilm/Quyen-Pro-Max-v0.1
4079
+ tokenizer_name: qwen/qwen2-72b-instruct
4080
+ max_sequence_length: 32768
4081
+ client_spec:
4082
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4083
+ args:
4084
+ pretrained_model_name_or_path: vilm/Quyen-Pro-Max-v0.1
4085
+
4086
+ - name: vilm/Quyen-Mini-v0.1
4087
+ model_name: vilm/Quyen-Mini-v0.1
4088
+ tokenizer_name: qwen/qwen2-72b-instruct
4089
+ max_sequence_length: 32768
4090
+ client_spec:
4091
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4092
+ args:
4093
+ pretrained_model_name_or_path: vilm/Quyen-Mini-v0.1
4094
+
4095
+ - name: vilm/Quyen-SE-v0.1
4096
+ model_name: vilm/Quyen-SE-v0.1
4097
+ tokenizer_name: qwen/qwen2-72b-instruct
4098
+ max_sequence_length: 32768
4099
+ client_spec:
4100
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4101
+ args:
4102
+ pretrained_model_name_or_path: vilm/Quyen-SE-v0.1
4103
+
4104
+ - name: Viet-Mistral/Vistral-7B-Chat
4105
+ model_name: Viet-Mistral/Vistral-7B-Chat
4106
+ tokenizer_name: Viet-Mistral/Vistral-7B-Chat
4107
+ max_sequence_length: 32768
4108
+ client_spec:
4109
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4110
+ args:
4111
+ pretrained_model_name_or_path: Viet-Mistral/Vistral-7B-Chat
4112
+
4113
+ - name: vinai/PhoGPT-7B5-Instruct
4114
+ model_name: vinai/PhoGPT-7B5-Instruct
4115
+ tokenizer_name: vinai/PhoGPT-7B5-Instruct
4116
+ max_sequence_length: 2048
4117
+ client_spec:
4118
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4119
+ args:
4120
+ pretrained_model_name_or_path: vinai/PhoGPT-7B5-Instruct
4121
+
4122
+ - name: vinai/PhoGPT-4B-Chat
4123
+ model_name: vinai/PhoGPT-4B-Chat
4124
+ tokenizer_name: vinai/PhoGPT-4B-Chat
4125
+ max_sequence_length: 8192
4126
+ client_spec:
4127
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4128
+ args:
4129
+ pretrained_model_name_or_path: vinai/PhoGPT-4B-Chat
4130
+
4131
+ # Stanford Health Care
4132
+ # Placed later in the file to make them non-default
4133
+ - name: stanfordhealthcare/claude-3-5-sonnet-20241022
4134
+ model_name: anthropic/claude-3-5-sonnet-20241022
4135
+ tokenizer_name: anthropic/claude
4136
+ max_sequence_length: 200000
4137
+ client_spec:
4138
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
4139
+ args:
4140
+ model: anthropic.claude-3-5-sonnet-20241022-v2:0
4141
+ deployment: Claude35Sonnetv2/awssig4fa
4142
+
4143
+ - name: stanfordhealthcare/claude-3-7-sonnet-20250219
4144
+ model_name: anthropic/claude-3-7-sonnet-20250219
4145
+ tokenizer_name: anthropic/claude
4146
+ max_sequence_length: 200000
4147
+ client_spec:
4148
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
4149
+ args:
4150
+ model: arn:aws:bedrock:us-west-2:679683451337:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0
4151
+ deployment: awssig4claude37/aswsig4claude37
4152
+
4153
+ - name: stanfordhealthcare/gemini-1.5-pro-001
4154
+ model_name: google/gemini-1.5-pro-001
4155
+ tokenizer_name: google/gemma-2b
4156
+ max_sequence_length: 1000000
4157
+ client_spec:
4158
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
4159
+ args:
4160
+ deployment: gcpgemini/apim-gcp-oauth-fa
4161
+
4162
+ - name: stanfordhealthcare/gemini-2.0-flash-001
4163
+ model_name: google/gemini-2.0-flash-001
4164
+ tokenizer_name: google/gemma-2b
4165
+ max_sequence_length: 1000000
4166
+ client_spec:
4167
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
4168
+ args:
4169
+ deployment: gcp-gem20flash-fa/apim-gcp-gem20flash-fa
4170
+
4171
+ - name: stanfordhealthcare/gpt-4o-mini-2024-07-18
4172
+ model_name: openai/gpt-4o-mini-2024-07-18
4173
+ tokenizer_name: openai/o200k_base
4174
+ max_sequence_length: 128000
4175
+ client_spec:
4176
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4177
+ args:
4178
+ openai_model_name: gpt-4o-mini
4179
+ api_version: 2023-05-15
4180
+
4181
+ - name: stanfordhealthcare/gpt-4o-2024-05-13
4182
+ model_name: openai/gpt-4o-2024-05-13
4183
+ tokenizer_name: openai/o200k_base
4184
+ max_sequence_length: 128000
4185
+ client_spec:
4186
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4187
+ args:
4188
+ openai_model_name: gpt-4o
4189
+ api_version: 2023-05-15
4190
+
4191
+ - name: stanfordhealthcare/gpt-4-0613
4192
+ model_name: openai/gpt-4-0613
4193
+ tokenizer_name: openai/o200k_base
4194
+ max_sequence_length: 8192
4195
+ client_spec:
4196
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4197
+ args:
4198
+ openai_model_name: gpt-4
4199
+ api_version: 2023-05-15
4200
+
4201
+ - name: stanfordhealthcare/gpt-4-turbo-2024-04-09
4202
+ model_name: openai/gpt-4-turbo-2024-04-09
4203
+ tokenizer_name: openai/cl100k_base
4204
+ max_sequence_length: 128000
4205
+ client_spec:
4206
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4207
+ args:
4208
+ openai_model_name: gpt-4-turbo
4209
+ api_version: 2023-05-15
4210
+
4211
+ - name: stanfordhealthcare/gpt-4.1-2025-04-14
4212
+ model_name: openai/gpt-4.1-2025-04-14
4213
+ tokenizer_name: openai/o200k_base
4214
+ max_sequence_length: 1047576
4215
+ client_spec:
4216
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4217
+ args:
4218
+ openai_model_name: gpt-4.1
4219
+ api_version: 2025-01-01-preview
4220
+ base_url: "{endpoint}/openai-eastus2"
4221
+
4222
+ - name: stanfordhealthcare/o3-mini-2025-01-31
4223
+ model_name: openai/o3-mini-2025-01-31
4224
+ tokenizer_name: openai/cl100k_base
4225
+ max_sequence_length: 200000
4226
+ client_spec:
4227
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4228
+ args:
4229
+ openai_model_name: o3-mini
4230
+ api_version: 2024-12-01-preview
4231
+ base_url: "{endpoint}/openai-eastus2"
4232
+
4233
+ - name: stanfordhealthcare/o1-2024-12-17
4234
+ model_name: openai/o1-2024-12-17
4235
+ tokenizer_name: openai/cl100k_base
4236
+ max_sequence_length: 128000
4237
+ client_spec:
4238
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4239
+ args:
4240
+ openai_model_name: o1
4241
+ api_version: 2024-12-01-preview
4242
+ base_url: "{endpoint}/openai-eastus2"
4243
+
4244
+ - name: stanfordhealthcare/deepseek-r1
4245
+ model_name: deepseek-ai/deepseek-r1
4246
+ tokenizer_name: deepseek-ai/deepseek-r1
4247
+ max_sequence_length: 128000
4248
+ client_spec:
4249
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4250
+ args:
4251
+ openai_model_name: deepseek-chat
4252
+ output_processor: helm.benchmark.metrics.output_processors.remove_deepseek_r1_thinking
4253
+ base_url: "{endpoint}/deepseekr1/v1"
4254
+
4255
+ - name: stanfordhealthcare/llama-3.3-70b-instruct
4256
+ model_name: meta/llama-3.3-70b-instruct
4257
+ tokenizer_name: meta/llama-3.3-70b-instruct
4258
+ max_sequence_length: 128000
4259
+ client_spec:
4260
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4261
+ args:
4262
+ base_url: "{endpoint}/llama3370b/v1"
4263
+
4264
+ - name: stanfordhealthcare/llama-4-scout-17b-16e-instruct
4265
+ model_name: meta/llama-4-scout-17b-16e-instruct
4266
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
4267
+ max_sequence_length: 327680
4268
+ client_spec:
4269
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4270
+ args:
4271
+ base_url: "{endpoint}/llama4-scout/v1"
4272
+
4273
+ - name: stanfordhealthcare/llama-4-maverick-17b-128e-instruct-fp8
4274
+ model_name: meta/llama-4-maverick-17b-128e-instruct-fp8
4275
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
4276
+ max_sequence_length: 524288
4277
+ client_spec:
4278
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4279
+ args:
4280
+ base_url: "{endpoint}/llama4-maverick/v1"
4281
+
4282
+ - name: stanfordhealthcare/phi-3.5-mini-instruct
4283
+ model_name: microsoft/phi-3.5-mini-instruct
4284
+ tokenizer_name: microsoft/phi-3.5-mini-instruct
4285
+ max_sequence_length: 131072
4286
+ client_spec:
4287
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4288
+ args:
4289
+ base_url: "{endpoint}/phi35mi/v1"
4290
+
4291
+ - name: stanfordhealthcare_shc/gpt-4o-2024-05-13
4292
+ model_name: openai/gpt-4o-2024-05-13
4293
+ tokenizer_name: openai/o200k_base
4294
+ max_sequence_length: 128000
4295
+ client_spec:
4296
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
4297
+ deployment: gpt-4o
4298
+
4299
+ - name: stanfordhealthcare_shc/gpt-4o-mini-2024-07-18
4300
+ model_name: openai/gpt-4o-mini-2024-07-18
4301
+ tokenizer_name: openai/o200k_base
4302
+ max_sequence_length: 128000
4303
+ client_spec:
4304
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
4305
+ deployment: gpt-4o-mini
4306
+
4307
+ - name: stanfordhealthcare_shc/gpt-4-turbo-2024-04-09
4308
+ model_name: openai/gpt-4-turbo-2024-04-09
4309
+ tokenizer_name: openai/cl100k_base
4310
+ max_sequence_length: 128000
4311
+ client_spec:
4312
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
4313
+ deployment: gpt-4-turbo-2024-04-09