crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  13. helm/benchmark/annotation/model_as_judge.py +12 -16
  14. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  15. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  16. helm/benchmark/executor.py +11 -12
  17. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  18. helm/benchmark/metrics/bias_word_lists.py +1 -1
  19. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  20. helm/benchmark/metrics/classification_metrics.py +3 -3
  21. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  22. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  23. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  24. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  25. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  26. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  27. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  28. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  29. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  30. helm/benchmark/metrics/medalign_metrics.py +9 -29
  31. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  32. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  33. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  34. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  35. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  36. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  37. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  38. helm/benchmark/metrics/metric_service.py +11 -11
  39. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  40. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  41. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  42. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  43. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  44. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  45. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  46. helm/benchmark/metrics/summac/model_summac.py +1 -2
  47. helm/benchmark/metrics/summarization_metrics.py +2 -1
  48. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  49. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  50. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  51. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  52. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  53. helm/benchmark/model_deployment_registry.py +6 -8
  54. helm/benchmark/presentation/contamination.py +3 -3
  55. helm/benchmark/presentation/create_plots.py +33 -12
  56. helm/benchmark/presentation/run_display.py +13 -0
  57. helm/benchmark/presentation/schema.py +2 -1
  58. helm/benchmark/presentation/summarize.py +76 -59
  59. helm/benchmark/reeval_run.py +3 -4
  60. helm/benchmark/reeval_runner.py +3 -3
  61. helm/benchmark/run.py +78 -73
  62. helm/benchmark/run_expander.py +12 -1
  63. helm/benchmark/run_spec_factory.py +7 -6
  64. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  65. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  66. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  67. helm/benchmark/run_specs/long_context_run_specs.py +67 -15
  68. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  69. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  70. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  71. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  72. helm/benchmark/runner.py +5 -5
  73. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  74. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  75. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  76. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  77. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  78. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  79. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  80. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  81. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  82. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  83. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  84. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  85. helm/benchmark/scenarios/clear_scenario.py +11 -7
  86. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  87. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  88. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  89. helm/benchmark/scenarios/grammar.py +2 -2
  90. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  91. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  92. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  93. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  94. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  95. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  96. helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
  97. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  98. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  99. helm/benchmark/scenarios/medec_scenario.py +6 -1
  100. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  101. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  102. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  103. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  104. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  105. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  106. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  107. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  108. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  109. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  110. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  111. helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
  112. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  113. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  114. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  115. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  116. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  117. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  118. helm/benchmark/scenarios/numeracy_scenario.py +2 -1
  119. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  120. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  121. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  122. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  123. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  124. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  125. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  126. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  127. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  128. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  129. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  130. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  131. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  132. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  133. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  134. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  135. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  136. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  137. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  138. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  139. helm/benchmark/server.py +2 -1
  140. helm/benchmark/static/schema_audio.yaml +60 -49
  141. helm/benchmark/static/schema_enterprise.yaml +21 -0
  142. helm/benchmark/static/schema_long_context.yaml +63 -20
  143. helm/benchmark/static/schema_medhelm.yaml +272 -213
  144. helm/benchmark/static/schema_melt.yaml +1257 -0
  145. helm/benchmark/static/schema_slphelm.yaml +162 -0
  146. helm/benchmark/static/schema_vhelm.yaml +26 -26
  147. helm/benchmark/static/schema_video.yaml +219 -0
  148. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  149. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  150. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  151. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  152. helm/benchmark/static_build/index.html +4 -4
  153. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  154. helm/benchmark/window_services/test_utils.py +3 -4
  155. helm/benchmark/window_services/tokenizer_service.py +7 -8
  156. helm/clients/anthropic_client.py +69 -29
  157. helm/clients/audio_language/diva_llama_client.py +4 -2
  158. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  159. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  160. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  161. helm/clients/audio_language/test.py +62 -0
  162. helm/clients/bedrock_client.py +3 -1
  163. helm/clients/client.py +7 -7
  164. helm/clients/grok_client.py +36 -0
  165. helm/clients/huggingface_client.py +42 -3
  166. helm/clients/huggingface_pipeline_client.py +138 -0
  167. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  168. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  169. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  170. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  171. helm/clients/openai_client.py +100 -54
  172. helm/clients/openai_responses_client.py +174 -0
  173. helm/clients/palmyra_client.py +2 -5
  174. helm/clients/reka_client.py +2 -2
  175. helm/clients/together_client.py +31 -4
  176. helm/clients/vertexai_client.py +6 -0
  177. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  178. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  179. helm/clients/vision_language/idefics_client.py +6 -2
  180. helm/clients/vision_language/paligemma_client.py +2 -2
  181. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  182. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  183. helm/clients/writer_client.py +102 -0
  184. helm/common/context.py +80 -0
  185. helm/common/credentials_utils.py +5 -5
  186. helm/common/general.py +9 -2
  187. helm/common/hierarchical_logger.py +46 -3
  188. helm/common/local_context.py +140 -0
  189. helm/common/remote_context.py +61 -0
  190. helm/common/request.py +8 -0
  191. helm/config/model_deployments.yaml +864 -193
  192. helm/config/model_metadata.yaml +667 -53
  193. helm/config/tokenizer_configs.yaml +144 -3
  194. helm/proxy/cli.py +3 -1
  195. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  196. helm/proxy/services/server_service.py +21 -85
  197. helm/tokenizers/grok_tokenizer.py +53 -0
  198. helm/tokenizers/huggingface_tokenizer.py +1 -1
  199. helm/tokenizers/test_grok_tokenizer.py +33 -0
  200. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  201. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  202. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  203. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  204. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
  205. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
  206. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -16,161 +16,6 @@ model_deployments:
16
16
  client_spec:
17
17
  class_name: "helm.clients.simple_client.SimpleClient"
18
18
 
19
- # Stanford Health Care
20
- # Placed earlier in the file to make them non-default
21
- - name: stanfordhealthcare/claude-3-5-sonnet-20241022
22
- model_name: anthropic/claude-3-5-sonnet-20241022
23
- tokenizer_name: anthropic/claude
24
- max_sequence_length: 200000
25
- client_spec:
26
- class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
27
- args:
28
- model: anthropic.claude-3-5-sonnet-20241022-v2:0
29
- deployment: Claude35Sonnetv2/awssig4fa
30
-
31
- - name: stanfordhealthcare/claude-3-7-sonnet-20250219
32
- model_name: anthropic/claude-3-7-sonnet-20250219
33
- tokenizer_name: anthropic/claude
34
- max_sequence_length: 200000
35
- client_spec:
36
- class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
37
- args:
38
- model: arn:aws:bedrock:us-west-2:679683451337:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0
39
- deployment: awssig4claude37/aswsig4claude37
40
-
41
- - name: stanfordhealthcare/gemini-1.5-pro-001
42
- model_name: google/gemini-1.5-pro-001
43
- tokenizer_name: google/gemma-2b
44
- max_sequence_length: 1000000
45
- client_spec:
46
- class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
47
- args:
48
- deployment: gcpgemini/apim-gcp-oauth-fa
49
-
50
- - name: stanfordhealthcare/gemini-2.0-flash-001
51
- model_name: google/gemini-2.0-flash-001
52
- tokenizer_name: google/gemma-2b
53
- max_sequence_length: 1000000
54
- client_spec:
55
- class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
56
- args:
57
- deployment: gcp-gem20flash-fa/apim-gcp-gem20flash-fa
58
-
59
- - name: stanfordhealthcare/gpt-4o-mini-2024-07-18
60
- model_name: openai/gpt-4o-mini-2024-07-18
61
- tokenizer_name: openai/o200k_base
62
- max_sequence_length: 128000
63
- client_spec:
64
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
65
- args:
66
- openai_model_name: gpt-4o-mini
67
- api_version: 2023-05-15
68
-
69
- - name: stanfordhealthcare/gpt-4o-2024-05-13
70
- model_name: openai/gpt-4o-2024-05-13
71
- tokenizer_name: openai/o200k_base
72
- max_sequence_length: 128000
73
- client_spec:
74
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
75
- args:
76
- openai_model_name: gpt-4o
77
- api_version: 2023-05-15
78
-
79
- - name: stanfordhealthcare/gpt-4-0613
80
- model_name: openai/gpt-4-0613
81
- tokenizer_name: openai/o200k_base
82
- max_sequence_length: 8192
83
- client_spec:
84
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
85
- args:
86
- openai_model_name: gpt-4
87
- api_version: 2023-05-15
88
-
89
- - name: stanfordhealthcare/gpt-4-turbo-2024-04-09
90
- model_name: openai/gpt-4-turbo-2024-04-09
91
- tokenizer_name: openai/cl100k_base
92
- max_sequence_length: 128000
93
- client_spec:
94
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
95
- args:
96
- openai_model_name: gpt-4-turbo
97
- api_version: 2023-05-15
98
-
99
- - name: stanfordhealthcare/o3-mini-2025-01-31
100
- model_name: openai/o3-mini-2025-01-31
101
- tokenizer_name: openai/cl100k_base
102
- max_sequence_length: 200000
103
- client_spec:
104
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
105
- args:
106
- openai_model_name: o3-mini
107
- api_version: 2024-12-01-preview
108
- base_url: "{endpoint}/openai-eastus2"
109
-
110
- - name: stanfordhealthcare/o1-2024-12-17
111
- model_name: openai/o1-2024-12-17
112
- tokenizer_name: openai/cl100k_base
113
- max_sequence_length: 128000
114
- client_spec:
115
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
116
- args:
117
- openai_model_name: o1
118
- api_version: 2024-12-01-preview
119
- base_url: "{endpoint}/openai-eastus2"
120
-
121
- - name: stanfordhealthcare/deepseek-r1
122
- model_name: deepseek-ai/deepseek-r1
123
- tokenizer_name: deepseek-ai/deepseek-r1
124
- max_sequence_length: 128000
125
- client_spec:
126
- class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
127
- args:
128
- openai_model_name: deepseek-chat
129
- output_processor: helm.benchmark.metrics.output_processors.remove_deepseek_r1_thinking
130
- base_url: "{endpoint}/deepseekr1/v1"
131
-
132
- - name: stanfordhealthcare/llama-3.3-70b-instruct
133
- model_name: meta/llama-3.3-70b-instruct
134
- tokenizer_name: meta/llama-3.3-70b-instruct
135
- max_sequence_length: 128000
136
- client_spec:
137
- class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
138
- args:
139
- base_url: "{endpoint}/llama3370b/v1"
140
-
141
- - name: stanfordhealthcare/phi-3.5-mini-instruct
142
- model_name: microsoft/phi-3.5-mini-instruct
143
- tokenizer_name: microsoft/phi-3.5-mini-instruct
144
- max_sequence_length: 131072
145
- client_spec:
146
- class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
147
- args:
148
- base_url: "{endpoint}/phi35mi/v1"
149
-
150
- - name: stanfordhealthcare_shc/gpt-4o-2024-05-13
151
- model_name: openai/gpt-4o-2024-05-13
152
- tokenizer_name: openai/o200k_base
153
- max_sequence_length: 128000
154
- client_spec:
155
- class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
156
- deployment: gpt-4o
157
-
158
- - name: stanfordhealthcare_shc/gpt-4o-mini-2024-07-18
159
- model_name: openai/gpt-4o-mini-2024-07-18
160
- tokenizer_name: openai/o200k_base
161
- max_sequence_length: 128000
162
- client_spec:
163
- class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
164
- deployment: gpt-4o-mini
165
-
166
- - name: stanfordhealthcare_shc/gpt-4-turbo-2024-04-09
167
- model_name: openai/gpt-4-turbo-2024-04-09
168
- tokenizer_name: openai/cl100k_base
169
- max_sequence_length: 128000
170
- client_spec:
171
- class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
172
- deployment: gpt-4-turbo-2024-04-09
173
-
174
19
  # Adobe
175
20
  - name: adobe/giga-gan
176
21
  model_name: adobe/giga-gan
@@ -260,6 +105,14 @@ model_deployments:
260
105
 
261
106
 
262
107
  # Amazon nova models
108
+ - name: amazon/nova-premier-v1:0
109
+ model_name: amazon/nova-premier-v1:0
110
+ tokenizer_name: huggingface/gpt2
111
+ max_sequence_length: 1000000
112
+ client_spec:
113
+ class_name: "helm.clients.bedrock_client.BedrockNovaClient"
114
+ args:
115
+ bedrock_model_id: us.amazon.nova-premier-v1:0
263
116
 
264
117
  - name: amazon/nova-pro-v1:0
265
118
  model_name: amazon/nova-pro-v1:0
@@ -463,6 +316,53 @@ model_deployments:
463
316
  client_spec:
464
317
  class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
465
318
 
319
+ - name: anthropic/claude-3-7-sonnet-20250219-thinking-10k
320
+ model_name: anthropic/claude-3-7-sonnet-20250219-thinking-10k
321
+ tokenizer_name: anthropic/claude
322
+ max_sequence_length: 200000
323
+ client_spec:
324
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
325
+ args:
326
+ anthropic_model_name: claude-3-7-sonnet-20250219
327
+ thinking_budget_tokens: 10000
328
+ stream: true
329
+
330
+ - name: anthropic/claude-sonnet-4-20250514
331
+ model_name: anthropic/claude-sonnet-4-20250514
332
+ tokenizer_name: anthropic/claude
333
+ max_sequence_length: 200000
334
+ client_spec:
335
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
336
+
337
+ - name: anthropic/claude-sonnet-4-20250514-thinking-10k
338
+ model_name: anthropic/claude-sonnet-4-20250514-thinking-10k
339
+ tokenizer_name: anthropic/claude
340
+ max_sequence_length: 200000
341
+ client_spec:
342
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
343
+ args:
344
+ anthropic_model_name: claude-sonnet-4-20250514
345
+ thinking_budget_tokens: 10000
346
+ stream: true
347
+
348
+ - name: anthropic/claude-opus-4-20250514
349
+ model_name: anthropic/claude-opus-4-20250514
350
+ tokenizer_name: anthropic/claude
351
+ max_sequence_length: 200000
352
+ client_spec:
353
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
354
+
355
+ - name: anthropic/claude-opus-4-20250514-thinking-10k
356
+ model_name: anthropic/claude-opus-4-20250514-thinking-10k
357
+ tokenizer_name: anthropic/claude
358
+ max_sequence_length: 200000
359
+ client_spec:
360
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
361
+ args:
362
+ anthropic_model_name: claude-opus-4-20250514
363
+ thinking_budget_tokens: 10000
364
+ stream: true
365
+
466
366
  - name: anthropic/stanford-online-all-v4-s3
467
367
  deprecated: true # Closed model, not accessible via API
468
368
  model_name: anthropic/stanford-online-all-v4-s3
@@ -583,25 +483,16 @@ model_deployments:
583
483
  args:
584
484
  disable_logprobs: True
585
485
 
586
- - name: together/deepseek-r1
587
- model_name: deepseek-ai/deepseek-r1
588
- tokenizer_name: deepseek-ai/deepseek-r1
589
- max_sequence_length: 32768
590
- client_spec:
591
- class_name: "helm.clients.together_client.TogetherChatClient"
592
- args:
593
- disable_logprobs: True
594
-
595
- - name: together/deepseek-r1-hide-reasoning
596
- model_name: deepseek-ai/deepseek-r1-hide-reasoning
486
+ - name: together/deepseek-r1-0528
487
+ model_name: deepseek-ai/deepseek-r1-0528
597
488
  tokenizer_name: deepseek-ai/deepseek-r1
598
489
  max_sequence_length: 32768
599
490
  client_spec:
600
491
  class_name: "helm.clients.together_client.TogetherChatClient"
601
492
  args:
602
493
  together_model: deepseek-ai/deepseek-r1
494
+ parse_thinking: true
603
495
  disable_logprobs: True
604
- output_processor: helm.benchmark.metrics.output_processors.remove_deepseek_r1_thinking
605
496
 
606
497
  # Gooseai
607
498
 
@@ -802,6 +693,14 @@ model_deployments:
802
693
  client_spec:
803
694
  class_name: "helm.clients.vertexai_client.VertexAIChatClient"
804
695
 
696
+ - name: google/gemini-2.0-flash-lite-001
697
+ model_name: google/gemini-2.0-flash-lite-001
698
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
699
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
700
+ # TODO: Max output tokens: 8192
701
+ client_spec:
702
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
703
+
805
704
  - name: google/gemini-2.0-flash-thinking-exp-01-21
806
705
  model_name: google/gemini-2.0-flash-thinking-exp-01-21
807
706
  tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
@@ -809,6 +708,75 @@ model_deployments:
809
708
  client_spec:
810
709
  class_name: "helm.clients.vertexai_client.VertexAIChatClient"
811
710
 
711
+ - name: google/gemini-2.5-flash-lite-preview-06-17
712
+ model_name: google/gemini-2.5-flash-lite-preview-06-17
713
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
714
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash
715
+ # TODO: Max output tokens: 65536
716
+ client_spec:
717
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
718
+ args:
719
+ # Only the global location is supported. See:
720
+ # - https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash-lite
721
+ # - https://cloud.google.com/vertex-ai/generative-ai/docs/learn/locations#global-endpoint
722
+ location: global
723
+
724
+ - name: google/gemini-2.5-flash-preview-04-17
725
+ model_name: google/gemini-2.5-flash-preview-04-17
726
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
727
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash
728
+ # TODO: Max output tokens: 65536
729
+ client_spec:
730
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
731
+
732
+ - name: google/gemini-2.5-flash-preview-05-20
733
+ model_name: google/gemini-2.5-flash-preview-05-20
734
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
735
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash
736
+ # TODO: Max output tokens: 65536
737
+ client_spec:
738
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
739
+
740
+ - name: google/gemini-2.5-flash
741
+ model_name: google/gemini-2.5-flash
742
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
743
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash
744
+ # TODO: Max output tokens: 65536
745
+ client_spec:
746
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
747
+
748
+ - name: google/gemini-2.5-pro-exp-03-25
749
+ model_name: google/gemini-2.5-pro-exp-03-25
750
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
751
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-pro
752
+ # TODO: Max output tokens: 65536
753
+ client_spec:
754
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
755
+
756
+ - name: google/gemini-2.5-pro-preview-03-25
757
+ model_name: google/gemini-2.5-pro-preview-03-25
758
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
759
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-pro
760
+ # TODO: Max output tokens: 65536
761
+ client_spec:
762
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
763
+
764
+ - name: google/gemini-2.5-pro-preview-05-06
765
+ model_name: google/gemini-2.5-pro-preview-05-06
766
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
767
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-pro
768
+ # TODO: Max output tokens: 65536
769
+ client_spec:
770
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
771
+
772
+ - name: google/gemini-2.5-pro
773
+ model_name: google/gemini-2.5-pro
774
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
775
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-pro
776
+ # TODO: Max output tokens: 65536
777
+ client_spec:
778
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
779
+
812
780
  - name: google/gemini-1.5-flash-8b-001
813
781
  model_name: google/gemini-1.5-flash-8b-001
814
782
  tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
@@ -1364,6 +1332,8 @@ model_deployments:
1364
1332
  max_sequence_length: 4096
1365
1333
  client_spec:
1366
1334
  class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1335
+ args:
1336
+ apply_chat_template: false
1367
1337
 
1368
1338
  ## KAIST AI
1369
1339
  - name: huggingface/prometheus-vision-13b-v1.0-hf
@@ -1385,6 +1355,23 @@ model_deployments:
1385
1355
  tokenizer_name: "anas-awadalla-2/mpt-7b"
1386
1356
  cross_attn_every_n_layers: 4
1387
1357
 
1358
+ ## Marin Community
1359
+ - name: huggingface/marin-8b-instruct
1360
+ model_name: marin-community/marin-8b-instruct
1361
+ tokenizer_name: marin-community/marin-8b-instruct
1362
+ max_sequence_length: 4096
1363
+ client_spec:
1364
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1365
+ args:
1366
+ device_map: auto
1367
+
1368
+ - name: together/marin-8b-instruct
1369
+ model_name: marin-community/marin-8b-instruct
1370
+ tokenizer_name: marin-community/marin-8b-instruct
1371
+ max_sequence_length: 4096
1372
+ client_spec:
1373
+ class_name: "helm.clients.together_client.TogetherClient"
1374
+
1388
1375
  ## Microsoft
1389
1376
  - name: together/phi-2
1390
1377
  model_name: microsoft/phi-2
@@ -1530,6 +1517,8 @@ model_deployments:
1530
1517
  max_sequence_length: 32768
1531
1518
  client_spec:
1532
1519
  class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1520
+ args:
1521
+ apply_chat_template: false
1533
1522
 
1534
1523
  - name: huggingface/sailor-7b-chat
1535
1524
  model_name: sail/sailor-7b-chat
@@ -1546,6 +1535,7 @@ model_deployments:
1546
1535
  class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1547
1536
  args:
1548
1537
  device_map: auto
1538
+ apply_chat_template: false
1549
1539
 
1550
1540
  - name: huggingface/sailor-14b-chat
1551
1541
  model_name: sail/sailor-14b-chat
@@ -1980,6 +1970,13 @@ model_deployments:
1980
1970
  client_spec:
1981
1971
  class_name: "helm.clients.mistral_client.MistralAIClient"
1982
1972
 
1973
+ - name: mistralai/mistral-medium-2505
1974
+ model_name: mistralai/mistral-medium-2505
1975
+ tokenizer_name: mistralai/Mistral-7B-v0.1
1976
+ max_sequence_length: 128000
1977
+ client_spec:
1978
+ class_name: "helm.clients.mistral_client.MistralAIClient"
1979
+
1983
1980
  - name: mistralai/mistral-large-2402
1984
1981
  model_name: mistralai/mistral-large-2402
1985
1982
  tokenizer_name: mistralai/Mistral-7B-v0.1
@@ -2223,12 +2220,47 @@ model_deployments:
2223
2220
  client_spec:
2224
2221
  class_name: "helm.clients.openai_client.OpenAIClient"
2225
2222
 
2226
- - name: openai/whisper-1_gpt-4o-2024-11-20
2227
- model_name: openai/whisper-1_gpt-4o-2024-11-20
2223
+ - name: openai/gpt-4.1-2025-04-14
2224
+ model_name: openai/gpt-4.1-2025-04-14
2228
2225
  tokenizer_name: openai/o200k_base
2229
- max_sequence_length: 128000
2226
+ max_sequence_length: 1047576
2230
2227
  client_spec:
2231
- class_name: "helm.clients.openai_client.OpenAITranscriptionThenCompletionClient"
2228
+ class_name: "helm.clients.openai_client.OpenAIClient"
2229
+
2230
+ - name: openai/gpt-4.1-mini-2025-04-14
2231
+ model_name: openai/gpt-4.1-mini-2025-04-14
2232
+ tokenizer_name: openai/o200k_base
2233
+ max_sequence_length: 1047576
2234
+ client_spec:
2235
+ class_name: "helm.clients.openai_client.OpenAIClient"
2236
+
2237
+ - name: openai/gpt-4.1-nano-2025-04-14
2238
+ model_name: openai/gpt-4.1-nano-2025-04-14
2239
+ tokenizer_name: openai/o200k_base
2240
+ max_sequence_length: 1047576
2241
+ client_spec:
2242
+ class_name: "helm.clients.openai_client.OpenAIClient"
2243
+
2244
+ - name: openai/whisper-1_gpt-4o-2024-11-20
2245
+ model_name: openai/whisper-1_gpt-4o-2024-11-20
2246
+ tokenizer_name: openai/o200k_base
2247
+ max_sequence_length: 128000
2248
+ client_spec:
2249
+ class_name: "helm.clients.openai_client.OpenAITranscriptionThenCompletionClient"
2250
+
2251
+ - name: openai/gpt-4o-transcribe_gpt-4o-2024-11-20
2252
+ model_name: openai/gpt-4o-transcribe_gpt-4o-2024-11-20
2253
+ tokenizer_name: openai/o200k_base
2254
+ max_sequence_length: 128000
2255
+ client_spec:
2256
+ class_name: "helm.clients.openai_client.OpenAITranscriptionThenCompletionClient"
2257
+
2258
+ - name: openai/gpt-4o-mini-transcribe_gpt-4o-2024-11-20
2259
+ model_name: openai/gpt-4o-mini-transcribe_gpt-4o-2024-11-20
2260
+ tokenizer_name: openai/o200k_base
2261
+ max_sequence_length: 128000
2262
+ client_spec:
2263
+ class_name: "helm.clients.openai_client.OpenAITranscriptionThenCompletionClient"
2232
2264
 
2233
2265
  - name: openai/gpt-4o-audio-preview-2024-10-01
2234
2266
  model_name: openai/gpt-4o-audio-preview-2024-10-01
@@ -2278,6 +2310,33 @@ model_deployments:
2278
2310
  class_name: "helm.clients.openai_client.OpenAIClient"
2279
2311
 
2280
2312
  ## o1 Models
2313
+ - name: openai/o1-pro-2025-03-19
2314
+ model_name: openai/o1-pro-2025-03-19
2315
+ tokenizer_name: openai/cl100k_base
2316
+ max_sequence_length: 128000
2317
+ client_spec:
2318
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2319
+
2320
+ - name: openai/o1-pro-2025-03-19-low-reasoning-effort
2321
+ model_name: openai/o1-pro-2025-03-19-low-reasoning-effort
2322
+ tokenizer_name: openai/cl100k_base
2323
+ max_sequence_length: 128000
2324
+ client_spec:
2325
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2326
+ args:
2327
+ openai_model_name: o1-pro-2025-03-19
2328
+ reasoning_effort: low
2329
+
2330
+ - name: openai/o1-pro-2025-03-19-high-reasoning-effort
2331
+ model_name: openai/o1-pro-2025-03-19-high-reasoning-effort
2332
+ tokenizer_name: openai/cl100k_base
2333
+ max_sequence_length: 128000
2334
+ client_spec:
2335
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2336
+ args:
2337
+ openai_model_name: o1-pro-2025-03-19
2338
+ reasoning_effort: high
2339
+
2281
2340
  - name: openai/o1-2024-12-17
2282
2341
  model_name: openai/o1-2024-12-17
2283
2342
  tokenizer_name: openai/cl100k_base
@@ -2346,6 +2405,73 @@ model_deployments:
2346
2405
  openai_model_name: o3-mini-2025-01-31
2347
2406
  reasoning_effort: high
2348
2407
 
2408
+ - name: openai/o3-2025-04-16
2409
+ model_name: openai/o3-2025-04-16
2410
+ tokenizer_name: openai/cl100k_base
2411
+ # Source: https://platform.openai.com/docs/models/o3
2412
+ max_sequence_length: 200000
2413
+ # TODO: max_output_tokens: 100000
2414
+ client_spec:
2415
+ class_name: "helm.clients.openai_client.OpenAIClient"
2416
+
2417
+ - name: openai/o3-2025-04-16-low-reasoning-effort
2418
+ model_name: openai/o3-2025-04-16-low-reasoning-effort
2419
+ tokenizer_name: openai/cl100k_base
2420
+ # Source: https://platform.openai.com/docs/models/o3
2421
+ max_sequence_length: 200000
2422
+ # TODO: max_output_tokens: 100000
2423
+ client_spec:
2424
+ class_name: "helm.clients.openai_client.OpenAIClient"
2425
+ args:
2426
+ openai_model_name: o3-2025-04-16
2427
+ reasoning_effort: low
2428
+
2429
+ - name: openai/o3-2025-04-16-high-reasoning-effort
2430
+ model_name: openai/o3-2025-04-16-high-reasoning-effort
2431
+ tokenizer_name: openai/cl100k_base
2432
+ # Source: https://platform.openai.com/docs/models/o3
2433
+ max_sequence_length: 200000
2434
+ # TODO: max_output_tokens: 100000
2435
+ client_spec:
2436
+ class_name: "helm.clients.openai_client.OpenAIClient"
2437
+ args:
2438
+ openai_model_name: o3-2025-04-16
2439
+ reasoning_effort: high
2440
+
2441
+ - name: openai/o4-mini-2025-04-16
2442
+ model_name: openai/o4-mini-2025-04-16
2443
+ tokenizer_name: openai/cl100k_base
2444
+ # Source: https://platform.openai.com/docs/models/o4-mini
2445
+ max_sequence_length: 200000
2446
+ # TODO: max_output_tokens: 100000
2447
+ client_spec:
2448
+ class_name: "helm.clients.openai_client.OpenAIClient"
2449
+
2450
+ - name: openai/o4-mini-2025-04-16-low-reasoning-effort
2451
+ model_name: openai/o4-mini-2025-04-16-low-reasoning-effort
2452
+ tokenizer_name: openai/cl100k_base
2453
+ # Source: https://platform.openai.com/docs/models/o4-mini
2454
+ max_sequence_length: 200000
2455
+ # TODO: max_output_tokens: 100000
2456
+ client_spec:
2457
+ class_name: "helm.clients.openai_client.OpenAIClient"
2458
+ args:
2459
+ openai_model_name: o4-mini-2025-04-16
2460
+ reasoning_effort: low
2461
+
2462
+
2463
+ - name: openai/o4-mini-2025-04-16-high-reasoning-effort
2464
+ model_name: openai/o4-mini-2025-04-16-high-reasoning-effort
2465
+ tokenizer_name: openai/cl100k_base
2466
+ # Source: https://platform.openai.com/docs/models/o4-mini
2467
+ max_sequence_length: 200000
2468
+ # TODO: max_output_tokens: 100000
2469
+ client_spec:
2470
+ class_name: "helm.clients.openai_client.OpenAIClient"
2471
+ args:
2472
+ openai_model_name: o4-mini-2025-04-16
2473
+ reasoning_effort: high
2474
+
2349
2475
  ## Text Similarity Models
2350
2476
  # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings
2351
2477
  # The number of parameters is guessed based on the number of parameters of the
@@ -2610,6 +2736,24 @@ model_deployments:
2610
2736
  args:
2611
2737
  together_model: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
2612
2738
 
2739
+ - name: together/llama-4-scout-17b-16e-instruct
2740
+ model_name: meta/llama-4-scout-17b-16e-instruct
2741
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
2742
+ max_sequence_length: 327680
2743
+ client_spec:
2744
+ class_name: "helm.clients.together_client.TogetherChatClient"
2745
+ args:
2746
+ together_model: meta-llama/Llama-4-Scout-17B-16E-Instruct
2747
+
2748
+ - name: together/llama-4-maverick-17b-128e-instruct-fp8
2749
+ model_name: meta/llama-4-maverick-17b-128e-instruct-fp8
2750
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
2751
+ max_sequence_length: 524288
2752
+ client_spec:
2753
+ class_name: "helm.clients.together_client.TogetherChatClient"
2754
+ args:
2755
+ together_model: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
2756
+
2613
2757
  - name: together/llama-3-8b-chat
2614
2758
  model_name: meta/llama-3-8b-chat
2615
2759
  tokenizer_name: meta/llama-3-8b-instruct
@@ -2784,6 +2928,42 @@ model_deployments:
2784
2928
  args:
2785
2929
  pretrained_model_name_or_path: allenai/OLMo-1.7-7B-hf
2786
2930
 
2931
+ - name: huggingface/olmo-2-1124-7b-instruct
2932
+ model_name: allenai/olmo-2-1124-7b-instruct
2933
+ tokenizer_name: allenai/olmo-2-1124-7b-instruct
2934
+ max_sequence_length: 4096
2935
+ client_spec:
2936
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
2937
+ args:
2938
+ device_map: auto
2939
+
2940
+ - name: huggingface/olmo-2-1124-13b-instruct
2941
+ model_name: allenai/olmo-2-1124-13b-instruct
2942
+ tokenizer_name: allenai/olmo-2-1124-7b-instruct
2943
+ max_sequence_length: 4096
2944
+ client_spec:
2945
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
2946
+ args:
2947
+ device_map: auto
2948
+
2949
+ - name: huggingface/olmo-2-0325-32b-instruct
2950
+ model_name: allenai/olmo-2-0325-32b-instruct
2951
+ tokenizer_name: allenai/olmo-2-0325-32b-instruct
2952
+ max_sequence_length: 4096
2953
+ client_spec:
2954
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
2955
+ args:
2956
+ device_map: auto
2957
+
2958
+ - name: huggingface/olmoe-1b-7b-0125-instruct
2959
+ model_name: allenai/olmoe-1b-7b-0125-instruct
2960
+ tokenizer_name: allenai/olmoe-1b-7b-0125-instruct
2961
+ max_sequence_length: 4096
2962
+ client_spec:
2963
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
2964
+ args:
2965
+ device_map: auto
2966
+
2787
2967
  ## MistralAI
2788
2968
  - name: together/mistral-7b-v0.1
2789
2969
  model_name: mistralai/mistral-7b-v0.1
@@ -3069,6 +3249,14 @@ model_deployments:
3069
3249
  client_spec:
3070
3250
  class_name: "helm.clients.palmyra_client.PalmyraChatClient"
3071
3251
 
3252
+ - name: writer/palmyra-x5
3253
+ model_name: writer/palmyra-x5
3254
+ # See tokenizer comment for writer/palmyra-x-004
3255
+ tokenizer_name: meta/llama-3-8b
3256
+ max_sequence_length: 1000000
3257
+ client_spec:
3258
+ class_name: "helm.clients.writer_client.WriterClient"
3259
+
3072
3260
  - name: writer/palmyra-med-32k
3073
3261
  model_name: writer/palmyra-med-32k
3074
3262
  # Palmyra-Med uses the "<|end_of_text|>" as the end of text token, which is used by meta/llama-3-8b,
@@ -3080,12 +3268,10 @@ model_deployments:
3080
3268
 
3081
3269
  - name: writer/palmyra-med
3082
3270
  model_name: writer/palmyra-med
3083
- # Palmyra-Med uses the "<|end_of_text|>" as the end of text token, which is used by meta/llama-3-8b,
3084
- # rather than "<|eot_id|>", which is used by meta/llama-3-8b-instruct
3085
3271
  tokenizer_name: meta/llama-3-8b
3086
- max_sequence_length: 4096
3272
+ max_sequence_length: 32000
3087
3273
  client_spec:
3088
- class_name: "helm.clients.palmyra_client.PalmyraChatClient"
3274
+ class_name: "helm.clients.writer_client.WriterClient"
3089
3275
 
3090
3276
  - name: writer/palmyra-fin-32k
3091
3277
  model_name: writer/palmyra-fin-32k
@@ -3104,16 +3290,23 @@ model_deployments:
3104
3290
 
3105
3291
  # xAI
3106
3292
 
3107
- - name: xai/grok-beta
3108
- model_name: xai/grok-beta
3109
- # No public information on tokenizer, so just pick an arbitrary one.
3110
- # It shouldn't matter since the context is long.
3111
- tokenizer_name: openai/o200k_base
3112
- max_sequence_length: 128000
3293
+ - name: xai/grok-3-beta
3294
+ model_name: xai/grok-3-beta
3295
+ tokenizer_name: xai/grok-3-beta
3296
+ max_sequence_length: 131072
3113
3297
  client_spec:
3114
- class_name: "helm.clients.openai_client.OpenAIClient"
3115
- args:
3116
- base_url: https://api.x.ai/v1
3298
+ class_name: "helm.clients.grok_client.GrokChatClient"
3299
+ window_service_spec:
3300
+ class_name: "helm.benchmark.window_services.no_decoding_window_service.NoDecodingWindowService"
3301
+
3302
+ - name: xai/grok-3-mini-beta
3303
+ model_name: xai/grok-3-mini-beta
3304
+ tokenizer_name: xai/grok-3-mini-beta
3305
+ max_sequence_length: 131072
3306
+ client_spec:
3307
+ class_name: "helm.clients.grok_client.GrokChatClient"
3308
+ window_service_spec:
3309
+ class_name: "helm.benchmark.window_services.no_decoding_window_service.NoDecodingWindowService"
3117
3310
 
3118
3311
  # Qwen
3119
3312
 
@@ -3217,6 +3410,15 @@ model_deployments:
3217
3410
  max_sequence_length: 128000
3218
3411
  client_spec:
3219
3412
  class_name: "helm.clients.together_client.TogetherChatClient"
3413
+
3414
+ - name: together/qwen3-235b-a22b-fp8-tput
3415
+ model_name: qwen/qwen3-235b-a22b-fp8-tput
3416
+ tokenizer_name: qwen/qwen3-235b-a22b
3417
+ max_sequence_length: 40960
3418
+ client_spec:
3419
+ class_name: "helm.clients.together_client.TogetherChatClient"
3420
+ args:
3421
+ parse_thinking: true
3220
3422
 
3221
3423
  - name: huggingface/qwen2.5-7b-instruct-4bit
3222
3424
  model_name: qwen/qwen2.5-7b-instruct
@@ -3240,6 +3442,60 @@ model_deployments:
3240
3442
  args:
3241
3443
  pretrained_model_name_or_path: Qwen/Qwen2.5-7B-Instruct
3242
3444
 
3445
+ - name: huggingface/smollm2-135m
3446
+ model_name: huggingface/smollm2-135m
3447
+ tokenizer_name: huggingface/smollm2-135m
3448
+ max_sequence_length: 8192
3449
+ client_spec:
3450
+ class_name: "helm.clients.huggingface_pipeline_client.HuggingFacePipelineClient"
3451
+ args:
3452
+ pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M
3453
+
3454
+ - name: huggingface/smollm2-360m
3455
+ model_name: huggingface/smollm2-360m
3456
+ tokenizer_name: huggingface/smollm2-135m
3457
+ max_sequence_length: 8192
3458
+ client_spec:
3459
+ class_name: "helm.clients.huggingface_pipeline_client.HuggingFacePipelineClient"
3460
+ args:
3461
+ pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-360M
3462
+
3463
+ - name: huggingface/smollm2-1.7b
3464
+ model_name: huggingface/smollm2-1.7b
3465
+ tokenizer_name: huggingface/smollm2-135m
3466
+ max_sequence_length: 8192
3467
+ client_spec:
3468
+ class_name: "helm.clients.huggingface_pipeline_client.HuggingFacePipelineClient"
3469
+ args:
3470
+ pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-1.7B
3471
+
3472
+ - name: huggingface/smollm2-135m-instruct
3473
+ model_name: huggingface/smollm2-135m-instruct
3474
+ tokenizer_name: huggingface/smollm2-135m-instruct
3475
+ max_sequence_length: 328192768
3476
+ client_spec:
3477
+ class_name: "helm.clients.huggingface_pipeline_client.HuggingFacePipelineClient"
3478
+ args:
3479
+ pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct
3480
+
3481
+ - name: huggingface/smollm2-360m-instruct
3482
+ model_name: huggingface/smollm2-360m-instruct
3483
+ tokenizer_name: huggingface/smollm2-135m-instruct
3484
+ max_sequence_length: 8192
3485
+ client_spec:
3486
+ class_name: "helm.clients.huggingface_pipeline_client.HuggingFacePipelineClient"
3487
+ args:
3488
+ pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-360M-Instruct
3489
+
3490
+ - name: huggingface/smollm2-1.7b-instruct
3491
+ model_name: huggingface/smollm2-1.7b-instruct
3492
+ tokenizer_name: huggingface/smollm2-135m-instruct
3493
+ max_sequence_length: 8192
3494
+ client_spec:
3495
+ class_name: "helm.clients.huggingface_pipeline_client.HuggingFacePipelineClient"
3496
+ args:
3497
+ pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-1.7B-Instruct
3498
+
3243
3499
  - name: together/qwq-32b-preview
3244
3500
  model_name: qwen/qwq-32b-preview
3245
3501
  tokenizer_name: qwen/qwq-32b-preview
@@ -3275,6 +3531,34 @@ model_deployments:
3275
3531
  client_spec:
3276
3532
  class_name: "helm.clients.vision_language.qwen2_vlm_client.Qwen2VLMClient"
3277
3533
 
3534
+ - name: huggingface/qwen2.5-vl-3b-instruct
3535
+ model_name: qwen/qwen2.5-vl-3b-instruct
3536
+ tokenizer_name: qwen/qwen-vl-chat
3537
+ max_sequence_length: 8191
3538
+ client_spec:
3539
+ class_name: "helm.clients.vision_language.qwen2_vlm_client.Qwen2VLMClient"
3540
+
3541
+ - name: huggingface/qwen2.5-vl-7b-instruct
3542
+ model_name: qwen/qwen2.5-vl-7b-instruct
3543
+ tokenizer_name: qwen/qwen-vl-chat
3544
+ max_sequence_length: 8191
3545
+ client_spec:
3546
+ class_name: "helm.clients.vision_language.qwen2_vlm_client.Qwen2VLMClient"
3547
+
3548
+ - name: huggingface/qwen2.5-vl-32b-instruct
3549
+ model_name: qwen/qwen2.5-vl-32b-instruct
3550
+ tokenizer_name: qwen/qwen-vl-chat
3551
+ max_sequence_length: 8191
3552
+ client_spec:
3553
+ class_name: "helm.clients.vision_language.qwen2_vlm_client.Qwen2VLMClient"
3554
+
3555
+ - name: huggingface/qwen2.5-vl-72b-instruct
3556
+ model_name: qwen/qwen2.5-vl-72b-instruct
3557
+ tokenizer_name: qwen/qwen-vl-chat
3558
+ max_sequence_length: 8191
3559
+ client_spec:
3560
+ class_name: "helm.clients.vision_language.qwen2_vlm_client.Qwen2VLMClient"
3561
+
3278
3562
  - name: huggingface/qwen-audio-chat
3279
3563
  model_name: qwen/qwen-audio-chat
3280
3564
  tokenizer_name: qwen/qwen-audio-chat
@@ -3289,6 +3573,13 @@ model_deployments:
3289
3573
  client_spec:
3290
3574
  class_name: "helm.clients.audio_language.qwen2_audiolm_client.Qwen2AudioLMClient"
3291
3575
 
3576
+ - name: huggingface/qwen2.5-omni-7b
3577
+ model_name: qwen/qwen2.5-omni-7b
3578
+ tokenizer_name: qwen/qwen2.5-omni-7b
3579
+ max_sequence_length: 8191
3580
+ client_spec:
3581
+ class_name: "helm.clients.audio_language.qwen2_5_omni_client.Qwen2_5OmniAudioLMClient"
3582
+
3292
3583
  # Reka
3293
3584
  - name: reka/reka-core
3294
3585
  model_name: reka/reka-core
@@ -3567,8 +3858,6 @@ model_deployments:
3567
3858
  watsonx_model_name: ibm/granite-3-2b-instruct
3568
3859
  region: Dallas
3569
3860
 
3570
-
3571
- #
3572
3861
  - name: ibm/granite-3-8b-instruct
3573
3862
  model_name: ibm/granite-3.1-8b-instruct
3574
3863
  tokenizer_name: ibm-granite/granite-3.1-8b-instruct
@@ -3578,9 +3867,7 @@ model_deployments:
3578
3867
  args:
3579
3868
  watsonx_model_name: ibm/granite-3-8b-instruct
3580
3869
  region: Dallas
3581
- #
3582
3870
 
3583
- #
3584
3871
  - name: ibm/granite-13b-instruct-v2
3585
3872
  model_name: ibm/granite-13b-instruct-v2
3586
3873
  tokenizer_name: EleutherAI/gpt-neox-20b
@@ -3590,7 +3877,7 @@ model_deployments:
3590
3877
  args:
3591
3878
  watsonx_model_name: ibm/granite-13b-instruct-v2
3592
3879
  region: Dallas
3593
- #
3880
+
3594
3881
  - name: ibm/granite-20b-code-instruct-8k
3595
3882
  model_name: ibm/granite-20b-code-instruct-8k
3596
3883
  tokenizer_name: ibm-granite/granite-20b-code-instruct-8k
@@ -3600,7 +3887,7 @@ model_deployments:
3600
3887
  args:
3601
3888
  watsonx_model_name: ibm/granite-20b-code-instruct
3602
3889
  region: Dallas
3603
- #
3890
+
3604
3891
  - name: ibm/granite-34b-code-instruct
3605
3892
  model_name: ibm/granite-34b-code-instruct
3606
3893
  tokenizer_name: ibm-granite/granite-34b-code-instruct-8k
@@ -3610,7 +3897,7 @@ model_deployments:
3610
3897
  args:
3611
3898
  watsonx_model_name: ibm/granite-34b-code-instruct
3612
3899
  region: Dallas
3613
- #
3900
+
3614
3901
  - name: ibm/granite-3b-code-instruct
3615
3902
  model_name: ibm/granite-3b-code-instruct
3616
3903
  tokenizer_name: ibm-granite/granite-3b-code-instruct-128k
@@ -3620,7 +3907,7 @@ model_deployments:
3620
3907
  args:
3621
3908
  watsonx_model_name: ibm/granite-3b-code-instruct
3622
3909
  region: Dallas
3623
- #
3910
+
3624
3911
  - name: ibm/granite-8b-code-instruct
3625
3912
  model_name: ibm/granite-8b-code-instruct
3626
3913
  tokenizer_name: ibm-granite/granite-8b-code-instruct-128k
@@ -3639,4 +3926,388 @@ model_deployments:
3639
3926
  class_name: "helm.clients.ibm_client.IbmChatClient"
3640
3927
  args:
3641
3928
  watsonx_model_name: mistralai/mixtral-8x7b-instruct-v01
3642
- region: Dallas
3929
+ region: Dallas
3930
+
3931
+ - name: ibm/granite-3.3-8b-instruct
3932
+ model_name: ibm/granite-3.3-8b-instruct
3933
+ tokenizer_name: ibm/granite-3.3-8b-instruct
3934
+ max_sequence_length: 131072
3935
+ client_spec:
3936
+ class_name: "helm.clients.ibm_client.IbmTextClient"
3937
+ args:
3938
+ watsonx_model_name: ibm/granite-3-3-8b-instruct
3939
+ region: Dallas
3940
+
3941
+ # Vietnamese
3942
+ - name: ura-hcmut/ura-llama-2.1-8b
3943
+ model_name: ura-hcmut/ura-llama-2.1-8b
3944
+ tokenizer_name: meta/llama-3.1-8b-instruct
3945
+ max_sequence_length: 131072
3946
+ client_spec:
3947
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3948
+ args:
3949
+ pretrained_model_name_or_path: ura-hcmut/ura-llama-2.1-8b
3950
+
3951
+ - name: ura-hcmut/ura-llama-2-8b
3952
+ model_name: ura-hcmut/ura-llama-2-8b
3953
+ tokenizer_name: meta/llama-3-8b-instruct
3954
+ max_sequence_length: 8192
3955
+ client_spec:
3956
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3957
+ args:
3958
+ pretrained_model_name_or_path: ura-hcmut/ura-llama-2-8b
3959
+
3960
+ - name: ura-hcmut/ura-llama-7b
3961
+ model_name: ura-hcmut/ura-llama-7b
3962
+ tokenizer_name: meta-llama/Llama-2-7b-hf
3963
+ max_sequence_length: 4096
3964
+ client_spec:
3965
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3966
+ args:
3967
+ pretrained_model_name_or_path: ura-hcmut/ura-llama-7b
3968
+
3969
+ - name: ura-hcmut/ura-llama-13b
3970
+ model_name: ura-hcmut/ura-llama-13b
3971
+ tokenizer_name: meta-llama/Llama-2-7b-hf
3972
+ max_sequence_length: 4096
3973
+ client_spec:
3974
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3975
+ args:
3976
+ pretrained_model_name_or_path: ura-hcmut/ura-llama-13b
3977
+
3978
+ - name: ura-hcmut/ura-llama-70b
3979
+ model_name: ura-hcmut/ura-llama-70b
3980
+ tokenizer_name: meta-llama/Llama-2-7b-hf
3981
+ max_sequence_length: 4096
3982
+ client_spec:
3983
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3984
+ args:
3985
+ pretrained_model_name_or_path: ura-hcmut/ura-llama-70b
3986
+
3987
+ - name: ura-hcmut/GemSUra-7B
3988
+ model_name: ura-hcmut/GemSUra-7B
3989
+ tokenizer_name: google/gemma-2b
3990
+ max_sequence_length: 8192
3991
+ client_spec:
3992
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3993
+ args:
3994
+ pretrained_model_name_or_path: ura-hcmut/GemSUra-7B
3995
+
3996
+ - name: ura-hcmut/GemSUra-2B
3997
+ model_name: ura-hcmut/GemSUra-2B
3998
+ tokenizer_name: google/gemma-2b
3999
+ max_sequence_length: 8192
4000
+ client_spec:
4001
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4002
+ args:
4003
+ pretrained_model_name_or_path: ura-hcmut/GemSUra-2B
4004
+
4005
+ - name: ura-hcmut/MixSUra
4006
+ model_name: ura-hcmut/MixSUra
4007
+ tokenizer_name: mistralai/Mistral-7B-v0.1
4008
+ max_sequence_length: 32768
4009
+ client_spec:
4010
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4011
+ args:
4012
+ pretrained_model_name_or_path: ura-hcmut/MixSUra
4013
+
4014
+ - name: vilm/vinallama-7b-chat
4015
+ model_name: vilm/vinallama-7b-chat
4016
+ tokenizer_name: vilm/vinallama-7b-chat
4017
+ max_sequence_length: 4096
4018
+ client_spec:
4019
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4020
+ args:
4021
+ pretrained_model_name_or_path: vilm/vinallama-7b-chat
4022
+
4023
+ - name: vilm/vinallama-2.7b-chat
4024
+ model_name: vilm/vinallama-2.7b-chat
4025
+ tokenizer_name: vilm/vinallama-2.7b-chat
4026
+ max_sequence_length: 4096
4027
+ client_spec:
4028
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4029
+ args:
4030
+ pretrained_model_name_or_path: vilm/vinallama-2.7b-chat
4031
+
4032
+ - name: vilm/vietcuna-7b-v3
4033
+ model_name: vilm/vietcuna-7b-v3
4034
+ tokenizer_name: vilm/vietcuna-7b-v3
4035
+ max_sequence_length: 2048
4036
+ client_spec:
4037
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4038
+ args:
4039
+ pretrained_model_name_or_path: vilm/vietcuna-7b-v3
4040
+
4041
+ - name: vilm/vietcuna-3b-v2
4042
+ model_name: vilm/vietcuna-3b-v2
4043
+ tokenizer_name: vilm/vietcuna-7b-v3
4044
+ max_sequence_length: 2048
4045
+ client_spec:
4046
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4047
+ args:
4048
+ pretrained_model_name_or_path: vilm/vietcuna-3b-v2
4049
+
4050
+ - name: vilm/Quyen-v0.1
4051
+ model_name: vilm/Quyen-v0.1
4052
+ tokenizer_name: qwen/qwen2-72b-instruct
4053
+ max_sequence_length: 32768
4054
+ client_spec:
4055
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4056
+ args:
4057
+ pretrained_model_name_or_path: vilm/Quyen-v0.1
4058
+
4059
+ - name: vilm/Quyen-Plus-v0.1
4060
+ model_name: vilm/Quyen-Plus-v0.1
4061
+ tokenizer_name: qwen/qwen2-72b-instruct
4062
+ max_sequence_length: 32768
4063
+ client_spec:
4064
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4065
+ args:
4066
+ pretrained_model_name_or_path: vilm/Quyen-Plus-v0.1
4067
+
4068
+ - name: vilm/Quyen-Pro-v0.1
4069
+ model_name: vilm/Quyen-Pro-v0.1
4070
+ tokenizer_name: qwen/qwen2-72b-instruct
4071
+ max_sequence_length: 32768
4072
+ client_spec:
4073
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4074
+ args:
4075
+ pretrained_model_name_or_path: vilm/Quyen-Pro-v0.1
4076
+
4077
+ - name: vilm/Quyen-Pro-Max-v0.1
4078
+ model_name: vilm/Quyen-Pro-Max-v0.1
4079
+ tokenizer_name: qwen/qwen2-72b-instruct
4080
+ max_sequence_length: 32768
4081
+ client_spec:
4082
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4083
+ args:
4084
+ pretrained_model_name_or_path: vilm/Quyen-Pro-Max-v0.1
4085
+
4086
+ - name: vilm/Quyen-Mini-v0.1
4087
+ model_name: vilm/Quyen-Mini-v0.1
4088
+ tokenizer_name: qwen/qwen2-72b-instruct
4089
+ max_sequence_length: 32768
4090
+ client_spec:
4091
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4092
+ args:
4093
+ pretrained_model_name_or_path: vilm/Quyen-Mini-v0.1
4094
+
4095
+ - name: vilm/Quyen-SE-v0.1
4096
+ model_name: vilm/Quyen-SE-v0.1
4097
+ tokenizer_name: qwen/qwen2-72b-instruct
4098
+ max_sequence_length: 32768
4099
+ client_spec:
4100
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4101
+ args:
4102
+ pretrained_model_name_or_path: vilm/Quyen-SE-v0.1
4103
+
4104
+ - name: Viet-Mistral/Vistral-7B-Chat
4105
+ model_name: Viet-Mistral/Vistral-7B-Chat
4106
+ tokenizer_name: Viet-Mistral/Vistral-7B-Chat
4107
+ max_sequence_length: 32768
4108
+ client_spec:
4109
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4110
+ args:
4111
+ pretrained_model_name_or_path: Viet-Mistral/Vistral-7B-Chat
4112
+
4113
+ - name: vinai/PhoGPT-7B5-Instruct
4114
+ model_name: vinai/PhoGPT-7B5-Instruct
4115
+ tokenizer_name: vinai/PhoGPT-7B5-Instruct
4116
+ max_sequence_length: 2048
4117
+ client_spec:
4118
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4119
+ args:
4120
+ pretrained_model_name_or_path: vinai/PhoGPT-7B5-Instruct
4121
+
4122
+ - name: vinai/PhoGPT-4B-Chat
4123
+ model_name: vinai/PhoGPT-4B-Chat
4124
+ tokenizer_name: vinai/PhoGPT-4B-Chat
4125
+ max_sequence_length: 8192
4126
+ client_spec:
4127
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4128
+ args:
4129
+ pretrained_model_name_or_path: vinai/PhoGPT-4B-Chat
4130
+
4131
+ # Stanford Health Care
4132
+ # Placed later in the file to make them non-default
4133
+ - name: stanfordhealthcare/claude-3-5-sonnet-20241022
4134
+ model_name: anthropic/claude-3-5-sonnet-20241022
4135
+ tokenizer_name: anthropic/claude
4136
+ max_sequence_length: 200000
4137
+ client_spec:
4138
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
4139
+ args:
4140
+ model: anthropic.claude-3-5-sonnet-20241022-v2:0
4141
+ deployment: Claude35Sonnetv2/awssig4fa
4142
+
4143
+ - name: stanfordhealthcare/claude-3-7-sonnet-20250219
4144
+ model_name: anthropic/claude-3-7-sonnet-20250219
4145
+ tokenizer_name: anthropic/claude
4146
+ max_sequence_length: 200000
4147
+ client_spec:
4148
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
4149
+ args:
4150
+ model: arn:aws:bedrock:us-west-2:679683451337:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0
4151
+ deployment: awssig4claude37/aswsig4claude37
4152
+
4153
+ - name: stanfordhealthcare/gemini-1.5-pro-001
4154
+ model_name: google/gemini-1.5-pro-001
4155
+ tokenizer_name: google/gemma-2b
4156
+ max_sequence_length: 1000000
4157
+ client_spec:
4158
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
4159
+ args:
4160
+ deployment: gcpgemini/apim-gcp-oauth-fa
4161
+
4162
+ - name: stanfordhealthcare/gemini-2.0-flash-001
4163
+ model_name: google/gemini-2.0-flash-001
4164
+ tokenizer_name: google/gemma-2b
4165
+ max_sequence_length: 1000000
4166
+ client_spec:
4167
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
4168
+ args:
4169
+ deployment: gcp-gem20flash-fa/apim-gcp-gem20flash-fa
4170
+
4171
+ - name: stanfordhealthcare/gpt-4o-mini-2024-07-18
4172
+ model_name: openai/gpt-4o-mini-2024-07-18
4173
+ tokenizer_name: openai/o200k_base
4174
+ max_sequence_length: 128000
4175
+ client_spec:
4176
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4177
+ args:
4178
+ openai_model_name: gpt-4o-mini
4179
+ api_version: 2023-05-15
4180
+
4181
+ - name: stanfordhealthcare/gpt-4o-2024-05-13
4182
+ model_name: openai/gpt-4o-2024-05-13
4183
+ tokenizer_name: openai/o200k_base
4184
+ max_sequence_length: 128000
4185
+ client_spec:
4186
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4187
+ args:
4188
+ openai_model_name: gpt-4o
4189
+ api_version: 2023-05-15
4190
+
4191
+ - name: stanfordhealthcare/gpt-4-0613
4192
+ model_name: openai/gpt-4-0613
4193
+ tokenizer_name: openai/o200k_base
4194
+ max_sequence_length: 8192
4195
+ client_spec:
4196
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4197
+ args:
4198
+ openai_model_name: gpt-4
4199
+ api_version: 2023-05-15
4200
+
4201
+ - name: stanfordhealthcare/gpt-4-turbo-2024-04-09
4202
+ model_name: openai/gpt-4-turbo-2024-04-09
4203
+ tokenizer_name: openai/cl100k_base
4204
+ max_sequence_length: 128000
4205
+ client_spec:
4206
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4207
+ args:
4208
+ openai_model_name: gpt-4-turbo
4209
+ api_version: 2023-05-15
4210
+
4211
+ - name: stanfordhealthcare/gpt-4.1-2025-04-14
4212
+ model_name: openai/gpt-4.1-2025-04-14
4213
+ tokenizer_name: openai/o200k_base
4214
+ max_sequence_length: 1047576
4215
+ client_spec:
4216
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4217
+ args:
4218
+ openai_model_name: gpt-4.1
4219
+ api_version: 2025-01-01-preview
4220
+ base_url: "{endpoint}/openai-eastus2"
4221
+
4222
+ - name: stanfordhealthcare/o3-mini-2025-01-31
4223
+ model_name: openai/o3-mini-2025-01-31
4224
+ tokenizer_name: openai/cl100k_base
4225
+ max_sequence_length: 200000
4226
+ client_spec:
4227
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4228
+ args:
4229
+ openai_model_name: o3-mini
4230
+ api_version: 2024-12-01-preview
4231
+ base_url: "{endpoint}/openai-eastus2"
4232
+
4233
+ - name: stanfordhealthcare/o1-2024-12-17
4234
+ model_name: openai/o1-2024-12-17
4235
+ tokenizer_name: openai/cl100k_base
4236
+ max_sequence_length: 128000
4237
+ client_spec:
4238
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4239
+ args:
4240
+ openai_model_name: o1
4241
+ api_version: 2024-12-01-preview
4242
+ base_url: "{endpoint}/openai-eastus2"
4243
+
4244
+ - name: stanfordhealthcare/deepseek-r1
4245
+ model_name: deepseek-ai/deepseek-r1
4246
+ tokenizer_name: deepseek-ai/deepseek-r1
4247
+ max_sequence_length: 128000
4248
+ client_spec:
4249
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4250
+ args:
4251
+ openai_model_name: deepseek-chat
4252
+ output_processor: helm.benchmark.metrics.output_processors.remove_deepseek_r1_thinking
4253
+ base_url: "{endpoint}/deepseekr1/v1"
4254
+
4255
+ - name: stanfordhealthcare/llama-3.3-70b-instruct
4256
+ model_name: meta/llama-3.3-70b-instruct
4257
+ tokenizer_name: meta/llama-3.3-70b-instruct
4258
+ max_sequence_length: 128000
4259
+ client_spec:
4260
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4261
+ args:
4262
+ base_url: "{endpoint}/llama3370b/v1"
4263
+
4264
+ - name: stanfordhealthcare/llama-4-scout-17b-16e-instruct
4265
+ model_name: meta/llama-4-scout-17b-16e-instruct
4266
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
4267
+ max_sequence_length: 327680
4268
+ client_spec:
4269
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4270
+ args:
4271
+ base_url: "{endpoint}/llama4-scout/v1"
4272
+
4273
+ - name: stanfordhealthcare/llama-4-maverick-17b-128e-instruct-fp8
4274
+ model_name: meta/llama-4-maverick-17b-128e-instruct-fp8
4275
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
4276
+ max_sequence_length: 524288
4277
+ client_spec:
4278
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4279
+ args:
4280
+ base_url: "{endpoint}/llama4-maverick/v1"
4281
+
4282
+ - name: stanfordhealthcare/phi-3.5-mini-instruct
4283
+ model_name: microsoft/phi-3.5-mini-instruct
4284
+ tokenizer_name: microsoft/phi-3.5-mini-instruct
4285
+ max_sequence_length: 131072
4286
+ client_spec:
4287
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4288
+ args:
4289
+ base_url: "{endpoint}/phi35mi/v1"
4290
+
4291
+ - name: stanfordhealthcare_shc/gpt-4o-2024-05-13
4292
+ model_name: openai/gpt-4o-2024-05-13
4293
+ tokenizer_name: openai/o200k_base
4294
+ max_sequence_length: 128000
4295
+ client_spec:
4296
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
4297
+ deployment: gpt-4o
4298
+
4299
+ - name: stanfordhealthcare_shc/gpt-4o-mini-2024-07-18
4300
+ model_name: openai/gpt-4o-mini-2024-07-18
4301
+ tokenizer_name: openai/o200k_base
4302
+ max_sequence_length: 128000
4303
+ client_spec:
4304
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
4305
+ deployment: gpt-4o-mini
4306
+
4307
+ - name: stanfordhealthcare_shc/gpt-4-turbo-2024-04-09
4308
+ model_name: openai/gpt-4-turbo-2024-04-09
4309
+ tokenizer_name: openai/cl100k_base
4310
+ max_sequence_length: 128000
4311
+ client_spec:
4312
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
4313
+ deployment: gpt-4-turbo-2024-04-09