crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (103) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
  3. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  4. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  5. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  6. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  7. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  8. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  9. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  10. helm/benchmark/metrics/comet_metric.py +1 -1
  11. helm/benchmark/metrics/copyright_metrics.py +1 -1
  12. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  13. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  14. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  15. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  16. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  17. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  18. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  19. helm/benchmark/metrics/summac/model_summac.py +1 -1
  20. helm/benchmark/model_deployment_registry.py +11 -19
  21. helm/benchmark/presentation/create_plots.py +11 -2
  22. helm/benchmark/presentation/schema.py +5 -0
  23. helm/benchmark/presentation/summarize.py +9 -3
  24. helm/benchmark/presentation/test_create_plots.py +4 -1
  25. helm/benchmark/run.py +7 -1
  26. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  27. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  28. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  29. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  30. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  31. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  32. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  33. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  34. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  35. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
  36. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  37. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  38. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  39. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  40. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  41. helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
  42. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  43. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
  44. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
  45. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
  46. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  47. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  48. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  49. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  50. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  51. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  52. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  53. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  54. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  55. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  56. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  57. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  58. helm/benchmark/scenarios/math_scenario.py +21 -20
  59. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  60. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  61. helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
  62. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  63. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  64. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  65. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  66. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  67. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  68. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  69. helm/benchmark/slurm_jobs.py +1 -2
  70. helm/benchmark/slurm_runner.py +8 -1
  71. helm/benchmark/static/schema_arabic.yaml +228 -0
  72. helm/benchmark/static/schema_classic.yaml +0 -17
  73. helm/benchmark/static/schema_long_context.yaml +19 -1
  74. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  75. helm/benchmark/static_build/index.html +1 -1
  76. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  77. helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
  78. helm/clients/huggingface_client.py +2 -2
  79. helm/clients/openai_client.py +2 -1
  80. helm/clients/openai_responses_client.py +6 -4
  81. helm/clients/test_huggingface_client.py +3 -3
  82. helm/clients/together_client.py +0 -2
  83. helm/clients/vertexai_client.py +11 -9
  84. helm/clients/vllm_client.py +43 -7
  85. helm/clients/vllm_granite_thinking_client.py +56 -0
  86. helm/common/critique_request.py +0 -1
  87. helm/common/hierarchical_logger.py +83 -34
  88. helm/common/object_spec.py +23 -8
  89. helm/common/test_logging.py +94 -0
  90. helm/config/model_deployments.yaml +454 -175
  91. helm/config/model_metadata.yaml +117 -10
  92. helm/config/tokenizer_configs.yaml +81 -1
  93. helm/proxy/cli.py +1 -1
  94. helm/proxy/retry.py +5 -0
  95. helm/tokenizers/grok_tokenizer.py +2 -0
  96. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  97. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  98. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  99. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  100. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
  101. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  102. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  103. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
@@ -16,6 +16,373 @@ model_deployments:
16
16
  client_spec:
17
17
  class_name: "helm.clients.simple_client.SimpleClient"
18
18
 
19
+ # Stanford Health Care
20
+ # For internal use only for MedHELM
21
+ # Placed earlier in the file to make them non-default
22
+ - name: stanfordhealthcare/claude-3-5-sonnet-20241022
23
+ model_name: anthropic/claude-3-5-sonnet-20241022
24
+ tokenizer_name: anthropic/claude
25
+ max_sequence_length: 200000
26
+ client_spec:
27
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
28
+ args:
29
+ model: anthropic.claude-3-5-sonnet-20241022-v2:0
30
+ deployment: Claude35Sonnetv2/awssig4fa
31
+
32
+ - name: stanfordhealthcare/claude-3-7-sonnet-20250219
33
+ model_name: anthropic/claude-3-7-sonnet-20250219
34
+ tokenizer_name: anthropic/claude
35
+ max_sequence_length: 200000
36
+ client_spec:
37
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
38
+ args:
39
+ model: arn:aws:bedrock:us-west-2:679683451337:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0
40
+ deployment: awssig4claude37/aswsig4claude37
41
+
42
+ - name: stanfordhealthcare/gemini-1.5-pro-001
43
+ model_name: google/gemini-1.5-pro-001
44
+ tokenizer_name: google/gemma-2b
45
+ max_sequence_length: 1000000
46
+ client_spec:
47
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
48
+ args:
49
+ deployment: gcpgemini/apim-gcp-oauth-fa
50
+
51
+ - name: stanfordhealthcare/gemini-2.0-flash-001
52
+ model_name: google/gemini-2.0-flash-001
53
+ tokenizer_name: google/gemma-2b
54
+ max_sequence_length: 1000000
55
+ client_spec:
56
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
57
+ args:
58
+ deployment: gcp-gem20flash-fa/apim-gcp-gem20flash-fa
59
+
60
+ - name: stanfordhealthcare/gpt-4o-mini-2024-07-18
61
+ model_name: openai/gpt-4o-mini-2024-07-18
62
+ tokenizer_name: openai/o200k_base
63
+ max_sequence_length: 128000
64
+ client_spec:
65
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
66
+ args:
67
+ openai_model_name: gpt-4o-mini
68
+ api_version: 2023-05-15
69
+
70
+ - name: stanfordhealthcare/gpt-4o-2024-05-13
71
+ model_name: openai/gpt-4o-2024-05-13
72
+ tokenizer_name: openai/o200k_base
73
+ max_sequence_length: 128000
74
+ client_spec:
75
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
76
+ args:
77
+ openai_model_name: gpt-4o
78
+ api_version: 2023-05-15
79
+
80
+ - name: stanfordhealthcare/gpt-4-0613
81
+ model_name: openai/gpt-4-0613
82
+ tokenizer_name: openai/o200k_base
83
+ max_sequence_length: 8192
84
+ client_spec:
85
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
86
+ args:
87
+ openai_model_name: gpt-4
88
+ api_version: 2023-05-15
89
+
90
+ - name: stanfordhealthcare/gpt-4-turbo-2024-04-09
91
+ model_name: openai/gpt-4-turbo-2024-04-09
92
+ tokenizer_name: openai/cl100k_base
93
+ max_sequence_length: 128000
94
+ client_spec:
95
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
96
+ args:
97
+ openai_model_name: gpt-4-turbo
98
+ api_version: 2023-05-15
99
+
100
+ - name: stanfordhealthcare/gpt-4.1-2025-04-14
101
+ model_name: openai/gpt-4.1-2025-04-14
102
+ tokenizer_name: openai/o200k_base
103
+ max_sequence_length: 1047576
104
+ client_spec:
105
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
106
+ args:
107
+ openai_model_name: gpt-4.1
108
+ api_version: 2025-01-01-preview
109
+ base_url: "{endpoint}/openai-eastus2"
110
+
111
+ - name: stanfordhealthcare/o3-mini-2025-01-31
112
+ model_name: openai/o3-mini-2025-01-31
113
+ tokenizer_name: openai/cl100k_base
114
+ max_sequence_length: 200000
115
+ client_spec:
116
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
117
+ args:
118
+ openai_model_name: o3-mini
119
+ api_version: 2024-12-01-preview
120
+ base_url: "{endpoint}/openai-eastus2"
121
+
122
+ - name: stanfordhealthcare/o1-2024-12-17
123
+ model_name: openai/o1-2024-12-17
124
+ tokenizer_name: openai/cl100k_base
125
+ max_sequence_length: 128000
126
+ client_spec:
127
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
128
+ args:
129
+ openai_model_name: o1
130
+ api_version: 2024-12-01-preview
131
+ base_url: "{endpoint}/openai-eastus2"
132
+
133
+ - name: stanfordhealthcare/deepseek-r1
134
+ model_name: deepseek-ai/deepseek-r1
135
+ tokenizer_name: deepseek-ai/deepseek-r1
136
+ max_sequence_length: 128000
137
+ client_spec:
138
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
139
+ args:
140
+ openai_model_name: deepseek-chat
141
+ output_processor: helm.benchmark.metrics.output_processors.remove_deepseek_r1_thinking
142
+ base_url: "{endpoint}/deepseekr1/v1"
143
+
144
+ - name: stanfordhealthcare/llama-3.3-70b-instruct
145
+ model_name: meta/llama-3.3-70b-instruct
146
+ tokenizer_name: meta/llama-3.3-70b-instruct
147
+ max_sequence_length: 128000
148
+ client_spec:
149
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
150
+ args:
151
+ base_url: "{endpoint}/llama3370b/v1"
152
+
153
+ - name: stanfordhealthcare/llama-4-scout-17b-16e-instruct
154
+ model_name: meta/llama-4-scout-17b-16e-instruct
155
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
156
+ max_sequence_length: 327680
157
+ client_spec:
158
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
159
+ args:
160
+ base_url: "{endpoint}/llama4-scout/v1"
161
+
162
+ - name: stanfordhealthcare/llama-4-maverick-17b-128e-instruct-fp8
163
+ model_name: meta/llama-4-maverick-17b-128e-instruct-fp8
164
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
165
+ max_sequence_length: 524288
166
+ client_spec:
167
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
168
+ args:
169
+ base_url: "{endpoint}/llama4-maverick/v1"
170
+
171
+ - name: stanfordhealthcare/phi-3.5-mini-instruct
172
+ model_name: microsoft/phi-3.5-mini-instruct
173
+ tokenizer_name: microsoft/phi-3.5-mini-instruct
174
+ max_sequence_length: 131072
175
+ client_spec:
176
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
177
+ args:
178
+ base_url: "{endpoint}/phi35mi/v1"
179
+
180
+ - name: stanfordhealthcare_shc/gpt-4o-2024-05-13
181
+ model_name: openai/gpt-4o-2024-05-13
182
+ tokenizer_name: openai/o200k_base
183
+ max_sequence_length: 128000
184
+ client_spec:
185
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
186
+ deployment: gpt-4o
187
+
188
+ - name: stanfordhealthcare_shc/gpt-4o-mini-2024-07-18
189
+ model_name: openai/gpt-4o-mini-2024-07-18
190
+ tokenizer_name: openai/o200k_base
191
+ max_sequence_length: 128000
192
+ client_spec:
193
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
194
+ deployment: gpt-4o-mini
195
+
196
+ - name: stanfordhealthcare_shc/gpt-4-turbo-2024-04-09
197
+ model_name: openai/gpt-4-turbo-2024-04-09
198
+ tokenizer_name: openai/cl100k_base
199
+ max_sequence_length: 128000
200
+ client_spec:
201
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
202
+ deployment: gpt-4-turbo-2024-04-09
203
+
204
+ - name: stanfordhealthcare/claude-3-5-sonnet-20241022
205
+ model_name: anthropic/claude-3-5-sonnet-20241022
206
+ tokenizer_name: anthropic/claude
207
+ max_sequence_length: 200000
208
+ client_spec:
209
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
210
+ args:
211
+ model: anthropic.claude-3-5-sonnet-20241022-v2:0
212
+ deployment: Claude35Sonnetv2/awssig4fa
213
+
214
+ - name: stanfordhealthcare/claude-3-7-sonnet-20250219
215
+ model_name: anthropic/claude-3-7-sonnet-20250219
216
+ tokenizer_name: anthropic/claude
217
+ max_sequence_length: 200000
218
+ client_spec:
219
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
220
+ args:
221
+ model: arn:aws:bedrock:us-west-2:679683451337:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0
222
+ deployment: awssig4claude37/aswsig4claude37
223
+
224
+ - name: stanfordhealthcare/gemini-1.5-pro-001
225
+ model_name: google/gemini-1.5-pro-001
226
+ tokenizer_name: google/gemma-2b
227
+ max_sequence_length: 1000000
228
+ client_spec:
229
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
230
+ args:
231
+ deployment: gcpgemini/apim-gcp-oauth-fa
232
+
233
+ - name: stanfordhealthcare/gemini-2.0-flash-001
234
+ model_name: google/gemini-2.0-flash-001
235
+ tokenizer_name: google/gemma-2b
236
+ max_sequence_length: 1000000
237
+ client_spec:
238
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
239
+ args:
240
+ deployment: gcp-gem20flash-fa/apim-gcp-gem20flash-fa
241
+
242
+ - name: stanfordhealthcare/gpt-4o-mini-2024-07-18
243
+ model_name: openai/gpt-4o-mini-2024-07-18
244
+ tokenizer_name: openai/o200k_base
245
+ max_sequence_length: 128000
246
+ client_spec:
247
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
248
+ args:
249
+ openai_model_name: gpt-4o-mini
250
+ api_version: 2023-05-15
251
+
252
+ - name: stanfordhealthcare/gpt-4o-2024-05-13
253
+ model_name: openai/gpt-4o-2024-05-13
254
+ tokenizer_name: openai/o200k_base
255
+ max_sequence_length: 128000
256
+ client_spec:
257
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
258
+ args:
259
+ openai_model_name: gpt-4o
260
+ api_version: 2023-05-15
261
+
262
+ - name: stanfordhealthcare/gpt-4-0613
263
+ model_name: openai/gpt-4-0613
264
+ tokenizer_name: openai/o200k_base
265
+ max_sequence_length: 8192
266
+ client_spec:
267
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
268
+ args:
269
+ openai_model_name: gpt-4
270
+ api_version: 2023-05-15
271
+
272
+ - name: stanfordhealthcare/gpt-4-turbo-2024-04-09
273
+ model_name: openai/gpt-4-turbo-2024-04-09
274
+ tokenizer_name: openai/cl100k_base
275
+ max_sequence_length: 128000
276
+ client_spec:
277
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
278
+ args:
279
+ openai_model_name: gpt-4-turbo
280
+ api_version: 2023-05-15
281
+
282
+ - name: stanfordhealthcare/gpt-4.1-2025-04-14
283
+ model_name: openai/gpt-4.1-2025-04-14
284
+ tokenizer_name: openai/o200k_base
285
+ max_sequence_length: 1047576
286
+ client_spec:
287
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
288
+ args:
289
+ openai_model_name: gpt-4.1
290
+ api_version: 2025-01-01-preview
291
+ base_url: "{endpoint}/openai-eastus2"
292
+
293
+ - name: stanfordhealthcare/o3-mini-2025-01-31
294
+ model_name: openai/o3-mini-2025-01-31
295
+ tokenizer_name: openai/cl100k_base
296
+ max_sequence_length: 200000
297
+ client_spec:
298
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
299
+ args:
300
+ openai_model_name: o3-mini
301
+ api_version: 2024-12-01-preview
302
+ base_url: "{endpoint}/openai-eastus2"
303
+
304
+ - name: stanfordhealthcare/o1-2024-12-17
305
+ model_name: openai/o1-2024-12-17
306
+ tokenizer_name: openai/cl100k_base
307
+ max_sequence_length: 128000
308
+ client_spec:
309
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
310
+ args:
311
+ openai_model_name: o1
312
+ api_version: 2024-12-01-preview
313
+ base_url: "{endpoint}/openai-eastus2"
314
+
315
+ - name: stanfordhealthcare/deepseek-r1
316
+ model_name: deepseek-ai/deepseek-r1
317
+ tokenizer_name: deepseek-ai/deepseek-r1
318
+ max_sequence_length: 128000
319
+ client_spec:
320
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
321
+ args:
322
+ openai_model_name: deepseek-chat
323
+ output_processor: helm.benchmark.metrics.output_processors.remove_deepseek_r1_thinking
324
+ base_url: "{endpoint}/deepseekr1/v1"
325
+
326
+ - name: stanfordhealthcare/llama-3.3-70b-instruct
327
+ model_name: meta/llama-3.3-70b-instruct
328
+ tokenizer_name: meta/llama-3.3-70b-instruct
329
+ max_sequence_length: 128000
330
+ client_spec:
331
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
332
+ args:
333
+ base_url: "{endpoint}/llama3370b/v1"
334
+
335
+ - name: stanfordhealthcare/llama-4-scout-17b-16e-instruct
336
+ model_name: meta/llama-4-scout-17b-16e-instruct
337
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
338
+ max_sequence_length: 327680
339
+ client_spec:
340
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
341
+ args:
342
+ base_url: "{endpoint}/llama4-scout/v1"
343
+
344
+ - name: stanfordhealthcare/llama-4-maverick-17b-128e-instruct-fp8
345
+ model_name: meta/llama-4-maverick-17b-128e-instruct-fp8
346
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
347
+ max_sequence_length: 524288
348
+ client_spec:
349
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
350
+ args:
351
+ base_url: "{endpoint}/llama4-maverick/v1"
352
+
353
+ - name: stanfordhealthcare/phi-3.5-mini-instruct
354
+ model_name: microsoft/phi-3.5-mini-instruct
355
+ tokenizer_name: microsoft/phi-3.5-mini-instruct
356
+ max_sequence_length: 131072
357
+ client_spec:
358
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
359
+ args:
360
+ base_url: "{endpoint}/phi35mi/v1"
361
+
362
+ - name: stanfordhealthcare_shc/gpt-4o-2024-05-13
363
+ model_name: openai/gpt-4o-2024-05-13
364
+ tokenizer_name: openai/o200k_base
365
+ max_sequence_length: 128000
366
+ client_spec:
367
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
368
+ deployment: gpt-4o
369
+
370
+ - name: stanfordhealthcare_shc/gpt-4o-mini-2024-07-18
371
+ model_name: openai/gpt-4o-mini-2024-07-18
372
+ tokenizer_name: openai/o200k_base
373
+ max_sequence_length: 128000
374
+ client_spec:
375
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
376
+ deployment: gpt-4o-mini
377
+
378
+ - name: stanfordhealthcare_shc/gpt-4-turbo-2024-04-09
379
+ model_name: openai/gpt-4-turbo-2024-04-09
380
+ tokenizer_name: openai/cl100k_base
381
+ max_sequence_length: 128000
382
+ client_spec:
383
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
384
+ deployment: gpt-4-turbo-2024-04-09
385
+
19
386
  # Adobe
20
387
  - name: adobe/giga-gan
21
388
  model_name: adobe/giga-gan
@@ -1438,6 +1805,14 @@ model_deployments:
1438
1805
  client_spec:
1439
1806
  class_name: "helm.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient"
1440
1807
 
1808
+ ## Moonshot AI
1809
+ - name: together/kimi-k2-instruct
1810
+ model_name: moonshotai/kimi-k2-instruct
1811
+ tokenizer_name: moonshotai/kimi-k2-instruct
1812
+ max_sequence_length: 131072
1813
+ client_spec:
1814
+ class_name: "helm.clients.together_client.TogetherChatClient"
1815
+
1441
1816
  ## MosaicML
1442
1817
  - name: huggingface/mpt-7b
1443
1818
  model_name: mosaicml/mpt-7b
@@ -2472,6 +2847,19 @@ model_deployments:
2472
2847
  openai_model_name: o4-mini-2025-04-16
2473
2848
  reasoning_effort: high
2474
2849
 
2850
+
2851
+ - name: openai/o3-pro-2025-06-10-high-reasoning-effort
2852
+ model_name: openai/o3-pro-2025-06-10-high-reasoning-effort
2853
+ tokenizer_name: openai/cl100k_base
2854
+ # Source: https://platform.openai.com/docs/models/o3-pro
2855
+ max_sequence_length: 200000
2856
+ # TODO: max_output_tokens: 100000
2857
+ client_spec:
2858
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2859
+ args:
2860
+ openai_model_name: o3-pro-2025-06-10
2861
+ reasoning_effort: high
2862
+
2475
2863
  ## Text Similarity Models
2476
2864
  # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings
2477
2865
  # The number of parameters is guessed based on the number of parameters of the
@@ -3308,6 +3696,15 @@ model_deployments:
3308
3696
  window_service_spec:
3309
3697
  class_name: "helm.benchmark.window_services.no_decoding_window_service.NoDecodingWindowService"
3310
3698
 
3699
+ - name: xai/grok-4-0709
3700
+ model_name: xai/grok-4-0709
3701
+ tokenizer_name: xai/grok-4-0709
3702
+ max_sequence_length: 256000
3703
+ client_spec:
3704
+ class_name: "helm.clients.grok_client.GrokChatClient"
3705
+ window_service_spec:
3706
+ class_name: "helm.benchmark.window_services.no_decoding_window_service.NoDecodingWindowService"
3707
+
3311
3708
  # Qwen
3312
3709
 
3313
3710
  - name: together/qwen-7b
@@ -3728,6 +4125,7 @@ model_deployments:
3728
4125
  args:
3729
4126
  pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base
3730
4127
 
4128
+ # Maritaca AI
3731
4129
  - name: huggingface/sabia-7b
3732
4130
  model_name: maritaca-ai/sabia-7b
3733
4131
  tokenizer_name: maritaca-ai/sabia-7b
@@ -3737,6 +4135,27 @@ model_deployments:
3737
4135
  args:
3738
4136
  pretrained_model_name_or_path: maritaca-ai/sabia-7b
3739
4137
 
4138
+ - name: maritaca-ai/sabiazinho-3
4139
+ model_name: maritaca-ai/sabiazinho-3
4140
+ tokenizer_name: maritaca-ai/sabia-2-tokenizer-medium
4141
+ max_sequence_length: 32000
4142
+ client_spec:
4143
+ class_name: "helm.clients.openai_client.OpenAIClient"
4144
+
4145
+ - name: maritaca-ai/sabia-3
4146
+ model_name: maritaca-ai/sabia-3
4147
+ tokenizer_name: maritaca-ai/sabia-2-tokenizer-medium
4148
+ max_sequence_length: 128000
4149
+ client_spec:
4150
+ class_name: "helm.clients.openai_client.OpenAIClient"
4151
+
4152
+ - name: maritaca-ai/sabia-3.1-2025-05-08
4153
+ model_name: maritaca-ai/sabia-3.1-2025-05-08
4154
+ tokenizer_name: maritaca-ai/sabia-2-tokenizer-medium
4155
+ max_sequence_length: 128000
4156
+ client_spec:
4157
+ class_name: "helm.clients.openai_client.OpenAIClient"
4158
+
3740
4159
  # Granite-3.1-8b-base
3741
4160
  - name: huggingface/granite-3.1-8b-base
3742
4161
  model_name: ibm-granite/granite-3.1-8b-base
@@ -3918,16 +4337,6 @@ model_deployments:
3918
4337
  watsonx_model_name: ibm/granite-8b-code-instruct
3919
4338
  region: Dallas
3920
4339
 
3921
- - name: ibm/mixtral-8x7b-instruct-v0:1
3922
- model_name: mistralai/mixtral-8x7b-instruct-v0:1
3923
- tokenizer_name: huggingface/gpt2
3924
- max_sequence_length: 4000
3925
- client_spec:
3926
- class_name: "helm.clients.ibm_client.IbmChatClient"
3927
- args:
3928
- watsonx_model_name: mistralai/mixtral-8x7b-instruct-v01
3929
- region: Dallas
3930
-
3931
4340
  - name: ibm/granite-3.3-8b-instruct
3932
4341
  model_name: ibm/granite-3.3-8b-instruct
3933
4342
  tokenizer_name: ibm/granite-3.3-8b-instruct
@@ -4128,186 +4537,56 @@ model_deployments:
4128
4537
  args:
4129
4538
  pretrained_model_name_or_path: vinai/PhoGPT-4B-Chat
4130
4539
 
4131
- # Stanford Health Care
4132
- # Placed later in the file to make them non-default
4133
- - name: stanfordhealthcare/claude-3-5-sonnet-20241022
4134
- model_name: anthropic/claude-3-5-sonnet-20241022
4135
- tokenizer_name: anthropic/claude
4136
- max_sequence_length: 200000
4137
- client_spec:
4138
- class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
4139
- args:
4140
- model: anthropic.claude-3-5-sonnet-20241022-v2:0
4141
- deployment: Claude35Sonnetv2/awssig4fa
4142
-
4143
- - name: stanfordhealthcare/claude-3-7-sonnet-20250219
4144
- model_name: anthropic/claude-3-7-sonnet-20250219
4145
- tokenizer_name: anthropic/claude
4146
- max_sequence_length: 200000
4147
- client_spec:
4148
- class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
4149
- args:
4150
- model: arn:aws:bedrock:us-west-2:679683451337:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0
4151
- deployment: awssig4claude37/aswsig4claude37
4152
-
4153
- - name: stanfordhealthcare/gemini-1.5-pro-001
4154
- model_name: google/gemini-1.5-pro-001
4155
- tokenizer_name: google/gemma-2b
4156
- max_sequence_length: 1000000
4157
- client_spec:
4158
- class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
4159
- args:
4160
- deployment: gcpgemini/apim-gcp-oauth-fa
4161
-
4162
- - name: stanfordhealthcare/gemini-2.0-flash-001
4163
- model_name: google/gemini-2.0-flash-001
4164
- tokenizer_name: google/gemma-2b
4165
- max_sequence_length: 1000000
4166
- client_spec:
4167
- class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
4168
- args:
4169
- deployment: gcp-gem20flash-fa/apim-gcp-gem20flash-fa
4170
-
4171
- - name: stanfordhealthcare/gpt-4o-mini-2024-07-18
4172
- model_name: openai/gpt-4o-mini-2024-07-18
4173
- tokenizer_name: openai/o200k_base
4174
- max_sequence_length: 128000
4175
- client_spec:
4176
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4177
- args:
4178
- openai_model_name: gpt-4o-mini
4179
- api_version: 2023-05-15
4180
-
4181
- - name: stanfordhealthcare/gpt-4o-2024-05-13
4182
- model_name: openai/gpt-4o-2024-05-13
4183
- tokenizer_name: openai/o200k_base
4184
- max_sequence_length: 128000
4185
- client_spec:
4186
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4187
- args:
4188
- openai_model_name: gpt-4o
4189
- api_version: 2023-05-15
4190
-
4191
- - name: stanfordhealthcare/gpt-4-0613
4192
- model_name: openai/gpt-4-0613
4193
- tokenizer_name: openai/o200k_base
4194
- max_sequence_length: 8192
4195
- client_spec:
4196
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4197
- args:
4198
- openai_model_name: gpt-4
4199
- api_version: 2023-05-15
4200
-
4201
- - name: stanfordhealthcare/gpt-4-turbo-2024-04-09
4202
- model_name: openai/gpt-4-turbo-2024-04-09
4203
- tokenizer_name: openai/cl100k_base
4540
+ - name: huggingface/Gemma-3-Gaia-PT-BR-4b-it
4541
+ model_name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
4542
+ tokenizer_name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
4204
4543
  max_sequence_length: 128000
4205
4544
  client_spec:
4206
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4207
- args:
4208
- openai_model_name: gpt-4-turbo
4209
- api_version: 2023-05-15
4210
-
4211
- - name: stanfordhealthcare/gpt-4.1-2025-04-14
4212
- model_name: openai/gpt-4.1-2025-04-14
4213
- tokenizer_name: openai/o200k_base
4214
- max_sequence_length: 1047576
4215
- client_spec:
4216
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4217
- args:
4218
- openai_model_name: gpt-4.1
4219
- api_version: 2025-01-01-preview
4220
- base_url: "{endpoint}/openai-eastus2"
4221
-
4222
- - name: stanfordhealthcare/o3-mini-2025-01-31
4223
- model_name: openai/o3-mini-2025-01-31
4224
- tokenizer_name: openai/cl100k_base
4225
- max_sequence_length: 200000
4226
- client_spec:
4227
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4545
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4228
4546
  args:
4229
- openai_model_name: o3-mini
4230
- api_version: 2024-12-01-preview
4231
- base_url: "{endpoint}/openai-eastus2"
4547
+ pretrained_model_name_or_path: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
4232
4548
 
4233
- - name: stanfordhealthcare/o1-2024-12-17
4234
- model_name: openai/o1-2024-12-17
4235
- tokenizer_name: openai/cl100k_base
4236
- max_sequence_length: 128000
4549
+ - name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
4550
+ model_name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
4551
+ tokenizer_name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
4552
+ max_sequence_length: 4094
4237
4553
  client_spec:
4238
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4554
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4239
4555
  args:
4240
- openai_model_name: o1
4241
- api_version: 2024-12-01-preview
4242
- base_url: "{endpoint}/openai-eastus2"
4556
+ pretrained_model_name_or_path: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
4243
4557
 
4244
- - name: stanfordhealthcare/deepseek-r1
4245
- model_name: deepseek-ai/deepseek-r1
4246
- tokenizer_name: deepseek-ai/deepseek-r1
4247
- max_sequence_length: 128000
4558
+ - name: 22h/cabrita_7b_pt_850000
4559
+ model_name: 22h/cabrita_7b_pt_850000
4560
+ tokenizer_name: 22h/cabrita_7b_pt_850000
4561
+ max_sequence_length: 4094
4248
4562
  client_spec:
4249
- class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4563
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4250
4564
  args:
4251
- openai_model_name: deepseek-chat
4252
- output_processor: helm.benchmark.metrics.output_processors.remove_deepseek_r1_thinking
4253
- base_url: "{endpoint}/deepseekr1/v1"
4565
+ pretrained_model_name_or_path: 22h/cabrita_7b_pt_850000
4254
4566
 
4255
- - name: stanfordhealthcare/llama-3.3-70b-instruct
4256
- model_name: meta/llama-3.3-70b-instruct
4257
- tokenizer_name: meta/llama-3.3-70b-instruct
4258
- max_sequence_length: 128000
4567
+ - name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
4568
+ model_name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
4569
+ tokenizer_name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
4570
+ max_sequence_length: 4096
4259
4571
  client_spec:
4260
- class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4572
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4261
4573
  args:
4262
- base_url: "{endpoint}/llama3370b/v1"
4574
+ pretrained_model_name_or_path: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
4263
4575
 
4264
- - name: stanfordhealthcare/llama-4-scout-17b-16e-instruct
4265
- model_name: meta/llama-4-scout-17b-16e-instruct
4266
- tokenizer_name: meta/llama-4-scout-17b-16e-instruct
4267
- max_sequence_length: 327680
4268
- client_spec:
4269
- class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4270
- args:
4271
- base_url: "{endpoint}/llama4-scout/v1"
4272
-
4273
- - name: stanfordhealthcare/llama-4-maverick-17b-128e-instruct-fp8
4274
- model_name: meta/llama-4-maverick-17b-128e-instruct-fp8
4275
- tokenizer_name: meta/llama-4-scout-17b-16e-instruct
4276
- max_sequence_length: 524288
4576
+ - name: TucanoBR/Tucano-2b4
4577
+ model_name: TucanoBR/Tucano-2b4
4578
+ tokenizer_name: TucanoBR/Tucano-2b4
4579
+ max_sequence_length: 4096
4277
4580
  client_spec:
4278
- class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4581
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4279
4582
  args:
4280
- base_url: "{endpoint}/llama4-maverick/v1"
4583
+ pretrained_model_name_or_path: TucanoBR/Tucano-2b4
4281
4584
 
4282
- - name: stanfordhealthcare/phi-3.5-mini-instruct
4283
- model_name: microsoft/phi-3.5-mini-instruct
4284
- tokenizer_name: microsoft/phi-3.5-mini-instruct
4285
- max_sequence_length: 131072
4585
+ - name: nicholasKluge/TeenyTinyLlama-460m
4586
+ model_name: nicholasKluge/TeenyTinyLlama-460m
4587
+ tokenizer_name: nicholasKluge/TeenyTinyLlama-460m
4588
+ max_sequence_length: 2048
4286
4589
  client_spec:
4287
- class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4590
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4288
4591
  args:
4289
- base_url: "{endpoint}/phi35mi/v1"
4290
-
4291
- - name: stanfordhealthcare_shc/gpt-4o-2024-05-13
4292
- model_name: openai/gpt-4o-2024-05-13
4293
- tokenizer_name: openai/o200k_base
4294
- max_sequence_length: 128000
4295
- client_spec:
4296
- class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
4297
- deployment: gpt-4o
4298
-
4299
- - name: stanfordhealthcare_shc/gpt-4o-mini-2024-07-18
4300
- model_name: openai/gpt-4o-mini-2024-07-18
4301
- tokenizer_name: openai/o200k_base
4302
- max_sequence_length: 128000
4303
- client_spec:
4304
- class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
4305
- deployment: gpt-4o-mini
4306
-
4307
- - name: stanfordhealthcare_shc/gpt-4-turbo-2024-04-09
4308
- model_name: openai/gpt-4-turbo-2024-04-09
4309
- tokenizer_name: openai/cl100k_base
4310
- max_sequence_length: 128000
4311
- client_spec:
4312
- class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
4313
- deployment: gpt-4-turbo-2024-04-09
4592
+ pretrained_model_name_or_path: nicholasKluge/TeenyTinyLlama-460m