crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (311) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/classification_metrics.py +19 -1
  27. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  28. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  29. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  30. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  31. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  32. helm/benchmark/metrics/comet_metric.py +1 -1
  33. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  34. helm/benchmark/metrics/copyright_metrics.py +1 -1
  35. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  36. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  37. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  38. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  39. helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
  40. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  41. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  42. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  43. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  44. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  45. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  46. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  47. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  48. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  49. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  50. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  51. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  52. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  53. helm/benchmark/metrics/medec_metrics.py +25 -2
  54. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  55. helm/benchmark/metrics/metric.py +25 -0
  56. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  57. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  58. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  59. helm/benchmark/metrics/summac/model_summac.py +3 -3
  60. helm/benchmark/metrics/summarization_metrics.py +129 -1
  61. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  62. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  63. helm/benchmark/model_deployment_registry.py +11 -19
  64. helm/benchmark/presentation/create_plots.py +11 -2
  65. helm/benchmark/presentation/schema.py +10 -22
  66. helm/benchmark/presentation/summarize.py +189 -14
  67. helm/benchmark/presentation/taxonomy_info.py +20 -0
  68. helm/benchmark/presentation/test_create_plots.py +4 -1
  69. helm/benchmark/run.py +7 -1
  70. helm/benchmark/run_expander.py +4 -0
  71. helm/benchmark/run_specs/arabic_run_specs.py +191 -0
  72. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  73. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  74. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  75. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  76. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  77. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  78. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  79. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  80. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  81. helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
  82. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  83. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
  84. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  85. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  86. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  87. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  88. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  89. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  90. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  91. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  92. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  93. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  94. helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
  95. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  96. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
  97. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
  98. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
  99. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  100. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  101. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  102. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  103. helm/benchmark/scenarios/bold_scenario.py +15 -0
  104. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  105. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  106. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  107. helm/benchmark/scenarios/clear_scenario.py +23 -0
  108. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  109. helm/benchmark/scenarios/code_scenario.py +28 -0
  110. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  111. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  112. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  113. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  114. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  115. helm/benchmark/scenarios/commonsense_scenario.py +26 -0
  116. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  117. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  118. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  119. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  120. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  121. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  122. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  123. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  124. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  125. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  126. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  127. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  128. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  129. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  130. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  131. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  132. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  133. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  134. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  135. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  136. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  137. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  138. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  139. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  140. helm/benchmark/scenarios/gsm_scenario.py +15 -0
  141. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  142. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  143. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  144. helm/benchmark/scenarios/ice_scenario.py +21 -1
  145. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  146. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  147. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  148. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  149. helm/benchmark/scenarios/koala_scenario.py +21 -1
  150. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  151. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  152. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  153. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  154. helm/benchmark/scenarios/legalbench_scenario.py +20 -0
  155. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  156. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  157. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  158. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  159. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  160. helm/benchmark/scenarios/math_scenario.py +47 -20
  161. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  162. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  163. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  164. helm/benchmark/scenarios/med_qa_scenario.py +14 -0
  165. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  166. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  167. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  168. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  169. helm/benchmark/scenarios/medec_scenario.py +23 -0
  170. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  171. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  172. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  173. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  174. helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
  175. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  176. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  177. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  178. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  179. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  180. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  181. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  182. helm/benchmark/scenarios/mmlu_scenario.py +15 -0
  183. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  184. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  185. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  186. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  187. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  188. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
  189. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  190. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  191. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  192. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  193. helm/benchmark/scenarios/quac_scenario.py +14 -0
  194. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  195. helm/benchmark/scenarios/raft_scenario.py +15 -0
  196. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  197. helm/benchmark/scenarios/scenario.py +31 -0
  198. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  199. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  200. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  201. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  202. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  203. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  204. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  205. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  206. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  207. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  208. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  209. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  210. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  211. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  212. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  213. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  214. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  215. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  216. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  217. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  218. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  219. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  220. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  221. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  222. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  223. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  224. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  225. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  226. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  227. helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
  228. helm/benchmark/slurm_jobs.py +1 -2
  229. helm/benchmark/slurm_runner.py +8 -1
  230. helm/benchmark/static/schema_arabic.yaml +271 -0
  231. helm/benchmark/static/schema_classic.yaml +0 -17
  232. helm/benchmark/static/schema_long_context.yaml +24 -6
  233. helm/benchmark/static/schema_medhelm.yaml +36 -0
  234. helm/benchmark/static/schema_slp.yaml +219 -0
  235. helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
  236. helm/benchmark/static_build/assets/index-9352595e.css +1 -0
  237. helm/benchmark/static_build/index.html +2 -2
  238. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  239. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  240. helm/clients/audio_language/llama_omni/constants.py +9 -0
  241. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  242. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  243. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  244. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  245. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  246. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  247. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  248. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  249. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  250. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  251. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  252. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  253. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  254. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  255. helm/clients/audio_language/llama_omni/utils.py +202 -0
  256. helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
  257. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  258. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  259. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  260. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  261. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  262. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  263. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  264. helm/clients/huggingface_client.py +2 -2
  265. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  266. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  267. helm/clients/openai_client.py +33 -20
  268. helm/clients/openai_responses_client.py +34 -8
  269. helm/clients/openrouter_client.py +31 -0
  270. helm/clients/test_huggingface_client.py +3 -3
  271. helm/clients/test_openrouter_client.py +69 -0
  272. helm/clients/together_client.py +48 -13
  273. helm/clients/vertexai_client.py +19 -11
  274. helm/clients/vllm_client.py +43 -7
  275. helm/clients/vllm_granite_thinking_client.py +56 -0
  276. helm/common/critique_request.py +0 -1
  277. helm/common/hierarchical_logger.py +83 -34
  278. helm/common/object_spec.py +23 -8
  279. helm/common/test_logging.py +94 -0
  280. helm/config/model_deployments.yaml +525 -172
  281. helm/config/model_metadata.yaml +185 -10
  282. helm/config/tokenizer_configs.yaml +100 -2
  283. helm/proxy/cli.py +1 -1
  284. helm/proxy/example_queries.py +8 -8
  285. helm/proxy/retry.py +5 -0
  286. helm/proxy/server.py +2 -1
  287. helm/proxy/static/index.css +4 -0
  288. helm/proxy/static/index.js +7 -1
  289. helm/tokenizers/grok_tokenizer.py +2 -0
  290. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  291. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  292. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  293. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  294. helm/benchmark/metrics/medalign_metrics.py +0 -14
  295. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  296. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  297. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  298. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  299. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  300. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  301. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  302. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  303. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  304. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  305. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  306. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  307. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  308. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
  309. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
  310. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
  311. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
@@ -16,6 +16,373 @@ model_deployments:
16
16
  client_spec:
17
17
  class_name: "helm.clients.simple_client.SimpleClient"
18
18
 
19
+ # Stanford Health Care
20
+ # For internal use only for MedHELM
21
+ # Placed earlier in the file to make them non-default
22
+ - name: stanfordhealthcare/claude-3-5-sonnet-20241022
23
+ model_name: anthropic/claude-3-5-sonnet-20241022
24
+ tokenizer_name: anthropic/claude
25
+ max_sequence_length: 200000
26
+ client_spec:
27
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
28
+ args:
29
+ model: anthropic.claude-3-5-sonnet-20241022-v2:0
30
+ deployment: Claude35Sonnetv2/awssig4fa
31
+
32
+ - name: stanfordhealthcare/claude-3-7-sonnet-20250219
33
+ model_name: anthropic/claude-3-7-sonnet-20250219
34
+ tokenizer_name: anthropic/claude
35
+ max_sequence_length: 200000
36
+ client_spec:
37
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
38
+ args:
39
+ model: arn:aws:bedrock:us-west-2:679683451337:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0
40
+ deployment: awssig4claude37/aswsig4claude37
41
+
42
+ - name: stanfordhealthcare/gemini-1.5-pro-001
43
+ model_name: google/gemini-1.5-pro-001
44
+ tokenizer_name: google/gemma-2b
45
+ max_sequence_length: 1000000
46
+ client_spec:
47
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
48
+ args:
49
+ deployment: gcpgemini/apim-gcp-oauth-fa
50
+
51
+ - name: stanfordhealthcare/gemini-2.0-flash-001
52
+ model_name: google/gemini-2.0-flash-001
53
+ tokenizer_name: google/gemma-2b
54
+ max_sequence_length: 1000000
55
+ client_spec:
56
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
57
+ args:
58
+ deployment: gcp-gem20flash-fa/apim-gcp-gem20flash-fa
59
+
60
+ - name: stanfordhealthcare/gpt-4o-mini-2024-07-18
61
+ model_name: openai/gpt-4o-mini-2024-07-18
62
+ tokenizer_name: openai/o200k_base
63
+ max_sequence_length: 128000
64
+ client_spec:
65
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
66
+ args:
67
+ openai_model_name: gpt-4o-mini
68
+ api_version: 2023-05-15
69
+
70
+ - name: stanfordhealthcare/gpt-4o-2024-05-13
71
+ model_name: openai/gpt-4o-2024-05-13
72
+ tokenizer_name: openai/o200k_base
73
+ max_sequence_length: 128000
74
+ client_spec:
75
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
76
+ args:
77
+ openai_model_name: gpt-4o
78
+ api_version: 2023-05-15
79
+
80
+ - name: stanfordhealthcare/gpt-4-0613
81
+ model_name: openai/gpt-4-0613
82
+ tokenizer_name: openai/o200k_base
83
+ max_sequence_length: 8192
84
+ client_spec:
85
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
86
+ args:
87
+ openai_model_name: gpt-4
88
+ api_version: 2023-05-15
89
+
90
+ - name: stanfordhealthcare/gpt-4-turbo-2024-04-09
91
+ model_name: openai/gpt-4-turbo-2024-04-09
92
+ tokenizer_name: openai/cl100k_base
93
+ max_sequence_length: 128000
94
+ client_spec:
95
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
96
+ args:
97
+ openai_model_name: gpt-4-turbo
98
+ api_version: 2023-05-15
99
+
100
+ - name: stanfordhealthcare/gpt-4.1-2025-04-14
101
+ model_name: openai/gpt-4.1-2025-04-14
102
+ tokenizer_name: openai/o200k_base
103
+ max_sequence_length: 1047576
104
+ client_spec:
105
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
106
+ args:
107
+ openai_model_name: gpt-4.1
108
+ api_version: 2025-01-01-preview
109
+ base_url: "{endpoint}/openai-eastus2"
110
+
111
+ - name: stanfordhealthcare/o3-mini-2025-01-31
112
+ model_name: openai/o3-mini-2025-01-31
113
+ tokenizer_name: openai/cl100k_base
114
+ max_sequence_length: 200000
115
+ client_spec:
116
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
117
+ args:
118
+ openai_model_name: o3-mini
119
+ api_version: 2024-12-01-preview
120
+ base_url: "{endpoint}/openai-eastus2"
121
+
122
+ - name: stanfordhealthcare/o1-2024-12-17
123
+ model_name: openai/o1-2024-12-17
124
+ tokenizer_name: openai/cl100k_base
125
+ max_sequence_length: 128000
126
+ client_spec:
127
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
128
+ args:
129
+ openai_model_name: o1
130
+ api_version: 2024-12-01-preview
131
+ base_url: "{endpoint}/openai-eastus2"
132
+
133
+ - name: stanfordhealthcare/deepseek-r1
134
+ model_name: deepseek-ai/deepseek-r1
135
+ tokenizer_name: deepseek-ai/deepseek-r1
136
+ max_sequence_length: 128000
137
+ client_spec:
138
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
139
+ args:
140
+ openai_model_name: deepseek-chat
141
+ output_processor: helm.benchmark.metrics.output_processors.remove_deepseek_r1_thinking
142
+ base_url: "{endpoint}/deepseekr1/v1"
143
+
144
+ - name: stanfordhealthcare/llama-3.3-70b-instruct
145
+ model_name: meta/llama-3.3-70b-instruct
146
+ tokenizer_name: meta/llama-3.3-70b-instruct
147
+ max_sequence_length: 128000
148
+ client_spec:
149
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
150
+ args:
151
+ base_url: "{endpoint}/llama3370b/v1"
152
+
153
+ - name: stanfordhealthcare/llama-4-scout-17b-16e-instruct
154
+ model_name: meta/llama-4-scout-17b-16e-instruct
155
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
156
+ max_sequence_length: 327680
157
+ client_spec:
158
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
159
+ args:
160
+ base_url: "{endpoint}/llama4-scout/v1"
161
+
162
+ - name: stanfordhealthcare/llama-4-maverick-17b-128e-instruct-fp8
163
+ model_name: meta/llama-4-maverick-17b-128e-instruct-fp8
164
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
165
+ max_sequence_length: 524288
166
+ client_spec:
167
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
168
+ args:
169
+ base_url: "{endpoint}/llama4-maverick/v1"
170
+
171
+ - name: stanfordhealthcare/phi-3.5-mini-instruct
172
+ model_name: microsoft/phi-3.5-mini-instruct
173
+ tokenizer_name: microsoft/phi-3.5-mini-instruct
174
+ max_sequence_length: 131072
175
+ client_spec:
176
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
177
+ args:
178
+ base_url: "{endpoint}/phi35mi/v1"
179
+
180
+ - name: stanfordhealthcare_shc/gpt-4o-2024-05-13
181
+ model_name: openai/gpt-4o-2024-05-13
182
+ tokenizer_name: openai/o200k_base
183
+ max_sequence_length: 128000
184
+ client_spec:
185
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
186
+ deployment: gpt-4o
187
+
188
+ - name: stanfordhealthcare_shc/gpt-4o-mini-2024-07-18
189
+ model_name: openai/gpt-4o-mini-2024-07-18
190
+ tokenizer_name: openai/o200k_base
191
+ max_sequence_length: 128000
192
+ client_spec:
193
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
194
+ deployment: gpt-4o-mini
195
+
196
+ - name: stanfordhealthcare_shc/gpt-4-turbo-2024-04-09
197
+ model_name: openai/gpt-4-turbo-2024-04-09
198
+ tokenizer_name: openai/cl100k_base
199
+ max_sequence_length: 128000
200
+ client_spec:
201
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
202
+ deployment: gpt-4-turbo-2024-04-09
203
+
204
+ - name: stanfordhealthcare/claude-3-5-sonnet-20241022
205
+ model_name: anthropic/claude-3-5-sonnet-20241022
206
+ tokenizer_name: anthropic/claude
207
+ max_sequence_length: 200000
208
+ client_spec:
209
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
210
+ args:
211
+ model: anthropic.claude-3-5-sonnet-20241022-v2:0
212
+ deployment: Claude35Sonnetv2/awssig4fa
213
+
214
+ - name: stanfordhealthcare/claude-3-7-sonnet-20250219
215
+ model_name: anthropic/claude-3-7-sonnet-20250219
216
+ tokenizer_name: anthropic/claude
217
+ max_sequence_length: 200000
218
+ client_spec:
219
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
220
+ args:
221
+ model: arn:aws:bedrock:us-west-2:679683451337:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0
222
+ deployment: awssig4claude37/aswsig4claude37
223
+
224
+ - name: stanfordhealthcare/gemini-1.5-pro-001
225
+ model_name: google/gemini-1.5-pro-001
226
+ tokenizer_name: google/gemma-2b
227
+ max_sequence_length: 1000000
228
+ client_spec:
229
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
230
+ args:
231
+ deployment: gcpgemini/apim-gcp-oauth-fa
232
+
233
+ - name: stanfordhealthcare/gemini-2.0-flash-001
234
+ model_name: google/gemini-2.0-flash-001
235
+ tokenizer_name: google/gemma-2b
236
+ max_sequence_length: 1000000
237
+ client_spec:
238
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
239
+ args:
240
+ deployment: gcp-gem20flash-fa/apim-gcp-gem20flash-fa
241
+
242
+ - name: stanfordhealthcare/gpt-4o-mini-2024-07-18
243
+ model_name: openai/gpt-4o-mini-2024-07-18
244
+ tokenizer_name: openai/o200k_base
245
+ max_sequence_length: 128000
246
+ client_spec:
247
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
248
+ args:
249
+ openai_model_name: gpt-4o-mini
250
+ api_version: 2023-05-15
251
+
252
+ - name: stanfordhealthcare/gpt-4o-2024-05-13
253
+ model_name: openai/gpt-4o-2024-05-13
254
+ tokenizer_name: openai/o200k_base
255
+ max_sequence_length: 128000
256
+ client_spec:
257
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
258
+ args:
259
+ openai_model_name: gpt-4o
260
+ api_version: 2023-05-15
261
+
262
+ - name: stanfordhealthcare/gpt-4-0613
263
+ model_name: openai/gpt-4-0613
264
+ tokenizer_name: openai/o200k_base
265
+ max_sequence_length: 8192
266
+ client_spec:
267
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
268
+ args:
269
+ openai_model_name: gpt-4
270
+ api_version: 2023-05-15
271
+
272
+ - name: stanfordhealthcare/gpt-4-turbo-2024-04-09
273
+ model_name: openai/gpt-4-turbo-2024-04-09
274
+ tokenizer_name: openai/cl100k_base
275
+ max_sequence_length: 128000
276
+ client_spec:
277
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
278
+ args:
279
+ openai_model_name: gpt-4-turbo
280
+ api_version: 2023-05-15
281
+
282
+ - name: stanfordhealthcare/gpt-4.1-2025-04-14
283
+ model_name: openai/gpt-4.1-2025-04-14
284
+ tokenizer_name: openai/o200k_base
285
+ max_sequence_length: 1047576
286
+ client_spec:
287
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
288
+ args:
289
+ openai_model_name: gpt-4.1
290
+ api_version: 2025-01-01-preview
291
+ base_url: "{endpoint}/openai-eastus2"
292
+
293
+ - name: stanfordhealthcare/o3-mini-2025-01-31
294
+ model_name: openai/o3-mini-2025-01-31
295
+ tokenizer_name: openai/cl100k_base
296
+ max_sequence_length: 200000
297
+ client_spec:
298
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
299
+ args:
300
+ openai_model_name: o3-mini
301
+ api_version: 2024-12-01-preview
302
+ base_url: "{endpoint}/openai-eastus2"
303
+
304
+ - name: stanfordhealthcare/o1-2024-12-17
305
+ model_name: openai/o1-2024-12-17
306
+ tokenizer_name: openai/cl100k_base
307
+ max_sequence_length: 128000
308
+ client_spec:
309
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
310
+ args:
311
+ openai_model_name: o1
312
+ api_version: 2024-12-01-preview
313
+ base_url: "{endpoint}/openai-eastus2"
314
+
315
+ - name: stanfordhealthcare/deepseek-r1
316
+ model_name: deepseek-ai/deepseek-r1
317
+ tokenizer_name: deepseek-ai/deepseek-r1
318
+ max_sequence_length: 128000
319
+ client_spec:
320
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
321
+ args:
322
+ openai_model_name: deepseek-chat
323
+ output_processor: helm.benchmark.metrics.output_processors.remove_deepseek_r1_thinking
324
+ base_url: "{endpoint}/deepseekr1/v1"
325
+
326
+ - name: stanfordhealthcare/llama-3.3-70b-instruct
327
+ model_name: meta/llama-3.3-70b-instruct
328
+ tokenizer_name: meta/llama-3.3-70b-instruct
329
+ max_sequence_length: 128000
330
+ client_spec:
331
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
332
+ args:
333
+ base_url: "{endpoint}/llama3370b/v1"
334
+
335
+ - name: stanfordhealthcare/llama-4-scout-17b-16e-instruct
336
+ model_name: meta/llama-4-scout-17b-16e-instruct
337
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
338
+ max_sequence_length: 327680
339
+ client_spec:
340
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
341
+ args:
342
+ base_url: "{endpoint}/llama4-scout/v1"
343
+
344
+ - name: stanfordhealthcare/llama-4-maverick-17b-128e-instruct-fp8
345
+ model_name: meta/llama-4-maverick-17b-128e-instruct-fp8
346
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
347
+ max_sequence_length: 524288
348
+ client_spec:
349
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
350
+ args:
351
+ base_url: "{endpoint}/llama4-maverick/v1"
352
+
353
+ - name: stanfordhealthcare/phi-3.5-mini-instruct
354
+ model_name: microsoft/phi-3.5-mini-instruct
355
+ tokenizer_name: microsoft/phi-3.5-mini-instruct
356
+ max_sequence_length: 131072
357
+ client_spec:
358
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
359
+ args:
360
+ base_url: "{endpoint}/phi35mi/v1"
361
+
362
+ - name: stanfordhealthcare_shc/gpt-4o-2024-05-13
363
+ model_name: openai/gpt-4o-2024-05-13
364
+ tokenizer_name: openai/o200k_base
365
+ max_sequence_length: 128000
366
+ client_spec:
367
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
368
+ deployment: gpt-4o
369
+
370
+ - name: stanfordhealthcare_shc/gpt-4o-mini-2024-07-18
371
+ model_name: openai/gpt-4o-mini-2024-07-18
372
+ tokenizer_name: openai/o200k_base
373
+ max_sequence_length: 128000
374
+ client_spec:
375
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
376
+ deployment: gpt-4o-mini
377
+
378
+ - name: stanfordhealthcare_shc/gpt-4-turbo-2024-04-09
379
+ model_name: openai/gpt-4-turbo-2024-04-09
380
+ tokenizer_name: openai/cl100k_base
381
+ max_sequence_length: 128000
382
+ client_spec:
383
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
384
+ deployment: gpt-4-turbo-2024-04-09
385
+
19
386
  # Adobe
20
387
  - name: adobe/giga-gan
21
388
  model_name: adobe/giga-gan
@@ -721,6 +1088,14 @@ model_deployments:
721
1088
  # - https://cloud.google.com/vertex-ai/generative-ai/docs/learn/locations#global-endpoint
722
1089
  location: global
723
1090
 
1091
+ - name: google/gemini-2.5-flash-lite
1092
+ model_name: google/gemini-2.5-flash-lite
1093
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
1094
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash
1095
+ # TODO: Max output tokens: 65536
1096
+ client_spec:
1097
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
1098
+
724
1099
  - name: google/gemini-2.5-flash-preview-04-17
725
1100
  model_name: google/gemini-2.5-flash-preview-04-17
726
1101
  tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
@@ -1438,6 +1813,14 @@ model_deployments:
1438
1813
  client_spec:
1439
1814
  class_name: "helm.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient"
1440
1815
 
1816
+ ## Moonshot AI
1817
+ - name: together/kimi-k2-instruct
1818
+ model_name: moonshotai/kimi-k2-instruct
1819
+ tokenizer_name: moonshotai/kimi-k2-instruct
1820
+ max_sequence_length: 131072
1821
+ client_spec:
1822
+ class_name: "helm.clients.together_client.TogetherChatClient"
1823
+
1441
1824
  ## MosaicML
1442
1825
  - name: huggingface/mpt-7b
1443
1826
  model_name: mosaicml/mpt-7b
@@ -2241,6 +2624,27 @@ model_deployments:
2241
2624
  client_spec:
2242
2625
  class_name: "helm.clients.openai_client.OpenAIClient"
2243
2626
 
2627
+ - name: openai/gpt-5-2025-08-07
2628
+ model_name: openai/gpt-5-2025-08-07
2629
+ tokenizer_name: openai/o200k_base
2630
+ max_sequence_length: 400000
2631
+ client_spec:
2632
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2633
+
2634
+ - name: openai/gpt-5-mini-2025-08-07
2635
+ model_name: openai/gpt-5-mini-2025-08-07
2636
+ tokenizer_name: openai/o200k_base
2637
+ max_sequence_length: 400000
2638
+ client_spec:
2639
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2640
+
2641
+ - name: openai/gpt-5-nano-2025-08-07
2642
+ model_name: openai/gpt-5-nano-2025-08-07
2643
+ tokenizer_name: openai/o200k_base
2644
+ max_sequence_length: 400000
2645
+ client_spec:
2646
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2647
+
2244
2648
  - name: openai/whisper-1_gpt-4o-2024-11-20
2245
2649
  model_name: openai/whisper-1_gpt-4o-2024-11-20
2246
2650
  tokenizer_name: openai/o200k_base
@@ -2472,6 +2876,36 @@ model_deployments:
2472
2876
  openai_model_name: o4-mini-2025-04-16
2473
2877
  reasoning_effort: high
2474
2878
 
2879
+
2880
+ - name: openai/o3-pro-2025-06-10-high-reasoning-effort
2881
+ model_name: openai/o3-pro-2025-06-10-high-reasoning-effort
2882
+ tokenizer_name: openai/cl100k_base
2883
+ # Source: https://platform.openai.com/docs/models/o3-pro
2884
+ max_sequence_length: 200000
2885
+ # TODO: max_output_tokens: 100000
2886
+ client_spec:
2887
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2888
+ args:
2889
+ openai_model_name: o3-pro-2025-06-10
2890
+ reasoning_effort: high
2891
+
2892
+ ## GPT-OSS
2893
+ - name: together/gpt-oss-20b
2894
+ model_name: openai/gpt-oss-20b
2895
+ tokenizer_name: openai/o200k_harmony
2896
+ # Source: https://platform.openai.com/docs/models/gpt-oss-20b
2897
+ max_sequence_length: 131072
2898
+ client_spec:
2899
+ class_name: "helm.clients.together_client.TogetherChatClient"
2900
+
2901
+ - name: together/gpt-oss-120b
2902
+ model_name: openai/gpt-oss-120b
2903
+ tokenizer_name: openai/o200k_harmony
2904
+ # Source: https://platform.openai.com/docs/models/gpt-oss-120b
2905
+ max_sequence_length: 131072
2906
+ client_spec:
2907
+ class_name: "helm.clients.together_client.TogetherChatClient"
2908
+
2475
2909
  ## Text Similarity Models
2476
2910
  # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings
2477
2911
  # The number of parameters is guessed based on the number of parameters of the
@@ -3153,6 +3587,16 @@ model_deployments:
3153
3587
  args:
3154
3588
  together_model: togethercomputer/RedPajama-INCITE-7B-Instruct
3155
3589
 
3590
+ ## Z.ai
3591
+ - name: together/glm-4.5-air-fp8
3592
+ model_name: zai-org/glm-4.5-air-fp8
3593
+ tokenizer_name: zai-org/glm-4.5-air-fp8
3594
+ max_sequence_length: 131072
3595
+ client_spec:
3596
+ class_name: "helm.clients.together_client.TogetherChatClient"
3597
+ args:
3598
+ parse_thinking: true
3599
+
3156
3600
  - name: thudm/cogview2
3157
3601
  model_name: thudm/cogview2
3158
3602
  tokenizer_name: openai/clip-vit-large-patch14
@@ -3308,6 +3752,15 @@ model_deployments:
3308
3752
  window_service_spec:
3309
3753
  class_name: "helm.benchmark.window_services.no_decoding_window_service.NoDecodingWindowService"
3310
3754
 
3755
+ - name: xai/grok-4-0709
3756
+ model_name: xai/grok-4-0709
3757
+ tokenizer_name: xai/grok-4-0709
3758
+ max_sequence_length: 256000
3759
+ client_spec:
3760
+ class_name: "helm.clients.grok_client.GrokChatClient"
3761
+ window_service_spec:
3762
+ class_name: "helm.benchmark.window_services.no_decoding_window_service.NoDecodingWindowService"
3763
+
3311
3764
  # Qwen
3312
3765
 
3313
3766
  - name: together/qwen-7b
@@ -3419,7 +3872,16 @@ model_deployments:
3419
3872
  class_name: "helm.clients.together_client.TogetherChatClient"
3420
3873
  args:
3421
3874
  parse_thinking: true
3422
-
3875
+
3876
+ - name: together/qwen3-235b-a22b-instruct-2507-fp8
3877
+ model_name: qwen/qwen3-235b-a22b-instruct-2507-fp8
3878
+ tokenizer_name: qwen/qwen3-235b-a22b-instruct-2507-fp8
3879
+ max_sequence_length: 262144
3880
+ client_spec:
3881
+ class_name: "helm.clients.together_client.TogetherChatClient"
3882
+ args:
3883
+ together_model: Qwen/Qwen3-235B-A22B-Instruct-2507-tput
3884
+
3423
3885
  - name: huggingface/qwen2.5-7b-instruct-4bit
3424
3886
  model_name: qwen/qwen2.5-7b-instruct
3425
3887
  tokenizer_name: qwen/qwen2.5-7b-instruct
@@ -3728,6 +4190,7 @@ model_deployments:
3728
4190
  args:
3729
4191
  pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base
3730
4192
 
4193
+ # Maritaca AI
3731
4194
  - name: huggingface/sabia-7b
3732
4195
  model_name: maritaca-ai/sabia-7b
3733
4196
  tokenizer_name: maritaca-ai/sabia-7b
@@ -3737,6 +4200,27 @@ model_deployments:
3737
4200
  args:
3738
4201
  pretrained_model_name_or_path: maritaca-ai/sabia-7b
3739
4202
 
4203
+ - name: maritaca-ai/sabiazinho-3
4204
+ model_name: maritaca-ai/sabiazinho-3
4205
+ tokenizer_name: maritaca-ai/sabia-2-tokenizer-medium
4206
+ max_sequence_length: 32000
4207
+ client_spec:
4208
+ class_name: "helm.clients.openai_client.OpenAIClient"
4209
+
4210
+ - name: maritaca-ai/sabia-3
4211
+ model_name: maritaca-ai/sabia-3
4212
+ tokenizer_name: maritaca-ai/sabia-2-tokenizer-medium
4213
+ max_sequence_length: 128000
4214
+ client_spec:
4215
+ class_name: "helm.clients.openai_client.OpenAIClient"
4216
+
4217
+ - name: maritaca-ai/sabia-3.1-2025-05-08
4218
+ model_name: maritaca-ai/sabia-3.1-2025-05-08
4219
+ tokenizer_name: maritaca-ai/sabia-2-tokenizer-medium
4220
+ max_sequence_length: 128000
4221
+ client_spec:
4222
+ class_name: "helm.clients.openai_client.OpenAIClient"
4223
+
3740
4224
  # Granite-3.1-8b-base
3741
4225
  - name: huggingface/granite-3.1-8b-base
3742
4226
  model_name: ibm-granite/granite-3.1-8b-base
@@ -3918,16 +4402,6 @@ model_deployments:
3918
4402
  watsonx_model_name: ibm/granite-8b-code-instruct
3919
4403
  region: Dallas
3920
4404
 
3921
- - name: ibm/mixtral-8x7b-instruct-v0:1
3922
- model_name: mistralai/mixtral-8x7b-instruct-v0:1
3923
- tokenizer_name: huggingface/gpt2
3924
- max_sequence_length: 4000
3925
- client_spec:
3926
- class_name: "helm.clients.ibm_client.IbmChatClient"
3927
- args:
3928
- watsonx_model_name: mistralai/mixtral-8x7b-instruct-v01
3929
- region: Dallas
3930
-
3931
4405
  - name: ibm/granite-3.3-8b-instruct
3932
4406
  model_name: ibm/granite-3.3-8b-instruct
3933
4407
  tokenizer_name: ibm/granite-3.3-8b-instruct
@@ -4128,186 +4602,65 @@ model_deployments:
4128
4602
  args:
4129
4603
  pretrained_model_name_or_path: vinai/PhoGPT-4B-Chat
4130
4604
 
4131
- # Stanford Health Care
4132
- # Placed later in the file to make them non-default
4133
- - name: stanfordhealthcare/claude-3-5-sonnet-20241022
4134
- model_name: anthropic/claude-3-5-sonnet-20241022
4135
- tokenizer_name: anthropic/claude
4136
- max_sequence_length: 200000
4137
- client_spec:
4138
- class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
4139
- args:
4140
- model: anthropic.claude-3-5-sonnet-20241022-v2:0
4141
- deployment: Claude35Sonnetv2/awssig4fa
4142
-
4143
- - name: stanfordhealthcare/claude-3-7-sonnet-20250219
4144
- model_name: anthropic/claude-3-7-sonnet-20250219
4145
- tokenizer_name: anthropic/claude
4146
- max_sequence_length: 200000
4147
- client_spec:
4148
- class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
4149
- args:
4150
- model: arn:aws:bedrock:us-west-2:679683451337:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0
4151
- deployment: awssig4claude37/aswsig4claude37
4152
-
4153
- - name: stanfordhealthcare/gemini-1.5-pro-001
4154
- model_name: google/gemini-1.5-pro-001
4155
- tokenizer_name: google/gemma-2b
4156
- max_sequence_length: 1000000
4157
- client_spec:
4158
- class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
4159
- args:
4160
- deployment: gcpgemini/apim-gcp-oauth-fa
4161
-
4162
- - name: stanfordhealthcare/gemini-2.0-flash-001
4163
- model_name: google/gemini-2.0-flash-001
4164
- tokenizer_name: google/gemma-2b
4165
- max_sequence_length: 1000000
4166
- client_spec:
4167
- class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
4168
- args:
4169
- deployment: gcp-gem20flash-fa/apim-gcp-gem20flash-fa
4170
-
4171
- - name: stanfordhealthcare/gpt-4o-mini-2024-07-18
4172
- model_name: openai/gpt-4o-mini-2024-07-18
4173
- tokenizer_name: openai/o200k_base
4174
- max_sequence_length: 128000
4175
- client_spec:
4176
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4177
- args:
4178
- openai_model_name: gpt-4o-mini
4179
- api_version: 2023-05-15
4180
-
4181
- - name: stanfordhealthcare/gpt-4o-2024-05-13
4182
- model_name: openai/gpt-4o-2024-05-13
4183
- tokenizer_name: openai/o200k_base
4605
+ - name: huggingface/Gemma-3-Gaia-PT-BR-4b-it
4606
+ model_name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
4607
+ tokenizer_name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
4184
4608
  max_sequence_length: 128000
4185
4609
  client_spec:
4186
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4187
- args:
4188
- openai_model_name: gpt-4o
4189
- api_version: 2023-05-15
4190
-
4191
- - name: stanfordhealthcare/gpt-4-0613
4192
- model_name: openai/gpt-4-0613
4193
- tokenizer_name: openai/o200k_base
4194
- max_sequence_length: 8192
4195
- client_spec:
4196
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4610
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4197
4611
  args:
4198
- openai_model_name: gpt-4
4199
- api_version: 2023-05-15
4612
+ pretrained_model_name_or_path: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
4200
4613
 
4201
- - name: stanfordhealthcare/gpt-4-turbo-2024-04-09
4202
- model_name: openai/gpt-4-turbo-2024-04-09
4203
- tokenizer_name: openai/cl100k_base
4204
- max_sequence_length: 128000
4614
+ - name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
4615
+ model_name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
4616
+ tokenizer_name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
4617
+ max_sequence_length: 4094
4205
4618
  client_spec:
4206
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4619
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4207
4620
  args:
4208
- openai_model_name: gpt-4-turbo
4209
- api_version: 2023-05-15
4621
+ pretrained_model_name_or_path: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
4210
4622
 
4211
- - name: stanfordhealthcare/gpt-4.1-2025-04-14
4212
- model_name: openai/gpt-4.1-2025-04-14
4213
- tokenizer_name: openai/o200k_base
4214
- max_sequence_length: 1047576
4623
+ - name: 22h/cabrita_7b_pt_850000
4624
+ model_name: 22h/cabrita_7b_pt_850000
4625
+ tokenizer_name: 22h/cabrita_7b_pt_850000
4626
+ max_sequence_length: 4094
4215
4627
  client_spec:
4216
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4628
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4217
4629
  args:
4218
- openai_model_name: gpt-4.1
4219
- api_version: 2025-01-01-preview
4220
- base_url: "{endpoint}/openai-eastus2"
4630
+ pretrained_model_name_or_path: 22h/cabrita_7b_pt_850000
4221
4631
 
4222
- - name: stanfordhealthcare/o3-mini-2025-01-31
4223
- model_name: openai/o3-mini-2025-01-31
4224
- tokenizer_name: openai/cl100k_base
4225
- max_sequence_length: 200000
4632
+ - name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
4633
+ model_name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
4634
+ tokenizer_name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
4635
+ max_sequence_length: 4096
4226
4636
  client_spec:
4227
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4637
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4228
4638
  args:
4229
- openai_model_name: o3-mini
4230
- api_version: 2024-12-01-preview
4231
- base_url: "{endpoint}/openai-eastus2"
4639
+ pretrained_model_name_or_path: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
4232
4640
 
4233
- - name: stanfordhealthcare/o1-2024-12-17
4234
- model_name: openai/o1-2024-12-17
4235
- tokenizer_name: openai/cl100k_base
4236
- max_sequence_length: 128000
4641
+ - name: TucanoBR/Tucano-2b4
4642
+ model_name: TucanoBR/Tucano-2b4
4643
+ tokenizer_name: TucanoBR/Tucano-2b4
4644
+ max_sequence_length: 4096
4237
4645
  client_spec:
4238
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4646
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4239
4647
  args:
4240
- openai_model_name: o1
4241
- api_version: 2024-12-01-preview
4242
- base_url: "{endpoint}/openai-eastus2"
4648
+ pretrained_model_name_or_path: TucanoBR/Tucano-2b4
4243
4649
 
4244
- - name: stanfordhealthcare/deepseek-r1
4245
- model_name: deepseek-ai/deepseek-r1
4246
- tokenizer_name: deepseek-ai/deepseek-r1
4247
- max_sequence_length: 128000
4650
+ - name: nicholasKluge/TeenyTinyLlama-460m
4651
+ model_name: nicholasKluge/TeenyTinyLlama-460m
4652
+ tokenizer_name: nicholasKluge/TeenyTinyLlama-460m
4653
+ max_sequence_length: 2048
4248
4654
  client_spec:
4249
- class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4655
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4250
4656
  args:
4251
- openai_model_name: deepseek-chat
4252
- output_processor: helm.benchmark.metrics.output_processors.remove_deepseek_r1_thinking
4253
- base_url: "{endpoint}/deepseekr1/v1"
4657
+ pretrained_model_name_or_path: nicholasKluge/TeenyTinyLlama-460m
4254
4658
 
4255
- - name: stanfordhealthcare/llama-3.3-70b-instruct
4256
- model_name: meta/llama-3.3-70b-instruct
4257
- tokenizer_name: meta/llama-3.3-70b-instruct
4659
+ - name: openrouter/mistral-medium-3.1
4660
+ model_name: mistralai/mistral-medium-3.1
4661
+ tokenizer_name: mistralai/Mistral-7B-v0.1
4258
4662
  max_sequence_length: 128000
4259
4663
  client_spec:
4260
- class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4261
- args:
4262
- base_url: "{endpoint}/llama3370b/v1"
4263
-
4264
- - name: stanfordhealthcare/llama-4-scout-17b-16e-instruct
4265
- model_name: meta/llama-4-scout-17b-16e-instruct
4266
- tokenizer_name: meta/llama-4-scout-17b-16e-instruct
4267
- max_sequence_length: 327680
4268
- client_spec:
4269
- class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4270
- args:
4271
- base_url: "{endpoint}/llama4-scout/v1"
4272
-
4273
- - name: stanfordhealthcare/llama-4-maverick-17b-128e-instruct-fp8
4274
- model_name: meta/llama-4-maverick-17b-128e-instruct-fp8
4275
- tokenizer_name: meta/llama-4-scout-17b-16e-instruct
4276
- max_sequence_length: 524288
4277
- client_spec:
4278
- class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4279
- args:
4280
- base_url: "{endpoint}/llama4-maverick/v1"
4281
-
4282
- - name: stanfordhealthcare/phi-3.5-mini-instruct
4283
- model_name: microsoft/phi-3.5-mini-instruct
4284
- tokenizer_name: microsoft/phi-3.5-mini-instruct
4285
- max_sequence_length: 131072
4286
- client_spec:
4287
- class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
4664
+ class_name: "helm.clients.openrouter_client.OpenRouterClient"
4288
4665
  args:
4289
- base_url: "{endpoint}/phi35mi/v1"
4290
-
4291
- - name: stanfordhealthcare_shc/gpt-4o-2024-05-13
4292
- model_name: openai/gpt-4o-2024-05-13
4293
- tokenizer_name: openai/o200k_base
4294
- max_sequence_length: 128000
4295
- client_spec:
4296
- class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
4297
- deployment: gpt-4o
4298
-
4299
- - name: stanfordhealthcare_shc/gpt-4o-mini-2024-07-18
4300
- model_name: openai/gpt-4o-mini-2024-07-18
4301
- tokenizer_name: openai/o200k_base
4302
- max_sequence_length: 128000
4303
- client_spec:
4304
- class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
4305
- deployment: gpt-4o-mini
4306
-
4307
- - name: stanfordhealthcare_shc/gpt-4-turbo-2024-04-09
4308
- model_name: openai/gpt-4-turbo-2024-04-09
4309
- tokenizer_name: openai/cl100k_base
4310
- max_sequence_length: 128000
4311
- client_spec:
4312
- class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
4313
- deployment: gpt-4-turbo-2024-04-09
4666
+ model_name: mistralai/mistral-medium-3.1