crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/bbq_metrics.py +12 -0
  27. helm/benchmark/metrics/classification_metrics.py +19 -1
  28. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  29. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  30. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  31. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  32. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  33. helm/benchmark/metrics/comet_metric.py +1 -1
  34. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  35. helm/benchmark/metrics/copyright_metrics.py +1 -1
  36. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  37. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  38. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  39. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  40. helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
  41. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  42. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  43. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  44. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  45. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  46. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  47. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  48. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  49. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  50. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  51. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  52. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  53. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  54. helm/benchmark/metrics/medec_metrics.py +25 -2
  55. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  56. helm/benchmark/metrics/metric.py +25 -0
  57. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  58. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  59. helm/benchmark/metrics/safety_metrics.py +13 -1
  60. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  61. helm/benchmark/metrics/summac/model_summac.py +3 -3
  62. helm/benchmark/metrics/summarization_metrics.py +129 -1
  63. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  64. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  65. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  66. helm/benchmark/model_deployment_registry.py +11 -19
  67. helm/benchmark/presentation/create_plots.py +11 -2
  68. helm/benchmark/presentation/run_display.py +13 -3
  69. helm/benchmark/presentation/run_entry.py +2 -2
  70. helm/benchmark/presentation/schema.py +10 -22
  71. helm/benchmark/presentation/summarize.py +189 -14
  72. helm/benchmark/presentation/taxonomy_info.py +20 -0
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/run.py +15 -4
  75. helm/benchmark/run_expander.py +4 -0
  76. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  77. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  78. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  79. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  80. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  81. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  82. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  83. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  84. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  85. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  86. helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
  87. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  88. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
  89. helm/benchmark/runner.py +7 -0
  90. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  91. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  92. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  93. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  94. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  95. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  96. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  97. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  98. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  99. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  100. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  101. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  102. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  103. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
  104. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
  105. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
  106. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  107. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  108. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  109. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  110. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  111. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  112. helm/benchmark/scenarios/bold_scenario.py +15 -0
  113. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  115. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  116. helm/benchmark/scenarios/clear_scenario.py +23 -0
  117. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  118. helm/benchmark/scenarios/code_scenario.py +28 -0
  119. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  120. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  121. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  122. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  123. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  124. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  125. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  126. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  127. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  128. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  129. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  130. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  131. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  132. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  133. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  134. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  135. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  136. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  137. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  138. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  139. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  140. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  141. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  142. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  143. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  144. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  145. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  146. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  147. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  148. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  149. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  150. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  151. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  152. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  153. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  154. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  155. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  156. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  157. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  158. helm/benchmark/scenarios/ice_scenario.py +21 -1
  159. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  160. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  161. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  162. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  163. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  164. helm/benchmark/scenarios/koala_scenario.py +21 -1
  165. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  166. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  167. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  168. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  169. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  170. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  171. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  172. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  173. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  175. helm/benchmark/scenarios/math_scenario.py +54 -20
  176. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  177. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  178. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  179. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  180. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  181. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  182. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  183. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  184. helm/benchmark/scenarios/medec_scenario.py +23 -0
  185. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  186. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  187. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  188. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  189. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  190. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  191. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  192. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  193. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  194. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  195. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  196. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  197. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  198. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  199. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  200. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  201. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  202. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  203. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  204. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  205. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  206. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  207. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  208. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  209. helm/benchmark/scenarios/quac_scenario.py +14 -0
  210. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  211. helm/benchmark/scenarios/raft_scenario.py +15 -0
  212. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  213. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  214. helm/benchmark/scenarios/scenario.py +31 -0
  215. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  216. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  217. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  218. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  219. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  220. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  221. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  222. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  223. helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
  224. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  225. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  226. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  227. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  228. helm/benchmark/scenarios/spider_scenario.py +18 -0
  229. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  230. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  231. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  232. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  233. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  234. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  235. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  236. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  237. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  238. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  239. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  240. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  241. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  242. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  243. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  244. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  245. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  246. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  247. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  248. helm/benchmark/slurm_jobs.py +1 -2
  249. helm/benchmark/slurm_runner.py +8 -1
  250. helm/benchmark/static/schema_arabic.yaml +271 -0
  251. helm/benchmark/static/schema_classic.yaml +0 -17
  252. helm/benchmark/static/schema_long_context.yaml +17 -18
  253. helm/benchmark/static/schema_medhelm.yaml +36 -0
  254. helm/benchmark/static/schema_slp.yaml +219 -0
  255. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  256. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  257. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  258. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  259. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  260. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  261. helm/benchmark/static_build/index.html +5 -6
  262. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  263. helm/clients/ai21_client.py +2 -0
  264. helm/clients/aleph_alpha_client.py +2 -0
  265. helm/clients/anthropic_client.py +7 -1
  266. helm/clients/audio_language/diva_llama_client.py +2 -0
  267. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  268. helm/clients/audio_language/llama_omni/constants.py +9 -0
  269. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  270. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  271. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  272. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  273. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  274. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  275. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  276. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  277. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  278. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  279. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  280. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  281. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  282. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  283. helm/clients/audio_language/llama_omni/utils.py +202 -0
  284. helm/clients/audio_language/llama_omni_client.py +2 -1
  285. helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
  286. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  287. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  288. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  289. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  290. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  291. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  292. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  293. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  294. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  295. helm/clients/bedrock_client.py +63 -6
  296. helm/clients/cohere_client.py +3 -0
  297. helm/clients/dspy_client.py +135 -0
  298. helm/clients/google_client.py +2 -0
  299. helm/clients/http_model_client.py +2 -0
  300. helm/clients/huggingface_client.py +4 -3
  301. helm/clients/ibm_client.py +3 -1
  302. helm/clients/image_generation/adobe_vision_client.py +2 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  304. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  305. helm/clients/image_generation/cogview2_client.py +2 -1
  306. helm/clients/image_generation/dalle2_client.py +2 -0
  307. helm/clients/image_generation/dalle_mini_client.py +2 -1
  308. helm/clients/image_generation/deep_floyd_client.py +2 -0
  309. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  310. helm/clients/image_generation/lexica_client.py +2 -0
  311. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  312. helm/clients/image_generation/mindalle_client.py +2 -1
  313. helm/clients/image_generation/together_image_generation_client.py +2 -0
  314. helm/clients/megatron_client.py +2 -0
  315. helm/clients/mistral_client.py +2 -0
  316. helm/clients/moderation_api_client.py +2 -0
  317. helm/clients/openai_client.py +38 -21
  318. helm/clients/openai_responses_client.py +34 -8
  319. helm/clients/openrouter_client.py +31 -0
  320. helm/clients/palmyra_client.py +2 -1
  321. helm/clients/reka_client.py +2 -1
  322. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  323. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  324. helm/clients/test_huggingface_client.py +3 -3
  325. helm/clients/test_openrouter_client.py +69 -0
  326. helm/clients/together_client.py +52 -13
  327. helm/clients/vertexai_client.py +23 -11
  328. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  329. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  330. helm/clients/vision_language/idefics_client.py +2 -1
  331. helm/clients/vision_language/open_flamingo_client.py +2 -1
  332. helm/clients/vision_language/paligemma_client.py +2 -1
  333. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  334. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  335. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  336. helm/clients/vllm_client.py +43 -7
  337. helm/clients/vllm_granite_thinking_client.py +56 -0
  338. helm/clients/writer_client.py +5 -2
  339. helm/common/critique_request.py +0 -1
  340. helm/common/hierarchical_logger.py +103 -34
  341. helm/common/object_spec.py +23 -8
  342. helm/common/optional_dependencies.py +1 -1
  343. helm/common/test_general.py +4 -0
  344. helm/common/test_logging.py +94 -0
  345. helm/config/model_deployments.yaml +1001 -187
  346. helm/config/model_metadata.yaml +602 -18
  347. helm/config/tokenizer_configs.yaml +202 -5
  348. helm/proxy/cli.py +1 -1
  349. helm/proxy/example_queries.py +8 -8
  350. helm/proxy/retry.py +5 -0
  351. helm/proxy/server.py +2 -1
  352. helm/proxy/static/index.css +4 -0
  353. helm/proxy/static/index.js +7 -1
  354. helm/tokenizers/auto_tokenizer.py +2 -2
  355. helm/tokenizers/grok_tokenizer.py +2 -0
  356. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  357. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  358. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  359. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  360. helm/benchmark/metrics/medalign_metrics.py +0 -14
  361. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  362. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  363. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  364. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  365. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  366. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  367. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  368. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  369. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  370. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  371. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
  372. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  373. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  374. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  375. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  376. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  377. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  378. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
  379. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  380. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
  381. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  382. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  383. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  384. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  385. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  386. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  387. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  388. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  389. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  390. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  391. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  392. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  393. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  394. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -16,6 +16,373 @@ model_deployments:
16
16
  client_spec:
17
17
  class_name: "helm.clients.simple_client.SimpleClient"
18
18
 
19
+ # Stanford Health Care
20
+ # For internal use only for MedHELM
21
+ # Placed earlier in the file to make them non-default
22
+ - name: stanfordhealthcare/claude-3-5-sonnet-20241022
23
+ model_name: anthropic/claude-3-5-sonnet-20241022
24
+ tokenizer_name: anthropic/claude
25
+ max_sequence_length: 200000
26
+ client_spec:
27
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
28
+ args:
29
+ model: anthropic.claude-3-5-sonnet-20241022-v2:0
30
+ deployment: Claude35Sonnetv2/awssig4fa
31
+
32
+ - name: stanfordhealthcare/claude-3-7-sonnet-20250219
33
+ model_name: anthropic/claude-3-7-sonnet-20250219
34
+ tokenizer_name: anthropic/claude
35
+ max_sequence_length: 200000
36
+ client_spec:
37
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
38
+ args:
39
+ model: arn:aws:bedrock:us-west-2:679683451337:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0
40
+ deployment: awssig4claude37/aswsig4claude37
41
+
42
+ - name: stanfordhealthcare/gemini-1.5-pro-001
43
+ model_name: google/gemini-1.5-pro-001
44
+ tokenizer_name: google/gemma-2b
45
+ max_sequence_length: 1000000
46
+ client_spec:
47
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
48
+ args:
49
+ deployment: gcpgemini/apim-gcp-oauth-fa
50
+
51
+ - name: stanfordhealthcare/gemini-2.0-flash-001
52
+ model_name: google/gemini-2.0-flash-001
53
+ tokenizer_name: google/gemma-2b
54
+ max_sequence_length: 1000000
55
+ client_spec:
56
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
57
+ args:
58
+ deployment: gcp-gem20flash-fa/apim-gcp-gem20flash-fa
59
+
60
+ - name: stanfordhealthcare/gpt-4o-mini-2024-07-18
61
+ model_name: openai/gpt-4o-mini-2024-07-18
62
+ tokenizer_name: openai/o200k_base
63
+ max_sequence_length: 128000
64
+ client_spec:
65
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
66
+ args:
67
+ openai_model_name: gpt-4o-mini
68
+ api_version: 2023-05-15
69
+
70
+ - name: stanfordhealthcare/gpt-4o-2024-05-13
71
+ model_name: openai/gpt-4o-2024-05-13
72
+ tokenizer_name: openai/o200k_base
73
+ max_sequence_length: 128000
74
+ client_spec:
75
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
76
+ args:
77
+ openai_model_name: gpt-4o
78
+ api_version: 2023-05-15
79
+
80
+ - name: stanfordhealthcare/gpt-4-0613
81
+ model_name: openai/gpt-4-0613
82
+ tokenizer_name: openai/o200k_base
83
+ max_sequence_length: 8192
84
+ client_spec:
85
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
86
+ args:
87
+ openai_model_name: gpt-4
88
+ api_version: 2023-05-15
89
+
90
+ - name: stanfordhealthcare/gpt-4-turbo-2024-04-09
91
+ model_name: openai/gpt-4-turbo-2024-04-09
92
+ tokenizer_name: openai/cl100k_base
93
+ max_sequence_length: 128000
94
+ client_spec:
95
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
96
+ args:
97
+ openai_model_name: gpt-4-turbo
98
+ api_version: 2023-05-15
99
+
100
+ - name: stanfordhealthcare/gpt-4.1-2025-04-14
101
+ model_name: openai/gpt-4.1-2025-04-14
102
+ tokenizer_name: openai/o200k_base
103
+ max_sequence_length: 1047576
104
+ client_spec:
105
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
106
+ args:
107
+ openai_model_name: gpt-4.1
108
+ api_version: 2025-01-01-preview
109
+ base_url: "{endpoint}/openai-eastus2"
110
+
111
+ - name: stanfordhealthcare/o3-mini-2025-01-31
112
+ model_name: openai/o3-mini-2025-01-31
113
+ tokenizer_name: openai/cl100k_base
114
+ max_sequence_length: 200000
115
+ client_spec:
116
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
117
+ args:
118
+ openai_model_name: o3-mini
119
+ api_version: 2024-12-01-preview
120
+ base_url: "{endpoint}/openai-eastus2"
121
+
122
+ - name: stanfordhealthcare/o1-2024-12-17
123
+ model_name: openai/o1-2024-12-17
124
+ tokenizer_name: openai/cl100k_base
125
+ max_sequence_length: 128000
126
+ client_spec:
127
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
128
+ args:
129
+ openai_model_name: o1
130
+ api_version: 2024-12-01-preview
131
+ base_url: "{endpoint}/openai-eastus2"
132
+
133
+ - name: stanfordhealthcare/deepseek-r1
134
+ model_name: deepseek-ai/deepseek-r1
135
+ tokenizer_name: deepseek-ai/deepseek-r1
136
+ max_sequence_length: 128000
137
+ client_spec:
138
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
139
+ args:
140
+ openai_model_name: deepseek-chat
141
+ output_processor: helm.benchmark.metrics.output_processors.remove_deepseek_r1_thinking
142
+ base_url: "{endpoint}/deepseekr1/v1"
143
+
144
+ - name: stanfordhealthcare/llama-3.3-70b-instruct
145
+ model_name: meta/llama-3.3-70b-instruct
146
+ tokenizer_name: meta/llama-3.3-70b-instruct
147
+ max_sequence_length: 128000
148
+ client_spec:
149
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
150
+ args:
151
+ base_url: "{endpoint}/llama3370b/v1"
152
+
153
+ - name: stanfordhealthcare/llama-4-scout-17b-16e-instruct
154
+ model_name: meta/llama-4-scout-17b-16e-instruct
155
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
156
+ max_sequence_length: 327680
157
+ client_spec:
158
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
159
+ args:
160
+ base_url: "{endpoint}/llama4-scout/v1"
161
+
162
+ - name: stanfordhealthcare/llama-4-maverick-17b-128e-instruct-fp8
163
+ model_name: meta/llama-4-maverick-17b-128e-instruct-fp8
164
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
165
+ max_sequence_length: 524288
166
+ client_spec:
167
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
168
+ args:
169
+ base_url: "{endpoint}/llama4-maverick/v1"
170
+
171
+ - name: stanfordhealthcare/phi-3.5-mini-instruct
172
+ model_name: microsoft/phi-3.5-mini-instruct
173
+ tokenizer_name: microsoft/phi-3.5-mini-instruct
174
+ max_sequence_length: 131072
175
+ client_spec:
176
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
177
+ args:
178
+ base_url: "{endpoint}/phi35mi/v1"
179
+
180
+ - name: stanfordhealthcare_shc/gpt-4o-2024-05-13
181
+ model_name: openai/gpt-4o-2024-05-13
182
+ tokenizer_name: openai/o200k_base
183
+ max_sequence_length: 128000
184
+ client_spec:
185
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
186
+ deployment: gpt-4o
187
+
188
+ - name: stanfordhealthcare_shc/gpt-4o-mini-2024-07-18
189
+ model_name: openai/gpt-4o-mini-2024-07-18
190
+ tokenizer_name: openai/o200k_base
191
+ max_sequence_length: 128000
192
+ client_spec:
193
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
194
+ deployment: gpt-4o-mini
195
+
196
+ - name: stanfordhealthcare_shc/gpt-4-turbo-2024-04-09
197
+ model_name: openai/gpt-4-turbo-2024-04-09
198
+ tokenizer_name: openai/cl100k_base
199
+ max_sequence_length: 128000
200
+ client_spec:
201
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
202
+ deployment: gpt-4-turbo-2024-04-09
203
+
204
+ - name: stanfordhealthcare/claude-3-5-sonnet-20241022
205
+ model_name: anthropic/claude-3-5-sonnet-20241022
206
+ tokenizer_name: anthropic/claude
207
+ max_sequence_length: 200000
208
+ client_spec:
209
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
210
+ args:
211
+ model: anthropic.claude-3-5-sonnet-20241022-v2:0
212
+ deployment: Claude35Sonnetv2/awssig4fa
213
+
214
+ - name: stanfordhealthcare/claude-3-7-sonnet-20250219
215
+ model_name: anthropic/claude-3-7-sonnet-20250219
216
+ tokenizer_name: anthropic/claude
217
+ max_sequence_length: 200000
218
+ client_spec:
219
+ class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
220
+ args:
221
+ model: arn:aws:bedrock:us-west-2:679683451337:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0
222
+ deployment: awssig4claude37/aswsig4claude37
223
+
224
+ - name: stanfordhealthcare/gemini-1.5-pro-001
225
+ model_name: google/gemini-1.5-pro-001
226
+ tokenizer_name: google/gemma-2b
227
+ max_sequence_length: 1000000
228
+ client_spec:
229
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
230
+ args:
231
+ deployment: gcpgemini/apim-gcp-oauth-fa
232
+
233
+ - name: stanfordhealthcare/gemini-2.0-flash-001
234
+ model_name: google/gemini-2.0-flash-001
235
+ tokenizer_name: google/gemma-2b
236
+ max_sequence_length: 1000000
237
+ client_spec:
238
+ class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
239
+ args:
240
+ deployment: gcp-gem20flash-fa/apim-gcp-gem20flash-fa
241
+
242
+ - name: stanfordhealthcare/gpt-4o-mini-2024-07-18
243
+ model_name: openai/gpt-4o-mini-2024-07-18
244
+ tokenizer_name: openai/o200k_base
245
+ max_sequence_length: 128000
246
+ client_spec:
247
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
248
+ args:
249
+ openai_model_name: gpt-4o-mini
250
+ api_version: 2023-05-15
251
+
252
+ - name: stanfordhealthcare/gpt-4o-2024-05-13
253
+ model_name: openai/gpt-4o-2024-05-13
254
+ tokenizer_name: openai/o200k_base
255
+ max_sequence_length: 128000
256
+ client_spec:
257
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
258
+ args:
259
+ openai_model_name: gpt-4o
260
+ api_version: 2023-05-15
261
+
262
+ - name: stanfordhealthcare/gpt-4-0613
263
+ model_name: openai/gpt-4-0613
264
+ tokenizer_name: openai/o200k_base
265
+ max_sequence_length: 8192
266
+ client_spec:
267
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
268
+ args:
269
+ openai_model_name: gpt-4
270
+ api_version: 2023-05-15
271
+
272
+ - name: stanfordhealthcare/gpt-4-turbo-2024-04-09
273
+ model_name: openai/gpt-4-turbo-2024-04-09
274
+ tokenizer_name: openai/cl100k_base
275
+ max_sequence_length: 128000
276
+ client_spec:
277
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
278
+ args:
279
+ openai_model_name: gpt-4-turbo
280
+ api_version: 2023-05-15
281
+
282
+ - name: stanfordhealthcare/gpt-4.1-2025-04-14
283
+ model_name: openai/gpt-4.1-2025-04-14
284
+ tokenizer_name: openai/o200k_base
285
+ max_sequence_length: 1047576
286
+ client_spec:
287
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
288
+ args:
289
+ openai_model_name: gpt-4.1
290
+ api_version: 2025-01-01-preview
291
+ base_url: "{endpoint}/openai-eastus2"
292
+
293
+ - name: stanfordhealthcare/o3-mini-2025-01-31
294
+ model_name: openai/o3-mini-2025-01-31
295
+ tokenizer_name: openai/cl100k_base
296
+ max_sequence_length: 200000
297
+ client_spec:
298
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
299
+ args:
300
+ openai_model_name: o3-mini
301
+ api_version: 2024-12-01-preview
302
+ base_url: "{endpoint}/openai-eastus2"
303
+
304
+ - name: stanfordhealthcare/o1-2024-12-17
305
+ model_name: openai/o1-2024-12-17
306
+ tokenizer_name: openai/cl100k_base
307
+ max_sequence_length: 128000
308
+ client_spec:
309
+ class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
310
+ args:
311
+ openai_model_name: o1
312
+ api_version: 2024-12-01-preview
313
+ base_url: "{endpoint}/openai-eastus2"
314
+
315
+ - name: stanfordhealthcare/deepseek-r1
316
+ model_name: deepseek-ai/deepseek-r1
317
+ tokenizer_name: deepseek-ai/deepseek-r1
318
+ max_sequence_length: 128000
319
+ client_spec:
320
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
321
+ args:
322
+ openai_model_name: deepseek-chat
323
+ output_processor: helm.benchmark.metrics.output_processors.remove_deepseek_r1_thinking
324
+ base_url: "{endpoint}/deepseekr1/v1"
325
+
326
+ - name: stanfordhealthcare/llama-3.3-70b-instruct
327
+ model_name: meta/llama-3.3-70b-instruct
328
+ tokenizer_name: meta/llama-3.3-70b-instruct
329
+ max_sequence_length: 128000
330
+ client_spec:
331
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
332
+ args:
333
+ base_url: "{endpoint}/llama3370b/v1"
334
+
335
+ - name: stanfordhealthcare/llama-4-scout-17b-16e-instruct
336
+ model_name: meta/llama-4-scout-17b-16e-instruct
337
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
338
+ max_sequence_length: 327680
339
+ client_spec:
340
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
341
+ args:
342
+ base_url: "{endpoint}/llama4-scout/v1"
343
+
344
+ - name: stanfordhealthcare/llama-4-maverick-17b-128e-instruct-fp8
345
+ model_name: meta/llama-4-maverick-17b-128e-instruct-fp8
346
+ tokenizer_name: meta/llama-4-scout-17b-16e-instruct
347
+ max_sequence_length: 524288
348
+ client_spec:
349
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
350
+ args:
351
+ base_url: "{endpoint}/llama4-maverick/v1"
352
+
353
+ - name: stanfordhealthcare/phi-3.5-mini-instruct
354
+ model_name: microsoft/phi-3.5-mini-instruct
355
+ tokenizer_name: microsoft/phi-3.5-mini-instruct
356
+ max_sequence_length: 131072
357
+ client_spec:
358
+ class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
359
+ args:
360
+ base_url: "{endpoint}/phi35mi/v1"
361
+
362
+ - name: stanfordhealthcare_shc/gpt-4o-2024-05-13
363
+ model_name: openai/gpt-4o-2024-05-13
364
+ tokenizer_name: openai/o200k_base
365
+ max_sequence_length: 128000
366
+ client_spec:
367
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
368
+ deployment: gpt-4o
369
+
370
+ - name: stanfordhealthcare_shc/gpt-4o-mini-2024-07-18
371
+ model_name: openai/gpt-4o-mini-2024-07-18
372
+ tokenizer_name: openai/o200k_base
373
+ max_sequence_length: 128000
374
+ client_spec:
375
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
376
+ deployment: gpt-4o-mini
377
+
378
+ - name: stanfordhealthcare_shc/gpt-4-turbo-2024-04-09
379
+ model_name: openai/gpt-4-turbo-2024-04-09
380
+ tokenizer_name: openai/cl100k_base
381
+ max_sequence_length: 128000
382
+ client_spec:
383
+ class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
384
+ deployment: gpt-4-turbo-2024-04-09
385
+
19
386
  # Adobe
20
387
  - name: adobe/giga-gan
21
388
  model_name: adobe/giga-gan
@@ -363,6 +730,13 @@ model_deployments:
363
730
  thinking_budget_tokens: 10000
364
731
  stream: true
365
732
 
733
+ - name: anthropic/claude-sonnet-4-5-20250929
734
+ model_name: anthropic/claude-sonnet-4-5-20250929
735
+ tokenizer_name: anthropic/claude
736
+ max_sequence_length: 200000
737
+ client_spec:
738
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
739
+
366
740
  - name: anthropic/stanford-online-all-v4-s3
367
741
  deprecated: true # Closed model, not accessible via API
368
742
  model_name: anthropic/stanford-online-all-v4-s3
@@ -494,6 +868,20 @@ model_deployments:
494
868
  parse_thinking: true
495
869
  disable_logprobs: True
496
870
 
871
+ - name: together/deepseek-r1-distill-llama-70b
872
+ model_name: deepseek-ai/deepseek-r1-distill-llama-70b
873
+ tokenizer_name: deepseek-ai/deepseek-r1-distill-llama-70b
874
+ max_sequence_length: 131072
875
+ client_spec:
876
+ class_name: "helm.clients.together_client.TogetherChatClient"
877
+
878
+ - name: together/deepseek-r1-distill-qwen-14b
879
+ model_name: deepseek-ai/deepseek-r1-distill-qwen-14b
880
+ tokenizer_name: deepseek-ai/deepseek-r1-distill-qwen-14b
881
+ max_sequence_length: 131072
882
+ client_spec:
883
+ class_name: "helm.clients.together_client.TogetherChatClient"
884
+
497
885
  # Gooseai
498
886
 
499
887
  # TODO: Migrate these models to use OpenAIClient
@@ -721,6 +1109,14 @@ model_deployments:
721
1109
  # - https://cloud.google.com/vertex-ai/generative-ai/docs/learn/locations#global-endpoint
722
1110
  location: global
723
1111
 
1112
+ - name: google/gemini-2.5-flash-lite
1113
+ model_name: google/gemini-2.5-flash-lite
1114
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
1115
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash
1116
+ # TODO: Max output tokens: 65536
1117
+ client_spec:
1118
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
1119
+
724
1120
  - name: google/gemini-2.5-flash-preview-04-17
725
1121
  model_name: google/gemini-2.5-flash-preview-04-17
726
1122
  tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
@@ -1195,6 +1591,18 @@ model_deployments:
1195
1591
  args:
1196
1592
  device_map: auto
1197
1593
  torch_dtype: torch.bfloat16
1594
+
1595
+ ## Google
1596
+ - name: huggingface/medgemma-4b-it
1597
+ model_name: google/medgemma-4b-it
1598
+ tokenizer_name: google/medgemma-4b-it
1599
+ max_sequence_length: 128000
1600
+ client_spec:
1601
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1602
+ args:
1603
+ device_map: auto
1604
+ trust_remote_code: true
1605
+ torch_dtype: torch.bfloat16
1198
1606
 
1199
1607
  ## LMSYS
1200
1608
  - name: huggingface/vicuna-7b-v1.3
@@ -1438,6 +1846,14 @@ model_deployments:
1438
1846
  client_spec:
1439
1847
  class_name: "helm.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient"
1440
1848
 
1849
+ ## Moonshot AI
1850
+ - name: together/kimi-k2-instruct
1851
+ model_name: moonshotai/kimi-k2-instruct
1852
+ tokenizer_name: moonshotai/kimi-k2-instruct
1853
+ max_sequence_length: 131072
1854
+ client_spec:
1855
+ class_name: "helm.clients.together_client.TogetherChatClient"
1856
+
1441
1857
  ## MosaicML
1442
1858
  - name: huggingface/mpt-7b
1443
1859
  model_name: mosaicml/mpt-7b
@@ -2241,6 +2657,27 @@ model_deployments:
2241
2657
  client_spec:
2242
2658
  class_name: "helm.clients.openai_client.OpenAIClient"
2243
2659
 
2660
+ - name: openai/gpt-5-2025-08-07
2661
+ model_name: openai/gpt-5-2025-08-07
2662
+ tokenizer_name: openai/o200k_base
2663
+ max_sequence_length: 400000
2664
+ client_spec:
2665
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2666
+
2667
+ - name: openai/gpt-5-mini-2025-08-07
2668
+ model_name: openai/gpt-5-mini-2025-08-07
2669
+ tokenizer_name: openai/o200k_base
2670
+ max_sequence_length: 400000
2671
+ client_spec:
2672
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2673
+
2674
+ - name: openai/gpt-5-nano-2025-08-07
2675
+ model_name: openai/gpt-5-nano-2025-08-07
2676
+ tokenizer_name: openai/o200k_base
2677
+ max_sequence_length: 400000
2678
+ client_spec:
2679
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2680
+
2244
2681
  - name: openai/whisper-1_gpt-4o-2024-11-20
2245
2682
  model_name: openai/whisper-1_gpt-4o-2024-11-20
2246
2683
  tokenizer_name: openai/o200k_base
@@ -2472,6 +2909,36 @@ model_deployments:
2472
2909
  openai_model_name: o4-mini-2025-04-16
2473
2910
  reasoning_effort: high
2474
2911
 
2912
+
2913
+ - name: openai/o3-pro-2025-06-10-high-reasoning-effort
2914
+ model_name: openai/o3-pro-2025-06-10-high-reasoning-effort
2915
+ tokenizer_name: openai/cl100k_base
2916
+ # Source: https://platform.openai.com/docs/models/o3-pro
2917
+ max_sequence_length: 200000
2918
+ # TODO: max_output_tokens: 100000
2919
+ client_spec:
2920
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2921
+ args:
2922
+ openai_model_name: o3-pro-2025-06-10
2923
+ reasoning_effort: high
2924
+
2925
+ ## GPT-OSS
2926
+ - name: together/gpt-oss-20b
2927
+ model_name: openai/gpt-oss-20b
2928
+ tokenizer_name: openai/o200k_harmony
2929
+ # Source: https://platform.openai.com/docs/models/gpt-oss-20b
2930
+ max_sequence_length: 131072
2931
+ client_spec:
2932
+ class_name: "helm.clients.together_client.TogetherChatClient"
2933
+
2934
+ - name: together/gpt-oss-120b
2935
+ model_name: openai/gpt-oss-120b
2936
+ tokenizer_name: openai/o200k_harmony
2937
+ # Source: https://platform.openai.com/docs/models/gpt-oss-120b
2938
+ max_sequence_length: 131072
2939
+ client_spec:
2940
+ class_name: "helm.clients.together_client.TogetherChatClient"
2941
+
2475
2942
  ## Text Similarity Models
2476
2943
  # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings
2477
2944
  # The number of parameters is guessed based on the number of parameters of the
@@ -3153,6 +3620,16 @@ model_deployments:
3153
3620
  args:
3154
3621
  together_model: togethercomputer/RedPajama-INCITE-7B-Instruct
3155
3622
 
3623
+ ## Z.ai
3624
+ - name: together/glm-4.5-air-fp8
3625
+ model_name: zai-org/glm-4.5-air-fp8
3626
+ tokenizer_name: zai-org/glm-4.5-air-fp8
3627
+ max_sequence_length: 131072
3628
+ client_spec:
3629
+ class_name: "helm.clients.together_client.TogetherChatClient"
3630
+ args:
3631
+ parse_thinking: true
3632
+
3156
3633
  - name: thudm/cogview2
3157
3634
  model_name: thudm/cogview2
3158
3635
  tokenizer_name: openai/clip-vit-large-patch14
@@ -3257,6 +3734,17 @@ model_deployments:
3257
3734
  client_spec:
3258
3735
  class_name: "helm.clients.writer_client.WriterClient"
3259
3736
 
3737
+ - name: amazon/palmyra-x5-v1-bedrock
3738
+ model_name: writer/palmyra-x5-v1-bedrock
3739
+ # See tokenizer comment for writer/palmyra-x-004
3740
+ tokenizer_name: meta/llama-3-8b
3741
+ max_sequence_length: 1040000
3742
+ client_spec:
3743
+ class_name: "helm.clients.bedrock_client.BedrockPalmyraClient"
3744
+ args:
3745
+ bedrock_model_id: us.writer.palmyra-x5-v1:0
3746
+ region: us-west-2
3747
+
3260
3748
  - name: writer/palmyra-med-32k
3261
3749
  model_name: writer/palmyra-med-32k
3262
3750
  # Palmyra-Med uses the "<|end_of_text|>" as the end of text token, which is used by meta/llama-3-8b,
@@ -3308,6 +3796,15 @@ model_deployments:
3308
3796
  window_service_spec:
3309
3797
  class_name: "helm.benchmark.window_services.no_decoding_window_service.NoDecodingWindowService"
3310
3798
 
3799
+ - name: xai/grok-4-0709
3800
+ model_name: xai/grok-4-0709
3801
+ tokenizer_name: xai/grok-4-0709
3802
+ max_sequence_length: 256000
3803
+ client_spec:
3804
+ class_name: "helm.clients.grok_client.GrokChatClient"
3805
+ window_service_spec:
3806
+ class_name: "helm.benchmark.window_services.no_decoding_window_service.NoDecodingWindowService"
3807
+
3311
3808
  # Qwen
3312
3809
 
3313
3810
  - name: together/qwen-7b
@@ -3419,7 +3916,25 @@ model_deployments:
3419
3916
  class_name: "helm.clients.together_client.TogetherChatClient"
3420
3917
  args:
3421
3918
  parse_thinking: true
3422
-
3919
+
3920
+ - name: together/qwen3-next-80b-a3b-thinking
3921
+ model_name: qwen/qwen3-next-80b-a3b-thinking
3922
+ tokenizer_name: qwen/qwen3-next-80b-a3b-thinking
3923
+ max_sequence_length: 262144
3924
+ client_spec:
3925
+ class_name: "helm.clients.together_client.TogetherChatClient"
3926
+ args:
3927
+ parse_thinking: true
3928
+
3929
+ - name: together/qwen3-235b-a22b-instruct-2507-fp8
3930
+ model_name: qwen/qwen3-235b-a22b-instruct-2507-fp8
3931
+ tokenizer_name: qwen/qwen3-235b-a22b-instruct-2507-fp8
3932
+ max_sequence_length: 262144
3933
+ client_spec:
3934
+ class_name: "helm.clients.together_client.TogetherChatClient"
3935
+ args:
3936
+ together_model: Qwen/Qwen3-235B-A22B-Instruct-2507-tput
3937
+
3423
3938
  - name: huggingface/qwen2.5-7b-instruct-4bit
3424
3939
  model_name: qwen/qwen2.5-7b-instruct
3425
3940
  tokenizer_name: qwen/qwen2.5-7b-instruct
@@ -3655,7 +4170,7 @@ model_deployments:
3655
4170
  client_spec:
3656
4171
  class_name: "helm.clients.audio_language.llama_omni_client.LlamaOmniAudioLMClient"
3657
4172
 
3658
- # IBM - Granite 3.0
4173
+ # IBM Granite
3659
4174
  - name: huggingface/granite-3.0-2b-base
3660
4175
  model_name: ibm-granite/granite-3.0-2b-base
3661
4176
  tokenizer_name: ibm-granite/granite-3.0-2b-base
@@ -3728,6 +4243,25 @@ model_deployments:
3728
4243
  args:
3729
4244
  pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base
3730
4245
 
4246
+ - name: huggingface/granite-4.0-micro
4247
+ model_name: ibm/granite-4.0-micro
4248
+ tokenizer_name: ibm/granite-4.0-micro
4249
+ max_sequence_length: 128000
4250
+ client_spec:
4251
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4252
+ args:
4253
+ pretrained_model_name_or_path: ibm-granite/granite-4.0-micro
4254
+
4255
+ - name: huggingface/granite-4.0-h-small
4256
+ model_name: ibm/granite-4.0-h-small
4257
+ tokenizer_name: ibm/granite-4.0-h-small
4258
+ max_sequence_length: 128000
4259
+ client_spec:
4260
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4261
+ args:
4262
+ pretrained_model_name_or_path: ibm-granite/granite-4.0-h-small
4263
+
4264
+ # Maritaca AI
3731
4265
  - name: huggingface/sabia-7b
3732
4266
  model_name: maritaca-ai/sabia-7b
3733
4267
  tokenizer_name: maritaca-ai/sabia-7b
@@ -3737,6 +4271,27 @@ model_deployments:
3737
4271
  args:
3738
4272
  pretrained_model_name_or_path: maritaca-ai/sabia-7b
3739
4273
 
4274
+ - name: maritaca-ai/sabiazinho-3
4275
+ model_name: maritaca-ai/sabiazinho-3
4276
+ tokenizer_name: maritaca-ai/sabia-2-tokenizer-medium
4277
+ max_sequence_length: 32000
4278
+ client_spec:
4279
+ class_name: "helm.clients.openai_client.OpenAIClient"
4280
+
4281
+ - name: maritaca-ai/sabia-3
4282
+ model_name: maritaca-ai/sabia-3
4283
+ tokenizer_name: maritaca-ai/sabia-2-tokenizer-medium
4284
+ max_sequence_length: 128000
4285
+ client_spec:
4286
+ class_name: "helm.clients.openai_client.OpenAIClient"
4287
+
4288
+ - name: maritaca-ai/sabia-3.1-2025-05-08
4289
+ model_name: maritaca-ai/sabia-3.1-2025-05-08
4290
+ tokenizer_name: maritaca-ai/sabia-2-tokenizer-medium
4291
+ max_sequence_length: 128000
4292
+ client_spec:
4293
+ class_name: "helm.clients.openai_client.OpenAIClient"
4294
+
3740
4295
  # Granite-3.1-8b-base
3741
4296
  - name: huggingface/granite-3.1-8b-base
3742
4297
  model_name: ibm-granite/granite-3.1-8b-base
@@ -3767,75 +4322,270 @@ model_deployments:
3767
4322
  args:
3768
4323
  pretrained_model_name_or_path: ibm-granite/granite-3.1-2b-instruct
3769
4324
 
3770
- # Granite-3.1-2b-base
3771
- - name: huggingface/granite-3.1-2b-base
3772
- model_name: ibm-granite/granite-3.1-2b-base
3773
- tokenizer_name: ibm-granite/granite-3.1-2b-base
3774
- max_sequence_length: 128000
4325
+ # Granite-3.1-2b-base
4326
+ - name: huggingface/granite-3.1-2b-base
4327
+ model_name: ibm-granite/granite-3.1-2b-base
4328
+ tokenizer_name: ibm-granite/granite-3.1-2b-base
4329
+ max_sequence_length: 128000
4330
+ client_spec:
4331
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4332
+ args:
4333
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-2b-base
4334
+
4335
+ # Granite-3.1-3b-a800m-instruct
4336
+ - name: huggingface/granite-3.1-3b-a800m-instruct
4337
+ model_name: ibm-granite/granite-3.1-3b-a800m-instruct
4338
+ tokenizer_name: ibm-granite/granite-3.1-3b-a800m-instruct
4339
+ max_sequence_length: 128000
4340
+ client_spec:
4341
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4342
+ args:
4343
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-3b-a800m-instruct
4344
+
4345
+ # Granite-3.1-3b-a800m-base
4346
+ - name: huggingface/granite-3.1-3b-a800m-base
4347
+ model_name: ibm-granite/granite-3.1-3b-a800m-base
4348
+ tokenizer_name: ibm-granite/granite-3.1-3b-a800m-base
4349
+ max_sequence_length: 128000
4350
+ client_spec:
4351
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4352
+ args:
4353
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-3b-a800m-base
4354
+
4355
+ # Granite-3.1-1b-a400m-instruct
4356
+ - name: huggingface/granite-3.1-1b-a400m-instruct
4357
+ model_name: ibm-granite/granite-3.1-1b-a400m-instruct
4358
+ tokenizer_name: ibm-granite/granite-3.1-1b-a400m-instruct
4359
+ max_sequence_length: 128000
4360
+ client_spec:
4361
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4362
+ args:
4363
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-1b-a400m-instruct
4364
+
4365
+ # Granite-3.1-1b-a400m-base
4366
+ - name: huggingface/granite-3.1-1b-a400m-base
4367
+ model_name: ibm-granite/granite-3.1-1b-a400m-base
4368
+ tokenizer_name: ibm-granite/granite-3.1-1b-a400m-base
4369
+ max_sequence_length: 128000
4370
+ client_spec:
4371
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4372
+ args:
4373
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-1b-a400m-base
4374
+
4375
+ # DeepSeek-R1-Distill-Llama-3.1-8b
4376
+ - name: huggingface/DeepSeek-R1-Distill-Llama-8B
4377
+ model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
4378
+ tokenizer_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
4379
+ max_sequence_length: 128000
4380
+ client_spec:
4381
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4382
+ args:
4383
+ pretrained_model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
4384
+
4385
+ # deepseek-ai/deepseek-coder-6.7b-instruct
4386
+ - name: huggingface/deepseek-coder-6.7b-instruct
4387
+ model_name: deepseek-ai/deepseek-coder-6.7b-instruct
4388
+ tokenizer_name: deepseek-ai/deepseek-coder-6.7b-instruct
4389
+ max_sequence_length: 128000
4390
+ client_spec:
4391
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4392
+ args:
4393
+ pretrained_model_name_or_path: deepseek-ai/deepseek-coder-6.7b-instruct
4394
+
4395
+ # AceGPT-v2
4396
+ - name: huggingface/acegpt-v2-8b-chat
4397
+ model_name: freedomintelligence/acegpt-v2-8b-chat
4398
+ tokenizer_name: freedomintelligence/acegpt-v2-8b-chat
4399
+ max_sequence_length: 8192
4400
+ client_spec:
4401
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4402
+ args:
4403
+ device_map: auto
4404
+
4405
+ - name: huggingface/acegpt-v2-32b-chat
4406
+ model_name: freedomintelligence/acegpt-v2-32b-chat
4407
+ tokenizer_name: freedomintelligence/acegpt-v2-32b-chat
4408
+ max_sequence_length: 32768
4409
+ client_spec:
4410
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4411
+ args:
4412
+ device_map: auto
4413
+
4414
+ - name: huggingface/acegpt-v2-70b-chat
4415
+ model_name: freedomintelligence/acegpt-v2-70b-chat
4416
+ tokenizer_name: freedomintelligence/acegpt-v2-70b-chat
4417
+ max_sequence_length: 8192
4418
+ client_spec:
4419
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4420
+ args:
4421
+ device_map: auto
4422
+
4423
+ # ALLaM
4424
+ - name: huggingface/allam-7b-instruct-preview
4425
+ model_name: allam-ai/allam-7b-instruct-preview
4426
+ tokenizer_name: allam-ai/allam-7b-instruct-preview
4427
+ max_sequence_length: 4096
4428
+ client_spec:
4429
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4430
+ args:
4431
+ device_map: auto
4432
+
4433
+ # SILMA
4434
+ - name: huggingface/silma-9b-instruct-v1.0
4435
+ model_name: silma-ai/silma-9b-instruct-v1.0
4436
+ tokenizer_name: silma-ai/silma-9b-instruct-v1.0
4437
+ max_sequence_length: 8192
4438
+ client_spec:
4439
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4440
+ args:
4441
+ device_map: auto
4442
+
4443
+ # Jais Family
4444
+ #
4445
+ # NOTE: Jais Family models require `transformers<=4.52.3`.
4446
+ # On more recent versions of transformers, one of the following errors might occur:
4447
+ #
4448
+ # File "/path/to//site-packages/transformers/models/gemma3n/configuration_gemma3n.py", line 31, in <module>
4449
+ # from timm.data import ImageNetInfo, infer_imagenet_subset
4450
+ # ImportError: cannot import name 'ImageNetInfo' from 'timm.data' (/path/to/site-packages/timm/data/__init__.py)
4451
+ #
4452
+ # File "/path/to/.cache/huggingface/modules/transformers_modules/inceptionai/jais-family-590m-chat/90ac4769212b4964c6e81e183140224628228365/modeling_jais.py", line 899, in forward
4453
+ # past_length = past_key_values[0][0].size(-2)
4454
+ # AttributeError: 'NoneType' object has no attribute 'size'
4455
+
4456
+ - name: huggingface/jais-family-590m-chat
4457
+ model_name: inceptionai/jais-family-590m-chat
4458
+ tokenizer_name: inceptionai/jais-family-590m-chat
4459
+ max_sequence_length: 2000 # Reduced from 2048 to account for chat template tokens
4460
+ client_spec:
4461
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4462
+ args:
4463
+ trust_remote_code: true
4464
+ revision: 90ac4769212b4964c6e81e183140224628228365
4465
+
4466
+ - name: huggingface/jais-family-1p3b-chat
4467
+ model_name: inceptionai/jais-family-1p3b-chat
4468
+ tokenizer_name: inceptionai/jais-family-590m-chat
4469
+ max_sequence_length: 2000 # Reduced from 2048 to account for chat template tokens
4470
+ client_spec:
4471
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4472
+ args:
4473
+ trust_remote_code: true
4474
+ revision: 4b93176e2cb00f369b3bc0a8786e4cf16260c804
4475
+
4476
+ - name: huggingface/jais-family-2p7b-chat
4477
+ model_name: inceptionai/jais-family-2p7b-chat
4478
+ tokenizer_name: inceptionai/jais-family-590m-chat
4479
+ max_sequence_length: 2000 # Reduced from 2048 to account for chat template tokens
4480
+ client_spec:
4481
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4482
+ args:
4483
+ trust_remote_code: true
4484
+ revision: b2bf5d1bcd969ce868f66fb1ad8c3480289ea6b2
4485
+
4486
+ - name: huggingface/jais-family-6p7b-chat
4487
+ model_name: inceptionai/jais-family-6p7b-chat
4488
+ tokenizer_name: inceptionai/jais-family-590m-chat
4489
+ max_sequence_length: 2000 # Reduced from 2048 to account for chat template tokens
4490
+ client_spec:
4491
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4492
+ args:
4493
+ device_map: auto
4494
+ trust_remote_code: true
4495
+ revision: 683805efe6126c6536feb4aa23317e70222ac94c
4496
+
4497
+ - name: huggingface/jais-family-13b-chat
4498
+ model_name: inceptionai/jais-family-13b-chat
4499
+ tokenizer_name: inceptionai/jais-family-590m-chat
4500
+ max_sequence_length: 2000 # Reduced from 2048 to account for chat template tokens
4501
+ client_spec:
4502
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4503
+ args:
4504
+ device_map: auto
4505
+ trust_remote_code: true
4506
+ revision: 0ef8b4f80429609890816d912b331d3b95864707
4507
+
4508
+ - name: huggingface/jais-family-30b-8k-chat
4509
+ model_name: inceptionai/jais-family-30b-8k-chat
4510
+ tokenizer_name: inceptionai/jais-family-590m-chat
4511
+ max_sequence_length: 8192
4512
+ client_spec:
4513
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4514
+ args:
4515
+ device_map: auto
4516
+ trust_remote_code: true
4517
+ revision: dab185164dd3b79ec9201d7f4cf878ce91ae7e14
4518
+
4519
+ - name: huggingface/jais-family-30b-16k-chat
4520
+ model_name: inceptionai/jais-family-30b-16k-chat
4521
+ tokenizer_name: inceptionai/jais-family-590m-chat
4522
+ max_sequence_length: 16384
4523
+ client_spec:
4524
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4525
+ args:
4526
+ device_map: auto
4527
+ trust_remote_code: true
4528
+ revision: 369f88eeee4d313155f1b1dca4ebec90f9f9f2a4
4529
+
4530
+ # Jais Adapter
4531
+ - name: huggingface/jais-adapted-7b-chat
4532
+ model_name: inceptionai/jais-adapted-7b-chat
4533
+ tokenizer_name: inceptionai/jais-adapted-7b-chat
4534
+ max_sequence_length: 4096
3775
4535
  client_spec:
3776
- class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3777
- args:
3778
- pretrained_model_name_or_path: ibm-granite/granite-3.1-2b-base
4536
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4537
+ args:
4538
+ device_map: auto
3779
4539
 
3780
- # Granite-3.1-3b-a800m-instruct
3781
- - name: huggingface/granite-3.1-3b-a800m-instruct
3782
- model_name: ibm-granite/granite-3.1-3b-a800m-instruct
3783
- tokenizer_name: ibm-granite/granite-3.1-3b-a800m-instruct
3784
- max_sequence_length: 128000
4540
+ - name: huggingface/jais-adapted-13b-chat
4541
+ model_name: inceptionai/jais-adapted-13b-chat
4542
+ tokenizer_name: inceptionai/jais-adapted-13b-chat
4543
+ max_sequence_length: 4096
3785
4544
  client_spec:
3786
- class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3787
- args:
3788
- pretrained_model_name_or_path: ibm-granite/granite-3.1-3b-a800m-instruct
4545
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4546
+ args:
4547
+ device_map: auto
3789
4548
 
3790
- # Granite-3.1-3b-a800m-base
3791
- - name: huggingface/granite-3.1-3b-a800m-base
3792
- model_name: ibm-granite/granite-3.1-3b-a800m-base
3793
- tokenizer_name: ibm-granite/granite-3.1-3b-a800m-base
3794
- max_sequence_length: 128000
4549
+ - name: huggingface/jais-adapted-70b-chat
4550
+ model_name: inceptionai/jais-adapted-70b-chat
4551
+ tokenizer_name: inceptionai/jais-adapted-7b-chat
4552
+ max_sequence_length: 4096
3795
4553
  client_spec:
3796
- class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3797
- args:
3798
- pretrained_model_name_or_path: ibm-granite/granite-3.1-3b-a800m-base
4554
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4555
+ args:
4556
+ device_map: auto
3799
4557
 
3800
- # Granite-3.1-1b-a400m-instruct
3801
- - name: huggingface/granite-3.1-1b-a400m-instruct
3802
- model_name: ibm-granite/granite-3.1-1b-a400m-instruct
3803
- tokenizer_name: ibm-granite/granite-3.1-1b-a400m-instruct
3804
- max_sequence_length: 128000
4558
+ - name: huggingface/falcon3-1b-instruct
4559
+ model_name: tiiuae/falcon3-1b-instruct
4560
+ tokenizer_name: tiiuae/falcon3-1b-instruct
4561
+ max_sequence_length: 8192
3805
4562
  client_spec:
3806
- class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3807
- args:
3808
- pretrained_model_name_or_path: ibm-granite/granite-3.1-1b-a400m-instruct
4563
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3809
4564
 
3810
- # Granite-3.1-1b-a400m-base
3811
- - name: huggingface/granite-3.1-1b-a400m-base
3812
- model_name: ibm-granite/granite-3.1-1b-a400m-base
3813
- tokenizer_name: ibm-granite/granite-3.1-1b-a400m-base
3814
- max_sequence_length: 128000
4565
+ - name: huggingface/falcon3-3b-instruct
4566
+ model_name: tiiuae/falcon3-3b-instruct
4567
+ tokenizer_name: tiiuae/falcon3-1b-instruct
4568
+ max_sequence_length: 32768
3815
4569
  client_spec:
3816
- class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3817
- args:
3818
- pretrained_model_name_or_path: ibm-granite/granite-3.1-1b-a400m-base
4570
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3819
4571
 
3820
- # DeepSeek-R1-Distill-Llama-3.1-8b
3821
- - name: huggingface/DeepSeek-R1-Distill-Llama-8B
3822
- model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
3823
- tokenizer_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
3824
- max_sequence_length: 128000
4572
+ - name: huggingface/falcon3-7b-instruct
4573
+ model_name: tiiuae/falcon3-7b-instruct
4574
+ tokenizer_name: tiiuae/falcon3-1b-instruct
4575
+ max_sequence_length: 32768
3825
4576
  client_spec:
3826
- class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3827
- args:
3828
- pretrained_model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
4577
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4578
+ args:
4579
+ device_map: auto
3829
4580
 
3830
- # deepseek-ai/deepseek-coder-6.7b-instruct
3831
- - name: huggingface/deepseek-coder-6.7b-instruct
3832
- model_name: deepseek-ai/deepseek-coder-6.7b-instruct
3833
- tokenizer_name: deepseek-ai/deepseek-coder-6.7b-instruct
3834
- max_sequence_length: 128000
4581
+ - name: huggingface/falcon3-10b-instruct
4582
+ model_name: tiiuae/falcon3-10b-instruct
4583
+ tokenizer_name: tiiuae/falcon3-1b-instruct
4584
+ max_sequence_length: 32768
3835
4585
  client_spec:
3836
- class_name: "helm.clients.huggingface_client.HuggingFaceClient"
3837
- args:
3838
- pretrained_model_name_or_path: deepseek-ai/deepseek-coder-6.7b-instruct
4586
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4587
+ args:
4588
+ device_map: auto
3839
4589
 
3840
4590
  # IBM WatsonX
3841
4591
  - name: ibm/llama-3.3-70b-instruct
@@ -3918,16 +4668,6 @@ model_deployments:
3918
4668
  watsonx_model_name: ibm/granite-8b-code-instruct
3919
4669
  region: Dallas
3920
4670
 
3921
- - name: ibm/mixtral-8x7b-instruct-v0:1
3922
- model_name: mistralai/mixtral-8x7b-instruct-v0:1
3923
- tokenizer_name: huggingface/gpt2
3924
- max_sequence_length: 4000
3925
- client_spec:
3926
- class_name: "helm.clients.ibm_client.IbmChatClient"
3927
- args:
3928
- watsonx_model_name: mistralai/mixtral-8x7b-instruct-v01
3929
- region: Dallas
3930
-
3931
4671
  - name: ibm/granite-3.3-8b-instruct
3932
4672
  model_name: ibm/granite-3.3-8b-instruct
3933
4673
  tokenizer_name: ibm/granite-3.3-8b-instruct
@@ -4128,186 +4868,260 @@ model_deployments:
4128
4868
  args:
4129
4869
  pretrained_model_name_or_path: vinai/PhoGPT-4B-Chat
4130
4870
 
4131
- # Stanford Health Care
4132
- # Placed later in the file to make them non-default
4133
- - name: stanfordhealthcare/claude-3-5-sonnet-20241022
4134
- model_name: anthropic/claude-3-5-sonnet-20241022
4135
- tokenizer_name: anthropic/claude
4136
- max_sequence_length: 200000
4871
+ - name: huggingface/Gemma-3-Gaia-PT-BR-4b-it
4872
+ model_name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
4873
+ tokenizer_name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
4874
+ max_sequence_length: 128000
4137
4875
  client_spec:
4138
- class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
4876
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4139
4877
  args:
4140
- model: anthropic.claude-3-5-sonnet-20241022-v2:0
4141
- deployment: Claude35Sonnetv2/awssig4fa
4142
-
4143
- - name: stanfordhealthcare/claude-3-7-sonnet-20250219
4144
- model_name: anthropic/claude-3-7-sonnet-20250219
4145
- tokenizer_name: anthropic/claude
4146
- max_sequence_length: 200000
4878
+ pretrained_model_name_or_path: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
4879
+
4880
+ - name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
4881
+ model_name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
4882
+ tokenizer_name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
4883
+ max_sequence_length: 4094
4147
4884
  client_spec:
4148
- class_name: "helm.clients.stanfordhealthcare_claude_client.StanfordHealthCareClaudeClient"
4885
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4149
4886
  args:
4150
- model: arn:aws:bedrock:us-west-2:679683451337:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0
4151
- deployment: awssig4claude37/aswsig4claude37
4887
+ pretrained_model_name_or_path: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
4152
4888
 
4153
- - name: stanfordhealthcare/gemini-1.5-pro-001
4154
- model_name: google/gemini-1.5-pro-001
4155
- tokenizer_name: google/gemma-2b
4156
- max_sequence_length: 1000000
4889
+ - name: 22h/cabrita_7b_pt_850000
4890
+ model_name: 22h/cabrita_7b_pt_850000
4891
+ tokenizer_name: 22h/cabrita_7b_pt_850000
4892
+ max_sequence_length: 4094
4157
4893
  client_spec:
4158
- class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
4894
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4159
4895
  args:
4160
- deployment: gcpgemini/apim-gcp-oauth-fa
4896
+ pretrained_model_name_or_path: 22h/cabrita_7b_pt_850000
4161
4897
 
4162
- - name: stanfordhealthcare/gemini-2.0-flash-001
4163
- model_name: google/gemini-2.0-flash-001
4164
- tokenizer_name: google/gemma-2b
4165
- max_sequence_length: 1000000
4898
+ - name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
4899
+ model_name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
4900
+ tokenizer_name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
4901
+ max_sequence_length: 4096
4166
4902
  client_spec:
4167
- class_name: "helm.clients.stanfordhealthcare_google_client.StanfordHealthCareGoogleClient"
4903
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4168
4904
  args:
4169
- deployment: gcp-gem20flash-fa/apim-gcp-gem20flash-fa
4905
+ pretrained_model_name_or_path: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
4170
4906
 
4171
- - name: stanfordhealthcare/gpt-4o-mini-2024-07-18
4172
- model_name: openai/gpt-4o-mini-2024-07-18
4907
+ - name: TucanoBR/Tucano-2b4
4908
+ model_name: TucanoBR/Tucano-2b4
4909
+ tokenizer_name: TucanoBR/Tucano-2b4
4910
+ max_sequence_length: 4096
4911
+ client_spec:
4912
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4913
+ args:
4914
+ pretrained_model_name_or_path: TucanoBR/Tucano-2b4
4915
+
4916
+ - name: nicholasKluge/TeenyTinyLlama-460m
4917
+ model_name: nicholasKluge/TeenyTinyLlama-460m
4918
+ tokenizer_name: nicholasKluge/TeenyTinyLlama-460m
4919
+ max_sequence_length: 2048
4920
+ client_spec:
4921
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4922
+ args:
4923
+ pretrained_model_name_or_path: nicholasKluge/TeenyTinyLlama-460m
4924
+
4925
+ - name: openrouter/mistral-medium-3.1
4926
+ model_name: mistralai/mistral-medium-3.1
4927
+ tokenizer_name: mistralai/Mistral-7B-v0.1
4928
+ max_sequence_length: 128000
4929
+ client_spec:
4930
+ class_name: "helm.clients.openrouter_client.OpenRouterClient"
4931
+ args:
4932
+ model_name: mistralai/mistral-medium-3.1
4933
+
4934
+ # DSPy Models (EXPERIMENTAL)
4935
+ # The following model configurations use the DSPyClient for inference with DSPy modules.
4936
+
4937
+ - name: anthropic/claude-3-7-sonnet-20250219-dspy-zs-predict
4938
+ model_name: anthropic/claude-3-7-sonnet-20250219-dspy-zs-predict
4173
4939
  tokenizer_name: openai/o200k_base
4174
4940
  max_sequence_length: 128000
4175
4941
  client_spec:
4176
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4942
+ class_name: "helm.clients.dspy_client.DSPyClient"
4177
4943
  args:
4178
- openai_model_name: gpt-4o-mini
4179
- api_version: 2023-05-15
4944
+ dspy_agent_url: null
4945
+ dspy_module: Predict
4946
+ dspy_api_model: anthropic/claude-3-7-sonnet-20250219
4947
+ dspy_api_base: null
4180
4948
 
4181
- - name: stanfordhealthcare/gpt-4o-2024-05-13
4182
- model_name: openai/gpt-4o-2024-05-13
4949
+ - name: google/gemini-2.0-flash-001-dspy-zs-predict
4950
+ model_name: google/gemini-2.0-flash-001-dspy-zs-predict
4183
4951
  tokenizer_name: openai/o200k_base
4184
4952
  max_sequence_length: 128000
4185
4953
  client_spec:
4186
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4954
+ class_name: "helm.clients.dspy_client.DSPyClient"
4187
4955
  args:
4188
- openai_model_name: gpt-4o
4189
- api_version: 2023-05-15
4190
-
4191
- - name: stanfordhealthcare/gpt-4-0613
4192
- model_name: openai/gpt-4-0613
4956
+ dspy_agent_url: null
4957
+ dspy_module: Predict
4958
+ dspy_api_model: google/gemini-2.0-flash-001
4959
+ dspy_api_base: null
4960
+
4961
+ - name: openai/gpt-4o-2024-05-13-dspy-zs-predict
4962
+ model_name: openai/gpt-4o-2024-05-13-dspy-zs-predict
4193
4963
  tokenizer_name: openai/o200k_base
4194
- max_sequence_length: 8192
4964
+ max_sequence_length: 128000
4195
4965
  client_spec:
4196
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4966
+ class_name: "helm.clients.dspy_client.DSPyClient"
4197
4967
  args:
4198
- openai_model_name: gpt-4
4199
- api_version: 2023-05-15
4968
+ dspy_agent_url: null
4969
+ dspy_module: Predict
4970
+ dspy_api_model: openai/gpt-4o-2024-05-13
4971
+ dspy_api_base: null
4200
4972
 
4201
- - name: stanfordhealthcare/gpt-4-turbo-2024-04-09
4202
- model_name: openai/gpt-4-turbo-2024-04-09
4203
- tokenizer_name: openai/cl100k_base
4973
+ - name: openai/o3-mini-2025-01-31-dspy-zs-predict
4974
+ model_name: openai/o3-mini-2025-01-31-dspy-zs-predict
4975
+ tokenizer_name: openai/o200k_base
4204
4976
  max_sequence_length: 128000
4205
4977
  client_spec:
4206
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4978
+ class_name: "helm.clients.dspy_client.DSPyClient"
4207
4979
  args:
4208
- openai_model_name: gpt-4-turbo
4209
- api_version: 2023-05-15
4980
+ dspy_agent_url: null
4981
+ dspy_module: Predict
4982
+ dspy_api_model: openai/o3-mini-2025-01-31
4983
+ dspy_api_base: null
4210
4984
 
4211
- - name: stanfordhealthcare/gpt-4.1-2025-04-14
4212
- model_name: openai/gpt-4.1-2025-04-14
4985
+ - name: anthropic/claude-3-7-sonnet-20250219-dspy-zs-cot
4986
+ model_name: anthropic/claude-3-7-sonnet-20250219-dspy-zs-cot
4213
4987
  tokenizer_name: openai/o200k_base
4214
- max_sequence_length: 1047576
4988
+ max_sequence_length: 128000
4215
4989
  client_spec:
4216
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
4990
+ class_name: "helm.clients.dspy_client.DSPyClient"
4217
4991
  args:
4218
- openai_model_name: gpt-4.1
4219
- api_version: 2025-01-01-preview
4220
- base_url: "{endpoint}/openai-eastus2"
4992
+ dspy_agent_url: null
4993
+ dspy_module: ChainOfThought
4994
+ dspy_api_model: anthropic/claude-3-7-sonnet-20250219
4995
+ dspy_api_base: null
4221
4996
 
4222
- - name: stanfordhealthcare/o3-mini-2025-01-31
4223
- model_name: openai/o3-mini-2025-01-31
4224
- tokenizer_name: openai/cl100k_base
4225
- max_sequence_length: 200000
4997
+ - name: google/gemini-2.0-flash-001-dspy-zs-cot
4998
+ model_name: google/gemini-2.0-flash-001-dspy-zs-cot
4999
+ tokenizer_name: openai/o200k_base
5000
+ max_sequence_length: 128000
4226
5001
  client_spec:
4227
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
5002
+ class_name: "helm.clients.dspy_client.DSPyClient"
4228
5003
  args:
4229
- openai_model_name: o3-mini
4230
- api_version: 2024-12-01-preview
4231
- base_url: "{endpoint}/openai-eastus2"
5004
+ dspy_agent_url: null
5005
+ dspy_module: ChainOfThought
5006
+ dspy_api_model: google/gemini-2.0-flash-001
5007
+ dspy_api_base: null
4232
5008
 
4233
- - name: stanfordhealthcare/o1-2024-12-17
4234
- model_name: openai/o1-2024-12-17
4235
- tokenizer_name: openai/cl100k_base
5009
+ - name: openai/gpt-4o-2024-05-13-dspy-zs-cot
5010
+ model_name: openai/gpt-4o-2024-05-13-dspy-zs-cot
5011
+ tokenizer_name: openai/o200k_base
4236
5012
  max_sequence_length: 128000
4237
5013
  client_spec:
4238
- class_name: "helm.clients.stanfordhealthcare_azure_openai_client.StanfordHealthCareAzureOpenAIClient"
5014
+ class_name: "helm.clients.dspy_client.DSPyClient"
4239
5015
  args:
4240
- openai_model_name: o1
4241
- api_version: 2024-12-01-preview
4242
- base_url: "{endpoint}/openai-eastus2"
5016
+ dspy_agent_url: null
5017
+ dspy_module: ChainOfThought
5018
+ dspy_api_model: openai/gpt-4o-2024-05-13
5019
+ dspy_api_base: null
4243
5020
 
4244
- - name: stanfordhealthcare/deepseek-r1
4245
- model_name: deepseek-ai/deepseek-r1
4246
- tokenizer_name: deepseek-ai/deepseek-r1
5021
+ - name: openai/o3-mini-2025-01-31-dspy-zs-cot
5022
+ model_name: openai/o3-mini-2025-01-31-dspy-zs-cot
5023
+ tokenizer_name: openai/o200k_base
4247
5024
  max_sequence_length: 128000
4248
5025
  client_spec:
4249
- class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
5026
+ class_name: "helm.clients.dspy_client.DSPyClient"
4250
5027
  args:
4251
- openai_model_name: deepseek-chat
4252
- output_processor: helm.benchmark.metrics.output_processors.remove_deepseek_r1_thinking
4253
- base_url: "{endpoint}/deepseekr1/v1"
5028
+ dspy_agent_url: null
5029
+ dspy_module: ChainOfThought
5030
+ dspy_api_model: openai/o3-mini-2025-01-31
5031
+ dspy_api_base: null
4254
5032
 
4255
- - name: stanfordhealthcare/llama-3.3-70b-instruct
4256
- model_name: meta/llama-3.3-70b-instruct
4257
- tokenizer_name: meta/llama-3.3-70b-instruct
5033
+ - name: anthropic/claude-3-7-sonnet-20250219-dspy-fs-bfrs
5034
+ model_name: anthropic/claude-3-7-sonnet-20250219-dspy-fs-bfrs
5035
+ tokenizer_name: openai/o200k_base
4258
5036
  max_sequence_length: 128000
4259
5037
  client_spec:
4260
- class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
5038
+ class_name: "helm.clients.dspy_client.DSPyClient"
4261
5039
  args:
4262
- base_url: "{endpoint}/llama3370b/v1"
5040
+ dspy_agent_url: https://raw.githubusercontent.com/StanfordMIMI/dspy-helm/cecc4e742ec0d342d6349c80625afdd238df7fd4/agents/{scenario}/claude-3.7-sonnet/BootstrapFewShotWithRandomSearch.json
5041
+ dspy_module: ChainOfThought
5042
+ dspy_api_model: anthropic/claude-3-7-sonnet-20250219
5043
+ dspy_api_base: null
4263
5044
 
4264
- - name: stanfordhealthcare/llama-4-scout-17b-16e-instruct
4265
- model_name: meta/llama-4-scout-17b-16e-instruct
4266
- tokenizer_name: meta/llama-4-scout-17b-16e-instruct
4267
- max_sequence_length: 327680
5045
+ - name: google/gemini-2.0-flash-001-dspy-fs-bfrs
5046
+ model_name: google/gemini-2.0-flash-001-dspy-fs-bfrs
5047
+ tokenizer_name: openai/o200k_base
5048
+ max_sequence_length: 128000
4268
5049
  client_spec:
4269
- class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
5050
+ class_name: "helm.clients.dspy_client.DSPyClient"
4270
5051
  args:
4271
- base_url: "{endpoint}/llama4-scout/v1"
4272
-
4273
- - name: stanfordhealthcare/llama-4-maverick-17b-128e-instruct-fp8
4274
- model_name: meta/llama-4-maverick-17b-128e-instruct-fp8
4275
- tokenizer_name: meta/llama-4-scout-17b-16e-instruct
4276
- max_sequence_length: 524288
5052
+ dspy_agent_url: https://raw.githubusercontent.com/StanfordMIMI/dspy-helm/cecc4e742ec0d342d6349c80625afdd238df7fd4/agents/{scenario}/gemini-2.0-flash/BootstrapFewShotWithRandomSearch.json
5053
+ dspy_module: ChainOfThought
5054
+ dspy_api_model: google/gemini-2.0-flash-001
5055
+ dspy_api_base: null
5056
+
5057
+ - name: openai/gpt-4o-2024-05-13-dspy-fs-bfrs
5058
+ model_name: openai/gpt-4o-2024-05-13-dspy-fs-bfrs
5059
+ tokenizer_name: openai/o200k_base
5060
+ max_sequence_length: 128000
4277
5061
  client_spec:
4278
- class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
5062
+ class_name: "helm.clients.dspy_client.DSPyClient"
4279
5063
  args:
4280
- base_url: "{endpoint}/llama4-maverick/v1"
5064
+ dspy_agent_url: https://raw.githubusercontent.com/StanfordMIMI/dspy-helm/cecc4e742ec0d342d6349c80625afdd238df7fd4/agents/{scenario}/gpt-4o/BootstrapFewShotWithRandomSearch.json
5065
+ dspy_module: ChainOfThought
5066
+ dspy_api_model: openai/gpt-4o-2024-05-13
5067
+ dspy_api_base: null
4281
5068
 
4282
- - name: stanfordhealthcare/phi-3.5-mini-instruct
4283
- model_name: microsoft/phi-3.5-mini-instruct
4284
- tokenizer_name: microsoft/phi-3.5-mini-instruct
4285
- max_sequence_length: 131072
5069
+ - name: openai/o3-mini-2025-01-31-dspy-fs-bfrs
5070
+ model_name: openai/o3-mini-2025-01-31-dspy-fs-bfrs
5071
+ tokenizer_name: openai/o200k_base
5072
+ max_sequence_length: 128000
4286
5073
  client_spec:
4287
- class_name: "helm.clients.stanfordhealthcare_openai_client.StanfordHealthCareOpenAIClient"
5074
+ class_name: "helm.clients.dspy_client.DSPyClient"
4288
5075
  args:
4289
- base_url: "{endpoint}/phi35mi/v1"
5076
+ dspy_agent_url: https://raw.githubusercontent.com/StanfordMIMI/dspy-helm/cecc4e742ec0d342d6349c80625afdd238df7fd4/agents/{scenario}/o3-mini/BootstrapFewShotWithRandomSearch.json
5077
+ dspy_module: ChainOfThought
5078
+ dspy_api_model: openai/o3-mini-2025-01-31
5079
+ dspy_api_base: null
4290
5080
 
4291
- - name: stanfordhealthcare_shc/gpt-4o-2024-05-13
4292
- model_name: openai/gpt-4o-2024-05-13
5081
+ - name: anthropic/claude-3-7-sonnet-20250219-dspy-fs-miprov2
5082
+ model_name: anthropic/claude-3-7-sonnet-20250219-dspy-fs-miprov2
4293
5083
  tokenizer_name: openai/o200k_base
4294
5084
  max_sequence_length: 128000
4295
5085
  client_spec:
4296
- class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
4297
- deployment: gpt-4o
5086
+ class_name: "helm.clients.dspy_client.DSPyClient"
5087
+ args:
5088
+ dspy_agent_url: https://raw.githubusercontent.com/StanfordMIMI/dspy-helm/cecc4e742ec0d342d6349c80625afdd238df7fd4/agents/{scenario}/claude-3.7-sonnet/MIPROv2.json
5089
+ dspy_module: ChainOfThought
5090
+ dspy_api_model: anthropic/claude-3-7-sonnet-20250219
5091
+ dspy_api_base: null
4298
5092
 
4299
- - name: stanfordhealthcare_shc/gpt-4o-mini-2024-07-18
4300
- model_name: openai/gpt-4o-mini-2024-07-18
5093
+ - name: google/gemini-2.0-flash-001-dspy-fs-miprov2
5094
+ model_name: google/gemini-2.0-flash-001-dspy-fs-miprov2
4301
5095
  tokenizer_name: openai/o200k_base
4302
5096
  max_sequence_length: 128000
4303
5097
  client_spec:
4304
- class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
4305
- deployment: gpt-4o-mini
4306
-
4307
- - name: stanfordhealthcare_shc/gpt-4-turbo-2024-04-09
4308
- model_name: openai/gpt-4-turbo-2024-04-09
4309
- tokenizer_name: openai/cl100k_base
5098
+ class_name: "helm.clients.dspy_client.DSPyClient"
5099
+ args:
5100
+ dspy_agent_url: https://raw.githubusercontent.com/StanfordMIMI/dspy-helm/cecc4e742ec0d342d6349c80625afdd238df7fd4/agents/{scenario}/gemini-2.0-flash/MIPROv2.json
5101
+ dspy_module: ChainOfThought
5102
+ dspy_api_model: google/gemini-2.0-flash-001
5103
+ dspy_api_base: null
5104
+
5105
+ - name: openai/gpt-4o-2024-05-13-dspy-fs-miprov2
5106
+ model_name: openai/gpt-4o-2024-05-13-dspy-fs-miprov2
5107
+ tokenizer_name: openai/o200k_base
4310
5108
  max_sequence_length: 128000
4311
5109
  client_spec:
4312
- class_name: "helm.clients.stanfordhealthcare_shc_openai_client.StanfordHealthCareSHCOpenAIClient"
4313
- deployment: gpt-4-turbo-2024-04-09
5110
+ class_name: "helm.clients.dspy_client.DSPyClient"
5111
+ args:
5112
+ dspy_agent_url: https://raw.githubusercontent.com/StanfordMIMI/dspy-helm/cecc4e742ec0d342d6349c80625afdd238df7fd4/agents/{scenario}/gpt-4o/MIPROv2.json
5113
+ dspy_module: ChainOfThought
5114
+ dspy_api_model: openai/gpt-4o-2024-05-13
5115
+ dspy_api_base: null
5116
+
5117
+ - name: openai/o3-mini-2025-01-31-dspy-fs-miprov2
5118
+ model_name: openai/o3-mini-2025-01-31-dspy-fs-miprov2
5119
+ tokenizer_name: openai/o200k_base
5120
+ max_sequence_length: 128000
5121
+ client_spec:
5122
+ class_name: "helm.clients.dspy_client.DSPyClient"
5123
+ args:
5124
+ dspy_agent_url: https://raw.githubusercontent.com/StanfordMIMI/dspy-helm/cecc4e742ec0d342d6349c80625afdd238df7fd4/agents/{scenario}/o3-mini/MIPROv2.json
5125
+ dspy_module: ChainOfThought
5126
+ dspy_api_model: openai/o3-mini-2025-01-31
5127
+ dspy_api_base: null