crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (333) hide show
  1. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
  2. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/alrage_annotator.py +90 -0
  8. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  9. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  10. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  11. helm/benchmark/annotation/medalign_annotator.py +11 -22
  12. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  13. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  14. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  15. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  16. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  17. helm/benchmark/annotation/model_as_judge.py +23 -18
  18. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  19. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  20. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  21. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  22. helm/benchmark/metrics/alrage_metric.py +35 -0
  23. helm/benchmark/metrics/basic_metrics.py +267 -2
  24. helm/benchmark/metrics/bbq_metrics.py +12 -0
  25. helm/benchmark/metrics/classification_metrics.py +19 -1
  26. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  27. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  28. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  29. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  30. helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
  31. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  32. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  33. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  35. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  36. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  37. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  38. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  39. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  40. helm/benchmark/metrics/medec_metrics.py +25 -2
  41. helm/benchmark/metrics/metric.py +25 -0
  42. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  43. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  44. helm/benchmark/metrics/safety_metrics.py +13 -1
  45. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  46. helm/benchmark/metrics/summac/model_summac.py +2 -2
  47. helm/benchmark/metrics/summarization_metrics.py +129 -1
  48. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  49. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  50. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  51. helm/benchmark/presentation/run_display.py +13 -3
  52. helm/benchmark/presentation/run_entry.py +2 -2
  53. helm/benchmark/presentation/schema.py +5 -22
  54. helm/benchmark/presentation/summarize.py +180 -11
  55. helm/benchmark/presentation/taxonomy_info.py +20 -0
  56. helm/benchmark/run.py +1 -1
  57. helm/benchmark/run_expander.py +4 -0
  58. helm/benchmark/run_specs/arabic_run_specs.py +140 -16
  59. helm/benchmark/run_specs/bluex_run_specs.py +1 -1
  60. helm/benchmark/run_specs/classic_run_specs.py +2 -2
  61. helm/benchmark/run_specs/long_context_run_specs.py +2 -2
  62. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  63. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  64. helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
  65. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
  66. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  67. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  68. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  69. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  70. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  71. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  72. helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
  73. helm/benchmark/scenarios/aratrust_scenario.py +19 -0
  74. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
  75. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
  76. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
  77. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
  78. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
  79. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  80. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  81. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  82. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  83. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  84. helm/benchmark/scenarios/bluex_scenario.py +6 -2
  85. helm/benchmark/scenarios/bold_scenario.py +15 -0
  86. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  87. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  88. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  89. helm/benchmark/scenarios/clear_scenario.py +23 -0
  90. helm/benchmark/scenarios/cleva_scenario.py +479 -0
  91. helm/benchmark/scenarios/code_scenario.py +28 -0
  92. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  93. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  94. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  95. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  96. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  97. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  98. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  99. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  100. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  101. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  102. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  103. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  104. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  105. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  106. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  107. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  108. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  109. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  110. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  111. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  112. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  113. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  114. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  115. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  116. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  117. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  118. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  119. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  120. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  121. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  122. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  123. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  124. helm/benchmark/scenarios/ice_scenario.py +21 -1
  125. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  126. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  127. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
  128. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  129. helm/benchmark/scenarios/koala_scenario.py +21 -1
  130. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  131. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  132. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  133. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  134. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  135. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  136. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  137. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  138. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  139. helm/benchmark/scenarios/math_scenario.py +33 -0
  140. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  141. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  142. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  143. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  144. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  145. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  146. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  147. helm/benchmark/scenarios/medec_scenario.py +23 -0
  148. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  149. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  150. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  151. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  152. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  153. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  154. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  155. helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
  156. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  157. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  158. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  159. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  160. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  161. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  162. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  163. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  164. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  165. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  166. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  167. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  168. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  169. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  170. helm/benchmark/scenarios/quac_scenario.py +14 -0
  171. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  172. helm/benchmark/scenarios/raft_scenario.py +15 -0
  173. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  174. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  175. helm/benchmark/scenarios/scenario.py +31 -0
  176. helm/benchmark/scenarios/seahelm_scenario.py +348 -0
  177. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  178. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  179. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  180. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  181. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  182. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  183. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  184. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  185. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  186. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  187. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  188. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  189. helm/benchmark/scenarios/spider_scenario.py +18 -0
  190. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  191. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  192. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  193. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  194. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  195. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  196. helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
  197. helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
  198. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  199. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  200. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  201. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  202. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  203. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  204. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  205. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  206. helm/benchmark/static/schema_arabic.yaml +55 -12
  207. helm/benchmark/static/schema_long_context.yaml +11 -30
  208. helm/benchmark/static/schema_medhelm.yaml +36 -0
  209. helm/benchmark/static/schema_slp.yaml +219 -0
  210. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  211. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  212. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  213. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  214. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  215. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  216. helm/benchmark/static_build/index.html +5 -6
  217. helm/clients/ai21_client.py +2 -0
  218. helm/clients/aleph_alpha_client.py +2 -0
  219. helm/clients/anthropic_client.py +7 -1
  220. helm/clients/audio_language/diva_llama_client.py +2 -0
  221. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  222. helm/clients/audio_language/llama_omni/constants.py +9 -0
  223. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  224. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  225. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  226. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  227. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  228. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  229. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  230. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  231. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  232. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  233. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  234. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  235. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  236. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  237. helm/clients/audio_language/llama_omni/utils.py +202 -0
  238. helm/clients/audio_language/llama_omni_client.py +2 -1
  239. helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
  240. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  241. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  242. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  243. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  244. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  245. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  246. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  247. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  248. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  249. helm/clients/bedrock_client.py +2 -0
  250. helm/clients/cohere_client.py +3 -0
  251. helm/clients/google_client.py +2 -0
  252. helm/clients/http_model_client.py +2 -0
  253. helm/clients/huggingface_client.py +2 -1
  254. helm/clients/ibm_client.py +3 -1
  255. helm/clients/image_generation/adobe_vision_client.py +2 -0
  256. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  257. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  258. helm/clients/image_generation/cogview2_client.py +2 -1
  259. helm/clients/image_generation/dalle2_client.py +2 -0
  260. helm/clients/image_generation/dalle_mini_client.py +2 -1
  261. helm/clients/image_generation/deep_floyd_client.py +2 -0
  262. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  263. helm/clients/image_generation/lexica_client.py +2 -0
  264. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  265. helm/clients/image_generation/mindalle_client.py +2 -1
  266. helm/clients/image_generation/together_image_generation_client.py +2 -0
  267. helm/clients/megatron_client.py +2 -0
  268. helm/clients/mistral_client.py +2 -0
  269. helm/clients/moderation_api_client.py +2 -0
  270. helm/clients/openai_client.py +36 -20
  271. helm/clients/openai_responses_client.py +27 -3
  272. helm/clients/openrouter_client.py +31 -0
  273. helm/clients/palmyra_client.py +2 -1
  274. helm/clients/reka_client.py +2 -1
  275. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  276. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  277. helm/clients/test_openrouter_client.py +69 -0
  278. helm/clients/together_client.py +52 -11
  279. helm/clients/vertexai_client.py +12 -2
  280. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  281. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  282. helm/clients/vision_language/idefics_client.py +2 -1
  283. helm/clients/vision_language/open_flamingo_client.py +2 -1
  284. helm/clients/vision_language/paligemma_client.py +2 -1
  285. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  286. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  287. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  288. helm/clients/writer_client.py +2 -0
  289. helm/common/hierarchical_logger.py +20 -0
  290. helm/common/optional_dependencies.py +1 -1
  291. helm/common/test_general.py +4 -0
  292. helm/config/model_deployments.yaml +300 -1
  293. helm/config/model_metadata.yaml +302 -9
  294. helm/config/tokenizer_configs.yaml +92 -4
  295. helm/proxy/example_queries.py +8 -8
  296. helm/proxy/server.py +2 -1
  297. helm/proxy/static/index.css +4 -0
  298. helm/proxy/static/index.js +7 -1
  299. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  300. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  301. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  302. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  303. helm/benchmark/metrics/medalign_metrics.py +0 -14
  304. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  305. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  306. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  307. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  308. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  309. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  310. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  311. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  312. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  313. helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
  314. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  315. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  316. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  317. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
  318. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
  319. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
  320. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
  321. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  322. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  323. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  324. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  325. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  326. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  327. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  328. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  329. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  330. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  331. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  332. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  333. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -8,7 +8,7 @@ from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
8
8
  from helm.common.cache import CacheConfig
9
9
  from helm.common.images_utils import open_image
10
10
  from helm.common.gpu_utils import get_torch_device_name
11
- from helm.common.hierarchical_logger import hlog, htrack_block
11
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
12
12
  from helm.common.media_object import TEXT_TYPE
13
13
  from helm.common.optional_dependencies import handle_module_not_found_error
14
14
  from helm.common.request import Request, RequestResult, GeneratedOutput, Token
@@ -126,6 +126,7 @@ class PaliGemmaClient(CachingClient):
126
126
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
127
127
  concat_results.append(result)
128
128
  except RuntimeError as model_error:
129
+ hexception(model_error)
129
130
  return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
130
131
 
131
132
  for result in concat_results:
@@ -5,6 +5,7 @@ import requests
5
5
 
6
6
  from helm.common.cache import CacheConfig
7
7
  from helm.common.images_utils import encode_base64
8
+ from helm.common.hierarchical_logger import hexception
8
9
  from helm.common.media_object import TEXT_TYPE
9
10
  from helm.common.request import Request, RequestResult, GeneratedOutput, ErrorFlags
10
11
  from helm.common.request import wrap_request_time
@@ -76,6 +77,7 @@ class PalmyraVisionClient(CachingClient):
76
77
  )
77
78
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
78
79
  except PalmyraVisionContentBlockedError as ex:
80
+ hexception(ex)
79
81
  return RequestResult(
80
82
  success=False,
81
83
  cached=False,
@@ -8,7 +8,7 @@ import torch
8
8
 
9
9
  from helm.common.cache import CacheConfig
10
10
  from helm.common.gpu_utils import get_torch_device_name
11
- from helm.common.hierarchical_logger import hlog, htrack_block
11
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
12
12
  from helm.common.media_object import TEXT_TYPE
13
13
  from helm.common.request import Request, RequestResult, GeneratedOutput, Token
14
14
  from helm.common.request import wrap_request_time
@@ -157,6 +157,7 @@ class Qwen2VLMClient(CachingClient):
157
157
  )
158
158
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
159
159
  except RuntimeError as model_error:
160
+ hexception(model_error)
160
161
  return RequestResult(
161
162
  success=False,
162
163
  cached=False,
@@ -7,7 +7,7 @@ from transformers.generation import GenerationConfig
7
7
 
8
8
  from helm.common.cache import CacheConfig
9
9
  from helm.common.gpu_utils import get_torch_device_name
10
- from helm.common.hierarchical_logger import hlog, htrack_block
10
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
11
11
  from helm.common.media_object import TEXT_TYPE
12
12
  from helm.common.request import Request, RequestResult, GeneratedOutput, Token
13
13
  from helm.common.request import wrap_request_time
@@ -139,6 +139,7 @@ class QwenVLMClient(CachingClient):
139
139
  )
140
140
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
141
141
  except RuntimeError as model_error:
142
+ hexception(model_error)
142
143
  return RequestResult(
143
144
  success=False, cached=False, error=str(model_error), completions=[], embedding=[]
144
145
  )
@@ -2,6 +2,7 @@ from typing import Any, Dict, List, Mapping, Optional
2
2
 
3
3
  from helm.clients.client import CachingClient
4
4
  from helm.common.cache import CacheConfig
5
+ from helm.common.hierarchical_logger import hexception
5
6
  from helm.common.optional_dependencies import handle_module_not_found_error
6
7
  from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
7
8
 
@@ -82,6 +83,7 @@ class WriterClient(CachingClient):
82
83
  raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
83
84
  chat_completion: ChatCompletion = ChatCompletion.model_validate(raw_response)
84
85
  except Exception as error:
86
+ hexception(error)
85
87
  return RequestResult(
86
88
  success=False,
87
89
  cached=False,
@@ -64,6 +64,16 @@ class HierarchicalLogger(object):
64
64
  self.logger.warning(self.indent() + str(x), **kwargs)
65
65
  sys.stdout.flush()
66
66
 
67
+ def error(self, x: Any, **kwargs) -> None:
68
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
69
+ self.logger.error(self.indent() + str(x), **kwargs)
70
+ sys.stdout.flush()
71
+
72
+ def exception(self, x: Any, **kwargs) -> None:
73
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
74
+ self.logger.exception(self.indent() + str(x), **kwargs)
75
+ sys.stdout.flush()
76
+
67
77
 
68
78
  def format_time(s: float) -> str:
69
79
  """Return a nice string representation of `s` seconds."""
@@ -96,6 +106,16 @@ def hwarn(x: Any, **kwargs) -> None:
96
106
  singleton.warn(x, **kwargs)
97
107
 
98
108
 
109
+ def herror(x: Any, **kwargs) -> None:
110
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
111
+ singleton.error(x, **kwargs)
112
+
113
+
114
+ def hexception(x: Any, **kwargs) -> None:
115
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
116
+ singleton.exception(x, **kwargs)
117
+
118
+
99
119
  class htrack_block:
100
120
  def __init__(self, x: Any, stacklevel=1) -> None:
101
121
  self._stacklevel = stacklevel + 1
@@ -9,7 +9,7 @@ def handle_module_not_found_error(e: ModuleNotFoundError, suggestions: Optional[
9
9
  # TODO: Ask user to install more specific optional dependencies
10
10
  # e.g. crfm-helm[plots] or crfm-helm[server]
11
11
  suggested_commands = " or ".join(
12
- [f"`pip install crfm-helm[{suggestion}]`" for suggestion in (suggestions or []) + ["all"]]
12
+ [f'`pip install "crfm-helm[{suggestion}]"`' for suggestion in (suggestions or []) + ["all"]]
13
13
  )
14
14
  raise OptionalDependencyNotInstalled(
15
15
  f"Optional dependency {e.name} is not installed. Please run {suggested_commands} to install it."
@@ -1,5 +1,8 @@
1
1
  import shutil
2
2
  import os
3
+
4
+ import pytest
5
+
3
6
  from helm.common.general import (
4
7
  ensure_file_downloaded,
5
8
  format_tags,
@@ -12,6 +15,7 @@ from helm.common.general import (
12
15
 
13
16
 
14
17
  def test_ensure_file_downloaded():
18
+ pytest.skip("Skipping download tests because these downloads are not reliable and may be throttled")
15
19
  ensure_file_downloaded("https://ftp.gnu.org/gnu/tar/tar-1.34.tar.gz", "test-tar", unpack=True, unpack_type="untar")
16
20
  assert os.path.isdir("test-tar")
17
21
  shutil.rmtree("test-tar")
@@ -730,6 +730,13 @@ model_deployments:
730
730
  thinking_budget_tokens: 10000
731
731
  stream: true
732
732
 
733
+ - name: anthropic/claude-sonnet-4-5-20250929
734
+ model_name: anthropic/claude-sonnet-4-5-20250929
735
+ tokenizer_name: anthropic/claude
736
+ max_sequence_length: 200000
737
+ client_spec:
738
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
739
+
733
740
  - name: anthropic/stanford-online-all-v4-s3
734
741
  deprecated: true # Closed model, not accessible via API
735
742
  model_name: anthropic/stanford-online-all-v4-s3
@@ -861,6 +868,20 @@ model_deployments:
861
868
  parse_thinking: true
862
869
  disable_logprobs: True
863
870
 
871
+ - name: together/deepseek-r1-distill-llama-70b
872
+ model_name: deepseek-ai/deepseek-r1-distill-llama-70b
873
+ tokenizer_name: deepseek-ai/deepseek-r1-distill-llama-70b
874
+ max_sequence_length: 131072
875
+ client_spec:
876
+ class_name: "helm.clients.together_client.TogetherChatClient"
877
+
878
+ - name: together/deepseek-r1-distill-qwen-14b
879
+ model_name: deepseek-ai/deepseek-r1-distill-qwen-14b
880
+ tokenizer_name: deepseek-ai/deepseek-r1-distill-qwen-14b
881
+ max_sequence_length: 131072
882
+ client_spec:
883
+ class_name: "helm.clients.together_client.TogetherChatClient"
884
+
864
885
  # Gooseai
865
886
 
866
887
  # TODO: Migrate these models to use OpenAIClient
@@ -1088,6 +1109,14 @@ model_deployments:
1088
1109
  # - https://cloud.google.com/vertex-ai/generative-ai/docs/learn/locations#global-endpoint
1089
1110
  location: global
1090
1111
 
1112
+ - name: google/gemini-2.5-flash-lite
1113
+ model_name: google/gemini-2.5-flash-lite
1114
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
1115
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash
1116
+ # TODO: Max output tokens: 65536
1117
+ client_spec:
1118
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
1119
+
1091
1120
  - name: google/gemini-2.5-flash-preview-04-17
1092
1121
  model_name: google/gemini-2.5-flash-preview-04-17
1093
1122
  tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
@@ -2616,6 +2645,27 @@ model_deployments:
2616
2645
  client_spec:
2617
2646
  class_name: "helm.clients.openai_client.OpenAIClient"
2618
2647
 
2648
+ - name: openai/gpt-5-2025-08-07
2649
+ model_name: openai/gpt-5-2025-08-07
2650
+ tokenizer_name: openai/o200k_base
2651
+ max_sequence_length: 400000
2652
+ client_spec:
2653
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2654
+
2655
+ - name: openai/gpt-5-mini-2025-08-07
2656
+ model_name: openai/gpt-5-mini-2025-08-07
2657
+ tokenizer_name: openai/o200k_base
2658
+ max_sequence_length: 400000
2659
+ client_spec:
2660
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2661
+
2662
+ - name: openai/gpt-5-nano-2025-08-07
2663
+ model_name: openai/gpt-5-nano-2025-08-07
2664
+ tokenizer_name: openai/o200k_base
2665
+ max_sequence_length: 400000
2666
+ client_spec:
2667
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2668
+
2619
2669
  - name: openai/whisper-1_gpt-4o-2024-11-20
2620
2670
  model_name: openai/whisper-1_gpt-4o-2024-11-20
2621
2671
  tokenizer_name: openai/o200k_base
@@ -2860,6 +2910,23 @@ model_deployments:
2860
2910
  openai_model_name: o3-pro-2025-06-10
2861
2911
  reasoning_effort: high
2862
2912
 
2913
+ ## GPT-OSS
2914
+ - name: together/gpt-oss-20b
2915
+ model_name: openai/gpt-oss-20b
2916
+ tokenizer_name: openai/o200k_harmony
2917
+ # Source: https://platform.openai.com/docs/models/gpt-oss-20b
2918
+ max_sequence_length: 131072
2919
+ client_spec:
2920
+ class_name: "helm.clients.together_client.TogetherChatClient"
2921
+
2922
+ - name: together/gpt-oss-120b
2923
+ model_name: openai/gpt-oss-120b
2924
+ tokenizer_name: openai/o200k_harmony
2925
+ # Source: https://platform.openai.com/docs/models/gpt-oss-120b
2926
+ max_sequence_length: 131072
2927
+ client_spec:
2928
+ class_name: "helm.clients.together_client.TogetherChatClient"
2929
+
2863
2930
  ## Text Similarity Models
2864
2931
  # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings
2865
2932
  # The number of parameters is guessed based on the number of parameters of the
@@ -3541,6 +3608,16 @@ model_deployments:
3541
3608
  args:
3542
3609
  together_model: togethercomputer/RedPajama-INCITE-7B-Instruct
3543
3610
 
3611
+ ## Z.ai
3612
+ - name: together/glm-4.5-air-fp8
3613
+ model_name: zai-org/glm-4.5-air-fp8
3614
+ tokenizer_name: zai-org/glm-4.5-air-fp8
3615
+ max_sequence_length: 131072
3616
+ client_spec:
3617
+ class_name: "helm.clients.together_client.TogetherChatClient"
3618
+ args:
3619
+ parse_thinking: true
3620
+
3544
3621
  - name: thudm/cogview2
3545
3622
  model_name: thudm/cogview2
3546
3623
  tokenizer_name: openai/clip-vit-large-patch14
@@ -3816,7 +3893,25 @@ model_deployments:
3816
3893
  class_name: "helm.clients.together_client.TogetherChatClient"
3817
3894
  args:
3818
3895
  parse_thinking: true
3819
-
3896
+
3897
+ - name: together/qwen3-next-80b-a3b-thinking
3898
+ model_name: qwen/qwen3-next-80b-a3b-thinking
3899
+ tokenizer_name: qwen/qwen3-next-80b-a3b-thinking
3900
+ max_sequence_length: 262144
3901
+ client_spec:
3902
+ class_name: "helm.clients.together_client.TogetherChatClient"
3903
+ args:
3904
+ parse_thinking: true
3905
+
3906
+ - name: together/qwen3-235b-a22b-instruct-2507-fp8
3907
+ model_name: qwen/qwen3-235b-a22b-instruct-2507-fp8
3908
+ tokenizer_name: qwen/qwen3-235b-a22b-instruct-2507-fp8
3909
+ max_sequence_length: 262144
3910
+ client_spec:
3911
+ class_name: "helm.clients.together_client.TogetherChatClient"
3912
+ args:
3913
+ together_model: Qwen/Qwen3-235B-A22B-Instruct-2507-tput
3914
+
3820
3915
  - name: huggingface/qwen2.5-7b-instruct-4bit
3821
3916
  model_name: qwen/qwen2.5-7b-instruct
3822
3917
  tokenizer_name: qwen/qwen2.5-7b-instruct
@@ -4256,6 +4351,201 @@ model_deployments:
4256
4351
  args:
4257
4352
  pretrained_model_name_or_path: deepseek-ai/deepseek-coder-6.7b-instruct
4258
4353
 
4354
+ # AceGPT-v2
4355
+ - name: huggingface/acegpt-v2-8b-chat
4356
+ model_name: freedomintelligence/acegpt-v2-8b-chat
4357
+ tokenizer_name: freedomintelligence/acegpt-v2-8b-chat
4358
+ max_sequence_length: 8192
4359
+ client_spec:
4360
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4361
+ args:
4362
+ device_map: auto
4363
+
4364
+ - name: huggingface/acegpt-v2-32b-chat
4365
+ model_name: freedomintelligence/acegpt-v2-32b-chat
4366
+ tokenizer_name: freedomintelligence/acegpt-v2-32b-chat
4367
+ max_sequence_length: 32768
4368
+ client_spec:
4369
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4370
+ args:
4371
+ device_map: auto
4372
+
4373
+ - name: huggingface/acegpt-v2-70b-chat
4374
+ model_name: freedomintelligence/acegpt-v2-70b-chat
4375
+ tokenizer_name: freedomintelligence/acegpt-v2-70b-chat
4376
+ max_sequence_length: 8192
4377
+ client_spec:
4378
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4379
+ args:
4380
+ device_map: auto
4381
+
4382
+ # ALLaM
4383
+ - name: huggingface/allam-7b-instruct-preview
4384
+ model_name: allam-ai/allam-7b-instruct-preview
4385
+ tokenizer_name: allam-ai/allam-7b-instruct-preview
4386
+ max_sequence_length: 4096
4387
+ client_spec:
4388
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4389
+ args:
4390
+ device_map: auto
4391
+
4392
+ # SILMA
4393
+ - name: huggingface/silma-9b-instruct-v1.0
4394
+ model_name: silma-ai/silma-9b-instruct-v1.0
4395
+ tokenizer_name: silma-ai/silma-9b-instruct-v1.0
4396
+ max_sequence_length: 8192
4397
+ client_spec:
4398
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4399
+ args:
4400
+ device_map: auto
4401
+
4402
+ # Jais Family
4403
+ #
4404
+ # NOTE: Jais Family models require `transformers<=4.52.3`.
4405
+ # On more recent versions of transformers, one of the following errors might occur:
4406
+ #
4407
+ # File "/path/to//site-packages/transformers/models/gemma3n/configuration_gemma3n.py", line 31, in <module>
4408
+ # from timm.data import ImageNetInfo, infer_imagenet_subset
4409
+ # ImportError: cannot import name 'ImageNetInfo' from 'timm.data' (/path/to/site-packages/timm/data/__init__.py)
4410
+ #
4411
+ # File "/path/to/.cache/huggingface/modules/transformers_modules/inceptionai/jais-family-590m-chat/90ac4769212b4964c6e81e183140224628228365/modeling_jais.py", line 899, in forward
4412
+ # past_length = past_key_values[0][0].size(-2)
4413
+ # AttributeError: 'NoneType' object has no attribute 'size'
4414
+
4415
+ - name: huggingface/jais-family-590m-chat
4416
+ model_name: inceptionai/jais-family-590m-chat
4417
+ tokenizer_name: inceptionai/jais-family-590m-chat
4418
+ max_sequence_length: 2048
4419
+ client_spec:
4420
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4421
+ args:
4422
+ trust_remote_code: true
4423
+ revision: 90ac4769212b4964c6e81e183140224628228365
4424
+
4425
+ - name: huggingface/jais-family-1p3b-chat
4426
+ model_name: inceptionai/jais-family-1p3b-chat
4427
+ tokenizer_name: inceptionai/jais-family-590m-chat
4428
+ max_sequence_length: 2048
4429
+ client_spec:
4430
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4431
+ args:
4432
+ trust_remote_code: true
4433
+ revision: 4b93176e2cb00f369b3bc0a8786e4cf16260c804
4434
+
4435
+ - name: huggingface/jais-family-2p7b-chat
4436
+ model_name: inceptionai/jais-family-2p7b-chat
4437
+ tokenizer_name: inceptionai/jais-family-590m-chat
4438
+ max_sequence_length: 2048
4439
+ client_spec:
4440
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4441
+ args:
4442
+ trust_remote_code: true
4443
+ revision: b2bf5d1bcd969ce868f66fb1ad8c3480289ea6b2
4444
+
4445
+ - name: huggingface/jais-family-6p7b-chat
4446
+ model_name: inceptionai/jais-family-6p7b-chat
4447
+ tokenizer_name: inceptionai/jais-family-590m-chat
4448
+ max_sequence_length: 2048
4449
+ client_spec:
4450
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4451
+ args:
4452
+ device_map: auto
4453
+ trust_remote_code: true
4454
+ revision: 683805efe6126c6536feb4aa23317e70222ac94c
4455
+
4456
+ - name: huggingface/jais-family-13b-chat
4457
+ model_name: inceptionai/jais-family-13b-chat
4458
+ tokenizer_name: inceptionai/jais-family-590m-chat
4459
+ max_sequence_length: 2048
4460
+ client_spec:
4461
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4462
+ args:
4463
+ device_map: auto
4464
+ trust_remote_code: true
4465
+ revision: 0ef8b4f80429609890816d912b331d3b95864707
4466
+
4467
+ - name: huggingface/jais-family-30b-8k-chat
4468
+ model_name: inceptionai/jais-family-30b-8k-chat
4469
+ tokenizer_name: inceptionai/jais-family-590m-chat
4470
+ max_sequence_length: 8192
4471
+ client_spec:
4472
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4473
+ args:
4474
+ device_map: auto
4475
+ trust_remote_code: true
4476
+ revision: dab185164dd3b79ec9201d7f4cf878ce91ae7e14
4477
+
4478
+ - name: huggingface/jais-family-30b-16k-chat
4479
+ model_name: inceptionai/jais-family-30b-16k-chat
4480
+ tokenizer_name: inceptionai/jais-family-590m-chat
4481
+ max_sequence_length: 16384
4482
+ client_spec:
4483
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4484
+ args:
4485
+ device_map: auto
4486
+ trust_remote_code: true
4487
+ revision: 369f88eeee4d313155f1b1dca4ebec90f9f9f2a4
4488
+
4489
+ # Jais Adapter
4490
+ - name: huggingface/jais-adapted-7b-chat
4491
+ model_name: inceptionai/jais-adapted-7b-chat
4492
+ tokenizer_name: inceptionai/jais-adapted-7b-chat
4493
+ max_sequence_length: 4096
4494
+ client_spec:
4495
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4496
+ args:
4497
+ device_map: auto
4498
+
4499
+ - name: huggingface/jais-adapted-13b-chat
4500
+ model_name: inceptionai/jais-adapted-13b-chat
4501
+ tokenizer_name: inceptionai/jais-adapted-7b-chat
4502
+ max_sequence_length: 4096
4503
+ client_spec:
4504
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4505
+ args:
4506
+ device_map: auto
4507
+
4508
+ - name: huggingface/jais-adapted-70b-chat
4509
+ model_name: inceptionai/jais-adapted-70b-chat
4510
+ tokenizer_name: inceptionai/jais-adapted-7b-chat
4511
+ max_sequence_length: 4096
4512
+ client_spec:
4513
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4514
+ args:
4515
+ device_map: auto
4516
+
4517
+ - name: huggingface/falcon3-1b-instruct
4518
+ model_name: tiiuae/falcon3-1b-instruct
4519
+ tokenizer_name: tiiuae/falcon3-1b-instruct
4520
+ max_sequence_length: 8192
4521
+ client_spec:
4522
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4523
+
4524
+ - name: huggingface/falcon3-3b-instruct
4525
+ model_name: tiiuae/falcon3-3b-instruct
4526
+ tokenizer_name: tiiuae/falcon3-1b-instruct
4527
+ max_sequence_length: 32768
4528
+ client_spec:
4529
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4530
+
4531
+ - name: huggingface/falcon3-7b-instruct
4532
+ model_name: tiiuae/falcon3-7b-instruct
4533
+ tokenizer_name: tiiuae/falcon3-7b-instruct
4534
+ max_sequence_length: 32768
4535
+ client_spec:
4536
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4537
+ args:
4538
+ device_map: auto
4539
+
4540
+ - name: huggingface/falcon3-10b-instruct
4541
+ model_name: tiiuae/falcon3-10b-instruct
4542
+ tokenizer_name: tiiuae/falcon3-1b-instruct
4543
+ max_sequence_length: 32768
4544
+ client_spec:
4545
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4546
+ args:
4547
+ device_map: auto
4548
+
4259
4549
  # IBM WatsonX
4260
4550
  - name: ibm/llama-3.3-70b-instruct
4261
4551
  model_name: meta/llama-3.3-70b-instruct
@@ -4590,3 +4880,12 @@ model_deployments:
4590
4880
  class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4591
4881
  args:
4592
4882
  pretrained_model_name_or_path: nicholasKluge/TeenyTinyLlama-460m
4883
+
4884
+ - name: openrouter/mistral-medium-3.1
4885
+ model_name: mistralai/mistral-medium-3.1
4886
+ tokenizer_name: mistralai/Mistral-7B-v0.1
4887
+ max_sequence_length: 128000
4888
+ client_spec:
4889
+ class_name: "helm.clients.openrouter_client.OpenRouterClient"
4890
+ args:
4891
+ model_name: mistralai/mistral-medium-3.1