crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (333) hide show
  1. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
  2. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/alrage_annotator.py +90 -0
  8. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  9. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  10. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  11. helm/benchmark/annotation/medalign_annotator.py +11 -22
  12. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  13. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  14. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  15. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  16. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  17. helm/benchmark/annotation/model_as_judge.py +23 -18
  18. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  19. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  20. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  21. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  22. helm/benchmark/metrics/alrage_metric.py +35 -0
  23. helm/benchmark/metrics/basic_metrics.py +267 -2
  24. helm/benchmark/metrics/bbq_metrics.py +12 -0
  25. helm/benchmark/metrics/classification_metrics.py +19 -1
  26. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  27. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  28. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  29. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  30. helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
  31. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  32. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  33. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  35. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  36. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  37. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  38. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  39. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  40. helm/benchmark/metrics/medec_metrics.py +25 -2
  41. helm/benchmark/metrics/metric.py +25 -0
  42. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  43. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  44. helm/benchmark/metrics/safety_metrics.py +13 -1
  45. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  46. helm/benchmark/metrics/summac/model_summac.py +2 -2
  47. helm/benchmark/metrics/summarization_metrics.py +129 -1
  48. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  49. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  50. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  51. helm/benchmark/presentation/run_display.py +13 -3
  52. helm/benchmark/presentation/run_entry.py +2 -2
  53. helm/benchmark/presentation/schema.py +5 -22
  54. helm/benchmark/presentation/summarize.py +180 -11
  55. helm/benchmark/presentation/taxonomy_info.py +20 -0
  56. helm/benchmark/run.py +1 -1
  57. helm/benchmark/run_expander.py +4 -0
  58. helm/benchmark/run_specs/arabic_run_specs.py +140 -16
  59. helm/benchmark/run_specs/bluex_run_specs.py +1 -1
  60. helm/benchmark/run_specs/classic_run_specs.py +2 -2
  61. helm/benchmark/run_specs/long_context_run_specs.py +2 -2
  62. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  63. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  64. helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
  65. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
  66. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  67. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  68. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  69. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  70. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  71. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  72. helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
  73. helm/benchmark/scenarios/aratrust_scenario.py +19 -0
  74. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
  75. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
  76. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
  77. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
  78. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
  79. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  80. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  81. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  82. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  83. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  84. helm/benchmark/scenarios/bluex_scenario.py +6 -2
  85. helm/benchmark/scenarios/bold_scenario.py +15 -0
  86. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  87. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  88. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  89. helm/benchmark/scenarios/clear_scenario.py +23 -0
  90. helm/benchmark/scenarios/cleva_scenario.py +479 -0
  91. helm/benchmark/scenarios/code_scenario.py +28 -0
  92. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  93. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  94. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  95. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  96. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  97. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  98. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  99. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  100. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  101. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  102. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  103. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  104. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  105. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  106. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  107. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  108. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  109. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  110. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  111. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  112. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  113. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  114. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  115. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  116. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  117. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  118. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  119. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  120. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  121. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  122. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  123. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  124. helm/benchmark/scenarios/ice_scenario.py +21 -1
  125. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  126. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  127. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
  128. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  129. helm/benchmark/scenarios/koala_scenario.py +21 -1
  130. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  131. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  132. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  133. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  134. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  135. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  136. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  137. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  138. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  139. helm/benchmark/scenarios/math_scenario.py +33 -0
  140. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  141. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  142. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  143. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  144. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  145. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  146. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  147. helm/benchmark/scenarios/medec_scenario.py +23 -0
  148. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  149. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  150. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  151. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  152. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  153. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  154. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  155. helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
  156. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  157. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  158. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  159. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  160. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  161. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  162. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  163. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  164. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  165. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  166. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  167. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  168. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  169. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  170. helm/benchmark/scenarios/quac_scenario.py +14 -0
  171. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  172. helm/benchmark/scenarios/raft_scenario.py +15 -0
  173. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  174. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  175. helm/benchmark/scenarios/scenario.py +31 -0
  176. helm/benchmark/scenarios/seahelm_scenario.py +348 -0
  177. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  178. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  179. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  180. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  181. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  182. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  183. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  184. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  185. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  186. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  187. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  188. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  189. helm/benchmark/scenarios/spider_scenario.py +18 -0
  190. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  191. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  192. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  193. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  194. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  195. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  196. helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
  197. helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
  198. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  199. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  200. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  201. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  202. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  203. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  204. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  205. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  206. helm/benchmark/static/schema_arabic.yaml +55 -12
  207. helm/benchmark/static/schema_long_context.yaml +11 -30
  208. helm/benchmark/static/schema_medhelm.yaml +36 -0
  209. helm/benchmark/static/schema_slp.yaml +219 -0
  210. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  211. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  212. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  213. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  214. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  215. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  216. helm/benchmark/static_build/index.html +5 -6
  217. helm/clients/ai21_client.py +2 -0
  218. helm/clients/aleph_alpha_client.py +2 -0
  219. helm/clients/anthropic_client.py +7 -1
  220. helm/clients/audio_language/diva_llama_client.py +2 -0
  221. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  222. helm/clients/audio_language/llama_omni/constants.py +9 -0
  223. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  224. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  225. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  226. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  227. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  228. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  229. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  230. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  231. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  232. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  233. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  234. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  235. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  236. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  237. helm/clients/audio_language/llama_omni/utils.py +202 -0
  238. helm/clients/audio_language/llama_omni_client.py +2 -1
  239. helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
  240. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  241. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  242. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  243. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  244. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  245. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  246. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  247. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  248. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  249. helm/clients/bedrock_client.py +2 -0
  250. helm/clients/cohere_client.py +3 -0
  251. helm/clients/google_client.py +2 -0
  252. helm/clients/http_model_client.py +2 -0
  253. helm/clients/huggingface_client.py +2 -1
  254. helm/clients/ibm_client.py +3 -1
  255. helm/clients/image_generation/adobe_vision_client.py +2 -0
  256. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  257. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  258. helm/clients/image_generation/cogview2_client.py +2 -1
  259. helm/clients/image_generation/dalle2_client.py +2 -0
  260. helm/clients/image_generation/dalle_mini_client.py +2 -1
  261. helm/clients/image_generation/deep_floyd_client.py +2 -0
  262. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  263. helm/clients/image_generation/lexica_client.py +2 -0
  264. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  265. helm/clients/image_generation/mindalle_client.py +2 -1
  266. helm/clients/image_generation/together_image_generation_client.py +2 -0
  267. helm/clients/megatron_client.py +2 -0
  268. helm/clients/mistral_client.py +2 -0
  269. helm/clients/moderation_api_client.py +2 -0
  270. helm/clients/openai_client.py +36 -20
  271. helm/clients/openai_responses_client.py +27 -3
  272. helm/clients/openrouter_client.py +31 -0
  273. helm/clients/palmyra_client.py +2 -1
  274. helm/clients/reka_client.py +2 -1
  275. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  276. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  277. helm/clients/test_openrouter_client.py +69 -0
  278. helm/clients/together_client.py +52 -11
  279. helm/clients/vertexai_client.py +12 -2
  280. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  281. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  282. helm/clients/vision_language/idefics_client.py +2 -1
  283. helm/clients/vision_language/open_flamingo_client.py +2 -1
  284. helm/clients/vision_language/paligemma_client.py +2 -1
  285. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  286. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  287. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  288. helm/clients/writer_client.py +2 -0
  289. helm/common/hierarchical_logger.py +20 -0
  290. helm/common/optional_dependencies.py +1 -1
  291. helm/common/test_general.py +4 -0
  292. helm/config/model_deployments.yaml +300 -1
  293. helm/config/model_metadata.yaml +302 -9
  294. helm/config/tokenizer_configs.yaml +92 -4
  295. helm/proxy/example_queries.py +8 -8
  296. helm/proxy/server.py +2 -1
  297. helm/proxy/static/index.css +4 -0
  298. helm/proxy/static/index.js +7 -1
  299. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  300. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  301. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  302. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  303. helm/benchmark/metrics/medalign_metrics.py +0 -14
  304. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  305. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  306. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  307. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  308. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  309. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  310. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  311. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  312. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  313. helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
  314. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  315. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  316. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  317. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
  318. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
  319. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
  320. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
  321. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  322. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  323. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  324. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  325. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  326. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  327. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  328. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  329. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  330. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  331. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  332. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  333. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -278,7 +278,7 @@ models:
278
278
  # https://aws.amazon.com/ai/generative-ai/nova/
279
279
  - name: amazon/nova-premier-v1:0
280
280
  display_name: Amazon Nova Premier
281
- description: Amazon Nova Premier is the most capable model in the Nova family of foundation models. ([blog](https://aws.amazon.com/blogs/aws/amazon-nova-premier-our-most-capable-model-for-complex-tasks-and-teacher-for-model-distillation/))
281
+ description: Amazon Nova Premier is a capable multimodal foundation model and teacher for model distillation that processes text, images, and videos with a one-million token context window. ([model card](https://www.amazon.science/publications/amazon-nova-premier-technical-report-and-model-card), [blog](https://aws.amazon.com/blogs/aws/amazon-nova-premier-our-most-capable-model-for-complex-tasks-and-teacher-for-model-distillation/))
282
282
  creator_organization_name: Amazon
283
283
  access: limited
284
284
  release_date: 2025-04-30
@@ -286,7 +286,7 @@ models:
286
286
 
287
287
  - name: amazon/nova-pro-v1:0
288
288
  display_name: Amazon Nova Pro
289
- description: Amazon Nova Pro Model
289
+ description: Amazon Nova Pro is a highly capable multimodal model that balances of accuracy, speed, and cost for a wide range of tasks ([model card](https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card))
290
290
  creator_organization_name: Amazon
291
291
  access: limited
292
292
  release_date: 2024-12-03
@@ -294,7 +294,7 @@ models:
294
294
 
295
295
  - name: amazon/nova-lite-v1:0
296
296
  display_name: Amazon Nova Lite
297
- description: Amazon Nova Lite Model
297
+ description: Amazon Nova Lite is a low-cost multimodal model that is fast for processing images, video, documents and text. ([model card](https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card))
298
298
  creator_organization_name: Amazon
299
299
  access: limited
300
300
  release_date: 2024-12-03
@@ -302,7 +302,7 @@ models:
302
302
 
303
303
  - name: amazon/nova-micro-v1:0
304
304
  display_name: Amazon Nova Micro
305
- description: Amazon Nova Micro Model
305
+ description: Amazon Nova Micro is a text-only model that delivers low-latency responses at low cost. ([model card](https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card))
306
306
  creator_organization_name: Amazon
307
307
  access: limited
308
308
  release_date: 2024-12-03
@@ -555,6 +555,14 @@ models:
555
555
  release_date: 2025-05-14
556
556
  tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
557
557
 
558
+ - name: anthropic/claude-sonnet-4-5-20250929
559
+ display_name: Claude 4.5 Sonnet (20250929)
560
+ description: Claude 4.5 Sonnet is a model from Anthropic that shows particular strengths in software coding, in agentic tasks where it runs in a loop and uses tools, and in using computers. ([blog](https://www.anthropic.com/news/claude-sonnet-4-5), [system card](https://assets.anthropic.com/m/12f214efcc2f457a/original/Claude-Sonnet-4-5-System-Card.pdf))
561
+ creator_organization_name: Anthropic
562
+ access: limited
563
+ release_date: 2025-09-29
564
+ tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
565
+
558
566
  - name: anthropic/stanford-online-all-v4-s3
559
567
  display_name: Anthropic-LM v4-s3 (52B)
560
568
  description: A 52B parameter language model, trained using reinforcement learning from human feedback [paper](https://arxiv.org/pdf/2204.05862.pdf).
@@ -946,6 +954,24 @@ models:
946
954
  release_date: 2025-01-20
947
955
  tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
948
956
 
957
+ - name: deepseek-ai/deepseek-r1-distill-llama-70b
958
+ display_name: DeepSeek-R1-Distill-Llama-70B
959
+ description: DeepSeek-R1-Distill-Llama-70B is a fine-tuned open-source models based on Llama-3.3-70B-Instruct using samples generated by DeepSeek-R1.
960
+ creator_organization_name: DeepSeek
961
+ access: open
962
+ num_parameters: 70600000000
963
+ release_date: 2025-01-20
964
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
965
+
966
+ - name: deepseek-ai/deepseek-r1-distill-qwen-14b
967
+ display_name: DeepSeek-R1-Distill-Qwen-14B
968
+ description: DeepSeek-R1-Distill-Qwen-14B is a fine-tuned open-source models based on Qwen2.5-14B using samples generated by DeepSeek-R1.
969
+ creator_organization_name: DeepSeek
970
+ access: open
971
+ num_parameters: 14800000000
972
+ release_date: 2025-01-20
973
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
974
+
949
975
  - name: deepseek-ai/deepseek-coder-6.7b-instruct
950
976
  display_name: DeepSeek-Coder-6.7b-Instruct
951
977
  description: DeepSeek-Coder-6.7b-Instruct is a model that is fine-tuned from the LLaMA 6.7B model for the DeepSeek-Coder task.
@@ -1207,7 +1233,7 @@ models:
1207
1233
 
1208
1234
  - name: google/gemini-2.0-flash-001
1209
1235
  display_name: Gemini 2.0 Flash
1210
- description: Gemini 2.0 Flash ([documentation](https://ai.google.dev/gemini-api/docs/models/gemini))
1236
+ description: Gemini 2.0 Flash is a member of the Gemini 2.0 series of models, a suite of highly-capable, natively multimodal models designed to power agentic systems. ([model card](https://storage.googleapis.com/model-cards/documents/gemini-2-flash.pdf), [documentation](https://ai.google.dev/gemini-api/docs/models/gemini))
1211
1237
  creator_organization_name: Google
1212
1238
  access: limited
1213
1239
  release_date: 2025-02-01
@@ -1215,7 +1241,7 @@ models:
1215
1241
 
1216
1242
  - name: google/gemini-2.0-flash-lite-preview-02-05
1217
1243
  display_name: Gemini 2.0 Flash Lite (02-05 preview)
1218
- description: Gemini 2.0 Flash Lite (02-05 preview) ([documentation](https://ai.google.dev/gemini-api/docs/models/gemini))
1244
+ description: Gemini 2.0 Flash Lite (02-05 preview) ([model card](https://storage.googleapis.com/model-cards/documents/gemini-2-flash.pdf), [documentation](https://ai.google.dev/gemini-api/docs/models/gemini))
1219
1245
  creator_organization_name: Google
1220
1246
  access: limited
1221
1247
  release_date: 2025-02-05
@@ -1223,7 +1249,7 @@ models:
1223
1249
 
1224
1250
  - name: google/gemini-2.0-flash-lite-001
1225
1251
  display_name: Gemini 2.0 Flash Lite
1226
- description: Gemini 2.0 Flash Lite ([documentation](https://ai.google.dev/gemini-api/docs/models/gemini))
1252
+ description: Gemini 2.0 Flash Lite is the fastest and most cost efficient Flash model in the Gemini 2.0 series of models, a suite of highly-capable, natively multimodal models designed to power agentic systems. ([model card](https://storage.googleapis.com/model-cards/documents/gemini-2-flash.pdf), [documentation](https://ai.google.dev/gemini-api/docs/models/gemini))
1227
1253
  creator_organization_name: Google
1228
1254
  access: limited
1229
1255
  release_date: 2025-03-25
@@ -1253,6 +1279,14 @@ models:
1253
1279
  release_date: 2025-06-17
1254
1280
  tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, AUDIO_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
1255
1281
 
1282
+ - name: google/gemini-2.5-flash-lite
1283
+ display_name: Gemini 2.5 Flash-Lite
1284
+ description: Gemini 2.5 Flash-Lite ([blog](https://blog.google/products/gemini/gemini-2-5-model-family-expands/))
1285
+ creator_organization_name: Google
1286
+ access: limited
1287
+ release_date: 2025-07-22
1288
+ tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, AUDIO_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
1289
+
1256
1290
  - name: google/gemini-2.5-flash-preview-04-17
1257
1291
  display_name: Gemini 2.5 Flash (04-17 preview)
1258
1292
  description: Gemini 2.5 Flash (04-17 preview) ([documentation](https://ai.google.dev/gemini-api/docs/models/gemini))
@@ -2573,6 +2607,14 @@ models:
2573
2607
  release_date: 2025-05-07
2574
2608
  tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
2575
2609
 
2610
+ - name: mistralai/mistral-medium-3.1
2611
+ display_name: Mistral Medium 3.1
2612
+ description: Mistral Medium 3.1 is a language model that is intended to to deliver state-of-the-art performance at lower cost. ([blog](https://mistral.ai/news/mistral-medium-3))
2613
+ creator_organization_name: Mistral AI
2614
+ access: limited
2615
+ release_date: 2025-05-07
2616
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
2617
+
2576
2618
  - name: mistralai/mistral-large-2402
2577
2619
  display_name: Mistral Large (2402)
2578
2620
  description: Mistral Large is a multilingual model with a 32K tokens context window and function-calling capabilities. ([blog](https://mistral.ai/news/mistral-large/))
@@ -3052,6 +3094,30 @@ models:
3052
3094
  release_date: 2025-04-14
3053
3095
  tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3054
3096
 
3097
+ - name: openai/gpt-5-2025-08-07
3098
+ display_name: GPT-5 (2025-08-07)
3099
+ description: GPT-5 (2025-08-07) is a multimdodal model trained for real-world coding tasks and long-running agentic tasks. ([blog](https://openai.com/index/introducing-gpt-5-for-developers/), [system card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf))
3100
+ creator_organization_name: OpenAI
3101
+ access: limited
3102
+ release_date: 2025-08-07
3103
+ tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3104
+
3105
+ - name: openai/gpt-5-mini-2025-08-07
3106
+ display_name: GPT-5 mini (2025-08-07)
3107
+ description: GPT-5 mini (2025-08-07) is a multimdodal model trained for real-world coding tasks and long-running agentic tasks. ([blog](https://openai.com/index/introducing-gpt-5-for-developers/), [system card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf))
3108
+ creator_organization_name: OpenAI
3109
+ access: limited
3110
+ release_date: 2025-08-07
3111
+ tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3112
+
3113
+ - name: openai/gpt-5-nano-2025-08-07
3114
+ display_name: GPT-5 nano (2025-08-07)
3115
+ description: GPT-5 nano (2025-08-07) is a multimdodal model trained for real-world coding tasks and long-running agentic tasks. ([blog](https://openai.com/index/introducing-gpt-5-for-developers/), [system card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf))
3116
+ creator_organization_name: OpenAI
3117
+ access: limited
3118
+ release_date: 2025-08-07
3119
+ tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3120
+
3055
3121
  - name: openai/whisper-1_gpt-4o-2024-11-20
3056
3122
  display_name: Whisper-1 + GPT-4o (2024-11-20)
3057
3123
  description: Transcribes the text with Whisper-1 and then uses GPT-4o to generate a response.
@@ -3273,6 +3339,23 @@ models:
3273
3339
  release_date: 2025-06-10
3274
3340
  tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3275
3341
 
3342
+ ## GPT-OSS
3343
+ - name: openai/gpt-oss-20b
3344
+ display_name: gpt-oss-20b
3345
+ description: gpt-oss-20b is an open-weight language model that was trained using a mix of reinforcement learning and other techniques informed by OpenAI's internal models. It uses a mixture-of-experts architecture and activates 3.6B parameters per token. ([blog](https://openai.com/index/introducing-gpt-oss/))
3346
+ creator_organization_name: OpenAI
3347
+ access: open
3348
+ release_date: 2025-08-05
3349
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3350
+
3351
+ - name: openai/gpt-oss-120b
3352
+ display_name: gpt-oss-120b
3353
+ description: gpt-oss-120b is an open-weight language model that was trained using a mix of reinforcement learning and other techniques informed by OpenAI's internal models. It uses a mixture-of-experts architecture and activates 5.1B parameters per token. ([blog](https://openai.com/index/introducing-gpt-oss/))
3354
+ creator_organization_name: OpenAI
3355
+ access: open
3356
+ release_date: 2025-08-05
3357
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3358
+
3276
3359
  ## Codex Models
3277
3360
  # DEPRECATED: Codex models have been shut down on March 23 2023.
3278
3361
 
@@ -3549,6 +3632,22 @@ models:
3549
3632
  release_date: 2025-04-29
3550
3633
  tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3551
3634
 
3635
+ - name: qwen/qwen3-next-80b-a3b-thinking
3636
+ display_name: Qwen3-Next 80B A3B Thinking
3637
+ description: Qwen3-Next is a new model architecture for improving training and inference efficiency under long-context and large-parameter settings. Compared to the MoE structure of Qwen3, Qwen3-Next introduces a hybrid attention mechanism, a highly sparse Mixture-of-Experts (MoE) structure, training-stability-friendly optimizations, and a multi-token prediction mechanism for faster inference. ([blog](https://qwen.ai/blog?id=4074cca80393150c248e508aa62983f9cb7d27cd&from=research.latest-advancements-list))
3638
+ creator_organization_name: Qwen
3639
+ access: open
3640
+ release_date: 2025-07-21 # https://x.com/Alibaba_Qwen/status/1947344511988076547
3641
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3642
+
3643
+ - name: qwen/qwen3-235b-a22b-instruct-2507-fp8
3644
+ display_name: Qwen3 235B A22B Instruct 2507 FP8
3645
+ description: Qwen3 235B A22B Instruct 2507 FP8 is an updated version of the non-thinking mode of Qwen3 235B A22B FP8.
3646
+ creator_organization_name: Qwen
3647
+ access: open
3648
+ release_date: 2025-07-21 # https://x.com/Alibaba_Qwen/status/1947344511988076547
3649
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3650
+
3552
3651
  - name: qwen/qwq-32b-preview
3553
3652
  display_name: QwQ (32B Preview)
3554
3653
  description: QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities. ([blog post](https://qwenlm.github.io/blog/qwq-32b-preview/)).
@@ -3892,7 +3991,190 @@ models:
3892
3991
  release_date: 2023-05-25
3893
3992
  tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
3894
3993
 
3994
+ - name: tiiuae/falcon3-1b-instruct
3995
+ display_name: Falcon3-1B-Instruct
3996
+ description: Falcon3-1B-Instruct is an open-weights foundation model that supports 4 languages (English, French, Spanish, Portuguese) that was trained on 14T tokens.
3997
+ creator_organization_name: TII UAE
3998
+ access: open
3999
+ num_parameters: 1670000000
4000
+ release_date: 2024-12-17 # https://huggingface.co/docs/transformers/main/en/model_doc/falcon3
4001
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4002
+
4003
+ - name: tiiuae/falcon3-3b-instruct
4004
+ display_name: Falcon3-3B-Instruct
4005
+ description: Falcon3-3B-Instruct is an open-weights foundation model that supports 4 languages (English, French, Spanish, Portuguese) that was trained on 14T tokens.
4006
+ creator_organization_name: TII UAE
4007
+ access: open
4008
+ num_parameters: 3230000000
4009
+ release_date: 2024-12-17 # https://huggingface.co/docs/transformers/main/en/model_doc/falcon3
4010
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4011
+
4012
+ - name: tiiuae/falcon3-7b-instruct
4013
+ display_name: Falcon3-7B-Instruct
4014
+ description: Falcon3-7B-Instruct is an open-weights foundation model that supports 4 languages (English, French, Spanish, Portuguese) that was trained on 14T tokens.
4015
+ creator_organization_name: TII UAE
4016
+ access: open
4017
+ num_parameters: 7460000000
4018
+ release_date: 2024-12-17 # https://huggingface.co/docs/transformers/main/en/model_doc/falcon3
4019
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4020
+
4021
+ - name: tiiuae/falcon3-10b-instruct
4022
+ display_name: Falcon3-10B-Instruct
4023
+ description: Falcon3-10B-Instruct is an open-weights foundation model that supports 4 languages (English, French, Spanish, Portuguese) that was trained on 14T tokens.
4024
+ creator_organization_name: TII UAE
4025
+ access: open
4026
+ num_parameters: 10300000000
4027
+ release_date: 2024-12-17 # https://huggingface.co/docs/transformers/main/en/model_doc/falcon3
4028
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4029
+
4030
+ # AceGPT-v2
4031
+ - name: freedomintelligence/acegpt-v2-8b-chat
4032
+ display_name: AceGPT-v2-8B-Chat
4033
+ description: AceGPT is a fully fine-tuned generative text model collection, particularly focused on the Arabic language domain. AceGPT-v2-8B-Chat is based on Meta-Llama-3-8B. ([paper](https://arxiv.org/abs/2412.12310))
4034
+ creator_organization_name: FreedomAI
4035
+ access: open
4036
+ num_parameters: 8030000000
4037
+ release_date: 2024-10-20
4038
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4039
+
4040
+ - name: freedomintelligence/acegpt-v2-32b-chat
4041
+ display_name: AceGPT-v2-32B-Chat
4042
+ description: AceGPT is a fully fine-tuned generative text model collection, particularly focused on the Arabic language domain. AceGPT-v2-32B-Chat is based on Qwen1.5-32B. ([paper](https://arxiv.org/abs/2412.12310))
4043
+ creator_organization_name: FreedomAI
4044
+ access: open
4045
+ num_parameters: 32500000000
4046
+ release_date: 2024-10-20
4047
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4048
+
4049
+ - name: freedomintelligence/acegpt-v2-70b-chat
4050
+ display_name: AceGPT-v2-70B-Chat
4051
+ description: AceGPT is a fully fine-tuned generative text model collection, particularly focused on the Arabic language domain. AceGPT-v2-70B-Chat is based on Meta-Llama-3-70B. ([paper](https://arxiv.org/abs/2412.12310))
4052
+ creator_organization_name: FreedomAI
4053
+ access: open
4054
+ num_parameters: 70600000000
4055
+ release_date: 2024-10-20
4056
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4057
+
4058
+ # ALLaM
4059
+ - name: allam-ai/allam-7b-instruct-preview
4060
+ display_name: ALLaM-7B-Instruct-preview
4061
+ description: ALLaM-7B-Instruct-preview is a model designed to advance Arabic language technology, which used a recipe of training on 4T English tokens followed by training on 1.2T mixed Arabic/English tokens. ([paper](https://arxiv.org/abs/2407.15390v1))
4062
+ creator_organization_name: NCAI & SDAIA
4063
+ access: open
4064
+ num_parameters: 7000000000
4065
+ release_date: 2024-07-22
4066
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4067
+
4068
+ # SILMA
4069
+ - name: silma-ai/silma-9b-instruct-v1.0
4070
+ display_name: SILMA 9B
4071
+ description: SILMA 9B is a compact Arabic language model based on Google Gemma. ([model card](https://huggingface.co/silma-ai/SILMA-9B-Instruct-v1.0))
4072
+ creator_organization_name: SILMA AI
4073
+ access: open
4074
+ num_parameters: 9240000000
4075
+ release_date: 2024-08-17
4076
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4077
+
4078
+ # Jais Family
4079
+
4080
+ - name: inceptionai/jais-family-590m-chat
4081
+ display_name: Jais-family-590m-chat
4082
+ description: The Jais family of models is a series of bilingual English-Arabic large language models (LLMs) that are trained from scratch and optimized to excel in Arabic while having strong English capabilities. ([website](https://inceptionai.ai/jaisfamily/index.html), [blog](https://mbzuai.ac.ae/news/meet-jais-the-worlds-most-advanced-arabic-large-language-model-open-sourced-by-g42s-inception/))
4083
+ creator_organization_name: Inception
4084
+ access: open
4085
+ num_parameters: 771000000
4086
+ release_date: 2023-08-30
4087
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4088
+
4089
+ - name: inceptionai/jais-family-1p3b-chat
4090
+ display_name: Jais-family-1p3b-chat
4091
+ description: The Jais family of models is a series of bilingual English-Arabic large language models (LLMs) that are trained from scratch and optimized to excel in Arabic while having strong English capabilities. ([website](https://inceptionai.ai/jaisfamily/index.html), [blog](https://mbzuai.ac.ae/news/meet-jais-the-worlds-most-advanced-arabic-large-language-model-open-sourced-by-g42s-inception/))
4092
+ creator_organization_name: Inception
4093
+ access: open
4094
+ num_parameters: 1560000000
4095
+ release_date: 2023-08-30
4096
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4097
+
4098
+ - name: inceptionai/jais-family-2p7b-chat
4099
+ display_name: Jais-family-2p7b-chat
4100
+ description: The Jais family of models is a series of bilingual English-Arabic large language models (LLMs) that are trained from scratch and optimized to excel in Arabic while having strong English capabilities. ([website](https://inceptionai.ai/jaisfamily/index.html), [blog](https://mbzuai.ac.ae/news/meet-jais-the-worlds-most-advanced-arabic-large-language-model-open-sourced-by-g42s-inception/))
4101
+ creator_organization_name: Inception
4102
+ access: open
4103
+ num_parameters: 2950000000
4104
+ release_date: 2023-08-30
4105
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4106
+
4107
+ - name: inceptionai/jais-family-6p7b-chat
4108
+ display_name: Jais-family-6p7b-chat
4109
+ description: The Jais family of models is a series of bilingual English-Arabic large language models (LLMs) that are trained from scratch and optimized to excel in Arabic while having strong English capabilities. ([website](https://inceptionai.ai/jaisfamily/index.html), [blog](https://mbzuai.ac.ae/news/meet-jais-the-worlds-most-advanced-arabic-large-language-model-open-sourced-by-g42s-inception/))
4110
+ creator_organization_name: Inception
4111
+ access: open
4112
+ num_parameters: 7140000000
4113
+ release_date: 2023-08-30
4114
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4115
+
4116
+ - name: inceptionai/jais-family-6p7b-chat
4117
+ display_name: Jais-family-6p7b-chat
4118
+ description: The Jais family of models is a series of bilingual English-Arabic large language models (LLMs) that are trained from scratch and optimized to excel in Arabic while having strong English capabilities. ([website](https://inceptionai.ai/jaisfamily/index.html), [blog](https://mbzuai.ac.ae/news/meet-jais-the-worlds-most-advanced-arabic-large-language-model-open-sourced-by-g42s-inception/))
4119
+ creator_organization_name: Inception
4120
+ access: open
4121
+ num_parameters: 7140000000
4122
+ release_date: 2023-08-30
4123
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4124
+
4125
+ - name: inceptionai/jais-family-13b-chat
4126
+ display_name: Jais-family-13b-chat
4127
+ description: The Jais family of models is a series of bilingual English-Arabic large language models (LLMs) that are trained from scratch and optimized to excel in Arabic while having strong English capabilities. ([website](https://inceptionai.ai/jaisfamily/index.html), [blog](https://mbzuai.ac.ae/news/meet-jais-the-worlds-most-advanced-arabic-large-language-model-open-sourced-by-g42s-inception/))
4128
+ creator_organization_name: Inception
4129
+ access: open
4130
+ num_parameters: 13500000000
4131
+ release_date: 2023-08-30
4132
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3895
4133
 
4134
+ - name: inceptionai/jais-family-30b-8k-chat
4135
+ display_name: Jais-family-30b-8k-chat
4136
+ description: The Jais family of models is a series of bilingual English-Arabic large language models (LLMs) that are trained from scratch and optimized to excel in Arabic while having strong English capabilities. ([website](https://inceptionai.ai/jaisfamily/index.html), [blog](https://mbzuai.ac.ae/news/meet-jais-the-worlds-most-advanced-arabic-large-language-model-open-sourced-by-g42s-inception/))
4137
+ creator_organization_name: Inception
4138
+ access: open
4139
+ num_parameters: 30800000000
4140
+ release_date: 2023-08-30
4141
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4142
+
4143
+ - name: inceptionai/jais-family-30b-16k-chat
4144
+ display_name: Jais-family-30b-16k-chat
4145
+ description: The Jais family of models is a series of bilingual English-Arabic large language models (LLMs) that are trained from scratch and optimized to excel in Arabic while having strong English capabilities. ([website](https://inceptionai.ai/jaisfamily/index.html), [blog](https://mbzuai.ac.ae/news/meet-jais-the-worlds-most-advanced-arabic-large-language-model-open-sourced-by-g42s-inception/))
4146
+ creator_organization_name: Inception
4147
+ access: open
4148
+ num_parameters: 30800000000
4149
+ release_date: 2023-08-30
4150
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4151
+
4152
+ - name: inceptionai/jais-adapted-7b-chat
4153
+ display_name: Jais-adapted-7b-chat
4154
+ description: The Jais adapted models are bilingual English-Arabic large language models (LLMs) that are trained adaptively from Llama-2 and optimized to excel in Arabic while having strong English capabilities. ([website](https://inceptionai.ai/jaisfamily/index.html), [blog](https://mbzuai.ac.ae/news/meet-jais-the-worlds-most-advanced-arabic-large-language-model-open-sourced-by-g42s-inception/))
4155
+ creator_organization_name: Inception
4156
+ access: open
4157
+ num_parameters: 7000000000
4158
+ release_date: 2023-08-30
4159
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4160
+
4161
+ - name: inceptionai/jais-adapted-13b-chat
4162
+ display_name: Jais-adapted-13b-chat
4163
+ description: The Jais adapted models are bilingual English-Arabic large language models (LLMs) that are trained adaptively from Llama-2 and optimized to excel in Arabic while having strong English capabilities. ([website](https://inceptionai.ai/jaisfamily/index.html), [blog](https://mbzuai.ac.ae/news/meet-jais-the-worlds-most-advanced-arabic-large-language-model-open-sourced-by-g42s-inception/))
4164
+ creator_organization_name: Inception
4165
+ access: open
4166
+ num_parameters: 13300000000
4167
+ release_date: 2023-08-30
4168
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4169
+
4170
+ - name: inceptionai/jais-adapted-70b-chat
4171
+ display_name: Jais-adapted-70b-chat
4172
+ description: The Jais adapted models are bilingual English-Arabic large language models (LLMs) that are trained adaptively from Llama-2 and optimized to excel in Arabic while having strong English capabilities. ([website](https://inceptionai.ai/jaisfamily/index.html), [blog](https://mbzuai.ac.ae/news/meet-jais-the-worlds-most-advanced-arabic-large-language-model-open-sourced-by-g42s-inception/))
4173
+ creator_organization_name: Inception
4174
+ access: open
4175
+ num_parameters: 69500000000
4176
+ release_date: 2023-08-30
4177
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3896
4178
 
3897
4179
  # Together
3898
4180
  - name: together/gpt-jt-6b-v1
@@ -4315,6 +4597,17 @@ models:
4315
4597
  release_date: 2025-05-08
4316
4598
  tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4317
4599
 
4600
+ # Z.ai
4601
+
4602
+ - name: zai-org/glm-4.5-air-fp8
4603
+ display_name: GLM-4.5-Air-FP8
4604
+ description: GLM-4.5-Air-FP8 is a hybrid reasoning model designed to unify reasoning, coding, and agentic capabilities into a single model. It has 106 billion total parameters and 12 billion active parameters. The thinking mode is enabled by default. ([blog](https://z.ai/blog/glm-4.5))
4605
+ creator_organization_name: Z.ai
4606
+ access: open
4607
+ num_parameters: 110000000000
4608
+ release_date: 2025-07-28
4609
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4610
+
4318
4611
 
4319
4612
  # Granite - IBM
4320
4613
  # https://www.ibm.com/granite
@@ -4530,7 +4823,7 @@ models:
4530
4823
 
4531
4824
  - name: ibm/granite-3.3-8b-instruct
4532
4825
  display_name: IBM Granite 3.3 8B Instruct
4533
- description: IBM Granite 3.3 8B Instruct is a 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
4826
+ description: IBM Granite 3.3 8B Instruct is an 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
4534
4827
  creator_organization_name: IBM
4535
4828
  access: open
4536
4829
  num_parameters: 8170000000
@@ -4539,7 +4832,7 @@ models:
4539
4832
 
4540
4833
  - name: ibm/granite-3.3-8b-instruct-with-guardian
4541
4834
  display_name: IBM Granite 3.3 8B Instruct (with guardian)
4542
- description: IBM Granite 3.3 8B Instruct is a 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct)) This model was run with an additional safety filter using [Granite Guardian 3.2](https://www.ibm.com/granite/docs/models/guardian/).
4835
+ description: IBM Granite 3.3 8B Instruct is an 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. All prompts were first evaluated for risk by [IBM Granite Guardian 3.2 5B](https://www.ibm.com/granite/docs/models/guardian/) and prompts that were deemed risky (with a risk threshold of 0.8) received the response "I'm very sorry, but I can't assist with that.". ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
4543
4836
  creator_organization_name: IBM
4544
4837
  access: open
4545
4838
  num_parameters: 8170000000
@@ -460,7 +460,7 @@ tokenizer_configs:
460
460
 
461
461
  # Allen Institute for AI
462
462
  # The allenai/olmo-7b requires Python 3.9 or newer.
463
- # To use the allenai/olmo-7b tokenizer, run `pip install crfm-helm[allenai]` first.
463
+ # To use the allenai/olmo-7b tokenizer, run `pip install "crfm-helm[allenai]"` first.
464
464
  - name: allenai/olmo-7b
465
465
  tokenizer_spec:
466
466
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -650,6 +650,12 @@ tokenizer_configs:
650
650
  end_of_text_token: "<|endoftext|>"
651
651
  prefix_token: "<|endoftext|>"
652
652
 
653
+ - name: openai/o200k_harmony
654
+ tokenizer_spec:
655
+ class_name: "helm.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"
656
+ end_of_text_token: "<|endoftext|>"
657
+ prefix_token: "<|startoftext|>"
658
+
653
659
  - name: openai/clip-vit-large-patch14
654
660
  tokenizer_spec:
655
661
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -705,6 +711,18 @@ tokenizer_configs:
705
711
  end_of_text_token: "<|im_end|>"
706
712
  prefix_token: "<|im_start|>"
707
713
 
714
+ - name: qwen/qwen3-235b-a22b-instruct-2507-fp8
715
+ tokenizer_spec:
716
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
717
+ end_of_text_token: "<|im_end|>"
718
+ prefix_token: ""
719
+
720
+ - name: qwen/qwen3-next-80b-a3b-thinking
721
+ tokenizer_spec:
722
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
723
+ end_of_text_token: "<|im_end|>"
724
+ prefix_token: ""
725
+
708
726
  - name: qwen/qwq-32b-preview
709
727
  tokenizer_spec:
710
728
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -785,6 +803,12 @@ tokenizer_configs:
785
803
  end_of_text_token: "<|endoftext|>"
786
804
  prefix_token: ""
787
805
 
806
+ - name: tiiuae/falcon3-1b-instruct
807
+ tokenizer_spec:
808
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
809
+ end_of_text_token: "<|endoftext|>"
810
+ prefix_token: ""
811
+
788
812
  # TsinghuaKEG
789
813
  - name: TsinghuaKEG/ice
790
814
  tokenizer_spec:
@@ -1048,7 +1072,6 @@ tokenizer_configs:
1048
1072
  end_of_text_token: ""
1049
1073
 
1050
1074
  # IBM Granite 3.3
1051
-
1052
1075
  - name: ibm/granite-3.3-8b-instruct
1053
1076
  tokenizer_spec:
1054
1077
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -1057,7 +1080,12 @@ tokenizer_configs:
1057
1080
  end_of_text_token: "<|end_of_text|>"
1058
1081
  prefix_token: "<|end_of_text|>"
1059
1082
 
1060
-
1083
+ # Z.ai GLM-4.5-AIR-FP8
1084
+ - name: zai-org/glm-4.5-air-fp8
1085
+ tokenizer_spec:
1086
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1087
+ end_of_text_token: "<|endoftext|>"
1088
+ prefix_token: ""
1061
1089
 
1062
1090
  # DeepSeek-R1-Distill-Llama-3.1-8b
1063
1091
  - name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
@@ -1068,6 +1096,20 @@ tokenizer_configs:
1068
1096
  end_of_text_token: "<|end▁of▁sentence|>"
1069
1097
  prefix_token: "<|begin▁of▁sentence|>"
1070
1098
 
1099
+ # DeepSeek-R1-Distill-Llama-3.1-8b
1100
+ - name: deepseek-ai/deepseek-r1-distill-llama-70b
1101
+ tokenizer_spec:
1102
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1103
+ end_of_text_token: "<|end▁of▁sentence|>"
1104
+ prefix_token: "<|begin▁of▁sentence|>"
1105
+
1106
+ # DeepSeek-R1-Distill-Qwen-14B
1107
+ - name: deepseek-ai/deepseek-r1-distill-qwen-14b
1108
+ tokenizer_spec:
1109
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1110
+ end_of_text_token: "<|end▁of▁sentence|>"
1111
+ prefix_token: "<|begin▁of▁sentence|>"
1112
+
1071
1113
  # deepseek-ai/deepseek-coder-6.7b-instruct
1072
1114
  - name: deepseek-ai/deepseek-coder-6.7b-instruct
1073
1115
  tokenizer_spec:
@@ -1077,7 +1119,6 @@ tokenizer_configs:
1077
1119
  end_of_text_token: "<|end▁of▁sentence|>"
1078
1120
  prefix_token: "<|begin▁of▁sentence|>"
1079
1121
 
1080
-
1081
1122
  # vilm/vinallama-2.7b-chat
1082
1123
  - name: vilm/vinallama-2.7b-chat
1083
1124
  tokenizer_spec:
@@ -1185,3 +1226,50 @@ tokenizer_configs:
1185
1226
  pretrained_model_name_or_path: nicholasKluge/TeenyTinyLlama-460m
1186
1227
  end_of_text_token: "</s>"
1187
1228
  prefix_token: "<s>"
1229
+
1230
+ # AceGPT-v2
1231
+ - name: freedomintelligence/acegpt-v2-8b-chat
1232
+ tokenizer_spec:
1233
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1234
+ end_of_text_token: "<|end_of_text|>"
1235
+ prefix_token: "<|begin_of_text|>"
1236
+
1237
+ - name: freedomintelligence/acegpt-v2-32b-chat
1238
+ tokenizer_spec:
1239
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1240
+ end_of_text_token: "<|endoftext|>"
1241
+ prefix_token: ""
1242
+
1243
+ - name: freedomintelligence/acegpt-v2-70b-chat
1244
+ tokenizer_spec:
1245
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1246
+ end_of_text_token: "<|end_of_text|>"
1247
+ prefix_token: "<|begin_of_text|>"
1248
+
1249
+ # ALLaM
1250
+ - name: allam-ai/allam-7b-instruct-preview
1251
+ tokenizer_spec:
1252
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1253
+ end_of_text_token: "</s>"
1254
+ prefix_token: "<s>"
1255
+
1256
+ # SILMA
1257
+ - name: silma-ai/silma-9b-instruct-v1.0
1258
+ tokenizer_spec:
1259
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1260
+ end_of_text_token: "<eos>"
1261
+ prefix_token: "<bos>"
1262
+
1263
+ # Jais Family
1264
+ - name: inceptionai/jais-family-590m-chat
1265
+ tokenizer_spec:
1266
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1267
+ end_of_text_token: "<|endoftext|>"
1268
+ prefix_token: "<|endoftext|>"
1269
+
1270
+ # Jais Adapted
1271
+ - name: inceptionai/jais-adapted-7b-chat
1272
+ tokenizer_spec:
1273
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1274
+ end_of_text_token: "</s>"
1275
+ prefix_token: "<s>"
@@ -21,7 +21,7 @@ example_queries = [
21
21
  """
22
22
  temperature: 0.5 # Medium amount of randomness
23
23
  stop_sequences: [.] # Stop when you hit a period
24
- model: openai/gpt-3.5-turbo-0613
24
+ model: openai/gpt-4.1-nano-2025-04-14
25
25
  """
26
26
  ),
27
27
  environments="",
@@ -33,7 +33,7 @@ example_queries = [
33
33
  temperature: 0.5 # Medium amount of randomness
34
34
  stop_sequences: [\\n] # Stop when you hit a newline
35
35
  num_completions: 5 # Generate many samples
36
- model: openai/gpt-3.5-turbo-0613
36
+ model: openai/gpt-4.1-nano-2025-04-14
37
37
  """
38
38
  ),
39
39
  environments="",
@@ -58,7 +58,7 @@ example_queries = [
58
58
  """
59
59
  temperature: 0 # Deterministic
60
60
  max_tokens: 50
61
- model: openai/gpt-3.5-turbo-0613
61
+ model: openai/gpt-4.1-nano-2025-04-14
62
62
  """
63
63
  ),
64
64
  environments="",
@@ -76,7 +76,7 @@ example_queries = [
76
76
  environments=dedent(
77
77
  """
78
78
  occupation: [mathematician, lawyer, doctor]
79
- model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
79
+ model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
80
80
  """
81
81
  ),
82
82
  ),
@@ -101,7 +101,7 @@ example_queries = [
101
101
  ),
102
102
  environments=dedent(
103
103
  """
104
- model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
104
+ model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
105
105
  """
106
106
  ),
107
107
  ),
@@ -136,7 +136,7 @@ example_queries = [
136
136
  ),
137
137
  environments=dedent(
138
138
  """
139
- model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
139
+ model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
140
140
  """
141
141
  ),
142
142
  ),
@@ -144,7 +144,7 @@ example_queries = [
144
144
  prompt="Write a Python function that takes two vectors a and b and returns their Euclidean distance.",
145
145
  settings=dedent(
146
146
  """
147
- model: openai/gpt-3.5-turbo-0613
147
+ model: openai/gpt-4.1-nano-2025-04-14
148
148
  """
149
149
  ),
150
150
  environments="",
@@ -161,7 +161,7 @@ example_queries = [
161
161
  ),
162
162
  environments=dedent(
163
163
  """
164
- model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
164
+ model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
165
165
  """
166
166
  ),
167
167
  ),