crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (311) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/classification_metrics.py +19 -1
  27. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  28. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  29. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  30. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  31. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  32. helm/benchmark/metrics/comet_metric.py +1 -1
  33. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  34. helm/benchmark/metrics/copyright_metrics.py +1 -1
  35. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  36. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  37. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  38. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  39. helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
  40. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  41. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  42. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  43. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  44. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  45. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  46. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  47. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  48. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  49. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  50. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  51. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  52. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  53. helm/benchmark/metrics/medec_metrics.py +25 -2
  54. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  55. helm/benchmark/metrics/metric.py +25 -0
  56. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  57. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  58. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  59. helm/benchmark/metrics/summac/model_summac.py +3 -3
  60. helm/benchmark/metrics/summarization_metrics.py +129 -1
  61. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  62. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  63. helm/benchmark/model_deployment_registry.py +11 -19
  64. helm/benchmark/presentation/create_plots.py +11 -2
  65. helm/benchmark/presentation/schema.py +10 -22
  66. helm/benchmark/presentation/summarize.py +189 -14
  67. helm/benchmark/presentation/taxonomy_info.py +20 -0
  68. helm/benchmark/presentation/test_create_plots.py +4 -1
  69. helm/benchmark/run.py +7 -1
  70. helm/benchmark/run_expander.py +4 -0
  71. helm/benchmark/run_specs/arabic_run_specs.py +191 -0
  72. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  73. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  74. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  75. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  76. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  77. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  78. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  79. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  80. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  81. helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
  82. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  83. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
  84. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  85. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  86. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  87. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  88. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  89. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  90. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  91. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  92. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  93. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  94. helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
  95. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  96. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
  97. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
  98. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
  99. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  100. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  101. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  102. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  103. helm/benchmark/scenarios/bold_scenario.py +15 -0
  104. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  105. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  106. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  107. helm/benchmark/scenarios/clear_scenario.py +23 -0
  108. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  109. helm/benchmark/scenarios/code_scenario.py +28 -0
  110. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  111. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  112. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  113. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  114. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  115. helm/benchmark/scenarios/commonsense_scenario.py +26 -0
  116. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  117. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  118. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  119. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  120. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  121. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  122. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  123. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  124. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  125. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  126. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  127. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  128. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  129. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  130. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  131. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  132. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  133. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  134. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  135. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  136. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  137. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  138. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  139. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  140. helm/benchmark/scenarios/gsm_scenario.py +15 -0
  141. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  142. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  143. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  144. helm/benchmark/scenarios/ice_scenario.py +21 -1
  145. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  146. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  147. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  148. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  149. helm/benchmark/scenarios/koala_scenario.py +21 -1
  150. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  151. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  152. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  153. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  154. helm/benchmark/scenarios/legalbench_scenario.py +20 -0
  155. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  156. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  157. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  158. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  159. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  160. helm/benchmark/scenarios/math_scenario.py +47 -20
  161. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  162. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  163. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  164. helm/benchmark/scenarios/med_qa_scenario.py +14 -0
  165. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  166. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  167. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  168. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  169. helm/benchmark/scenarios/medec_scenario.py +23 -0
  170. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  171. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  172. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  173. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  174. helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
  175. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  176. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  177. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  178. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  179. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  180. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  181. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  182. helm/benchmark/scenarios/mmlu_scenario.py +15 -0
  183. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  184. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  185. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  186. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  187. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  188. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
  189. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  190. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  191. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  192. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  193. helm/benchmark/scenarios/quac_scenario.py +14 -0
  194. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  195. helm/benchmark/scenarios/raft_scenario.py +15 -0
  196. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  197. helm/benchmark/scenarios/scenario.py +31 -0
  198. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  199. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  200. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  201. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  202. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  203. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  204. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  205. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  206. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  207. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  208. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  209. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  210. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  211. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  212. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  213. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  214. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  215. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  216. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  217. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  218. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  219. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  220. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  221. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  222. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  223. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  224. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  225. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  226. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  227. helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
  228. helm/benchmark/slurm_jobs.py +1 -2
  229. helm/benchmark/slurm_runner.py +8 -1
  230. helm/benchmark/static/schema_arabic.yaml +271 -0
  231. helm/benchmark/static/schema_classic.yaml +0 -17
  232. helm/benchmark/static/schema_long_context.yaml +24 -6
  233. helm/benchmark/static/schema_medhelm.yaml +36 -0
  234. helm/benchmark/static/schema_slp.yaml +219 -0
  235. helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
  236. helm/benchmark/static_build/assets/index-9352595e.css +1 -0
  237. helm/benchmark/static_build/index.html +2 -2
  238. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  239. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  240. helm/clients/audio_language/llama_omni/constants.py +9 -0
  241. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  242. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  243. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  244. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  245. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  246. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  247. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  248. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  249. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  250. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  251. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  252. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  253. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  254. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  255. helm/clients/audio_language/llama_omni/utils.py +202 -0
  256. helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
  257. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  258. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  259. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  260. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  261. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  262. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  263. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  264. helm/clients/huggingface_client.py +2 -2
  265. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  266. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  267. helm/clients/openai_client.py +33 -20
  268. helm/clients/openai_responses_client.py +34 -8
  269. helm/clients/openrouter_client.py +31 -0
  270. helm/clients/test_huggingface_client.py +3 -3
  271. helm/clients/test_openrouter_client.py +69 -0
  272. helm/clients/together_client.py +48 -13
  273. helm/clients/vertexai_client.py +19 -11
  274. helm/clients/vllm_client.py +43 -7
  275. helm/clients/vllm_granite_thinking_client.py +56 -0
  276. helm/common/critique_request.py +0 -1
  277. helm/common/hierarchical_logger.py +83 -34
  278. helm/common/object_spec.py +23 -8
  279. helm/common/test_logging.py +94 -0
  280. helm/config/model_deployments.yaml +525 -172
  281. helm/config/model_metadata.yaml +185 -10
  282. helm/config/tokenizer_configs.yaml +100 -2
  283. helm/proxy/cli.py +1 -1
  284. helm/proxy/example_queries.py +8 -8
  285. helm/proxy/retry.py +5 -0
  286. helm/proxy/server.py +2 -1
  287. helm/proxy/static/index.css +4 -0
  288. helm/proxy/static/index.js +7 -1
  289. helm/tokenizers/grok_tokenizer.py +2 -0
  290. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  291. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  292. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  293. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  294. helm/benchmark/metrics/medalign_metrics.py +0 -14
  295. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  296. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  297. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  298. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  299. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  300. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  301. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  302. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  303. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  304. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  305. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  306. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  307. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  308. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
  309. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
  310. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
  311. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
@@ -1253,6 +1253,14 @@ models:
1253
1253
  release_date: 2025-06-17
1254
1254
  tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, AUDIO_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
1255
1255
 
1256
+ - name: google/gemini-2.5-flash-lite
1257
+ display_name: Gemini 2.5 Flash-Lite
1258
+ description: Gemini 2.5 Flash-Lite ([blog](https://blog.google/products/gemini/gemini-2-5-model-family-expands/))
1259
+ creator_organization_name: Google
1260
+ access: limited
1261
+ release_date: 2025-07-22
1262
+ tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, AUDIO_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
1263
+
1256
1264
  - name: google/gemini-2.5-flash-preview-04-17
1257
1265
  display_name: Gemini 2.5 Flash (04-17 preview)
1258
1266
  description: Gemini 2.5 Flash (04-17 preview) ([documentation](https://ai.google.dev/gemini-api/docs/models/gemini))
@@ -2624,6 +2632,15 @@ models:
2624
2632
  release_date: 2024-11-18
2625
2633
  tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
2626
2634
 
2635
+ # Moonshot AI
2636
+ - name: moonshotai/kimi-k2-instruct
2637
+ display_name: Kimi K2 Instruct
2638
+ description: Kimi K2 Instruct is a mixture-of-experts (MoE) language model with 32 billion activated parameters and 1 trillion total parameters trained with the Muon optimizer on 15.5T tokens. ([blog](https://moonshotai.github.io/Kimi-K2/))
2639
+ creator_organization_name: Moonshot AI
2640
+ access: open
2641
+ num_parameters: 1029173256720
2642
+ release_date: 2024-07-14 # Blog post has no date, so use the date from this news article https://www.cnbc.com/2025/07/14/alibaba-backed-moonshot-releases-kimi-k2-ai-rivaling-chatgpt-claude.html
2643
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
2627
2644
 
2628
2645
  # MosaicML
2629
2646
  - name: mosaicml/mpt-7b
@@ -3043,6 +3060,30 @@ models:
3043
3060
  release_date: 2025-04-14
3044
3061
  tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3045
3062
 
3063
+ - name: openai/gpt-5-2025-08-07
3064
+ display_name: GPT-5 (2025-08-07)
3065
+ description: GPT-5 (2025-08-07) is a multimdodal model trained for real-world coding tasks and long-running agentic tasks. ([blog](https://openai.com/index/introducing-gpt-5-for-developers/), [system card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf))
3066
+ creator_organization_name: OpenAI
3067
+ access: limited
3068
+ release_date: 2025-08-07
3069
+ tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3070
+
3071
+ - name: openai/gpt-5-mini-2025-08-07
3072
+ display_name: GPT-5 mini (2025-08-07)
3073
+ description: GPT-5 mini (2025-08-07) is a multimdodal model trained for real-world coding tasks and long-running agentic tasks. ([blog](https://openai.com/index/introducing-gpt-5-for-developers/), [system card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf))
3074
+ creator_organization_name: OpenAI
3075
+ access: limited
3076
+ release_date: 2025-08-07
3077
+ tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3078
+
3079
+ - name: openai/gpt-5-nano-2025-08-07
3080
+ display_name: GPT-5 nano (2025-08-07)
3081
+ description: GPT-5 nano (2025-08-07) is a multimdodal model trained for real-world coding tasks and long-running agentic tasks. ([blog](https://openai.com/index/introducing-gpt-5-for-developers/), [system card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf))
3082
+ creator_organization_name: OpenAI
3083
+ access: limited
3084
+ release_date: 2025-08-07
3085
+ tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3086
+
3046
3087
  - name: openai/whisper-1_gpt-4o-2024-11-20
3047
3088
  display_name: Whisper-1 + GPT-4o (2024-11-20)
3048
3089
  description: Transcribes the text with Whisper-1 and then uses GPT-4o to generate a response.
@@ -3256,6 +3297,31 @@ models:
3256
3297
  release_date: 2025-04-16
3257
3298
  tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3258
3299
 
3300
+ - name: openai/o3-pro-2025-06-10-high-reasoning-effort
3301
+ display_name: o3-pro (2025-06-10, high reasoning effort)
3302
+ description: o3-pro is an o-series model designed to think longer and provide the most reliable responses. ([blog post](https://help.openai.com/en/articles/9624314-model-release-notes))
3303
+ creator_organization_name: OpenAI
3304
+ access: limited
3305
+ release_date: 2025-06-10
3306
+ tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3307
+
3308
+ ## GPT-OSS
3309
+ - name: openai/gpt-oss-20b
3310
+ display_name: gpt-oss-20b
3311
+ description: gpt-oss-20b is an open-weight language model that was trained using a mix of reinforcement learning and other techniques informed by OpenAI's internal models. It uses a mixture-of-experts architecture and activates 3.6B parameters per token. ([blog](https://openai.com/index/introducing-gpt-oss/))
3312
+ creator_organization_name: OpenAI
3313
+ access: open
3314
+ release_date: 2025-08-05
3315
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3316
+
3317
+ - name: openai/gpt-oss-120b
3318
+ display_name: gpt-oss-120b
3319
+ description: gpt-oss-120b is an open-weight language model that was trained using a mix of reinforcement learning and other techniques informed by OpenAI's internal models. It uses a mixture-of-experts architecture and activates 5.1B parameters per token. ([blog](https://openai.com/index/introducing-gpt-oss/))
3320
+ creator_organization_name: OpenAI
3321
+ access: open
3322
+ release_date: 2025-08-05
3323
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3324
+
3259
3325
  ## Codex Models
3260
3326
  # DEPRECATED: Codex models have been shut down on March 23 2023.
3261
3327
 
@@ -3532,6 +3598,14 @@ models:
3532
3598
  release_date: 2025-04-29
3533
3599
  tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3534
3600
 
3601
+ - name: qwen/qwen3-235b-a22b-instruct-2507-fp8
3602
+ display_name: Qwen3 235B A22B Instruct 2507 FP8
3603
+ description: Qwen3 235B A22B Instruct 2507 FP8 is an updated version of the non-thinking mode of Qwen3 235B A22B FP8.
3604
+ creator_organization_name: Qwen
3605
+ access: open
3606
+ release_date: 2025-07-21 # https://x.com/Alibaba_Qwen/status/1947344511988076547
3607
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3608
+
3535
3609
  - name: qwen/qwq-32b-preview
3536
3610
  display_name: QwQ (32B Preview)
3537
3611
  description: QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities. ([blog post](https://qwenlm.github.io/blog/qwq-32b-preview/)).
@@ -4163,6 +4237,14 @@ models:
4163
4237
  release_date: 2025-04-03 # https://docs.x.ai/docs/release-notes#april-2025
4164
4238
  tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4165
4239
 
4240
+ - name: xai/grok-4-0709
4241
+ display_name: Grok 4 (0709)
4242
+ description: Grok 4 (0709) is a model that includes native tool use and real-time search integration. ([blog](https://x.ai/news/grok-4))
4243
+ creator_organization_name: xAI
4244
+ access: limited
4245
+ release_date: 2025-07-09
4246
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4247
+
4166
4248
  # Yandex
4167
4249
  - name: yandex/yalm
4168
4250
  display_name: YaLM (100B)
@@ -4266,6 +4348,42 @@ models:
4266
4348
  release_date: 2023-11-08
4267
4349
  tags: [TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4268
4350
 
4351
+ - name: maritaca-ai/sabiazinho-3
4352
+ display_name: Sabiazinho 3
4353
+ description: Sabiazinho-3 is a decoder-only language model designed for Portuguese text generation and understanding tasks. It supports a long context window of up to 128,000 tokens and is offered via API with scalable rate limits. The model is trained on diverse Portuguese corpora with knowledge up to july 2023.
4354
+ creator_organization_name: Maritaca AI
4355
+ access: limited
4356
+ release_date: 2025-02-06
4357
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4358
+
4359
+ - name: maritaca-ai/sabia-3
4360
+ display_name: Sabía 3
4361
+ description: Sabiá-3 is a decoder-only language model designed for Portuguese text generation and understanding tasks. It supports a long context window of up to 128,000 tokens and is offered via API with scalable rate limits. The model is trained on diverse Portuguese corpora with knowledge up to july 2023.
4362
+ creator_organization_name: Maritaca AI
4363
+ access: limited
4364
+ release_date: 2024-12-11
4365
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4366
+
4367
+ - name: maritaca-ai/sabia-3.1-2025-05-08
4368
+ display_name: Sabía 3.1
4369
+ description: Sabiá-3.1 is a decoder-only language model designed for Portuguese text generation and understanding tasks. It supports a long context window of up to 128,000 tokens and is offered via API with scalable rate limits. The model is trained on diverse Portuguese corpora with knowledge up to August 2024.
4370
+ creator_organization_name: Maritaca AI
4371
+ access: limited
4372
+ release_date: 2025-05-08
4373
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4374
+
4375
+ # Z.ai
4376
+
4377
+ - name: zai-org/glm-4.5-air-fp8
4378
+ display_name: GLM-4.5-Air-FP8
4379
+ description: GLM-4.5-Air-FP8 is a hybrid reasoning model designed to unify reasoning, coding, and agentic capabilities into a single model. It has 106 billion total parameters and 12 billion active parameters. The thinking mode is enabled by default. ([blog](https://z.ai/blog/glm-4.5))
4380
+ creator_organization_name: Z.ai
4381
+ access: open
4382
+ num_parameters: 110000000000
4383
+ release_date: 2025-07-28
4384
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4385
+
4386
+
4269
4387
  # Granite - IBM
4270
4388
  # https://www.ibm.com/granite
4271
4389
  # https://github.com/ibm-granite/granite-3.0-language-models
@@ -4479,21 +4597,23 @@ models:
4479
4597
  tags: [ TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG ]
4480
4598
 
4481
4599
  - name: ibm/granite-3.3-8b-instruct
4482
- display_name: Granite 3.3 8B Instruct
4483
- description: Granite 3.3 8B Instruct is a 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
4600
+ display_name: IBM Granite 3.3 8B Instruct
4601
+ description: IBM Granite 3.3 8B Instruct is an 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
4484
4602
  creator_organization_name: IBM
4485
4603
  access: open
4486
4604
  num_parameters: 8170000000
4487
4605
  release_date: 2025-04-16
4488
4606
  tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4489
4607
 
4490
- - name: mistralai/mixtral-8x7b-instruct-v0:1
4491
- display_name: Mixtral 8x7B Instruct on IBM WatsonX
4492
- description: A 7B sparse Mixture-of-Experts model with stronger capabilities than Mistral 7B. Uses 12B active parameters out of 45B total. Supports multiple languages, code and 32k context window.
4493
- creator_organization_name: Mistral
4494
- access: limited
4495
- release_date: 2023-12-11
4496
- tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4608
+ - name: ibm/granite-3.3-8b-instruct-with-guardian
4609
+ display_name: IBM Granite 3.3 8B Instruct (with guardian)
4610
+ description: IBM Granite 3.3 8B Instruct is an 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. All prompts were first evaluated for risk by [IBM Granite Guardian 3.2 5B](https://www.ibm.com/granite/docs/models/guardian/) and prompts that were deemed risky (with a risk threshold of 0.8) received the response "I'm very sorry, but I can't assist with that.". ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
4611
+ creator_organization_name: IBM
4612
+ access: open
4613
+ num_parameters: 8170000000
4614
+ release_date: 2025-04-16
4615
+ # Unfortunately this setup is not easily reproducible, so we mark it with DEPRECATED_MODEL_TAG
4616
+ tags: [DEPRECATED_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4497
4617
 
4498
4618
  - name: ura-hcmut/ura-llama-2.1-8b
4499
4619
  display_name: URA-Llama 2.1 (8B)
@@ -4682,4 +4802,59 @@ models:
4682
4802
  access: open
4683
4803
  num_parameters: 4000000000
4684
4804
  release_date: 2024-04-02
4685
- tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4805
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4806
+
4807
+ - name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
4808
+ display_name: Gemma-3 Gaia PT-BR 4b Instruct
4809
+ description: Gemma-3 Gaia PT-BR 4b Instruct is a model trained by CEIA-UFG for understanding and generating Brazilian Portuguese text.
4810
+ creator_organization_name: CEIA-UFG
4811
+ access: open
4812
+ num_parameters: 4000000000
4813
+ release_date: 2025-06-01
4814
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4815
+
4816
+ - name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
4817
+ display_name: Bode 13B Alpaca PT-BR
4818
+ description: Bode is a language model (LLM) for Portuguese, based on LLaMA 2 and fine-tuned with the Alpaca dataset translated into Portuguese. Suitable for instruction, text generation, translation and tasks in Portuguese.
4819
+ creator_organization_name: Recogna NLP
4820
+ access: open
4821
+ num_parameters: 13000000000
4822
+ release_date: 2024-01-05
4823
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4824
+
4825
+ - name: 22h/cabrita_7b_pt_850000
4826
+ display_name: Cabrita PT-BR 7B
4827
+ description: Cabrita is an OpenLLaMA-based model, continuously trained in Portuguese (mC4-pt subset) for 850000 steps with efficient tokenization adapted to the language.
4828
+ creator_organization_name: 22h
4829
+ access: open
4830
+ num_parameters: 7000000000
4831
+ release_date: 2023-08-23
4832
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4833
+
4834
+ - name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
4835
+ display_name: Gervásio PT-BR/PT-PT 7B Decoder
4836
+ description: Gervásio PT* is a 7B parameter decoder model, adapted from LLaMA27B, trained for both Brazilian and European Portuguese. Fine-tuned with translated data from benchmarks such as GLUE and SuperGLUE.
4837
+ creator_organization_name: PORTULAN (University of Lisbon NLX)
4838
+ access: open
4839
+ num_parameters: 6740000000
4840
+ release_date: 2024-02-29
4841
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4842
+
4843
+ - name: TucanoBR/Tucano-2b4
4844
+ display_name: Tucano PT-BR 2b4
4845
+ description: Tucano is a series of decoder models based on LLaMA2, natively pre-trained in Portuguese using the GigaVerbo dataset (200B tokens), with the 2B model trained for 1.96M steps over 845h (515B tokens, 4 epochs).
4846
+ creator_organization_name: TucanoBR (University of Bonn)
4847
+ access: open
4848
+ num_parameters: 2444618240
4849
+ release_date: 2024-12-11
4850
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4851
+
4852
+ - name: nicholasKluge/TeenyTinyLlama-460m
4853
+ display_name: TeenyTinyLlama 460M PT-BR
4854
+ description: TeenyTinyLlama-460m is a lightweight and efficient model based on LLaMA2, trained exclusively on Brazilian Portuguese. It uses RoPE embeddings and SwiGLU activations, with a refined SentencePiece tokenizer and a low-resource optimized architecture.
4855
+ creator_organization_name: Nicholas Kluge.
4856
+ access: open
4857
+ num_parameters: 460000000
4858
+ release_date: 2024-01-30
4859
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4860
+
@@ -265,6 +265,12 @@ tokenizer_configs:
265
265
  end_of_text_token: ""
266
266
  prefix_token: ""
267
267
 
268
+ - name: xai/grok-4-0709
269
+ tokenizer_spec:
270
+ class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
271
+ end_of_text_token: ""
272
+ prefix_token: ""
273
+
268
274
  # Hf-internal-testing
269
275
 
270
276
  # Tokenizer name hf-internal-testing/llama-tokenizer is taken from:
@@ -582,6 +588,17 @@ tokenizer_configs:
582
588
  end_of_text_token: "</s>"
583
589
  prefix_token: "<s>"
584
590
 
591
+ # Moonshot AI
592
+ - name: moonshotai/kimi-k2-instruct
593
+ tokenizer_spec:
594
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
595
+ args:
596
+ pretrained_model_name_or_path: moonshotai/Kimi-K2-Instruct
597
+ trust_remote_code: true
598
+ revision: 4f239503ad9d1a042f0a4bacac457931ab972cfc
599
+ end_of_text_token: "[EOS]"
600
+ prefix_token: "[BOS]"
601
+
585
602
  # Nectec
586
603
  - name: nectec/OpenThaiLLM-Prebuilt-7B
587
604
  tokenizer_spec:
@@ -633,6 +650,12 @@ tokenizer_configs:
633
650
  end_of_text_token: "<|endoftext|>"
634
651
  prefix_token: "<|endoftext|>"
635
652
 
653
+ - name: openai/o200k_harmony
654
+ tokenizer_spec:
655
+ class_name: "helm.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"
656
+ end_of_text_token: "<|endoftext|>"
657
+ prefix_token: "<|startoftext|>"
658
+
636
659
  - name: openai/clip-vit-large-patch14
637
660
  tokenizer_spec:
638
661
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -688,6 +711,12 @@ tokenizer_configs:
688
711
  end_of_text_token: "<|im_end|>"
689
712
  prefix_token: "<|im_start|>"
690
713
 
714
+ - name: qwen/qwen3-235b-a22b-instruct-2507-fp8
715
+ tokenizer_spec:
716
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
717
+ end_of_text_token: "<|im_end|>"
718
+ prefix_token: ""
719
+
691
720
  - name: qwen/qwq-32b-preview
692
721
  tokenizer_spec:
693
722
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -892,6 +921,7 @@ tokenizer_configs:
892
921
  end_of_text_token: ""
893
922
  prefix_token: ""
894
923
 
924
+ # Maritaca AI
895
925
  - name: maritaca-ai/sabia-7b
896
926
  tokenizer_spec:
897
927
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -900,6 +930,14 @@ tokenizer_configs:
900
930
  end_of_text_token: "</s>"
901
931
  prefix_token: "<s>"
902
932
 
933
+ - name: maritaca-ai/sabia-2-tokenizer-medium
934
+ tokenizer_spec:
935
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
936
+ args:
937
+ pretrained_model_name_or_path: maritaca-ai/sabia-2-tokenizer-medium
938
+ end_of_text_token: "</s>"
939
+ prefix_token: "<s>"
940
+
903
941
  # Granite-3.1-8b-base
904
942
  - name: ibm-granite/granite-3.1-8b-base
905
943
  tokenizer_spec:
@@ -1022,7 +1060,6 @@ tokenizer_configs:
1022
1060
  end_of_text_token: ""
1023
1061
 
1024
1062
  # IBM Granite 3.3
1025
-
1026
1063
  - name: ibm/granite-3.3-8b-instruct
1027
1064
  tokenizer_spec:
1028
1065
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -1031,6 +1068,13 @@ tokenizer_configs:
1031
1068
  end_of_text_token: "<|end_of_text|>"
1032
1069
  prefix_token: "<|end_of_text|>"
1033
1070
 
1071
+ # Z.ai GLM-4.5-AIR-FP8
1072
+ - name: zai-org/glm-4.5-air-fp8
1073
+ tokenizer_spec:
1074
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1075
+ end_of_text_token: "<|endoftext|>"
1076
+ prefix_token: ""
1077
+
1034
1078
 
1035
1079
 
1036
1080
  # DeepSeek-R1-Distill-Llama-3.1-8b
@@ -1104,4 +1148,58 @@ tokenizer_configs:
1104
1148
  args:
1105
1149
  pretrained_model_name_or_path: vinai/PhoGPT-4B-Chat
1106
1150
  end_of_text_token: "</s>"
1107
- prefix_token: "<s>"
1151
+ prefix_token: "<s>"
1152
+
1153
+ # Gemma-3-Gaia-PT-BR-4b-it
1154
+ - name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
1155
+ tokenizer_spec:
1156
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1157
+ args:
1158
+ pretrained_model_name_or_path: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
1159
+ end_of_text_token: "<eos>"
1160
+ prefix_token: "<bos>"
1161
+
1162
+ # Bode 13B Alpaca PT-BR
1163
+ - name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
1164
+ tokenizer_spec:
1165
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1166
+ args:
1167
+ pretrained_model_name_or_path: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
1168
+ end_of_text_token: "</s>"
1169
+ prefix_token: "<s>"
1170
+
1171
+ # Cabrita 7B PT-BR tokenizer
1172
+ - name: 22h/cabrita_7b_pt_850000
1173
+ tokenizer_spec:
1174
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1175
+ args:
1176
+ pretrained_model_name_or_path: 22h/cabrita_7b_pt_850000
1177
+ end_of_text_token: "</s>"
1178
+ prefix_token: "<s>"
1179
+
1180
+ # Gervásio 7B PT‑BR/PT‑PT tokenizer
1181
+ - name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
1182
+ tokenizer_spec:
1183
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1184
+ args:
1185
+ pretrained_model_name_or_path: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
1186
+ end_of_text_token: "</s>"
1187
+ prefix_token: "<s>"
1188
+
1189
+ # Tucano 2b4 PT-BR tokenizer
1190
+ - name: TucanoBR/Tucano-2b4
1191
+ tokenizer_spec:
1192
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1193
+ args:
1194
+ pretrained_model_name_or_path: TucanoBR/Tucano-2b4
1195
+ end_of_text_token: "</s>"
1196
+ prefix_token: "<s>"
1197
+
1198
+ # TeenyTinyLlama 460M PT-BR tokenizer
1199
+ - name: nicholasKluge/TeenyTinyLlama-460m
1200
+ tokenizer_spec:
1201
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1202
+ args:
1203
+ pretrained_model_name_or_path: nicholasKluge/TeenyTinyLlama-460m
1204
+ end_of_text_token: "</s>"
1205
+ prefix_token: "<s>"
helm/proxy/cli.py CHANGED
@@ -123,7 +123,7 @@ def do_create_update_command(service: RemoteService, auth: Authentication, args)
123
123
 
124
124
  # Update quotas
125
125
  for quota_str in args.quotas:
126
- m = re.match(f"(\w+)\.(\w+)=(\d+|{UNLIMITED_QUOTA})", quota_str)
126
+ m = re.match(rf"(\w+)\.(\w+)=(\d+|{UNLIMITED_QUOTA})", quota_str)
127
127
  if not m:
128
128
  raise Exception(
129
129
  f"Invalid format: {quota_str}, expect <model_group>.<granularity>=<quota> "
@@ -21,7 +21,7 @@ example_queries = [
21
21
  """
22
22
  temperature: 0.5 # Medium amount of randomness
23
23
  stop_sequences: [.] # Stop when you hit a period
24
- model: openai/gpt-3.5-turbo-0613
24
+ model: openai/gpt-4.1-nano-2025-04-14
25
25
  """
26
26
  ),
27
27
  environments="",
@@ -33,7 +33,7 @@ example_queries = [
33
33
  temperature: 0.5 # Medium amount of randomness
34
34
  stop_sequences: [\\n] # Stop when you hit a newline
35
35
  num_completions: 5 # Generate many samples
36
- model: openai/gpt-3.5-turbo-0613
36
+ model: openai/gpt-4.1-nano-2025-04-14
37
37
  """
38
38
  ),
39
39
  environments="",
@@ -58,7 +58,7 @@ example_queries = [
58
58
  """
59
59
  temperature: 0 # Deterministic
60
60
  max_tokens: 50
61
- model: openai/gpt-3.5-turbo-0613
61
+ model: openai/gpt-4.1-nano-2025-04-14
62
62
  """
63
63
  ),
64
64
  environments="",
@@ -76,7 +76,7 @@ example_queries = [
76
76
  environments=dedent(
77
77
  """
78
78
  occupation: [mathematician, lawyer, doctor]
79
- model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
79
+ model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
80
80
  """
81
81
  ),
82
82
  ),
@@ -101,7 +101,7 @@ example_queries = [
101
101
  ),
102
102
  environments=dedent(
103
103
  """
104
- model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
104
+ model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
105
105
  """
106
106
  ),
107
107
  ),
@@ -136,7 +136,7 @@ example_queries = [
136
136
  ),
137
137
  environments=dedent(
138
138
  """
139
- model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
139
+ model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
140
140
  """
141
141
  ),
142
142
  ),
@@ -144,7 +144,7 @@ example_queries = [
144
144
  prompt="Write a Python function that takes two vectors a and b and returns their Euclidean distance.",
145
145
  settings=dedent(
146
146
  """
147
- model: openai/gpt-3.5-turbo-0613
147
+ model: openai/gpt-4.1-nano-2025-04-14
148
148
  """
149
149
  ),
150
150
  environments="",
@@ -161,7 +161,7 @@ example_queries = [
161
161
  ),
162
162
  environments=dedent(
163
163
  """
164
- model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
164
+ model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
165
165
  """
166
166
  ),
167
167
  ),
helm/proxy/retry.py CHANGED
@@ -5,6 +5,7 @@ from retrying import Retrying
5
5
  from helm.common.request import RequestResult
6
6
  from helm.common.tokenization_request import TokenizationRequestResult
7
7
  from helm.common.hierarchical_logger import hlog
8
+ import os
8
9
  import traceback
9
10
  import threading
10
11
 
@@ -19,6 +20,10 @@ Example usage:
19
20
  ...
20
21
  """
21
22
 
23
+ # TODO: make these configurable at a config / cli level
24
+ HELM_RETRIES = int(os.environ.get("HELM_RETRIES", "5"))
25
+ HELM_TOKENIZER_RETRIES = int(os.environ.get("HELM_TOKENIZER_RETRIES", HELM_RETRIES))
26
+
22
27
  # The lock is used to prevent multiple threads from printing at the same time.
23
28
  # This can cause issues when printing the stack trace.
24
29
  # (The stack traces can get mixed up and become unreadable.)
helm/proxy/server.py CHANGED
@@ -23,7 +23,7 @@ from helm.benchmark.model_deployment_registry import get_default_model_deploymen
23
23
  from helm.common.authentication import Authentication
24
24
  from helm.common.cache_backend_config import CacheBackendConfig, MongoCacheBackendConfig, SqliteCacheBackendConfig
25
25
  from helm.common.general import ensure_directory_exists
26
- from helm.common.hierarchical_logger import hlog
26
+ from helm.common.hierarchical_logger import hlog, setup_default_logging
27
27
  from helm.common.optional_dependencies import handle_module_not_found_error
28
28
  from helm.common.request import Request
29
29
  from helm.common.perspective_api_request import PerspectiveAPIRequest
@@ -273,6 +273,7 @@ def main():
273
273
  default="",
274
274
  )
275
275
  args = parser.parse_args()
276
+ setup_default_logging()
276
277
 
277
278
  register_builtin_configs_from_helm_package()
278
279
  register_configs_from_directory(args.base_path)
@@ -35,6 +35,10 @@
35
35
  font-style: italic;
36
36
  }
37
37
 
38
+ .thinking {
39
+ font-style: italic;
40
+ }
41
+
38
42
  .token:hover {
39
43
  background-color: lightgreen;
40
44
  }
@@ -282,7 +282,13 @@ $(function () {
282
282
  requestResult.completions.forEach((completion) => {
283
283
  const $contents = $("<span>", {
284
284
  title: `logprob: ${completion.logprob}`,
285
- }).append(renderTokens(completion.tokens));
285
+ });
286
+ if (completion.thinking) {
287
+ const $thinking = $("<span>", { class: "thinking" }).append(completion.thinking.text);
288
+ $contents.append($thinking);
289
+ }
290
+ const $resultText = completion.tokens.length > 0 ?renderTokens(completion.tokens) : $("<div>").append(completion.text);
291
+ $contents.append($resultText);
286
292
  const $metadata = $("<span>", { class: "metadata" });
287
293
  $metadata.append(
288
294
  $("<span>", { title: "Log probability" }).append(
@@ -34,6 +34,8 @@ class GrokAPITokenizer(CachingTokenizer):
34
34
  "Set grokApiKey in credentials.conf or set the GROK_API_KEY environment variable"
35
35
  )
36
36
  text = request["text"]
37
+ if not text:
38
+ return {"token_ids": []}
37
39
  model = request["tokenizer"].split("/")[-1]
38
40
  response = requests.post(
39
41
  url="https://api.x.ai/v1/tokenize-text",
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.aci_bench_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class ACIBenchMetric(LLMJuryMetric):
6
- """Score metrics for ACIBench."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="aci_bench_accuracy",
11
- scenario_name="aci_bench",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.chw_care_plan_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class CHWCarePlanMetric(LLMJuryMetric):
6
- """Score metrics for CHWCarePlan."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="chw_care_plan_accuracy",
11
- scenario_name="chw_care_plan",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.dischargeme_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class DischargeMeMetric(LLMJuryMetric):
6
- """Score metrics for DischargeMe."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="dischargeme_accuracy",
11
- scenario_name="dischargeme",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.med_dialog_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class MedDialogMetric(LLMJuryMetric):
6
- """Score metrics for MedDialog."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="med_dialog_accuracy",
11
- scenario_name="med_dialog",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.medalign_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class MedalignMetric(LLMJuryMetric):
6
- """Score metrics for Medalign."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="medalign_accuracy",
11
- scenario_name="medalign",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )