crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (243) hide show
  1. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
  2. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/alrage_annotator.py +90 -0
  8. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  9. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  10. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  11. helm/benchmark/annotation/medalign_annotator.py +11 -22
  12. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  13. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  14. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  15. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  16. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  17. helm/benchmark/annotation/model_as_judge.py +23 -18
  18. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  19. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  20. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  21. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  22. helm/benchmark/metrics/alrage_metric.py +35 -0
  23. helm/benchmark/metrics/basic_metrics.py +267 -2
  24. helm/benchmark/metrics/classification_metrics.py +19 -1
  25. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  26. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  27. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  28. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  29. helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
  30. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  31. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  32. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  33. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  34. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  35. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  36. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  37. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  38. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  39. helm/benchmark/metrics/medec_metrics.py +25 -2
  40. helm/benchmark/metrics/metric.py +25 -0
  41. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  42. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  43. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  44. helm/benchmark/metrics/summac/model_summac.py +2 -2
  45. helm/benchmark/metrics/summarization_metrics.py +129 -1
  46. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  47. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  48. helm/benchmark/presentation/schema.py +5 -22
  49. helm/benchmark/presentation/summarize.py +180 -11
  50. helm/benchmark/presentation/taxonomy_info.py +20 -0
  51. helm/benchmark/run_expander.py +4 -0
  52. helm/benchmark/run_specs/arabic_run_specs.py +134 -16
  53. helm/benchmark/run_specs/bluex_run_specs.py +1 -1
  54. helm/benchmark/run_specs/classic_run_specs.py +2 -2
  55. helm/benchmark/run_specs/long_context_run_specs.py +2 -2
  56. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  57. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  58. helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
  59. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  60. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  61. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  62. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  63. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  64. helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
  65. helm/benchmark/scenarios/aratrust_scenario.py +19 -0
  66. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  67. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  68. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  69. helm/benchmark/scenarios/bluex_scenario.py +6 -2
  70. helm/benchmark/scenarios/bold_scenario.py +15 -0
  71. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  72. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  73. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  74. helm/benchmark/scenarios/clear_scenario.py +23 -0
  75. helm/benchmark/scenarios/cleva_scenario.py +479 -0
  76. helm/benchmark/scenarios/code_scenario.py +28 -0
  77. helm/benchmark/scenarios/commonsense_scenario.py +26 -0
  78. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  79. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  80. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  81. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  82. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  83. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  84. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  85. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  86. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  87. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  88. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  89. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  90. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  91. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  92. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  93. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  94. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  95. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  96. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  97. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  98. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  99. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  100. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  101. helm/benchmark/scenarios/gsm_scenario.py +15 -0
  102. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  103. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  104. helm/benchmark/scenarios/ice_scenario.py +21 -1
  105. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  106. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  107. helm/benchmark/scenarios/koala_scenario.py +21 -1
  108. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  109. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  110. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  111. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  112. helm/benchmark/scenarios/legalbench_scenario.py +20 -0
  113. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  114. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  115. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  116. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  117. helm/benchmark/scenarios/math_scenario.py +26 -0
  118. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  119. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  120. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  121. helm/benchmark/scenarios/med_qa_scenario.py +14 -0
  122. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  123. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  124. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  125. helm/benchmark/scenarios/medec_scenario.py +23 -0
  126. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  127. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  128. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  129. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  130. helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
  131. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  132. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  133. helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
  134. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  135. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  136. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  137. helm/benchmark/scenarios/mmlu_scenario.py +15 -0
  138. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  139. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  140. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  141. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  142. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
  143. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  144. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  145. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  146. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  147. helm/benchmark/scenarios/quac_scenario.py +14 -0
  148. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  149. helm/benchmark/scenarios/raft_scenario.py +15 -0
  150. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  151. helm/benchmark/scenarios/scenario.py +31 -0
  152. helm/benchmark/scenarios/seahelm_scenario.py +348 -0
  153. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  154. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  155. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  156. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  157. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  158. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  159. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  160. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  161. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  162. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  163. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  164. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  165. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  166. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  167. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  168. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  169. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  170. helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
  171. helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
  172. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  173. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  175. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  176. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  177. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  178. helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
  179. helm/benchmark/static/schema_arabic.yaml +55 -12
  180. helm/benchmark/static/schema_long_context.yaml +17 -17
  181. helm/benchmark/static/schema_medhelm.yaml +36 -0
  182. helm/benchmark/static/schema_slp.yaml +219 -0
  183. helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
  184. helm/benchmark/static_build/assets/index-9352595e.css +1 -0
  185. helm/benchmark/static_build/index.html +2 -2
  186. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  187. helm/clients/audio_language/llama_omni/constants.py +9 -0
  188. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  189. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  190. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  191. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  192. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  193. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  194. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  195. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  196. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  197. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  198. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  199. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  200. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  201. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  202. helm/clients/audio_language/llama_omni/utils.py +202 -0
  203. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  204. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  205. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  206. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  207. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  208. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  209. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  210. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  211. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  212. helm/clients/openai_client.py +31 -19
  213. helm/clients/openai_responses_client.py +27 -3
  214. helm/clients/openrouter_client.py +31 -0
  215. helm/clients/test_openrouter_client.py +69 -0
  216. helm/clients/together_client.py +48 -11
  217. helm/clients/vertexai_client.py +8 -2
  218. helm/config/model_deployments.yaml +75 -1
  219. helm/config/model_metadata.yaml +70 -2
  220. helm/config/tokenizer_configs.yaml +19 -1
  221. helm/proxy/example_queries.py +8 -8
  222. helm/proxy/server.py +2 -1
  223. helm/proxy/static/index.css +4 -0
  224. helm/proxy/static/index.js +7 -1
  225. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  226. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  227. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  228. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  229. helm/benchmark/metrics/medalign_metrics.py +0 -14
  230. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  231. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  232. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  233. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  234. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  235. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  236. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  237. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  238. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  239. helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
  240. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
  241. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
  242. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
  243. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
@@ -99,7 +99,7 @@ class JobNotFinishedError(TogetherClientError):
99
99
  pass
100
100
 
101
101
 
102
- def _parse_thinking(input: str) -> Tuple[str, str]:
102
+ def _parse_thinking_deepseek_r1(input: str) -> Tuple[str, str]:
103
103
  """Return a tuple of thinking text and output text."""
104
104
  match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
105
105
  if match:
@@ -112,6 +112,44 @@ def _parse_thinking(input: str) -> Tuple[str, str]:
112
112
  return (input, "")
113
113
 
114
114
 
115
+ def _parse_thinking_qwen3(input: str) -> Tuple[str, str]:
116
+ """Return a tuple of thinking text and output text."""
117
+ match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
118
+ if match:
119
+ return (match.group(1), match.group(2))
120
+
121
+ match = re.match(r"<think>\n?(.*)", input, re.DOTALL)
122
+ if match:
123
+ return (match.group(1), "")
124
+
125
+ return (input, "")
126
+
127
+
128
+ def _parse_thinking_glm_4_5(input: str) -> Tuple[str, str]:
129
+ """Return a tuple of thinking text and output text."""
130
+ match = re.match(r"\n<think>(.*)</think>(.*)", input, re.DOTALL)
131
+ if match:
132
+ return (match.group(1), match.group(2))
133
+
134
+ match = re.match(r"\n<think>(.*)", input, re.DOTALL)
135
+ if match:
136
+ return (match.group(1), "")
137
+
138
+ return (input, "")
139
+
140
+
141
+ def _parse_thinking(input: str, model_name: str) -> Tuple[str, str]:
142
+ # TODO: Come up with a more sustainable extensible way of doing this.
143
+ if "deepseek-r1" in model_name:
144
+ return _parse_thinking_deepseek_r1(input)
145
+ elif "qwen3" in model_name:
146
+ return _parse_thinking_qwen3(input)
147
+ elif "glm-4.5" in model_name:
148
+ return _parse_thinking_glm_4_5(input)
149
+ else:
150
+ raise Exception(f"No thinking parser available for model {model_name}")
151
+
152
+
115
153
  class TogetherClient(CachingClient):
116
154
  """
117
155
  Client for the models where we evaluate offline. Since the queries are handled offline, the `TogetherClient` just
@@ -346,9 +384,8 @@ class TogetherChatClient(CachingClient):
346
384
  self._client = Together(api_key=api_key)
347
385
  self._together_model = together_model
348
386
  self._disable_logprobs = bool(disable_logprobs)
349
- # self.output_processor is actually a function, not a class
350
387
  self._parse_thinking = bool(parse_thinking)
351
-
388
+ # self.output_processor is actually a function, not a class
352
389
  self.output_processor: Optional[Callable[[str], str]] = (
353
390
  get_class_by_name(output_processor) if output_processor else None
354
391
  )
@@ -444,15 +481,15 @@ class TogetherChatClient(CachingClient):
444
481
  if self.output_processor:
445
482
  output_text = self.output_processor(output_text)
446
483
 
484
+ thinking: Optional[Thinking] = None
447
485
  if self._parse_thinking:
448
- thinking_text, output_text = _parse_thinking(output_text)
449
- generated_outputs.append(
450
- GeneratedOutput(
451
- text=output_text, logprob=logprob, tokens=tokens, thinking=Thinking(text=thinking_text)
452
- )
453
- )
454
- else:
455
- generated_outputs.append(GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens))
486
+ thinking_text, output_text = _parse_thinking(output_text, request.model)
487
+ thinking = Thinking(text=thinking_text)
488
+ elif hasattr(choice.message, "reasoning_content"):
489
+ thinking = Thinking(text=choice.message.reasoning_content)
490
+ generated_outputs.append(
491
+ GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens, thinking=thinking)
492
+ )
456
493
  return RequestResult(
457
494
  success=True,
458
495
  cached=cached,
@@ -276,8 +276,14 @@ class VertexAIChatClient(VertexAIClient):
276
276
  if not candidate.content:
277
277
  raise VertexAIContentBlockedError(f"No content in candidate: {candidate}")
278
278
  if not candidate.content.parts:
279
- raise VertexAIContentBlockedError(f"No content parts in candidate: {candidate}")
280
- predictions.append({"text": candidate.content.text})
279
+ if candidate.finish_reason == 2: # MAX_TOKENS
280
+ # This means that there is no text output because the maximum number of tokens were
281
+ # reached during thinking.
282
+ predictions.append({"text": ""})
283
+ else:
284
+ raise VertexAIContentBlockedError(f"No content parts in candidate: {candidate}")
285
+ else:
286
+ predictions.append({"text": candidate.content.text})
281
287
  # TODO: Extract more information from the response
282
288
  return {"predictions": predictions}
283
289
 
@@ -1088,6 +1088,14 @@ model_deployments:
1088
1088
  # - https://cloud.google.com/vertex-ai/generative-ai/docs/learn/locations#global-endpoint
1089
1089
  location: global
1090
1090
 
1091
+ - name: google/gemini-2.5-flash-lite
1092
+ model_name: google/gemini-2.5-flash-lite
1093
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
1094
+ max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash
1095
+ # TODO: Max output tokens: 65536
1096
+ client_spec:
1097
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
1098
+
1091
1099
  - name: google/gemini-2.5-flash-preview-04-17
1092
1100
  model_name: google/gemini-2.5-flash-preview-04-17
1093
1101
  tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
@@ -2616,6 +2624,27 @@ model_deployments:
2616
2624
  client_spec:
2617
2625
  class_name: "helm.clients.openai_client.OpenAIClient"
2618
2626
 
2627
+ - name: openai/gpt-5-2025-08-07
2628
+ model_name: openai/gpt-5-2025-08-07
2629
+ tokenizer_name: openai/o200k_base
2630
+ max_sequence_length: 400000
2631
+ client_spec:
2632
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2633
+
2634
+ - name: openai/gpt-5-mini-2025-08-07
2635
+ model_name: openai/gpt-5-mini-2025-08-07
2636
+ tokenizer_name: openai/o200k_base
2637
+ max_sequence_length: 400000
2638
+ client_spec:
2639
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2640
+
2641
+ - name: openai/gpt-5-nano-2025-08-07
2642
+ model_name: openai/gpt-5-nano-2025-08-07
2643
+ tokenizer_name: openai/o200k_base
2644
+ max_sequence_length: 400000
2645
+ client_spec:
2646
+ class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
2647
+
2619
2648
  - name: openai/whisper-1_gpt-4o-2024-11-20
2620
2649
  model_name: openai/whisper-1_gpt-4o-2024-11-20
2621
2650
  tokenizer_name: openai/o200k_base
@@ -2860,6 +2889,23 @@ model_deployments:
2860
2889
  openai_model_name: o3-pro-2025-06-10
2861
2890
  reasoning_effort: high
2862
2891
 
2892
+ ## GPT-OSS
2893
+ - name: together/gpt-oss-20b
2894
+ model_name: openai/gpt-oss-20b
2895
+ tokenizer_name: openai/o200k_harmony
2896
+ # Source: https://platform.openai.com/docs/models/gpt-oss-20b
2897
+ max_sequence_length: 131072
2898
+ client_spec:
2899
+ class_name: "helm.clients.together_client.TogetherChatClient"
2900
+
2901
+ - name: together/gpt-oss-120b
2902
+ model_name: openai/gpt-oss-120b
2903
+ tokenizer_name: openai/o200k_harmony
2904
+ # Source: https://platform.openai.com/docs/models/gpt-oss-120b
2905
+ max_sequence_length: 131072
2906
+ client_spec:
2907
+ class_name: "helm.clients.together_client.TogetherChatClient"
2908
+
2863
2909
  ## Text Similarity Models
2864
2910
  # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings
2865
2911
  # The number of parameters is guessed based on the number of parameters of the
@@ -3541,6 +3587,16 @@ model_deployments:
3541
3587
  args:
3542
3588
  together_model: togethercomputer/RedPajama-INCITE-7B-Instruct
3543
3589
 
3590
+ ## Z.ai
3591
+ - name: together/glm-4.5-air-fp8
3592
+ model_name: zai-org/glm-4.5-air-fp8
3593
+ tokenizer_name: zai-org/glm-4.5-air-fp8
3594
+ max_sequence_length: 131072
3595
+ client_spec:
3596
+ class_name: "helm.clients.together_client.TogetherChatClient"
3597
+ args:
3598
+ parse_thinking: true
3599
+
3544
3600
  - name: thudm/cogview2
3545
3601
  model_name: thudm/cogview2
3546
3602
  tokenizer_name: openai/clip-vit-large-patch14
@@ -3816,7 +3872,16 @@ model_deployments:
3816
3872
  class_name: "helm.clients.together_client.TogetherChatClient"
3817
3873
  args:
3818
3874
  parse_thinking: true
3819
-
3875
+
3876
+ - name: together/qwen3-235b-a22b-instruct-2507-fp8
3877
+ model_name: qwen/qwen3-235b-a22b-instruct-2507-fp8
3878
+ tokenizer_name: qwen/qwen3-235b-a22b-instruct-2507-fp8
3879
+ max_sequence_length: 262144
3880
+ client_spec:
3881
+ class_name: "helm.clients.together_client.TogetherChatClient"
3882
+ args:
3883
+ together_model: Qwen/Qwen3-235B-A22B-Instruct-2507-tput
3884
+
3820
3885
  - name: huggingface/qwen2.5-7b-instruct-4bit
3821
3886
  model_name: qwen/qwen2.5-7b-instruct
3822
3887
  tokenizer_name: qwen/qwen2.5-7b-instruct
@@ -4590,3 +4655,12 @@ model_deployments:
4590
4655
  class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4591
4656
  args:
4592
4657
  pretrained_model_name_or_path: nicholasKluge/TeenyTinyLlama-460m
4658
+
4659
+ - name: openrouter/mistral-medium-3.1
4660
+ model_name: mistralai/mistral-medium-3.1
4661
+ tokenizer_name: mistralai/Mistral-7B-v0.1
4662
+ max_sequence_length: 128000
4663
+ client_spec:
4664
+ class_name: "helm.clients.openrouter_client.OpenRouterClient"
4665
+ args:
4666
+ model_name: mistralai/mistral-medium-3.1
@@ -1253,6 +1253,14 @@ models:
1253
1253
  release_date: 2025-06-17
1254
1254
  tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, AUDIO_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
1255
1255
 
1256
+ - name: google/gemini-2.5-flash-lite
1257
+ display_name: Gemini 2.5 Flash-Lite
1258
+ description: Gemini 2.5 Flash-Lite ([blog](https://blog.google/products/gemini/gemini-2-5-model-family-expands/))
1259
+ creator_organization_name: Google
1260
+ access: limited
1261
+ release_date: 2025-07-22
1262
+ tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, AUDIO_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
1263
+
1256
1264
  - name: google/gemini-2.5-flash-preview-04-17
1257
1265
  display_name: Gemini 2.5 Flash (04-17 preview)
1258
1266
  description: Gemini 2.5 Flash (04-17 preview) ([documentation](https://ai.google.dev/gemini-api/docs/models/gemini))
@@ -3052,6 +3060,30 @@ models:
3052
3060
  release_date: 2025-04-14
3053
3061
  tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3054
3062
 
3063
+ - name: openai/gpt-5-2025-08-07
3064
+ display_name: GPT-5 (2025-08-07)
3065
+ description: GPT-5 (2025-08-07) is a multimdodal model trained for real-world coding tasks and long-running agentic tasks. ([blog](https://openai.com/index/introducing-gpt-5-for-developers/), [system card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf))
3066
+ creator_organization_name: OpenAI
3067
+ access: limited
3068
+ release_date: 2025-08-07
3069
+ tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3070
+
3071
+ - name: openai/gpt-5-mini-2025-08-07
3072
+ display_name: GPT-5 mini (2025-08-07)
3073
+ description: GPT-5 mini (2025-08-07) is a multimdodal model trained for real-world coding tasks and long-running agentic tasks. ([blog](https://openai.com/index/introducing-gpt-5-for-developers/), [system card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf))
3074
+ creator_organization_name: OpenAI
3075
+ access: limited
3076
+ release_date: 2025-08-07
3077
+ tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3078
+
3079
+ - name: openai/gpt-5-nano-2025-08-07
3080
+ display_name: GPT-5 nano (2025-08-07)
3081
+ description: GPT-5 nano (2025-08-07) is a multimdodal model trained for real-world coding tasks and long-running agentic tasks. ([blog](https://openai.com/index/introducing-gpt-5-for-developers/), [system card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf))
3082
+ creator_organization_name: OpenAI
3083
+ access: limited
3084
+ release_date: 2025-08-07
3085
+ tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3086
+
3055
3087
  - name: openai/whisper-1_gpt-4o-2024-11-20
3056
3088
  display_name: Whisper-1 + GPT-4o (2024-11-20)
3057
3089
  description: Transcribes the text with Whisper-1 and then uses GPT-4o to generate a response.
@@ -3273,6 +3305,23 @@ models:
3273
3305
  release_date: 2025-06-10
3274
3306
  tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3275
3307
 
3308
+ ## GPT-OSS
3309
+ - name: openai/gpt-oss-20b
3310
+ display_name: gpt-oss-20b
3311
+ description: gpt-oss-20b is an open-weight language model that was trained using a mix of reinforcement learning and other techniques informed by OpenAI's internal models. It uses a mixture-of-experts architecture and activates 3.6B parameters per token. ([blog](https://openai.com/index/introducing-gpt-oss/))
3312
+ creator_organization_name: OpenAI
3313
+ access: open
3314
+ release_date: 2025-08-05
3315
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3316
+
3317
+ - name: openai/gpt-oss-120b
3318
+ display_name: gpt-oss-120b
3319
+ description: gpt-oss-120b is an open-weight language model that was trained using a mix of reinforcement learning and other techniques informed by OpenAI's internal models. It uses a mixture-of-experts architecture and activates 5.1B parameters per token. ([blog](https://openai.com/index/introducing-gpt-oss/))
3320
+ creator_organization_name: OpenAI
3321
+ access: open
3322
+ release_date: 2025-08-05
3323
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3324
+
3276
3325
  ## Codex Models
3277
3326
  # DEPRECATED: Codex models have been shut down on March 23 2023.
3278
3327
 
@@ -3549,6 +3598,14 @@ models:
3549
3598
  release_date: 2025-04-29
3550
3599
  tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3551
3600
 
3601
+ - name: qwen/qwen3-235b-a22b-instruct-2507-fp8
3602
+ display_name: Qwen3 235B A22B Instruct 2507 FP8
3603
+ description: Qwen3 235B A22B Instruct 2507 FP8 is an updated version of the non-thinking mode of Qwen3 235B A22B FP8.
3604
+ creator_organization_name: Qwen
3605
+ access: open
3606
+ release_date: 2025-07-21 # https://x.com/Alibaba_Qwen/status/1947344511988076547
3607
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3608
+
3552
3609
  - name: qwen/qwq-32b-preview
3553
3610
  display_name: QwQ (32B Preview)
3554
3611
  description: QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities. ([blog post](https://qwenlm.github.io/blog/qwq-32b-preview/)).
@@ -4315,6 +4372,17 @@ models:
4315
4372
  release_date: 2025-05-08
4316
4373
  tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4317
4374
 
4375
+ # Z.ai
4376
+
4377
+ - name: zai-org/glm-4.5-air-fp8
4378
+ display_name: GLM-4.5-Air-FP8
4379
+ description: GLM-4.5-Air-FP8 is a hybrid reasoning model designed to unify reasoning, coding, and agentic capabilities into a single model. It has 106 billion total parameters and 12 billion active parameters. The thinking mode is enabled by default. ([blog](https://z.ai/blog/glm-4.5))
4380
+ creator_organization_name: Z.ai
4381
+ access: open
4382
+ num_parameters: 110000000000
4383
+ release_date: 2025-07-28
4384
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4385
+
4318
4386
 
4319
4387
  # Granite - IBM
4320
4388
  # https://www.ibm.com/granite
@@ -4530,7 +4598,7 @@ models:
4530
4598
 
4531
4599
  - name: ibm/granite-3.3-8b-instruct
4532
4600
  display_name: IBM Granite 3.3 8B Instruct
4533
- description: IBM Granite 3.3 8B Instruct is a 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
4601
+ description: IBM Granite 3.3 8B Instruct is an 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
4534
4602
  creator_organization_name: IBM
4535
4603
  access: open
4536
4604
  num_parameters: 8170000000
@@ -4539,7 +4607,7 @@ models:
4539
4607
 
4540
4608
  - name: ibm/granite-3.3-8b-instruct-with-guardian
4541
4609
  display_name: IBM Granite 3.3 8B Instruct (with guardian)
4542
- description: IBM Granite 3.3 8B Instruct is a 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct)) This model was run with an additional safety filter using [Granite Guardian 3.2](https://www.ibm.com/granite/docs/models/guardian/).
4610
+ description: IBM Granite 3.3 8B Instruct is an 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. All prompts were first evaluated for risk by [IBM Granite Guardian 3.2 5B](https://www.ibm.com/granite/docs/models/guardian/) and prompts that were deemed risky (with a risk threshold of 0.8) received the response "I'm very sorry, but I can't assist with that.". ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
4543
4611
  creator_organization_name: IBM
4544
4612
  access: open
4545
4613
  num_parameters: 8170000000
@@ -650,6 +650,12 @@ tokenizer_configs:
650
650
  end_of_text_token: "<|endoftext|>"
651
651
  prefix_token: "<|endoftext|>"
652
652
 
653
+ - name: openai/o200k_harmony
654
+ tokenizer_spec:
655
+ class_name: "helm.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"
656
+ end_of_text_token: "<|endoftext|>"
657
+ prefix_token: "<|startoftext|>"
658
+
653
659
  - name: openai/clip-vit-large-patch14
654
660
  tokenizer_spec:
655
661
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -705,6 +711,12 @@ tokenizer_configs:
705
711
  end_of_text_token: "<|im_end|>"
706
712
  prefix_token: "<|im_start|>"
707
713
 
714
+ - name: qwen/qwen3-235b-a22b-instruct-2507-fp8
715
+ tokenizer_spec:
716
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
717
+ end_of_text_token: "<|im_end|>"
718
+ prefix_token: ""
719
+
708
720
  - name: qwen/qwq-32b-preview
709
721
  tokenizer_spec:
710
722
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -1048,7 +1060,6 @@ tokenizer_configs:
1048
1060
  end_of_text_token: ""
1049
1061
 
1050
1062
  # IBM Granite 3.3
1051
-
1052
1063
  - name: ibm/granite-3.3-8b-instruct
1053
1064
  tokenizer_spec:
1054
1065
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -1057,6 +1068,13 @@ tokenizer_configs:
1057
1068
  end_of_text_token: "<|end_of_text|>"
1058
1069
  prefix_token: "<|end_of_text|>"
1059
1070
 
1071
+ # Z.ai GLM-4.5-AIR-FP8
1072
+ - name: zai-org/glm-4.5-air-fp8
1073
+ tokenizer_spec:
1074
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1075
+ end_of_text_token: "<|endoftext|>"
1076
+ prefix_token: ""
1077
+
1060
1078
 
1061
1079
 
1062
1080
  # DeepSeek-R1-Distill-Llama-3.1-8b
@@ -21,7 +21,7 @@ example_queries = [
21
21
  """
22
22
  temperature: 0.5 # Medium amount of randomness
23
23
  stop_sequences: [.] # Stop when you hit a period
24
- model: openai/gpt-3.5-turbo-0613
24
+ model: openai/gpt-4.1-nano-2025-04-14
25
25
  """
26
26
  ),
27
27
  environments="",
@@ -33,7 +33,7 @@ example_queries = [
33
33
  temperature: 0.5 # Medium amount of randomness
34
34
  stop_sequences: [\\n] # Stop when you hit a newline
35
35
  num_completions: 5 # Generate many samples
36
- model: openai/gpt-3.5-turbo-0613
36
+ model: openai/gpt-4.1-nano-2025-04-14
37
37
  """
38
38
  ),
39
39
  environments="",
@@ -58,7 +58,7 @@ example_queries = [
58
58
  """
59
59
  temperature: 0 # Deterministic
60
60
  max_tokens: 50
61
- model: openai/gpt-3.5-turbo-0613
61
+ model: openai/gpt-4.1-nano-2025-04-14
62
62
  """
63
63
  ),
64
64
  environments="",
@@ -76,7 +76,7 @@ example_queries = [
76
76
  environments=dedent(
77
77
  """
78
78
  occupation: [mathematician, lawyer, doctor]
79
- model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
79
+ model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
80
80
  """
81
81
  ),
82
82
  ),
@@ -101,7 +101,7 @@ example_queries = [
101
101
  ),
102
102
  environments=dedent(
103
103
  """
104
- model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
104
+ model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
105
105
  """
106
106
  ),
107
107
  ),
@@ -136,7 +136,7 @@ example_queries = [
136
136
  ),
137
137
  environments=dedent(
138
138
  """
139
- model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
139
+ model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
140
140
  """
141
141
  ),
142
142
  ),
@@ -144,7 +144,7 @@ example_queries = [
144
144
  prompt="Write a Python function that takes two vectors a and b and returns their Euclidean distance.",
145
145
  settings=dedent(
146
146
  """
147
- model: openai/gpt-3.5-turbo-0613
147
+ model: openai/gpt-4.1-nano-2025-04-14
148
148
  """
149
149
  ),
150
150
  environments="",
@@ -161,7 +161,7 @@ example_queries = [
161
161
  ),
162
162
  environments=dedent(
163
163
  """
164
- model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
164
+ model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
165
165
  """
166
166
  ),
167
167
  ),
helm/proxy/server.py CHANGED
@@ -23,7 +23,7 @@ from helm.benchmark.model_deployment_registry import get_default_model_deploymen
23
23
  from helm.common.authentication import Authentication
24
24
  from helm.common.cache_backend_config import CacheBackendConfig, MongoCacheBackendConfig, SqliteCacheBackendConfig
25
25
  from helm.common.general import ensure_directory_exists
26
- from helm.common.hierarchical_logger import hlog
26
+ from helm.common.hierarchical_logger import hlog, setup_default_logging
27
27
  from helm.common.optional_dependencies import handle_module_not_found_error
28
28
  from helm.common.request import Request
29
29
  from helm.common.perspective_api_request import PerspectiveAPIRequest
@@ -273,6 +273,7 @@ def main():
273
273
  default="",
274
274
  )
275
275
  args = parser.parse_args()
276
+ setup_default_logging()
276
277
 
277
278
  register_builtin_configs_from_helm_package()
278
279
  register_configs_from_directory(args.base_path)
@@ -35,6 +35,10 @@
35
35
  font-style: italic;
36
36
  }
37
37
 
38
+ .thinking {
39
+ font-style: italic;
40
+ }
41
+
38
42
  .token:hover {
39
43
  background-color: lightgreen;
40
44
  }
@@ -282,7 +282,13 @@ $(function () {
282
282
  requestResult.completions.forEach((completion) => {
283
283
  const $contents = $("<span>", {
284
284
  title: `logprob: ${completion.logprob}`,
285
- }).append(renderTokens(completion.tokens));
285
+ });
286
+ if (completion.thinking) {
287
+ const $thinking = $("<span>", { class: "thinking" }).append(completion.thinking.text);
288
+ $contents.append($thinking);
289
+ }
290
+ const $resultText = completion.tokens.length > 0 ?renderTokens(completion.tokens) : $("<div>").append(completion.text);
291
+ $contents.append($resultText);
286
292
  const $metadata = $("<span>", { class: "metadata" });
287
293
  $metadata.append(
288
294
  $("<span>", { title: "Log probability" }).append(
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.aci_bench_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class ACIBenchMetric(LLMJuryMetric):
6
- """Score metrics for ACIBench."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="aci_bench_accuracy",
11
- scenario_name="aci_bench",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.chw_care_plan_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class CHWCarePlanMetric(LLMJuryMetric):
6
- """Score metrics for CHWCarePlan."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="chw_care_plan_accuracy",
11
- scenario_name="chw_care_plan",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.dischargeme_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class DischargeMeMetric(LLMJuryMetric):
6
- """Score metrics for DischargeMe."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="dischargeme_accuracy",
11
- scenario_name="dischargeme",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.med_dialog_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class MedDialogMetric(LLMJuryMetric):
6
- """Score metrics for MedDialog."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="med_dialog_accuracy",
11
- scenario_name="med_dialog",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.medalign_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class MedalignMetric(LLMJuryMetric):
6
- """Score metrics for Medalign."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="medalign_accuracy",
11
- scenario_name="medalign",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.medi_qa_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class MediQAMetric(LLMJuryMetric):
6
- """Score metrics for MediQA."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="medi_qa_accuracy",
11
- scenario_name="medi_qa",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.medication_qa_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class MedicationQAMetric(LLMJuryMetric):
6
- """Score metrics for MedicationQA."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="medication_qa_accuracy",
11
- scenario_name="medication_qa",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.mental_health_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class MentalHealthMetric(LLMJuryMetric):
6
- """Score metrics for MentalHealth."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="mental_health_accuracy",
11
- scenario_name="mental_health",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.mimic_bhc_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class MIMICBHCMetric(LLMJuryMetric):
6
- """Score metrics for MIMICBHC."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="mimic_bhc_accuracy",
11
- scenario_name="mimic_bhc",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )
@@ -1,14 +0,0 @@
1
- from helm.benchmark.annotation.mimic_rrs_annotator import ANNOTATOR_MODELS
2
- from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
3
-
4
-
5
- class MIMICRRSMetric(LLMJuryMetric):
6
- """Score metrics for MIMICRRS."""
7
-
8
- def __init__(self):
9
- super().__init__(
10
- metric_name="mimic_rrs_accuracy",
11
- scenario_name="mimic_rrs",
12
- annotator_models=ANNOTATOR_MODELS,
13
- default_score=1.0,
14
- )