crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +134 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +26 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +15 -0
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +20 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +26 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +14 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +15 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +17 -17
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
- helm/benchmark/static_build/assets/index-9352595e.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/openai_client.py +31 -19
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +48 -11
- helm/clients/vertexai_client.py +8 -2
- helm/config/model_deployments.yaml +75 -1
- helm/config/model_metadata.yaml +70 -2
- helm/config/tokenizer_configs.yaml +19 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
helm/clients/together_client.py
CHANGED
|
@@ -99,7 +99,7 @@ class JobNotFinishedError(TogetherClientError):
|
|
|
99
99
|
pass
|
|
100
100
|
|
|
101
101
|
|
|
102
|
-
def
|
|
102
|
+
def _parse_thinking_deepseek_r1(input: str) -> Tuple[str, str]:
|
|
103
103
|
"""Return a tuple of thinking text and output text."""
|
|
104
104
|
match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
|
|
105
105
|
if match:
|
|
@@ -112,6 +112,44 @@ def _parse_thinking(input: str) -> Tuple[str, str]:
|
|
|
112
112
|
return (input, "")
|
|
113
113
|
|
|
114
114
|
|
|
115
|
+
def _parse_thinking_qwen3(input: str) -> Tuple[str, str]:
|
|
116
|
+
"""Return a tuple of thinking text and output text."""
|
|
117
|
+
match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
|
|
118
|
+
if match:
|
|
119
|
+
return (match.group(1), match.group(2))
|
|
120
|
+
|
|
121
|
+
match = re.match(r"<think>\n?(.*)", input, re.DOTALL)
|
|
122
|
+
if match:
|
|
123
|
+
return (match.group(1), "")
|
|
124
|
+
|
|
125
|
+
return (input, "")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _parse_thinking_glm_4_5(input: str) -> Tuple[str, str]:
|
|
129
|
+
"""Return a tuple of thinking text and output text."""
|
|
130
|
+
match = re.match(r"\n<think>(.*)</think>(.*)", input, re.DOTALL)
|
|
131
|
+
if match:
|
|
132
|
+
return (match.group(1), match.group(2))
|
|
133
|
+
|
|
134
|
+
match = re.match(r"\n<think>(.*)", input, re.DOTALL)
|
|
135
|
+
if match:
|
|
136
|
+
return (match.group(1), "")
|
|
137
|
+
|
|
138
|
+
return (input, "")
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _parse_thinking(input: str, model_name: str) -> Tuple[str, str]:
|
|
142
|
+
# TODO: Come up with a more sustainable extensible way of doing this.
|
|
143
|
+
if "deepseek-r1" in model_name:
|
|
144
|
+
return _parse_thinking_deepseek_r1(input)
|
|
145
|
+
elif "qwen3" in model_name:
|
|
146
|
+
return _parse_thinking_qwen3(input)
|
|
147
|
+
elif "glm-4.5" in model_name:
|
|
148
|
+
return _parse_thinking_glm_4_5(input)
|
|
149
|
+
else:
|
|
150
|
+
raise Exception(f"No thinking parser available for model {model_name}")
|
|
151
|
+
|
|
152
|
+
|
|
115
153
|
class TogetherClient(CachingClient):
|
|
116
154
|
"""
|
|
117
155
|
Client for the models where we evaluate offline. Since the queries are handled offline, the `TogetherClient` just
|
|
@@ -346,9 +384,8 @@ class TogetherChatClient(CachingClient):
|
|
|
346
384
|
self._client = Together(api_key=api_key)
|
|
347
385
|
self._together_model = together_model
|
|
348
386
|
self._disable_logprobs = bool(disable_logprobs)
|
|
349
|
-
# self.output_processor is actually a function, not a class
|
|
350
387
|
self._parse_thinking = bool(parse_thinking)
|
|
351
|
-
|
|
388
|
+
# self.output_processor is actually a function, not a class
|
|
352
389
|
self.output_processor: Optional[Callable[[str], str]] = (
|
|
353
390
|
get_class_by_name(output_processor) if output_processor else None
|
|
354
391
|
)
|
|
@@ -444,15 +481,15 @@ class TogetherChatClient(CachingClient):
|
|
|
444
481
|
if self.output_processor:
|
|
445
482
|
output_text = self.output_processor(output_text)
|
|
446
483
|
|
|
484
|
+
thinking: Optional[Thinking] = None
|
|
447
485
|
if self._parse_thinking:
|
|
448
|
-
thinking_text, output_text = _parse_thinking(output_text)
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
)
|
|
454
|
-
|
|
455
|
-
generated_outputs.append(GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens))
|
|
486
|
+
thinking_text, output_text = _parse_thinking(output_text, request.model)
|
|
487
|
+
thinking = Thinking(text=thinking_text)
|
|
488
|
+
elif hasattr(choice.message, "reasoning_content"):
|
|
489
|
+
thinking = Thinking(text=choice.message.reasoning_content)
|
|
490
|
+
generated_outputs.append(
|
|
491
|
+
GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens, thinking=thinking)
|
|
492
|
+
)
|
|
456
493
|
return RequestResult(
|
|
457
494
|
success=True,
|
|
458
495
|
cached=cached,
|
helm/clients/vertexai_client.py
CHANGED
|
@@ -276,8 +276,14 @@ class VertexAIChatClient(VertexAIClient):
|
|
|
276
276
|
if not candidate.content:
|
|
277
277
|
raise VertexAIContentBlockedError(f"No content in candidate: {candidate}")
|
|
278
278
|
if not candidate.content.parts:
|
|
279
|
-
|
|
280
|
-
|
|
279
|
+
if candidate.finish_reason == 2: # MAX_TOKENS
|
|
280
|
+
# This means that there is no text output because the maximum number of tokens were
|
|
281
|
+
# reached during thinking.
|
|
282
|
+
predictions.append({"text": ""})
|
|
283
|
+
else:
|
|
284
|
+
raise VertexAIContentBlockedError(f"No content parts in candidate: {candidate}")
|
|
285
|
+
else:
|
|
286
|
+
predictions.append({"text": candidate.content.text})
|
|
281
287
|
# TODO: Extract more information from the response
|
|
282
288
|
return {"predictions": predictions}
|
|
283
289
|
|
|
@@ -1088,6 +1088,14 @@ model_deployments:
|
|
|
1088
1088
|
# - https://cloud.google.com/vertex-ai/generative-ai/docs/learn/locations#global-endpoint
|
|
1089
1089
|
location: global
|
|
1090
1090
|
|
|
1091
|
+
- name: google/gemini-2.5-flash-lite
|
|
1092
|
+
model_name: google/gemini-2.5-flash-lite
|
|
1093
|
+
tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
|
|
1094
|
+
max_sequence_length: 1048576 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash
|
|
1095
|
+
# TODO: Max output tokens: 65536
|
|
1096
|
+
client_spec:
|
|
1097
|
+
class_name: "helm.clients.vertexai_client.VertexAIChatClient"
|
|
1098
|
+
|
|
1091
1099
|
- name: google/gemini-2.5-flash-preview-04-17
|
|
1092
1100
|
model_name: google/gemini-2.5-flash-preview-04-17
|
|
1093
1101
|
tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
|
|
@@ -2616,6 +2624,27 @@ model_deployments:
|
|
|
2616
2624
|
client_spec:
|
|
2617
2625
|
class_name: "helm.clients.openai_client.OpenAIClient"
|
|
2618
2626
|
|
|
2627
|
+
- name: openai/gpt-5-2025-08-07
|
|
2628
|
+
model_name: openai/gpt-5-2025-08-07
|
|
2629
|
+
tokenizer_name: openai/o200k_base
|
|
2630
|
+
max_sequence_length: 400000
|
|
2631
|
+
client_spec:
|
|
2632
|
+
class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
|
|
2633
|
+
|
|
2634
|
+
- name: openai/gpt-5-mini-2025-08-07
|
|
2635
|
+
model_name: openai/gpt-5-mini-2025-08-07
|
|
2636
|
+
tokenizer_name: openai/o200k_base
|
|
2637
|
+
max_sequence_length: 400000
|
|
2638
|
+
client_spec:
|
|
2639
|
+
class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
|
|
2640
|
+
|
|
2641
|
+
- name: openai/gpt-5-nano-2025-08-07
|
|
2642
|
+
model_name: openai/gpt-5-nano-2025-08-07
|
|
2643
|
+
tokenizer_name: openai/o200k_base
|
|
2644
|
+
max_sequence_length: 400000
|
|
2645
|
+
client_spec:
|
|
2646
|
+
class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
|
|
2647
|
+
|
|
2619
2648
|
- name: openai/whisper-1_gpt-4o-2024-11-20
|
|
2620
2649
|
model_name: openai/whisper-1_gpt-4o-2024-11-20
|
|
2621
2650
|
tokenizer_name: openai/o200k_base
|
|
@@ -2860,6 +2889,23 @@ model_deployments:
|
|
|
2860
2889
|
openai_model_name: o3-pro-2025-06-10
|
|
2861
2890
|
reasoning_effort: high
|
|
2862
2891
|
|
|
2892
|
+
## GPT-OSS
|
|
2893
|
+
- name: together/gpt-oss-20b
|
|
2894
|
+
model_name: openai/gpt-oss-20b
|
|
2895
|
+
tokenizer_name: openai/o200k_harmony
|
|
2896
|
+
# Source: https://platform.openai.com/docs/models/gpt-oss-20b
|
|
2897
|
+
max_sequence_length: 131072
|
|
2898
|
+
client_spec:
|
|
2899
|
+
class_name: "helm.clients.together_client.TogetherChatClient"
|
|
2900
|
+
|
|
2901
|
+
- name: together/gpt-oss-120b
|
|
2902
|
+
model_name: openai/gpt-oss-120b
|
|
2903
|
+
tokenizer_name: openai/o200k_harmony
|
|
2904
|
+
# Source: https://platform.openai.com/docs/models/gpt-oss-120b
|
|
2905
|
+
max_sequence_length: 131072
|
|
2906
|
+
client_spec:
|
|
2907
|
+
class_name: "helm.clients.together_client.TogetherChatClient"
|
|
2908
|
+
|
|
2863
2909
|
## Text Similarity Models
|
|
2864
2910
|
# OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings
|
|
2865
2911
|
# The number of parameters is guessed based on the number of parameters of the
|
|
@@ -3541,6 +3587,16 @@ model_deployments:
|
|
|
3541
3587
|
args:
|
|
3542
3588
|
together_model: togethercomputer/RedPajama-INCITE-7B-Instruct
|
|
3543
3589
|
|
|
3590
|
+
## Z.ai
|
|
3591
|
+
- name: together/glm-4.5-air-fp8
|
|
3592
|
+
model_name: zai-org/glm-4.5-air-fp8
|
|
3593
|
+
tokenizer_name: zai-org/glm-4.5-air-fp8
|
|
3594
|
+
max_sequence_length: 131072
|
|
3595
|
+
client_spec:
|
|
3596
|
+
class_name: "helm.clients.together_client.TogetherChatClient"
|
|
3597
|
+
args:
|
|
3598
|
+
parse_thinking: true
|
|
3599
|
+
|
|
3544
3600
|
- name: thudm/cogview2
|
|
3545
3601
|
model_name: thudm/cogview2
|
|
3546
3602
|
tokenizer_name: openai/clip-vit-large-patch14
|
|
@@ -3816,7 +3872,16 @@ model_deployments:
|
|
|
3816
3872
|
class_name: "helm.clients.together_client.TogetherChatClient"
|
|
3817
3873
|
args:
|
|
3818
3874
|
parse_thinking: true
|
|
3819
|
-
|
|
3875
|
+
|
|
3876
|
+
- name: together/qwen3-235b-a22b-instruct-2507-fp8
|
|
3877
|
+
model_name: qwen/qwen3-235b-a22b-instruct-2507-fp8
|
|
3878
|
+
tokenizer_name: qwen/qwen3-235b-a22b-instruct-2507-fp8
|
|
3879
|
+
max_sequence_length: 262144
|
|
3880
|
+
client_spec:
|
|
3881
|
+
class_name: "helm.clients.together_client.TogetherChatClient"
|
|
3882
|
+
args:
|
|
3883
|
+
together_model: Qwen/Qwen3-235B-A22B-Instruct-2507-tput
|
|
3884
|
+
|
|
3820
3885
|
- name: huggingface/qwen2.5-7b-instruct-4bit
|
|
3821
3886
|
model_name: qwen/qwen2.5-7b-instruct
|
|
3822
3887
|
tokenizer_name: qwen/qwen2.5-7b-instruct
|
|
@@ -4590,3 +4655,12 @@ model_deployments:
|
|
|
4590
4655
|
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4591
4656
|
args:
|
|
4592
4657
|
pretrained_model_name_or_path: nicholasKluge/TeenyTinyLlama-460m
|
|
4658
|
+
|
|
4659
|
+
- name: openrouter/mistral-medium-3.1
|
|
4660
|
+
model_name: mistralai/mistral-medium-3.1
|
|
4661
|
+
tokenizer_name: mistralai/Mistral-7B-v0.1
|
|
4662
|
+
max_sequence_length: 128000
|
|
4663
|
+
client_spec:
|
|
4664
|
+
class_name: "helm.clients.openrouter_client.OpenRouterClient"
|
|
4665
|
+
args:
|
|
4666
|
+
model_name: mistralai/mistral-medium-3.1
|
helm/config/model_metadata.yaml
CHANGED
|
@@ -1253,6 +1253,14 @@ models:
|
|
|
1253
1253
|
release_date: 2025-06-17
|
|
1254
1254
|
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, AUDIO_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
1255
1255
|
|
|
1256
|
+
- name: google/gemini-2.5-flash-lite
|
|
1257
|
+
display_name: Gemini 2.5 Flash-Lite
|
|
1258
|
+
description: Gemini 2.5 Flash-Lite ([blog](https://blog.google/products/gemini/gemini-2-5-model-family-expands/))
|
|
1259
|
+
creator_organization_name: Google
|
|
1260
|
+
access: limited
|
|
1261
|
+
release_date: 2025-07-22
|
|
1262
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, AUDIO_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
1263
|
+
|
|
1256
1264
|
- name: google/gemini-2.5-flash-preview-04-17
|
|
1257
1265
|
display_name: Gemini 2.5 Flash (04-17 preview)
|
|
1258
1266
|
description: Gemini 2.5 Flash (04-17 preview) ([documentation](https://ai.google.dev/gemini-api/docs/models/gemini))
|
|
@@ -3052,6 +3060,30 @@ models:
|
|
|
3052
3060
|
release_date: 2025-04-14
|
|
3053
3061
|
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3054
3062
|
|
|
3063
|
+
- name: openai/gpt-5-2025-08-07
|
|
3064
|
+
display_name: GPT-5 (2025-08-07)
|
|
3065
|
+
description: GPT-5 (2025-08-07) is a multimdodal model trained for real-world coding tasks and long-running agentic tasks. ([blog](https://openai.com/index/introducing-gpt-5-for-developers/), [system card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf))
|
|
3066
|
+
creator_organization_name: OpenAI
|
|
3067
|
+
access: limited
|
|
3068
|
+
release_date: 2025-08-07
|
|
3069
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3070
|
+
|
|
3071
|
+
- name: openai/gpt-5-mini-2025-08-07
|
|
3072
|
+
display_name: GPT-5 mini (2025-08-07)
|
|
3073
|
+
description: GPT-5 mini (2025-08-07) is a multimdodal model trained for real-world coding tasks and long-running agentic tasks. ([blog](https://openai.com/index/introducing-gpt-5-for-developers/), [system card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf))
|
|
3074
|
+
creator_organization_name: OpenAI
|
|
3075
|
+
access: limited
|
|
3076
|
+
release_date: 2025-08-07
|
|
3077
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3078
|
+
|
|
3079
|
+
- name: openai/gpt-5-nano-2025-08-07
|
|
3080
|
+
display_name: GPT-5 nano (2025-08-07)
|
|
3081
|
+
description: GPT-5 nano (2025-08-07) is a multimdodal model trained for real-world coding tasks and long-running agentic tasks. ([blog](https://openai.com/index/introducing-gpt-5-for-developers/), [system card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf))
|
|
3082
|
+
creator_organization_name: OpenAI
|
|
3083
|
+
access: limited
|
|
3084
|
+
release_date: 2025-08-07
|
|
3085
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3086
|
+
|
|
3055
3087
|
- name: openai/whisper-1_gpt-4o-2024-11-20
|
|
3056
3088
|
display_name: Whisper-1 + GPT-4o (2024-11-20)
|
|
3057
3089
|
description: Transcribes the text with Whisper-1 and then uses GPT-4o to generate a response.
|
|
@@ -3273,6 +3305,23 @@ models:
|
|
|
3273
3305
|
release_date: 2025-06-10
|
|
3274
3306
|
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3275
3307
|
|
|
3308
|
+
## GPT-OSS
|
|
3309
|
+
- name: openai/gpt-oss-20b
|
|
3310
|
+
display_name: gpt-oss-20b
|
|
3311
|
+
description: gpt-oss-20b is an open-weight language model that was trained using a mix of reinforcement learning and other techniques informed by OpenAI's internal models. It uses a mixture-of-experts architecture and activates 3.6B parameters per token. ([blog](https://openai.com/index/introducing-gpt-oss/))
|
|
3312
|
+
creator_organization_name: OpenAI
|
|
3313
|
+
access: open
|
|
3314
|
+
release_date: 2025-08-05
|
|
3315
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3316
|
+
|
|
3317
|
+
- name: openai/gpt-oss-120b
|
|
3318
|
+
display_name: gpt-oss-120b
|
|
3319
|
+
description: gpt-oss-120b is an open-weight language model that was trained using a mix of reinforcement learning and other techniques informed by OpenAI's internal models. It uses a mixture-of-experts architecture and activates 5.1B parameters per token. ([blog](https://openai.com/index/introducing-gpt-oss/))
|
|
3320
|
+
creator_organization_name: OpenAI
|
|
3321
|
+
access: open
|
|
3322
|
+
release_date: 2025-08-05
|
|
3323
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3324
|
+
|
|
3276
3325
|
## Codex Models
|
|
3277
3326
|
# DEPRECATED: Codex models have been shut down on March 23 2023.
|
|
3278
3327
|
|
|
@@ -3549,6 +3598,14 @@ models:
|
|
|
3549
3598
|
release_date: 2025-04-29
|
|
3550
3599
|
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3551
3600
|
|
|
3601
|
+
- name: qwen/qwen3-235b-a22b-instruct-2507-fp8
|
|
3602
|
+
display_name: Qwen3 235B A22B Instruct 2507 FP8
|
|
3603
|
+
description: Qwen3 235B A22B Instruct 2507 FP8 is an updated version of the non-thinking mode of Qwen3 235B A22B FP8.
|
|
3604
|
+
creator_organization_name: Qwen
|
|
3605
|
+
access: open
|
|
3606
|
+
release_date: 2025-07-21 # https://x.com/Alibaba_Qwen/status/1947344511988076547
|
|
3607
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3608
|
+
|
|
3552
3609
|
- name: qwen/qwq-32b-preview
|
|
3553
3610
|
display_name: QwQ (32B Preview)
|
|
3554
3611
|
description: QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities. ([blog post](https://qwenlm.github.io/blog/qwq-32b-preview/)).
|
|
@@ -4315,6 +4372,17 @@ models:
|
|
|
4315
4372
|
release_date: 2025-05-08
|
|
4316
4373
|
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4317
4374
|
|
|
4375
|
+
# Z.ai
|
|
4376
|
+
|
|
4377
|
+
- name: zai-org/glm-4.5-air-fp8
|
|
4378
|
+
display_name: GLM-4.5-Air-FP8
|
|
4379
|
+
description: GLM-4.5-Air-FP8 is a hybrid reasoning model designed to unify reasoning, coding, and agentic capabilities into a single model. It has 106 billion total parameters and 12 billion active parameters. The thinking mode is enabled by default. ([blog](https://z.ai/blog/glm-4.5))
|
|
4380
|
+
creator_organization_name: Z.ai
|
|
4381
|
+
access: open
|
|
4382
|
+
num_parameters: 110000000000
|
|
4383
|
+
release_date: 2025-07-28
|
|
4384
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4385
|
+
|
|
4318
4386
|
|
|
4319
4387
|
# Granite - IBM
|
|
4320
4388
|
# https://www.ibm.com/granite
|
|
@@ -4530,7 +4598,7 @@ models:
|
|
|
4530
4598
|
|
|
4531
4599
|
- name: ibm/granite-3.3-8b-instruct
|
|
4532
4600
|
display_name: IBM Granite 3.3 8B Instruct
|
|
4533
|
-
description: IBM Granite 3.3 8B Instruct is
|
|
4601
|
+
description: IBM Granite 3.3 8B Instruct is an 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
|
|
4534
4602
|
creator_organization_name: IBM
|
|
4535
4603
|
access: open
|
|
4536
4604
|
num_parameters: 8170000000
|
|
@@ -4539,7 +4607,7 @@ models:
|
|
|
4539
4607
|
|
|
4540
4608
|
- name: ibm/granite-3.3-8b-instruct-with-guardian
|
|
4541
4609
|
display_name: IBM Granite 3.3 8B Instruct (with guardian)
|
|
4542
|
-
description: IBM Granite 3.3 8B Instruct is
|
|
4610
|
+
description: IBM Granite 3.3 8B Instruct is an 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. All prompts were first evaluated for risk by [IBM Granite Guardian 3.2 5B](https://www.ibm.com/granite/docs/models/guardian/) and prompts that were deemed risky (with a risk threshold of 0.8) received the response "I'm very sorry, but I can't assist with that.". ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
|
|
4543
4611
|
creator_organization_name: IBM
|
|
4544
4612
|
access: open
|
|
4545
4613
|
num_parameters: 8170000000
|
|
@@ -650,6 +650,12 @@ tokenizer_configs:
|
|
|
650
650
|
end_of_text_token: "<|endoftext|>"
|
|
651
651
|
prefix_token: "<|endoftext|>"
|
|
652
652
|
|
|
653
|
+
- name: openai/o200k_harmony
|
|
654
|
+
tokenizer_spec:
|
|
655
|
+
class_name: "helm.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"
|
|
656
|
+
end_of_text_token: "<|endoftext|>"
|
|
657
|
+
prefix_token: "<|startoftext|>"
|
|
658
|
+
|
|
653
659
|
- name: openai/clip-vit-large-patch14
|
|
654
660
|
tokenizer_spec:
|
|
655
661
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -705,6 +711,12 @@ tokenizer_configs:
|
|
|
705
711
|
end_of_text_token: "<|im_end|>"
|
|
706
712
|
prefix_token: "<|im_start|>"
|
|
707
713
|
|
|
714
|
+
- name: qwen/qwen3-235b-a22b-instruct-2507-fp8
|
|
715
|
+
tokenizer_spec:
|
|
716
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
717
|
+
end_of_text_token: "<|im_end|>"
|
|
718
|
+
prefix_token: ""
|
|
719
|
+
|
|
708
720
|
- name: qwen/qwq-32b-preview
|
|
709
721
|
tokenizer_spec:
|
|
710
722
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -1048,7 +1060,6 @@ tokenizer_configs:
|
|
|
1048
1060
|
end_of_text_token: ""
|
|
1049
1061
|
|
|
1050
1062
|
# IBM Granite 3.3
|
|
1051
|
-
|
|
1052
1063
|
- name: ibm/granite-3.3-8b-instruct
|
|
1053
1064
|
tokenizer_spec:
|
|
1054
1065
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -1057,6 +1068,13 @@ tokenizer_configs:
|
|
|
1057
1068
|
end_of_text_token: "<|end_of_text|>"
|
|
1058
1069
|
prefix_token: "<|end_of_text|>"
|
|
1059
1070
|
|
|
1071
|
+
# Z.ai GLM-4.5-AIR-FP8
|
|
1072
|
+
- name: zai-org/glm-4.5-air-fp8
|
|
1073
|
+
tokenizer_spec:
|
|
1074
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1075
|
+
end_of_text_token: "<|endoftext|>"
|
|
1076
|
+
prefix_token: ""
|
|
1077
|
+
|
|
1060
1078
|
|
|
1061
1079
|
|
|
1062
1080
|
# DeepSeek-R1-Distill-Llama-3.1-8b
|
helm/proxy/example_queries.py
CHANGED
|
@@ -21,7 +21,7 @@ example_queries = [
|
|
|
21
21
|
"""
|
|
22
22
|
temperature: 0.5 # Medium amount of randomness
|
|
23
23
|
stop_sequences: [.] # Stop when you hit a period
|
|
24
|
-
model: openai/gpt-
|
|
24
|
+
model: openai/gpt-4.1-nano-2025-04-14
|
|
25
25
|
"""
|
|
26
26
|
),
|
|
27
27
|
environments="",
|
|
@@ -33,7 +33,7 @@ example_queries = [
|
|
|
33
33
|
temperature: 0.5 # Medium amount of randomness
|
|
34
34
|
stop_sequences: [\\n] # Stop when you hit a newline
|
|
35
35
|
num_completions: 5 # Generate many samples
|
|
36
|
-
model: openai/gpt-
|
|
36
|
+
model: openai/gpt-4.1-nano-2025-04-14
|
|
37
37
|
"""
|
|
38
38
|
),
|
|
39
39
|
environments="",
|
|
@@ -58,7 +58,7 @@ example_queries = [
|
|
|
58
58
|
"""
|
|
59
59
|
temperature: 0 # Deterministic
|
|
60
60
|
max_tokens: 50
|
|
61
|
-
model: openai/gpt-
|
|
61
|
+
model: openai/gpt-4.1-nano-2025-04-14
|
|
62
62
|
"""
|
|
63
63
|
),
|
|
64
64
|
environments="",
|
|
@@ -76,7 +76,7 @@ example_queries = [
|
|
|
76
76
|
environments=dedent(
|
|
77
77
|
"""
|
|
78
78
|
occupation: [mathematician, lawyer, doctor]
|
|
79
|
-
model: [openai/gpt-
|
|
79
|
+
model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
|
|
80
80
|
"""
|
|
81
81
|
),
|
|
82
82
|
),
|
|
@@ -101,7 +101,7 @@ example_queries = [
|
|
|
101
101
|
),
|
|
102
102
|
environments=dedent(
|
|
103
103
|
"""
|
|
104
|
-
model: [openai/gpt-
|
|
104
|
+
model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
|
|
105
105
|
"""
|
|
106
106
|
),
|
|
107
107
|
),
|
|
@@ -136,7 +136,7 @@ example_queries = [
|
|
|
136
136
|
),
|
|
137
137
|
environments=dedent(
|
|
138
138
|
"""
|
|
139
|
-
model: [openai/gpt-
|
|
139
|
+
model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
|
|
140
140
|
"""
|
|
141
141
|
),
|
|
142
142
|
),
|
|
@@ -144,7 +144,7 @@ example_queries = [
|
|
|
144
144
|
prompt="Write a Python function that takes two vectors a and b and returns their Euclidean distance.",
|
|
145
145
|
settings=dedent(
|
|
146
146
|
"""
|
|
147
|
-
model: openai/gpt-
|
|
147
|
+
model: openai/gpt-4.1-nano-2025-04-14
|
|
148
148
|
"""
|
|
149
149
|
),
|
|
150
150
|
environments="",
|
|
@@ -161,7 +161,7 @@ example_queries = [
|
|
|
161
161
|
),
|
|
162
162
|
environments=dedent(
|
|
163
163
|
"""
|
|
164
|
-
model: [openai/gpt-
|
|
164
|
+
model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
|
|
165
165
|
"""
|
|
166
166
|
),
|
|
167
167
|
),
|
helm/proxy/server.py
CHANGED
|
@@ -23,7 +23,7 @@ from helm.benchmark.model_deployment_registry import get_default_model_deploymen
|
|
|
23
23
|
from helm.common.authentication import Authentication
|
|
24
24
|
from helm.common.cache_backend_config import CacheBackendConfig, MongoCacheBackendConfig, SqliteCacheBackendConfig
|
|
25
25
|
from helm.common.general import ensure_directory_exists
|
|
26
|
-
from helm.common.hierarchical_logger import hlog
|
|
26
|
+
from helm.common.hierarchical_logger import hlog, setup_default_logging
|
|
27
27
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
28
28
|
from helm.common.request import Request
|
|
29
29
|
from helm.common.perspective_api_request import PerspectiveAPIRequest
|
|
@@ -273,6 +273,7 @@ def main():
|
|
|
273
273
|
default="",
|
|
274
274
|
)
|
|
275
275
|
args = parser.parse_args()
|
|
276
|
+
setup_default_logging()
|
|
276
277
|
|
|
277
278
|
register_builtin_configs_from_helm_package()
|
|
278
279
|
register_configs_from_directory(args.base_path)
|
helm/proxy/static/index.css
CHANGED
helm/proxy/static/index.js
CHANGED
|
@@ -282,7 +282,13 @@ $(function () {
|
|
|
282
282
|
requestResult.completions.forEach((completion) => {
|
|
283
283
|
const $contents = $("<span>", {
|
|
284
284
|
title: `logprob: ${completion.logprob}`,
|
|
285
|
-
})
|
|
285
|
+
});
|
|
286
|
+
if (completion.thinking) {
|
|
287
|
+
const $thinking = $("<span>", { class: "thinking" }).append(completion.thinking.text);
|
|
288
|
+
$contents.append($thinking);
|
|
289
|
+
}
|
|
290
|
+
const $resultText = completion.tokens.length > 0 ?renderTokens(completion.tokens) : $("<div>").append(completion.text);
|
|
291
|
+
$contents.append($resultText);
|
|
286
292
|
const $metadata = $("<span>", { class: "metadata" });
|
|
287
293
|
$metadata.append(
|
|
288
294
|
$("<span>", { title: "Log probability" }).append(
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.aci_bench_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class ACIBenchMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for ACIBench."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="aci_bench_accuracy",
|
|
11
|
-
scenario_name="aci_bench",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.chw_care_plan_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class CHWCarePlanMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for CHWCarePlan."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="chw_care_plan_accuracy",
|
|
11
|
-
scenario_name="chw_care_plan",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.dischargeme_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class DischargeMeMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for DischargeMe."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="dischargeme_accuracy",
|
|
11
|
-
scenario_name="dischargeme",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.med_dialog_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MedDialogMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MedDialog."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="med_dialog_accuracy",
|
|
11
|
-
scenario_name="med_dialog",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.medalign_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MedalignMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for Medalign."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="medalign_accuracy",
|
|
11
|
-
scenario_name="medalign",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.medi_qa_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MediQAMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MediQA."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="medi_qa_accuracy",
|
|
11
|
-
scenario_name="medi_qa",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.medication_qa_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MedicationQAMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MedicationQA."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="medication_qa_accuracy",
|
|
11
|
-
scenario_name="medication_qa",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.mental_health_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MentalHealthMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MentalHealth."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="mental_health_accuracy",
|
|
11
|
-
scenario_name="mental_health",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.mimic_bhc_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MIMICBHCMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MIMICBHC."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="mimic_bhc_accuracy",
|
|
11
|
-
scenario_name="mimic_bhc",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.mimic_rrs_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MIMICRRSMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MIMICRRS."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="mimic_rrs_accuracy",
|
|
11
|
-
scenario_name="mimic_rrs",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|