PyPI - crfm-helm - Versions diffs - 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl - Mend

crfm-helm 0.5.7py3-none-any.whl → 0.5.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (243) hide show

{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
helm/benchmark/adaptation/adapter_spec.py +5 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
helm/benchmark/annotation/aci_bench_annotator.py +11 -22
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
helm/benchmark/annotation/dischargeme_annotator.py +11 -22
helm/benchmark/annotation/med_dialog_annotator.py +11 -22
helm/benchmark/annotation/medalign_annotator.py +11 -22
helm/benchmark/annotation/medi_qa_annotator.py +11 -22
helm/benchmark/annotation/medication_qa_annotator.py +11 -22
helm/benchmark/annotation/mental_health_annotator.py +11 -22
helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
helm/benchmark/annotation/model_as_judge.py +23 -18
helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
helm/benchmark/metrics/air_bench_metrics.py +3157 -1
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/basic_metrics.py +267 -2
helm/benchmark/metrics/classification_metrics.py +19 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
helm/benchmark/metrics/dry_run_metrics.py +30 -1
helm/benchmark/metrics/efficiency_metrics.py +74 -0
helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
helm/benchmark/metrics/ifeval_metrics.py +13 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
helm/benchmark/metrics/language_modeling_metrics.py +13 -1
helm/benchmark/metrics/live_qa_metrics.py +13 -1
helm/benchmark/metrics/llm_jury_metrics.py +13 -1
helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
helm/benchmark/metrics/medec_metrics.py +25 -2
helm/benchmark/metrics/metric.py +25 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
helm/benchmark/metrics/omni_math_metrics.py +13 -1
helm/benchmark/metrics/seahelm_metrics.py +14 -1
helm/benchmark/metrics/summac/model_summac.py +2 -2
helm/benchmark/metrics/summarization_metrics.py +129 -1
helm/benchmark/metrics/toxicity_metrics.py +31 -1
helm/benchmark/metrics/wildbench_metrics.py +21 -1
helm/benchmark/presentation/schema.py +5 -22
helm/benchmark/presentation/summarize.py +180 -11
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/run_expander.py +4 -0
helm/benchmark/run_specs/arabic_run_specs.py +134 -16
helm/benchmark/run_specs/bluex_run_specs.py +1 -1
helm/benchmark/run_specs/classic_run_specs.py +2 -2
helm/benchmark/run_specs/long_context_run_specs.py +2 -2
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
helm/benchmark/scenarios/air_bench_scenario.py +21 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
helm/benchmark/scenarios/aratrust_scenario.py +19 -0
helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
helm/benchmark/scenarios/bbq_scenario.py +15 -0
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/bluex_scenario.py +6 -2
helm/benchmark/scenarios/bold_scenario.py +15 -0
helm/benchmark/scenarios/boolq_scenario.py +20 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
helm/benchmark/scenarios/clear_scenario.py +23 -0
helm/benchmark/scenarios/cleva_scenario.py +479 -0
helm/benchmark/scenarios/code_scenario.py +28 -0
helm/benchmark/scenarios/commonsense_scenario.py +26 -0
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
helm/benchmark/scenarios/disinformation_scenario.py +22 -0
helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
helm/benchmark/scenarios/gpqa_scenario.py +18 -0
helm/benchmark/scenarios/grammar_scenario.py +20 -1
helm/benchmark/scenarios/gsm_scenario.py +15 -0
helm/benchmark/scenarios/headqa_scenario.py +22 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
helm/benchmark/scenarios/ice_scenario.py +21 -1
helm/benchmark/scenarios/ifeval_scenario.py +18 -0
helm/benchmark/scenarios/imdb_scenario.py +15 -0
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
helm/benchmark/scenarios/legal_support_scenario.py +13 -0
helm/benchmark/scenarios/legalbench_scenario.py +20 -0
helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
helm/benchmark/scenarios/lextreme_scenario.py +11 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +26 -0
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
helm/benchmark/scenarios/med_qa_scenario.py +14 -0
helm/benchmark/scenarios/medalign_scenario.py +23 -0
helm/benchmark/scenarios/medbullets_scenario.py +22 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
helm/benchmark/scenarios/medec_scenario.py +23 -0
helm/benchmark/scenarios/medhallu_scenario.py +23 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
helm/benchmark/scenarios/mental_health_scenario.py +23 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
helm/benchmark/scenarios/mmlu_scenario.py +15 -0
helm/benchmark/scenarios/msmarco_scenario.py +30 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
helm/benchmark/scenarios/omni_math_scenario.py +18 -0
helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
helm/benchmark/scenarios/quac_scenario.py +14 -0
helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
helm/benchmark/scenarios/raft_scenario.py +15 -0
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/scenario.py +31 -0
helm/benchmark/scenarios/seahelm_scenario.py +348 -0
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
helm/benchmark/scenarios/summarization_scenario.py +37 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/vicuna_scenario.py +21 -1
helm/benchmark/scenarios/wikifact_scenario.py +20 -0
helm/benchmark/scenarios/wildbench_scenario.py +18 -0
helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
helm/benchmark/static/schema_arabic.yaml +55 -12
helm/benchmark/static/schema_long_context.yaml +17 -17
helm/benchmark/static/schema_medhelm.yaml +36 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
helm/benchmark/static_build/assets/index-9352595e.css +1 -0
helm/benchmark/static_build/index.html +2 -2
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
helm/clients/openai_client.py +31 -19
helm/clients/openai_responses_client.py +27 -3
helm/clients/openrouter_client.py +31 -0
helm/clients/test_openrouter_client.py +69 -0
helm/clients/together_client.py +48 -11
helm/clients/vertexai_client.py +8 -2
helm/config/model_deployments.yaml +75 -1
helm/config/model_metadata.yaml +70 -2
helm/config/tokenizer_configs.yaml +19 -1
helm/proxy/example_queries.py +8 -8
helm/proxy/server.py +2 -1
helm/proxy/static/index.css +4 -0
helm/proxy/static/index.js +7 -1
helm/benchmark/metrics/aci_bench_metrics.py +0 -14
helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
helm/benchmark/metrics/dischargeme_metrics.py +0 -14
helm/benchmark/metrics/med_dialog_metrics.py +0 -14
helm/benchmark/metrics/medalign_metrics.py +0 -14
helm/benchmark/metrics/medi_qa_metrics.py +0 -14
helm/benchmark/metrics/medication_qa_metrics.py +0 -14
helm/benchmark/metrics/mental_health_metrics.py +0 -14
helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
helm/benchmark/static_build/assets/index-b9779128.css +0 -1
helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0

helm/clients/together_client.py CHANGED Viewed

@@ -99,7 +99,7 @@ class JobNotFinishedError(TogetherClientError):
     pass
-def _parse_thinking(input: str) -> Tuple[str, str]:
+def _parse_thinking_deepseek_r1(input: str) -> Tuple[str, str]:
     """Return a tuple of thinking text and output text."""
     match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
     if match:
@@ -112,6 +112,44 @@ def _parse_thinking(input: str) -> Tuple[str, str]:
     return (input, "")
+def _parse_thinking_qwen3(input: str) -> Tuple[str, str]:
+    """Return a tuple of thinking text and output text."""
+    match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
+    if match:
+        return (match.group(1), match.group(2))
+    match = re.match(r"<think>\n?(.*)", input, re.DOTALL)
+    if match:
+        return (match.group(1), "")
+    return (input, "")
+def _parse_thinking_glm_4_5(input: str) -> Tuple[str, str]:
+    """Return a tuple of thinking text and output text."""
+    match = re.match(r"\n<think>(.*)</think>(.*)", input, re.DOTALL)
+    if match:
+        return (match.group(1), match.group(2))
+    match = re.match(r"\n<think>(.*)", input, re.DOTALL)
+    if match:
+        return (match.group(1), "")
+    return (input, "")
+def _parse_thinking(input: str, model_name: str) -> Tuple[str, str]:
+    # TODO: Come up with a more sustainable extensible way of doing this.
+    if "deepseek-r1" in model_name:
+        return _parse_thinking_deepseek_r1(input)
+    elif "qwen3" in model_name:
+        return _parse_thinking_qwen3(input)
+    elif "glm-4.5" in model_name:
+        return _parse_thinking_glm_4_5(input)
+    else:
+        raise Exception(f"No thinking parser available for model {model_name}")
 class TogetherClient(CachingClient):
     """
     Client for the models where we evaluate offline. Since the queries are handled offline, the `TogetherClient` just
@@ -346,9 +384,8 @@ class TogetherChatClient(CachingClient):
         self._client = Together(api_key=api_key)
         self._together_model = together_model
         self._disable_logprobs = bool(disable_logprobs)
-        # self.output_processor is actually a function, not a class
         self._parse_thinking = bool(parse_thinking)
+        # self.output_processor is actually a function, not a class
         self.output_processor: Optional[Callable[[str], str]] = (
             get_class_by_name(output_processor) if output_processor else None
         )
@@ -444,15 +481,15 @@ class TogetherChatClient(CachingClient):
             if self.output_processor:
                 output_text = self.output_processor(output_text)
+            thinking: Optional[Thinking] = None
             if self._parse_thinking:
-                thinking_text, output_text = _parse_thinking(output_text)
-                generated_outputs.append(
-                    GeneratedOutput(
-                        text=output_text, logprob=logprob, tokens=tokens, thinking=Thinking(text=thinking_text)
-                    )
-                )
-            else:
-                generated_outputs.append(GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens))
+                thinking_text, output_text = _parse_thinking(output_text, request.model)
+                thinking = Thinking(text=thinking_text)
+            elif hasattr(choice.message, "reasoning_content"):
+                thinking = Thinking(text=choice.message.reasoning_content)
+            generated_outputs.append(
+                GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens, thinking=thinking)
+            )
         return RequestResult(
             success=True,
             cached=cached,

helm/clients/vertexai_client.py CHANGED Viewed

@@ -276,8 +276,14 @@ class VertexAIChatClient(VertexAIClient):
                     if not candidate.content:
                         raise VertexAIContentBlockedError(f"No content in candidate: {candidate}")
                     if not candidate.content.parts:
-                        raise VertexAIContentBlockedError(f"No content parts in candidate: {candidate}")
-                    predictions.append({"text": candidate.content.text})
+                        if candidate.finish_reason == 2:  # MAX_TOKENS
+                            # This means that there is no text output because the maximum number of tokens were
+                            # reached during thinking.
+                            predictions.append({"text": ""})
+                        else:
+                            raise VertexAIContentBlockedError(f"No content parts in candidate: {candidate}")
+                    else:
+                        predictions.append({"text": candidate.content.text})
                     # TODO: Extract more information from the response
                 return {"predictions": predictions}

helm/config/model_deployments.yaml CHANGED Viewed

@@ -1088,6 +1088,14 @@ model_deployments:
         # - https://cloud.google.com/vertex-ai/generative-ai/docs/learn/locations#global-endpoint
         location: global
+  - name: google/gemini-2.5-flash-lite
+    model_name: google/gemini-2.5-flash-lite
+    tokenizer_name: google/gemma-2b  # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
+    max_sequence_length: 1048576  # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash
+    # TODO: Max output tokens: 65536
+    client_spec:
+      class_name: "helm.clients.vertexai_client.VertexAIChatClient"
   - name: google/gemini-2.5-flash-preview-04-17
     model_name: google/gemini-2.5-flash-preview-04-17
     tokenizer_name: google/gemma-2b  # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
@@ -2616,6 +2624,27 @@ model_deployments:
     client_spec:
       class_name: "helm.clients.openai_client.OpenAIClient"
+  - name: openai/gpt-5-2025-08-07
+    model_name: openai/gpt-5-2025-08-07
+    tokenizer_name: openai/o200k_base
+    max_sequence_length: 400000
+    client_spec:
+      class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
+  - name: openai/gpt-5-mini-2025-08-07
+    model_name: openai/gpt-5-mini-2025-08-07
+    tokenizer_name: openai/o200k_base
+    max_sequence_length: 400000
+    client_spec:
+      class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
+  - name: openai/gpt-5-nano-2025-08-07
+    model_name: openai/gpt-5-nano-2025-08-07
+    tokenizer_name: openai/o200k_base
+    max_sequence_length: 400000
+    client_spec:
+      class_name: "helm.clients.openai_responses_client.OpenAIResponseClient"
   - name: openai/whisper-1_gpt-4o-2024-11-20
     model_name: openai/whisper-1_gpt-4o-2024-11-20
     tokenizer_name: openai/o200k_base
@@ -2860,6 +2889,23 @@ model_deployments:
         openai_model_name: o3-pro-2025-06-10
         reasoning_effort: high
+  ## GPT-OSS
+  - name: together/gpt-oss-20b
+    model_name: openai/gpt-oss-20b
+    tokenizer_name: openai/o200k_harmony
+    # Source: https://platform.openai.com/docs/models/gpt-oss-20b
+    max_sequence_length: 131072
+    client_spec:
+      class_name: "helm.clients.together_client.TogetherChatClient"
+  - name: together/gpt-oss-120b
+    model_name: openai/gpt-oss-120b
+    tokenizer_name: openai/o200k_harmony
+    # Source: https://platform.openai.com/docs/models/gpt-oss-120b
+    max_sequence_length: 131072
+    client_spec:
+      class_name: "helm.clients.together_client.TogetherChatClient"
   ## Text Similarity Models
   # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings
   # The number of parameters is guessed based on the number of parameters of the
@@ -3541,6 +3587,16 @@ model_deployments:
       args:
         together_model: togethercomputer/RedPajama-INCITE-7B-Instruct
+  ## Z.ai
+  - name: together/glm-4.5-air-fp8
+    model_name: zai-org/glm-4.5-air-fp8
+    tokenizer_name: zai-org/glm-4.5-air-fp8
+    max_sequence_length: 131072
+    client_spec:
+      class_name: "helm.clients.together_client.TogetherChatClient"
+      args:
+        parse_thinking: true
   - name: thudm/cogview2
     model_name: thudm/cogview2
     tokenizer_name: openai/clip-vit-large-patch14
@@ -3816,7 +3872,16 @@ model_deployments:
       class_name: "helm.clients.together_client.TogetherChatClient"
       args:
         parse_thinking: true
+  - name: together/qwen3-235b-a22b-instruct-2507-fp8
+    model_name: qwen/qwen3-235b-a22b-instruct-2507-fp8
+    tokenizer_name: qwen/qwen3-235b-a22b-instruct-2507-fp8
+    max_sequence_length: 262144
+    client_spec:
+      class_name: "helm.clients.together_client.TogetherChatClient"
+      args:
+        together_model: Qwen/Qwen3-235B-A22B-Instruct-2507-tput
   - name: huggingface/qwen2.5-7b-instruct-4bit
     model_name: qwen/qwen2.5-7b-instruct
     tokenizer_name: qwen/qwen2.5-7b-instruct
@@ -4590,3 +4655,12 @@ model_deployments:
       class_name: "helm.clients.huggingface_client.HuggingFaceClient"
       args:
         pretrained_model_name_or_path: nicholasKluge/TeenyTinyLlama-460m
+  - name: openrouter/mistral-medium-3.1
+    model_name: mistralai/mistral-medium-3.1
+    tokenizer_name: mistralai/Mistral-7B-v0.1
+    max_sequence_length: 128000
+    client_spec:
+      class_name: "helm.clients.openrouter_client.OpenRouterClient"
+      args:
+        model_name: mistralai/mistral-medium-3.1

helm/config/model_metadata.yaml CHANGED Viewed

@@ -1253,6 +1253,14 @@ models:
     release_date: 2025-06-17
     tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, AUDIO_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  - name: google/gemini-2.5-flash-lite
+    display_name: Gemini 2.5 Flash-Lite
+    description: Gemini 2.5 Flash-Lite ([blog](https://blog.google/products/gemini/gemini-2-5-model-family-expands/))
+    creator_organization_name: Google
+    access: limited
+    release_date: 2025-07-22
+    tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, AUDIO_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
   - name: google/gemini-2.5-flash-preview-04-17
     display_name: Gemini 2.5 Flash (04-17 preview)
     description: Gemini 2.5 Flash (04-17 preview) ([documentation](https://ai.google.dev/gemini-api/docs/models/gemini))
@@ -3052,6 +3060,30 @@ models:
     release_date: 2025-04-14
     tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  - name: openai/gpt-5-2025-08-07
+    display_name: GPT-5 (2025-08-07)
+    description: GPT-5 (2025-08-07) is a multimdodal model trained for real-world coding tasks and long-running agentic tasks. ([blog](https://openai.com/index/introducing-gpt-5-for-developers/), [system card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf))
+    creator_organization_name: OpenAI
+    access: limited
+    release_date: 2025-08-07
+    tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  - name: openai/gpt-5-mini-2025-08-07
+    display_name: GPT-5 mini (2025-08-07)
+    description: GPT-5 mini (2025-08-07) is a multimdodal model trained for real-world coding tasks and long-running agentic tasks. ([blog](https://openai.com/index/introducing-gpt-5-for-developers/), [system card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf))
+    creator_organization_name: OpenAI
+    access: limited
+    release_date: 2025-08-07
+    tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  - name: openai/gpt-5-nano-2025-08-07
+    display_name: GPT-5 nano (2025-08-07)
+    description: GPT-5 nano (2025-08-07) is a multimdodal model trained for real-world coding tasks and long-running agentic tasks. ([blog](https://openai.com/index/introducing-gpt-5-for-developers/), [system card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf))
+    creator_organization_name: OpenAI
+    access: limited
+    release_date: 2025-08-07
+    tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
   - name: openai/whisper-1_gpt-4o-2024-11-20
     display_name: Whisper-1 + GPT-4o (2024-11-20)
     description: Transcribes the text with Whisper-1 and then uses GPT-4o to generate a response.
@@ -3273,6 +3305,23 @@ models:
     release_date: 2025-06-10
     tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  ## GPT-OSS
+  - name: openai/gpt-oss-20b
+    display_name: gpt-oss-20b
+    description: gpt-oss-20b is an open-weight language model that was trained using a mix of reinforcement learning and other techniques informed by OpenAI's internal models. It uses a mixture-of-experts architecture and activates 3.6B parameters per token. ([blog](https://openai.com/index/introducing-gpt-oss/))
+    creator_organization_name: OpenAI
+    access: open
+    release_date: 2025-08-05
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  - name: openai/gpt-oss-120b
+    display_name: gpt-oss-120b
+    description: gpt-oss-120b is an open-weight language model that was trained using a mix of reinforcement learning and other techniques informed by OpenAI's internal models. It uses a mixture-of-experts architecture and activates 5.1B parameters per token. ([blog](https://openai.com/index/introducing-gpt-oss/))
+    creator_organization_name: OpenAI
+    access: open
+    release_date: 2025-08-05
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
   ## Codex Models
   # DEPRECATED: Codex models have been shut down on March 23 2023.
@@ -3549,6 +3598,14 @@ models:
     release_date: 2025-04-29
     tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  - name: qwen/qwen3-235b-a22b-instruct-2507-fp8
+    display_name: Qwen3 235B A22B Instruct 2507 FP8
+    description: Qwen3 235B A22B Instruct 2507 FP8 is an updated version of the non-thinking mode of Qwen3 235B A22B FP8.
+    creator_organization_name: Qwen
+    access: open
+    release_date: 2025-07-21  # https://x.com/Alibaba_Qwen/status/1947344511988076547
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
   - name: qwen/qwq-32b-preview
     display_name: QwQ (32B Preview)
     description: QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities. ([blog post](https://qwenlm.github.io/blog/qwq-32b-preview/)).
@@ -4315,6 +4372,17 @@ models:
     release_date: 2025-05-08
     tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  # Z.ai
+  - name: zai-org/glm-4.5-air-fp8
+    display_name: GLM-4.5-Air-FP8
+    description: GLM-4.5-Air-FP8 is a hybrid reasoning model designed to unify reasoning, coding, and agentic capabilities into a single model. It has 106 billion total parameters and 12 billion active parameters. The thinking mode is enabled by default. ([blog](https://z.ai/blog/glm-4.5))
+    creator_organization_name: Z.ai
+    access: open
+    num_parameters: 110000000000
+    release_date: 2025-07-28
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
 # Granite - IBM
 # https://www.ibm.com/granite
@@ -4530,7 +4598,7 @@ models:
   - name: ibm/granite-3.3-8b-instruct
     display_name: IBM Granite 3.3 8B Instruct
-    description: IBM Granite 3.3 8B Instruct is a 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
+    description: IBM Granite 3.3 8B Instruct is an 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
     creator_organization_name: IBM
     access: open
     num_parameters: 8170000000
@@ -4539,7 +4607,7 @@ models:
   - name: ibm/granite-3.3-8b-instruct-with-guardian
     display_name: IBM Granite 3.3 8B Instruct (with guardian)
-    description: IBM Granite 3.3 8B Instruct is a 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct)) This model was run with an additional safety filter using [Granite Guardian 3.2](https://www.ibm.com/granite/docs/models/guardian/).
+    description: IBM Granite 3.3 8B Instruct is an 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. All prompts were first evaluated for risk by [IBM Granite Guardian 3.2 5B](https://www.ibm.com/granite/docs/models/guardian/) and prompts that were deemed risky (with a risk threshold of 0.8) received the response "I'm very sorry, but I can't assist with that.". ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
     creator_organization_name: IBM
     access: open
     num_parameters: 8170000000

helm/config/tokenizer_configs.yaml CHANGED Viewed

@@ -650,6 +650,12 @@ tokenizer_configs:
     end_of_text_token: "<|endoftext|>"
     prefix_token: "<|endoftext|>"
+  - name: openai/o200k_harmony
+    tokenizer_spec:
+      class_name: "helm.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|startoftext|>"
   - name: openai/clip-vit-large-patch14
     tokenizer_spec:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -705,6 +711,12 @@ tokenizer_configs:
     end_of_text_token: "<|im_end|>"
     prefix_token: "<|im_start|>"
+  - name: qwen/qwen3-235b-a22b-instruct-2507-fp8
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|im_end|>"
+    prefix_token: ""
   - name: qwen/qwq-32b-preview
     tokenizer_spec:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -1048,7 +1060,6 @@ tokenizer_configs:
     end_of_text_token: ""
   # IBM Granite 3.3
   - name: ibm/granite-3.3-8b-instruct
     tokenizer_spec:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -1057,6 +1068,13 @@ tokenizer_configs:
     end_of_text_token: "<|end_of_text|>"
     prefix_token: "<|end_of_text|>"
+  # Z.ai GLM-4.5-AIR-FP8
+  - name: zai-org/glm-4.5-air-fp8
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: ""
   # DeepSeek-R1-Distill-Llama-3.1-8b

helm/proxy/example_queries.py CHANGED Viewed

@@ -21,7 +21,7 @@ example_queries = [
             """
             temperature: 0.5  # Medium amount of randomness
             stop_sequences: [.]  # Stop when you hit a period
-            model: openai/gpt-3.5-turbo-0613
+            model: openai/gpt-4.1-nano-2025-04-14
             """
         ),
         environments="",
@@ -33,7 +33,7 @@ example_queries = [
             temperature: 0.5  # Medium amount of randomness
             stop_sequences: [\\n]  # Stop when you hit a newline
             num_completions: 5  # Generate many samples
-            model: openai/gpt-3.5-turbo-0613
+            model: openai/gpt-4.1-nano-2025-04-14
             """
         ),
         environments="",
@@ -58,7 +58,7 @@ example_queries = [
             """
             temperature: 0  # Deterministic
             max_tokens: 50
-            model: openai/gpt-3.5-turbo-0613
+            model: openai/gpt-4.1-nano-2025-04-14
             """
         ),
         environments="",
@@ -76,7 +76,7 @@ example_queries = [
         environments=dedent(
             """
             occupation: [mathematician, lawyer, doctor]
-            model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
+            model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
             """
         ),
     ),
@@ -101,7 +101,7 @@ example_queries = [
         ),
         environments=dedent(
             """
-            model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
+            model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
             """
         ),
     ),
@@ -136,7 +136,7 @@ example_queries = [
         ),
         environments=dedent(
             """
-            model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
+            model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
             """
         ),
     ),
@@ -144,7 +144,7 @@ example_queries = [
         prompt="Write a Python function that takes two vectors a and b and returns their Euclidean distance.",
         settings=dedent(
             """
-            model: openai/gpt-3.5-turbo-0613
+            model: openai/gpt-4.1-nano-2025-04-14
             """
         ),
         environments="",
@@ -161,7 +161,7 @@ example_queries = [
         ),
         environments=dedent(
             """
-            model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
+            model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
             """
         ),
     ),

helm/proxy/server.py CHANGED Viewed

@@ -23,7 +23,7 @@ from helm.benchmark.model_deployment_registry import get_default_model_deploymen
 from helm.common.authentication import Authentication
 from helm.common.cache_backend_config import CacheBackendConfig, MongoCacheBackendConfig, SqliteCacheBackendConfig
 from helm.common.general import ensure_directory_exists
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hlog, setup_default_logging
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import Request
 from helm.common.perspective_api_request import PerspectiveAPIRequest
@@ -273,6 +273,7 @@ def main():
         default="",
     )
     args = parser.parse_args()
+    setup_default_logging()
     register_builtin_configs_from_helm_package()
     register_configs_from_directory(args.base_path)

helm/proxy/static/index.css CHANGED Viewed

@@ -35,6 +35,10 @@
   font-style: italic;
 }
+.thinking {
+  font-style: italic;
+}
 .token:hover {
   background-color: lightgreen;
 }

helm/proxy/static/index.js CHANGED Viewed

@@ -282,7 +282,13 @@ $(function () {
     requestResult.completions.forEach((completion) => {
       const $contents = $("<span>", {
         title: `logprob: ${completion.logprob}`,
-      }).append(renderTokens(completion.tokens));
+      });
+      if (completion.thinking) {
+        const $thinking = $("<span>", { class: "thinking" }).append(completion.thinking.text);
+        $contents.append($thinking);
+      }
+      const $resultText = completion.tokens.length > 0 ?renderTokens(completion.tokens) : $("<div>").append(completion.text);
+      $contents.append($resultText);
       const $metadata = $("<span>", { class: "metadata" });
       $metadata.append(
         $("<span>", { title: "Log probability" }).append(

helm/benchmark/metrics/aci_bench_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.aci_bench_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class ACIBenchMetric(LLMJuryMetric):
-    """Score metrics for ACIBench."""
-    def __init__(self):
-        super().__init__(
-            metric_name="aci_bench_accuracy",
-            scenario_name="aci_bench",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/chw_care_plan_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.chw_care_plan_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class CHWCarePlanMetric(LLMJuryMetric):
-    """Score metrics for CHWCarePlan."""
-    def __init__(self):
-        super().__init__(
-            metric_name="chw_care_plan_accuracy",
-            scenario_name="chw_care_plan",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/dischargeme_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.dischargeme_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class DischargeMeMetric(LLMJuryMetric):
-    """Score metrics for DischargeMe."""
-    def __init__(self):
-        super().__init__(
-            metric_name="dischargeme_accuracy",
-            scenario_name="dischargeme",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/med_dialog_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.med_dialog_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class MedDialogMetric(LLMJuryMetric):
-    """Score metrics for MedDialog."""
-    def __init__(self):
-        super().__init__(
-            metric_name="med_dialog_accuracy",
-            scenario_name="med_dialog",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/medalign_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.medalign_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class MedalignMetric(LLMJuryMetric):
-    """Score metrics for Medalign."""
-    def __init__(self):
-        super().__init__(
-            metric_name="medalign_accuracy",
-            scenario_name="medalign",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/medi_qa_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.medi_qa_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class MediQAMetric(LLMJuryMetric):
-    """Score metrics for MediQA."""
-    def __init__(self):
-        super().__init__(
-            metric_name="medi_qa_accuracy",
-            scenario_name="medi_qa",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/medication_qa_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.medication_qa_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class MedicationQAMetric(LLMJuryMetric):
-    """Score metrics for MedicationQA."""
-    def __init__(self):
-        super().__init__(
-            metric_name="medication_qa_accuracy",
-            scenario_name="medication_qa",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/mental_health_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.mental_health_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class MentalHealthMetric(LLMJuryMetric):
-    """Score metrics for MentalHealth."""
-    def __init__(self):
-        super().__init__(
-            metric_name="mental_health_accuracy",
-            scenario_name="mental_health",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/mimic_bhc_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.mimic_bhc_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class MIMICBHCMetric(LLMJuryMetric):
-    """Score metrics for MIMICBHC."""
-    def __init__(self):
-        super().__init__(
-            metric_name="mimic_bhc_accuracy",
-            scenario_name="mimic_bhc",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/mimic_rrs_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.mimic_rrs_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class MIMICRRSMetric(LLMJuryMetric):
-    """Score metrics for MIMICRRS."""
-    def __init__(self):
-        super().__init__(
-            metric_name="mimic_rrs_accuracy",
-            scenario_name="mimic_rrs",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.7py3-none-any.whl → 0.5.8py3-none-any.whl