PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +2 -2
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +16 -26
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +43 -13
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +7 -1
helm/benchmark/presentation/summarize.py +84 -61
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +84 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/arabic_run_specs.py +73 -0
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +0 -53
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +114 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
helm/benchmark/scenarios/aratrust_scenario.py +76 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/bluex_scenario.py +66 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/cleva_scenario.py +1 -1
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/math_scenario.py +21 -20
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/seahelm_scenario.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +228 -0
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +81 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +102 -55
helm/clients/openai_responses_client.py +176 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/test_huggingface_client.py +3 -3
helm/clients/together_client.py +31 -6
helm/clients/vertexai_client.py +17 -9
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/critique_request.py +0 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +104 -12
helm/common/local_context.py +140 -0
helm/common/object_spec.py +23 -8
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +995 -45
helm/config/model_metadata.yaml +780 -59
helm/config/tokenizer_configs.yaml +224 -3
helm/proxy/cli.py +4 -2
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/retry.py +5 -0
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +55 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -793
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/clients/openai_client.py CHANGED Viewed

@@ -1,13 +1,16 @@
 # mypy: check_untyped_defs = False
 from dataclasses import replace
+import re
 from typing import Any, Dict, List, Optional, cast, Union, Callable
+from openai import OpenAIError
 from helm.benchmark.model_metadata_registry import is_vlm
 from helm.common import multimodal_request_utils
 from helm.common.cache import CacheConfig
-from helm.common.media_object import TEXT_TYPE, MultimediaObject
-from helm.common.request import ErrorFlags, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
-from helm.common.hierarchical_logger import hlog
+from helm.common.media_object import TEXT_TYPE, MultimediaObject, MediaObject
+from helm.common.request import ErrorFlags, Thinking, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
+from helm.common.hierarchical_logger import hlog, hwarn
 from helm.common.object_spec import get_class_by_name
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.tokenization_request import (
@@ -24,8 +27,13 @@ except ModuleNotFoundError as e:
     handle_module_not_found_error(e, ["openai"])
-class OpenAIClient(CachingClient):
-    END_OF_TEXT: str = "<|endoftext|>"
+class OpenAIClientUtils:
+    """Methods used by both the chat completions client and the responses API client"""
+    @classmethod
+    def is_reasoning_model(cls, model_engine: str) -> bool:
+        # All OpenAI  reasoning models start "o[somenumber]", so we regexp for that to future proof things
+        return bool(re.match(r"^o\d+", model_engine))
     # Error OpenAI throws when the image in the prompt violates their content policy
     INAPPROPRIATE_IMAGE_ERROR: str = "Your input image may contain content that is not allowed by our safety system"
@@ -49,6 +57,56 @@ class OpenAIClient(CachingClient):
         "See https://labs.openai.com/policies/content-policy for more information."
     )
+    @classmethod
+    def handle_openai_error(cls, e: OpenAIError, request: Request):
+        if cls.INAPPROPRIATE_IMAGE_ERROR in str(e) or cls.INAPPROPRIATE_PROMPT_ERROR in str(e):
+            hwarn(f"Failed safety check: {str(request)}")
+            empty_completion = GeneratedOutput(
+                text="",
+                logprob=0,
+                tokens=[],
+                finish_reason={"reason": cls.CONTENT_POLICY_VIOLATED_FINISH_REASON},
+            )
+            return RequestResult(
+                success=True,
+                cached=False,
+                request_time=0,
+                completions=[empty_completion] * request.num_completions,
+                embedding=[],
+            )
+        elif cls.OPENAI_SERVER_ERROR in str(e):
+            # Handle these errors by returning an empty completion to unblock
+            hwarn(f"OpenAI server error for request: {str(request)}")
+            empty_completion = GeneratedOutput(
+                text="",
+                logprob=0,
+                tokens=[],
+                finish_reason={"reason": cls.OPENAI_SERVER_ERROR},
+            )
+            return RequestResult(
+                success=True,
+                cached=False,
+                request_time=0,
+                completions=[empty_completion] * request.num_completions,
+                embedding=[],
+            )
+        elif cls.INAPPROPRIATE_PROMPT_AZURE_ERROR in str(e) or cls.INAPPROPRIATE_PROMPT_MICROSOFT_ERROR in str(e):
+            return RequestResult(
+                success=False,
+                cached=False,
+                error="Content blocked by Azure's content management filter",
+                completions=[],
+                embedding=[],
+                error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
+            )
+        error: str = f"OpenAI error: {e}"
+        return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
+class OpenAIClient(CachingClient):
+    END_OF_TEXT: str = "<|endoftext|>"
     def __init__(
         self,
         tokenizer: Tokenizer,
@@ -60,11 +118,12 @@ class OpenAIClient(CachingClient):
         reasoning_effort: Optional[str] = None,
         openai_model_name: Optional[str] = None,
         output_processor: Optional[str] = None,
+        **kwargs,
     ):
         super().__init__(cache_config=cache_config)
         self.tokenizer = tokenizer
         self.tokenizer_name = tokenizer_name
-        self.client = OpenAI(api_key=api_key, organization=org_id, base_url=base_url)
+        self.client = OpenAI(api_key=api_key, organization=org_id, base_url=base_url, **kwargs)
         self.reasoning_effort = reasoning_effort
         self.openai_model_name = openai_model_name
         self.output_processor: Optional[Callable[[str], str]] = (
@@ -118,7 +177,7 @@ class OpenAIClient(CachingClient):
             embedding=embedding,
         )
-    def _make_chat_request(self, request: Request) -> RequestResult:
+    def _make_chat_raw_request(self, request: Request) -> Dict[str, Any]:
         messages: Optional[List[Dict[str, Union[str, Any]]]] = request.messages
         if (
             (request.prompt and request.messages)
@@ -137,7 +196,7 @@ class OpenAIClient(CachingClient):
             if request.messages[-1]["role"] != "user":
                 raise ValueError("Last message must have role 'user'")
             if request.prompt != "":
-                hlog("WARNING: Since message is set, prompt will be ignored")
+                hwarn("Since message is set, prompt will be ignored")
         else:
             # Convert prompt into a single message
             # For now, put the whole prompt in a single user message, and expect the response
@@ -223,7 +282,7 @@ class OpenAIClient(CachingClient):
         # Refer to the "Reasoning models" documentation further discussion of o1 model limitations:
         # https://platform.openai.com/docs/guides/reasoning
         model_engine: str = request.model_engine
-        if model_engine.startswith("o1") or model_engine.startswith("o3"):
+        if OpenAIClientUtils.is_reasoning_model(model_engine):
             # Avoid error:
             # "Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead."  # noqa: E501
             # Note that openai>=1.45 is needed for this
@@ -241,8 +300,13 @@ class OpenAIClient(CachingClient):
             # 'code': 'unsupported_parameter'}}"
             raw_request.pop("temperature", None)
+            # The following parameters also happen to be unsupported by the o-series (code unsupported_parameter)
+            raw_request.pop("top_p", None)
+            raw_request.pop("frequency_penalty", None)
+            raw_request.pop("presence_penalty", None)
             if self.reasoning_effort:
-                raw_request["reasoning_effort"] = "self.reasoning_effort"
+                raw_request["reasoning_effort"] = self.reasoning_effort
         elif is_vlm(request.model):
             # Avoid error:
             # "Invalid type for 'stop': expected an unsupported value, but got null instead."
@@ -258,6 +322,10 @@ class OpenAIClient(CachingClient):
             # OpenAI error: Error code: 400 - {'error': {'message': "[{'type': 'string_type', 'loc': ('body', 'stop', 'str'), 'msg': 'Input should be a valid string', 'input': None}, {'type': 'list_type', 'loc': ('body', 'stop', 'list[str]'), 'msg': 'Input should be a valid list', 'input': None}, {'type': 'list_type', 'loc': ('body', 'stop', 'list[list[int]]'), 'msg': 'Input should be a valid list', 'input': None}]", 'type': 'invalid_request_error', 'param': None, 'code': None}}  # noqa: 3501
             if raw_request["stop"] is None:
                 raw_request.pop("stop")
+        return raw_request
+    def _make_chat_request(self, request: Request) -> RequestResult:
+        raw_request = self._make_chat_raw_request(request)
         def do_it() -> Dict[str, Any]:
             return self.client.chat.completions.create(**raw_request).model_dump(mode="json")
@@ -266,49 +334,7 @@ class OpenAIClient(CachingClient):
             cache_key = self._get_cache_key(raw_request, request)
             response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
         except openai.OpenAIError as e:
-            if self.INAPPROPRIATE_IMAGE_ERROR in str(e) or self.INAPPROPRIATE_PROMPT_ERROR in str(e):
-                hlog(f"Failed safety check: {str(request)}")
-                empty_completion = GeneratedOutput(
-                    text="",
-                    logprob=0,
-                    tokens=[],
-                    finish_reason={"reason": self.CONTENT_POLICY_VIOLATED_FINISH_REASON},
-                )
-                return RequestResult(
-                    success=True,
-                    cached=False,
-                    request_time=0,
-                    completions=[empty_completion] * request.num_completions,
-                    embedding=[],
-                )
-            elif self.OPENAI_SERVER_ERROR in str(e):
-                # Handle these errors by returning an empty completion to unblock
-                hlog(f"OpenAI server error for request: {str(request)}")
-                empty_completion = GeneratedOutput(
-                    text="",
-                    logprob=0,
-                    tokens=[],
-                    finish_reason={"reason": self.OPENAI_SERVER_ERROR},
-                )
-                return RequestResult(
-                    success=True,
-                    cached=False,
-                    request_time=0,
-                    completions=[empty_completion] * request.num_completions,
-                    embedding=[],
-                )
-            elif self.INAPPROPRIATE_PROMPT_AZURE_ERROR in str(e) or self.INAPPROPRIATE_PROMPT_MICROSOFT_ERROR in str(e):
-                return RequestResult(
-                    success=False,
-                    cached=False,
-                    error="Content blocked by Azure's content management filter",
-                    completions=[],
-                    embedding=[],
-                    error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
-                )
-            error: str = f"OpenAI error: {e}"
-            return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
+            return OpenAIClientUtils.handle_openai_error(e, request)
         completions: List[GeneratedOutput] = []
         for raw_completion in response["choices"]:
@@ -338,11 +364,20 @@ class OpenAIClient(CachingClient):
             tokens: List[Token] = [
                 Token(text=cast(str, raw_token), logprob=0) for raw_token in tokenization_result.raw_tokens
             ]
+            # vLLM has a optional `reasoning_content` field in the message
+            # that is not in the standard OpenAI API.
+            # This field is also used by some model providers such as Grok.
+            thinking = (
+                Thinking(text=raw_completion["message"]["reasoning_content"])
+                if "reasoning_content" in raw_completion["message"]
+                else None
+            )
             completion = GeneratedOutput(
                 text=text,
                 logprob=0,  # OpenAI does not provide logprobs
                 tokens=tokens,
                 finish_reason={"reason": raw_completion["finish_reason"]},
+                thinking=thinking,
             )
             completions.append(truncate_sequence(completion, request))  # Truncate the text by stop sequences
@@ -459,7 +494,7 @@ class OpenAIClient(CachingClient):
     def make_request(self, request: Request) -> RequestResult:
         if request.embedding:
             return self._make_embedding_request(request)
-        elif "whisper" in request.model_engine:
+        elif "whisper" in request.model_engine or "transcribe" in request.model_engine:
             return self._make_transcription_request(request)
         else:
             return self._make_chat_request(request)
@@ -536,6 +571,18 @@ class OpenAITranscriptionThenCompletionClient(Client):
         # Now make the request to the completion model with just a text-only prompt and no audio
         # Use the same decoding parameters as the original request
         # Ensure to set multimodal_prompt to None so the request is treated as text-only.
-        return self._openai_client.make_request(
+        request_result: RequestResult = self._openai_client.make_request(
             replace(request, prompt=text_prompt, model=f"openai/{completion_model}", multimodal_prompt=None)
         )
+        # Also include the generated transcript to the request result
+        completions_with_transcript: List[GeneratedOutput] = [
+            replace(
+                completion,
+                multimodal_content=MultimediaObject(
+                    media_objects=[MediaObject(text=text_prompt, content_type="text/plain")]
+                ),
+            )
+            for completion in request_result.completions
+        ]
+        return replace(request_result, completions=completions_with_transcript)

helm/clients/openai_responses_client.py ADDED Viewed

@@ -0,0 +1,176 @@
+# mypy: check_untyped_defs = False
+import dataclasses
+from typing import Any, Dict, List, Optional, Union
+from helm.clients.openai_client import OpenAIClientUtils
+from helm.common.cache import CacheConfig
+from helm.common.media_object import TEXT_TYPE
+from helm.common.request import (
+    Thinking,
+    wrap_request_time,
+    Request,
+    RequestResult,
+    GeneratedOutput,
+)
+from helm.common.optional_dependencies import handle_module_not_found_error
+from helm.clients.client import (
+    CachingClient,
+    truncate_and_tokenize_response_text,
+    generate_uid_for_multimodal_prompt,
+)
+from helm.tokenizers.tokenizer import Tokenizer
+try:
+    import openai
+    from openai import OpenAI
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["openai"])
+class OpenAIResponseClient(CachingClient):
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        tokenizer_name: str,
+        cache_config: CacheConfig,
+        api_key: Optional[str] = None,
+        org_id: Optional[str] = None,
+        base_url: Optional[str] = None,
+        reasoning_effort: Optional[str] = None,
+        openai_model_name: Optional[str] = None,
+    ):
+        super().__init__(cache_config=cache_config)
+        self.tokenizer = tokenizer
+        self.tokenizer_name = tokenizer_name
+        self.client = OpenAI(
+            api_key=api_key,
+            organization=org_id,
+            base_url=base_url,
+        )
+        self.reasoning_effort = reasoning_effort
+        self.openai_model_name = openai_model_name
+    def _get_cache_key(self, raw_request: Dict, request: Request):
+        cache_key = CachingClient.make_cache_key(raw_request, request)
+        if request.multimodal_prompt:
+            prompt_key: str = generate_uid_for_multimodal_prompt(request.multimodal_prompt)
+            cache_key = {**cache_key, "multimodal_prompt": prompt_key}
+        return cache_key
+    def _make_raw_request(self, request: Request) -> dict[str, Any]:
+        input: Union[str, List[Dict[str, Any]]]
+        if request.multimodal_prompt is not None:
+            content = []
+            request.validate()
+            for media_object in request.multimodal_prompt.media_objects:
+                if media_object.is_type("image") and media_object.location:
+                    from helm.common.images_utils import encode_base64
+                    base64_image: str = encode_base64(media_object.location)
+                    content.append(
+                        {
+                            "type": "input_image",
+                            "image_url": f"data:image/jpeg;base64,{base64_image}",
+                        }
+                    )
+                elif media_object.is_type(TEXT_TYPE):
+                    assert media_object.text is not None
+                    content.append({"type": "input_text", "text": media_object.text})
+                else:
+                    raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
+            input = [{"role": "user", "content": content}]
+        else:
+            input = request.prompt
+        raw_request: Dict[str, Any] = {
+            "model": self._get_model_for_request(request),
+            "input": input,
+            "top_p": request.top_p,
+            # API errors if max_output_tokens is less than 16
+            # (Error you get: "Invalid 'max_output_tokens': integer below minimum value.
+            #    Expected a value >= 16, but got 5 instead.")
+            "max_output_tokens": max(16, request.max_tokens),
+            "temperature": request.temperature,
+            # Don't store responses for later retrieval
+            "store": False,
+        }
+        if self.reasoning_effort:
+            raw_request["reasoning"] = {"effort": self.reasoning_effort}
+        # If o-series model, get reasoning summaries
+        # Plus other changes
+        model_engine: str = request.model_engine
+        if OpenAIClientUtils.is_reasoning_model(model_engine):
+            raw_request["reasoning"]["summary"] = "detailed"
+            # Avoid error:
+            # "Error code: 400 - {'error': {'message': "Unsupported parameter: 'temperature' is
+            # not supported with this model.", 'type': 'invalid_request_error', 'param': 'temperature',
+            # 'code': 'unsupported_parameter'}}"
+            raw_request.pop("temperature", None)
+            # The following parameters also happen to be unsupported by the o-series (code unsupported_parameter)
+            raw_request.pop("top_p", None)
+        return raw_request
+    def _get_model_for_request(self, request: Request) -> str:
+        return self.openai_model_name or request.model_engine
+    def make_request(self, request: Request) -> RequestResult:
+        # Content can either be text or a list of multimodal content made up of text and images:
+        # https://platform.openai.com/docs/api-reference/responses/create
+        raw_request = self._make_raw_request(request)
+        # The responses API does not support a "num_completions" parameter,
+        # so we need to handle it ourselves with a simple loop
+        completions: list[GeneratedOutput] = []
+        for _ in range(request.num_completions):
+            def do_it() -> Dict[str, Any]:
+                raw_response = self.client.responses.create(**raw_request).model_dump(mode="json")
+                assert not raw_response.get("error", None), f"Error in response: {raw_response}"
+                return raw_response
+            try:
+                cache_key = self._get_cache_key(raw_request, request)
+                response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+            except openai.OpenAIError as e:
+                return OpenAIClientUtils.handle_openai_error(e, request)
+            # We can only return one completition really,
+            # but we get an array of messages back, so we need to contact them
+            reasoning_output = ""
+            text_output = ""
+            if request.echo_prompt:
+                text_output += request.prompt
+            for output in response["output"]:
+                output_type = output[
+                    "type"
+                ]  # one of "message" or "reasoning" from API observation, but can also include tool calls
+                if output_type == "reasoning":
+                    reasoning_output += "\n".join([raw_output["text"] for raw_output in output["summary"]])
+                elif output_type == "message":
+                    text_output += "\n".join([raw_output["text"] for raw_output in output["content"]])
+                # (Other output types are ignored)
+            completion = truncate_and_tokenize_response_text(
+                text_output,
+                request,
+                self.tokenizer,
+                self.tokenizer_name,
+                original_finish_reason="",
+            )
+            if reasoning_output:
+                completion = dataclasses.replace(completion, thinking=Thinking(text=reasoning_output))
+            completions.append(completion)
+        return RequestResult(
+            success=True,
+            cached=cached,
+            request_time=response["request_time"],
+            request_datetime=response.get("request_datetime"),
+            completions=completions,
+            embedding=[],
+        )

helm/clients/palmyra_client.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Any, Dict, List
 from helm.clients.openai_client import OpenAIClient
 from helm.common.cache import CacheConfig
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hwarn
 from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token, ErrorFlags
 from helm.common.tokenization_request import (
     TokenizationRequest,
@@ -103,10 +103,7 @@ class PalmyraClient(CachingClient):
                 return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
             if _is_content_moderation_failure(response):
-                hlog(
-                    f"WARNING: Returning empty request for {request.model_deployment} "
-                    "due to content moderation filter"
-                )
+                hwarn(f"Returning empty request for {request.model_deployment} " "due to content moderation filter")
                 return RequestResult(
                     success=False,
                     cached=False,

helm/clients/reka_client.py CHANGED Viewed

@@ -6,7 +6,7 @@ from helm.proxy.retry import NonRetriableException
 from helm.common.cache import CacheConfig
 from helm.common.media_object import TEXT_TYPE
 from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hwarn
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.tokenizers.tokenizer import Tokenizer
 from helm.clients.client import CachingClient, truncate_and_tokenize_response_text
@@ -121,7 +121,7 @@ class RekaClient(CachingClient):
             if messages[-1]["role"] != "user":
                 raise ValueError("Last message must have role 'user'")
             if request.prompt != "":
-                hlog("WARNING: Since message is set, prompt will be ignored")
+                hwarn("Since message is set, prompt will be ignored")
             reka_chat_history = self._convert_messages_to_reka_chat_history(messages)
         else:
             current_chat_history: Dict[str, Any] = {

helm/clients/test_huggingface_client.py CHANGED Viewed

@@ -9,7 +9,7 @@ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
 class TestHuggingFaceClient:
     def test_gpt2(self):
         tokenizer = HuggingFaceTokenizer(
-            BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2"
+            BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai-community/gpt2"
         )
         client = HuggingFaceClient(
             cache_config=BlackHoleCacheConfig(),
@@ -36,7 +36,7 @@ class TestHuggingFaceClient:
     @pytest.mark.skip(reason="GPT-J 6B is 22 GB and extremely slow without a GPU.")
     def test_gptj_6b(self):
         tokenizer = HuggingFaceTokenizer(
-            BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2"
+            BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai-community/gpt2"
         )
         client = HuggingFaceClient(
             cache_config=BlackHoleCacheConfig(),
@@ -57,7 +57,7 @@ class TestHuggingFaceClient:
     def test_logprob(self):
         tokenizer = HuggingFaceTokenizer(
-            BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2"
+            BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai-community/gpt2"
         )
         client = HuggingFaceClient(
             cache_config=BlackHoleCacheConfig(),

helm/clients/together_client.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from copy import deepcopy
 from itertools import zip_longest
+import re
 import threading
-from typing import Callable, List, Dict, Any, Mapping, Optional, TypedDict, Union
+from typing import Callable, List, Dict, Any, Mapping, Optional, Tuple, TypedDict, Union
 from typing_extensions import NotRequired
 import requests
@@ -11,7 +12,7 @@ from helm.common.cache import CacheConfig
 from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
 from helm.common.object_spec import get_class_by_name
 from helm.common.optional_dependencies import handle_module_not_found_error
-from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
+from helm.common.request import Thinking, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
 from helm.clients.client import CachingClient, truncate_sequence, cleanup_str
 try:
@@ -24,8 +25,6 @@ except ModuleNotFoundError as e:
 class _RewriteRequestTags:
     """Tags that indicate that the request for the model must be rewritten before sending to Together."""
-    # TODO: Convert to StrEnum after upgrading to Python 3.11
     ADD_EOS_TOKEN_AS_STOP_SEQUENCE = "ADD_EOS_TOKEN_AS_STOP_SEQUENCE"
     """Indicates that the EOS token should be added as an extra stop sequence.
@@ -100,6 +99,19 @@ class JobNotFinishedError(TogetherClientError):
     pass
+def _parse_thinking(input: str) -> Tuple[str, str]:
+    """Return a tuple of thinking text and output text."""
+    match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
+    if match:
+        return (match.group(1), match.group(2))
+    match = re.match(r"<think>\n?(.*)", input, re.DOTALL)
+    if match:
+        return (match.group(1), "")
+    return (input, "")
 class TogetherClient(CachingClient):
     """
     Client for the models where we evaluate offline. Since the queries are handled offline, the `TogetherClient` just
@@ -328,12 +340,14 @@ class TogetherChatClient(CachingClient):
         together_model: Optional[str] = None,
         disable_logprobs: Optional[bool] = None,
         output_processor: Optional[str] = None,
+        parse_thinking: Optional[bool] = None,
     ):
         super().__init__(cache_config=cache_config)
         self._client = Together(api_key=api_key)
         self._together_model = together_model
         self._disable_logprobs = bool(disable_logprobs)
         # self.output_processor is actually a function, not a class
+        self._parse_thinking = bool(parse_thinking)
         self.output_processor: Optional[Callable[[str], str]] = (
             get_class_by_name(output_processor) if output_processor else None
@@ -424,11 +438,21 @@ class TogetherChatClient(CachingClient):
                     if token_text is None:
                         break
                     tokens.append(Token(text=token_text, logprob=token_logprob or 0.0))
+            logprob = sum([token.logprob for token in tokens]) if tokens else 0.0
             assert choice.message.role == "assistant"
             output_text = choice.message.content
             if self.output_processor:
                 output_text = self.output_processor(output_text)
-            generated_outputs.append(GeneratedOutput(text=output_text, logprob=0.0, tokens=tokens))
+            if self._parse_thinking:
+                thinking_text, output_text = _parse_thinking(output_text)
+                generated_outputs.append(
+                    GeneratedOutput(
+                        text=output_text, logprob=logprob, tokens=tokens, thinking=Thinking(text=thinking_text)
+                    )
+                )
+            else:
+                generated_outputs.append(GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens))
         return RequestResult(
             success=True,
             cached=cached,
@@ -521,8 +545,9 @@ class TogetherCompletionClient(CachingClient):
                     if token_text is None:
                         break
                     tokens.append(Token(text=token_text, logprob=token_logprob or 0.0))
+            logprob = sum([token.logprob for token in tokens]) if tokens else 0.0
             assert choice.text
-            generated_outputs.append(GeneratedOutput(text=choice.text, logprob=0.0, tokens=tokens))
+            generated_outputs.append(GeneratedOutput(text=choice.text, logprob=logprob, tokens=tokens))
         return RequestResult(
             success=True,
             cached=cached,

helm/clients/vertexai_client.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import requests
 from abc import ABC, abstractmethod
 from threading import Lock
-from typing import Any, Dict, Mapping, Optional, List, Union
+from typing import Any, Dict, Mapping, Optional, List, Union, cast
 from helm.common.cache import CacheConfig
 from helm.common.multimodal_request_utils import get_contents_as_bytes
@@ -107,7 +107,7 @@ class VertexAITextClient(VertexAIClient):
     def make_request(self, request: Request) -> RequestResult:
         """Make a request"""
-        parameters = {
+        parameters: Dict[str, Any] = {
             "temperature": request.temperature,
             "max_output_tokens": request.max_tokens,
             "top_k": request.top_k_per_token,
@@ -207,21 +207,23 @@ class VertexAIChatClient(VertexAIClient):
     def make_request(self, request: Request) -> RequestResult:
         """Make a request"""
-        contents = [request.prompt]
+        # mypy is unhappy without this cast
+        contents: Union[List[Union[str, Image, Part]], List[Content]] = cast(
+            List[Union[str, Image, Part]], [request.prompt]
+        )
         # For the multimodal case, build up the content with the media objects of `request.multimodal_prompt`
         if request.multimodal_prompt is not None:
             return self._make_multimodal_request(request)
         if request.messages is not None:
-            contents = []
             role_mapping = {"user": "user", "assistant": "model"}
-            for msg in request.messages:
-                contents.append(
-                    Content(role=role_mapping.get(msg["role"], "user"), parts=[Part.from_text(msg["content"])])
-                )
+            contents = [
+                Content(role=role_mapping.get(msg["role"], "user"), parts=[Part.from_text(msg["content"])])
+                for msg in request.messages
+            ]
-        parameters = {
+        parameters: Dict[str, Any] = {
             "temperature": request.temperature,
             "max_output_tokens": request.max_tokens,
             "top_k": request.top_k_per_token,
@@ -360,6 +362,12 @@ class VertexAIChatClient(VertexAIClient):
         for media_object in request.multimodal_prompt.media_objects:
             if media_object.is_type("image") and media_object.location:
                 contents.append(Part.from_image(Image.load_from_file(media_object.location)))
+            elif media_object.is_type("video") and media_object.location:
+                # Following this example
+                # https://cloud.google.com/vertex-ai/generative-ai/docs/samples/googlegenaisdk-textgen-with-local-video
+                with open(media_object.location, "rb") as fp:
+                    video_content = fp.read()
+                contents.append(Part.from_data(data=video_content, mime_type=media_object.content_type))
             elif media_object.is_type("audio") and media_object.location:
                 contents.append(
                     Part.from_data(get_contents_as_bytes(media_object.location), mime_type=media_object.content_type)

crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl