PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +1 -2
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +76 -59
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +78 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/long_context_run_specs.py +67 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/numeracy_scenario.py +2 -1
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +63 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +100 -54
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/together_client.py +31 -4
helm/clients/vertexai_client.py +6 -0
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/local_context.py +140 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/config/model_deployments.yaml +864 -193
helm/config/model_metadata.yaml +667 -53
helm/config/tokenizer_configs.yaml +144 -3
helm/proxy/cli.py +3 -1
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/clients/bedrock_client.py CHANGED Viewed

@@ -117,10 +117,12 @@ class BedrockNovaClient(CachingClient):
         tokenizer_name: str,
         assumed_role: Optional[str] = None,
         region: Optional[str] = None,
+        bedrock_model_id: Optional[str] = None,
     ):
         super().__init__(cache_config=cache_config)
         self.tokenizer = tokenizer
         self.tokenizer_name = tokenizer_name
+        self.bedrock_model_id = bedrock_model_id
         self.bedrock_client = get_bedrock_client_v1(
             assumed_role=assumed_role or os.environ.get("BEDROCK_ASSUME_ROLE", None),
             region=region,
@@ -144,7 +146,7 @@ class BedrockNovaClient(CachingClient):
         messages = self._get_messages_from_request(request)
         return {
-            "modelId": model_id,
+            "modelId": self.bedrock_model_id or model_id,
             "inferenceConfig": {
                 "temperature": request.temperature,
                 "maxTokens": request.max_tokens,

helm/clients/client.py CHANGED Viewed

@@ -2,7 +2,7 @@ import json
 from abc import ABC, abstractmethod
 from typing import List, Mapping, Optional, cast
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hwarn
 from helm.common.media_object import MultimediaObject, TEXT_TYPE
 from helm.common.request import Request, RequestResult, GeneratedOutput, Token
 from helm.common.cache import Cache, CacheConfig
@@ -65,7 +65,7 @@ def truncate_sequence(
     # where max_tokens = 0, so there's nothing to truncate.
     if request.echo_prompt:
         if request.max_tokens != 0:
-            hlog("WARNING: don't know how to handle echo_prompt and max_tokens > 0, not truncating")
+            hwarn("don't know how to handle echo_prompt and max_tokens > 0, not truncating")
         return sequence
     if end_of_text_token:
@@ -90,8 +90,8 @@ def truncate_sequence(
             new_tokens.append(token)
         if len(new_text) < len(sequence.text) and len(new_tokens) == len(sequence.tokens):
-            hlog(
-                f"WARNING: Stripped characters from text ({len(sequence.text)} -> {len(new_text)}), "
+            hwarn(
+                f"Stripped characters from text ({len(sequence.text)} -> {len(new_text)}), "
                 f"but wasn't able to strip the tokens"
             )
@@ -99,14 +99,14 @@ def truncate_sequence(
         new_logprob = sum(token.logprob for token in new_tokens)
         if print_warning:
-            hlog(f"WARNING: truncate_sequence needs to strip {json.dumps(stop)}")
+            hwarn(f"truncate_sequence needs to strip {json.dumps(stop)}")
         sequence = GeneratedOutput(text=new_text, logprob=new_logprob, tokens=new_tokens)
     # Truncate based on the max number of tokens.
     if len(sequence.tokens) > request.max_tokens:
         if print_warning:
-            hlog(f"WARNING: truncate_sequence needs to truncate {len(sequence.tokens)} down to {request.max_tokens}")
+            hwarn(f"truncate_sequence needs to truncate {len(sequence.tokens)} down to {request.max_tokens}")
         new_tokens = sequence.tokens[: request.max_tokens]
         # This is imperfect stitching together of tokens, so just to make sure this is okay
@@ -114,7 +114,7 @@ def truncate_sequence(
         # Usually, in our benchmark, max_tokens is active when it's 1, so hopefully this isn't an issue.
         new_text = "".join(token.text for token in new_tokens)
         if not sequence.text.startswith(new_text):
-            hlog(f"WARNING: {json.dumps(sequence.text)} does not start with truncated text {json.dumps(new_text)}")
+            hwarn(f"{json.dumps(sequence.text)} does not start with truncated text {json.dumps(new_text)}")
         new_logprob = sum(token.logprob for token in new_tokens)

helm/clients/grok_client.py ADDED Viewed

@@ -0,0 +1,36 @@
+from typing import Any, Dict, Optional
+from helm.clients.openai_client import OpenAIClient
+from helm.common.cache import CacheConfig
+from helm.common.request import Request
+from helm.tokenizers.tokenizer import Tokenizer
+class GrokChatClient(OpenAIClient):
+    BASE_URL = "https://api.x.ai/v1"
+    _UNSUPPORTED_ARGUMENTS = ["presence_penalty", "frequency_penalty"]
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        tokenizer_name: str,
+        cache_config: CacheConfig,
+        api_key: Optional[str] = None,
+    ):
+        super().__init__(
+            tokenizer=tokenizer,
+            tokenizer_name=tokenizer_name,
+            cache_config=cache_config,
+            api_key=api_key,
+            org_id=None,
+            base_url="https://api.x.ai/v1",
+        )
+    def _make_chat_raw_request(self, request: Request) -> Dict[str, Any]:
+        raw_request = super()._make_chat_raw_request(request)
+        for unsupported_argument in self._UNSUPPORTED_ARGUMENTS:
+            if unsupported_argument in raw_request:
+                del raw_request[unsupported_argument]
+        return raw_request

helm/clients/huggingface_client.py CHANGED Viewed

@@ -8,7 +8,7 @@ from transformers.generation.stopping_criteria import (
 from typing import Any, Dict, List, Optional, TypedDict
 from helm.common.cache import CacheConfig
-from helm.common.hierarchical_logger import htrack_block, hlog
+from helm.common.hierarchical_logger import htrack_block, hlog, hwarn
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import (
     wrap_request_time,
@@ -18,6 +18,7 @@ from helm.common.request import (
     GeneratedOutput,
     Token,
 )
+from helm.proxy.retry import NonRetriableException
 from helm.tokenizers.tokenizer import Tokenizer
 from helm.clients.client import CachingClient, truncate_sequence
 from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer, WrappedPreTrainedTokenizer
@@ -256,6 +257,7 @@ class HuggingFaceClient(CachingClient):
         tokenizer: Tokenizer,
         pretrained_model_name_or_path: Optional[str] = None,
         end_of_text_token: Optional[str] = None,
+        apply_chat_template: Optional[bool] = None,
         **kwargs,
     ):
         super().__init__(cache_config=cache_config)
@@ -266,9 +268,46 @@ class HuggingFaceClient(CachingClient):
                 "but instead it is {tokenizer}"
             )
         self._wrapped_tokenizer: WrappedPreTrainedTokenizer = tokenizer.get_wrapped_tokenizer()
-        self._tokenizer = tokenizer
         self._kwargs = _process_huggingface_client_kwargs(kwargs)
         self._end_of_text_token = end_of_text_token
+        # If the user did not explicitly configure whether the model is a chat model with `apply_chat_template` arg,
+        # auto-infer if the model is a chat model based on whether the tokenizer has a chat template.
+        # Note: Auto-inference is incorrect for some non-chat models that still have chat templates
+        # e.g. Qwen2, Qwen 2.5.
+        # For these models, the `apply_chat_template` arg should be explicitly set to false.
+        if apply_chat_template is not None:
+            self._apply_chat_template = apply_chat_template
+        else:
+            with self._wrapped_tokenizer as hf_tokenizer:
+                self._apply_chat_template = bool(hf_tokenizer.chat_template)
+                hwarn(
+                    f"Automatically set `apply_chat_template` to {self._apply_chat_template} based on "
+                    "whether the tokenizer has a chat template. "
+                    "If this is incorrect, please explicitly set `apply_chat_template`."
+                )
+    def get_prompt(self, request: Request) -> str:
+        if request.prompt and request.messages:
+            raise NonRetriableException(f"More than one of `prompt` and `messages` was set in request: {request}")
+        # Chat model expects a list of messages as input
+        if self._apply_chat_template:
+            with self._wrapped_tokenizer as tokenizer:
+                if request.messages:
+                    prompt = tokenizer.apply_chat_template(request.messages, tokenize=False)
+                    assert isinstance(prompt, str)
+                    return prompt
+                else:
+                    prompt = tokenizer.apply_chat_template(
+                        [{"role": "user", "content": request.prompt}], tokenize=False
+                    )
+                    assert isinstance(prompt, str)
+                    return prompt
+        # Base non-chat model expects a string as input
+        else:
+            if request.messages:
+                raise NonRetriableException("Chat mesages not supported by non-chat model")
+            else:
+                return request.prompt
     def make_request(self, request: Request) -> RequestResult:
         # Embedding not supported for this model
@@ -277,7 +316,7 @@ class HuggingFaceClient(CachingClient):
         raw_request: HuggingFaceRequest = {
             "engine": request.model_engine,
-            "prompt": request.prompt,
+            "prompt": self.get_prompt(request),
             "temperature": 1e-7 if request.temperature == 0 else request.temperature,
             "num_return_sequences": request.num_completions,
             "max_new_tokens": request.max_tokens,

helm/clients/huggingface_pipeline_client.py ADDED Viewed

@@ -0,0 +1,138 @@
+from threading import Lock
+from typing import Any, Dict, List, Optional, Union
+import transformers
+from helm.clients.client import CachingClient
+from helm.common.cache import CacheConfig
+from helm.common.hierarchical_logger import htrack_block, hwarn
+from helm.common.request import GeneratedOutput, Request, RequestResult, wrap_request_time
+from helm.proxy.retry import NonRetriableException
+_pipelines: Dict[str, transformers.Pipeline] = {}
+_pipelines_lock: Lock = Lock()
+def _get_pipeline(
+    helm_model_name: str,
+    pipeline_kwargs: Dict[str, Any],
+) -> Any:
+    """
+    Checks if the desired HuggingFaceModel is cached. Creates the HuggingFaceModel if it's not cached.
+    Returns the HuggingFaceModel.
+    """
+    global _pipelines
+    global _pipelines_lock
+    with _pipelines_lock:
+        if helm_model_name not in _pipelines:
+            huggingface_model_name = pipeline_kwargs["model"]
+            with htrack_block(
+                f"Loading HuggingFace model {huggingface_model_name} (kwargs={pipeline_kwargs}) "
+                f"for HELM model {helm_model_name} with transformers.pipeline"
+            ):
+                _pipelines[helm_model_name] = transformers.pipeline(**pipeline_kwargs)
+    return _pipelines[helm_model_name]
+class HuggingFacePipelineClient(CachingClient):
+    def __init__(
+        self,
+        cache_config: CacheConfig,
+        model_name: str,
+        pretrained_model_name_or_path: Optional[str] = None,
+        apply_chat_template: Optional[bool] = None,
+        **kwargs,
+    ):
+        # Include `pretrained_model_name_or_path` parameter so that model deployments can use
+        # the `pretrained_model_name_or_path` arg to override `model_name`
+        super().__init__(cache_config=cache_config)
+        self._helm_model_name = model_name
+        self._pipeline_kwargs = {
+            "model": pretrained_model_name_or_path or self._helm_model_name,
+            "task": "text-generation",
+            **kwargs,
+        }
+        self._pipeline = _get_pipeline(self._helm_model_name, self._pipeline_kwargs)
+        if apply_chat_template is not None:
+            self._apply_chat_template = apply_chat_template
+        else:
+            # If the user did not explicitly configure whether the model is a chat model with `apply_chat_template` arg,
+            # auto-infer if the model is a chat model based on whether the tokenizer has a chat template.
+            # Note: Auto-inference is incorrect for some non-chat models that still have chat templates
+            # e.g. Qwen2, Qwen 2.5.
+            # For these models, the `apply_chat_template` arg should be explicitly set to false.
+            self._apply_chat_template = bool(self._pipeline.tokenizer.chat_template)
+            hwarn(
+                f"Automatically set `apply_chat_template` to {self._apply_chat_template} based on "
+                "whether the tokenizer has a chat template. "
+                "If this is incorrect, please explicitly set `apply_chat_template`."
+            )
+    def make_text_inputs(self, request: Request) -> Union[str, List[Dict[str, str]]]:
+        if request.prompt and request.messages:
+            raise NonRetriableException(f"More than one of `prompt` and `messages` was set in request: {request}")
+        # Chat model expects a list of messages as input
+        if self._apply_chat_template:
+            if request.messages:
+                return request.messages
+            else:
+                return [{"role": "user", "content": request.prompt}]
+        # Base non-chat model expects a string as input
+        else:
+            if request.messages:
+                raise NonRetriableException("Chat mesages not supported by non-chat model")
+            else:
+                return request.prompt
+    def make_request(self, request: Request) -> RequestResult:
+        """Make a request"""
+        if request.model != self._helm_model_name:
+            raise NonRetriableException(
+                f"This instance of HuggingFacePipelineClient has loaded model {self._helm_model_name} but the request was for model {request.model}"  # noqa: E501
+            )
+        completions: List[GeneratedOutput] = []
+        do_sample = request.temperature > 0.0
+        raw_request = {
+            "text_inputs": self.make_text_inputs(request),
+            "return_full_text": request.echo_prompt,
+            "temperature": request.temperature if do_sample else None,
+            "num_return_sequences": request.num_completions,
+            "max_new_tokens": request.max_tokens,
+            "top_p": request.top_p,
+            "top_k": request.top_k_per_token if do_sample else None,
+            "do_sample": do_sample,
+            "return_dict_in_generate": True,
+        }
+        if request.stop_sequences:
+            stop_sequence_ids = self._pipeline.tokenizer(
+                request.stop_sequences, return_token_type_ids=False, add_special_tokens=False
+            )
+            if len(stop_sequence_ids.input_ids) == 1 and len(stop_sequence_ids.input_ids[0]) == 1:
+                raw_request["eos_token_id"] = stop_sequence_ids.input_ids[0][0]
+            else:
+                raise NonRetriableException(
+                    "Multiple stop sequences and stop sequences of multiple tokens, are not yet supported by HuggingFacePipelineClient"  # noqa: E501
+                )
+        def do_it() -> Dict[str, Any]:
+            pipeline_outputs = self._pipeline(**raw_request)
+            return {"outputs": pipeline_outputs}
+        cache_key = CachingClient.make_cache_key(
+            {"pipeline_kwargs": self._pipeline_kwargs, **raw_request},
+            request,
+        )
+        response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+        for raw_output in response["outputs"]:
+            completions.append(GeneratedOutput(text=raw_output["generated_text"], logprob=0, tokens=[]))
+        return RequestResult(
+            success=True,
+            cached=cached,
+            request_time=response["request_time"],
+            request_datetime=response["request_datetime"],
+            completions=completions,
+            embedding=[],
+        )

helm/clients/image_generation/dalle_mini/model/configuration.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DalleBart model configuration """
+"""DalleBart model configuration"""
 import warnings
 from transformers.configuration_utils import PretrainedConfig

helm/clients/image_generation/dalle_mini/model/modeling.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DalleBart model. """
+"""DalleBart model."""
 import math
 from functools import partial

helm/clients/image_generation/dalle_mini/model/processor.py CHANGED Viewed

@@ -1,4 +1,4 @@
-""" DalleBart processor """
+"""DalleBart processor"""
 from typing import List

helm/clients/image_generation/dalle_mini/model/tokenizer.py CHANGED Viewed

@@ -1,4 +1,4 @@
-""" DalleBart tokenizer """
+"""DalleBart tokenizer"""
 from transformers import BartTokenizerFast

helm/clients/openai_client.py CHANGED Viewed

@@ -1,13 +1,16 @@
 # mypy: check_untyped_defs = False
 from dataclasses import replace
+import re
 from typing import Any, Dict, List, Optional, cast, Union, Callable
+from openai import OpenAIError
 from helm.benchmark.model_metadata_registry import is_vlm
 from helm.common import multimodal_request_utils
 from helm.common.cache import CacheConfig
-from helm.common.media_object import TEXT_TYPE, MultimediaObject
-from helm.common.request import ErrorFlags, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
-from helm.common.hierarchical_logger import hlog
+from helm.common.media_object import TEXT_TYPE, MultimediaObject, MediaObject
+from helm.common.request import ErrorFlags, Thinking, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
+from helm.common.hierarchical_logger import hlog, hwarn
 from helm.common.object_spec import get_class_by_name
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.tokenization_request import (
@@ -24,8 +27,13 @@ except ModuleNotFoundError as e:
     handle_module_not_found_error(e, ["openai"])
-class OpenAIClient(CachingClient):
-    END_OF_TEXT: str = "<|endoftext|>"
+class OpenAIClientUtils:
+    """Methods used by both the chat completions client and the responses API client"""
+    @classmethod
+    def is_reasoning_model(cls, model_engine: str) -> bool:
+        # All OpenAI  reasoning models start "o[somenumber]", so we regexp for that to future proof things
+        return bool(re.match(r"^o\d+", model_engine))
     # Error OpenAI throws when the image in the prompt violates their content policy
     INAPPROPRIATE_IMAGE_ERROR: str = "Your input image may contain content that is not allowed by our safety system"
@@ -49,6 +57,56 @@ class OpenAIClient(CachingClient):
         "See https://labs.openai.com/policies/content-policy for more information."
     )
+    @classmethod
+    def handle_openai_error(cls, e: OpenAIError, request: Request):
+        if cls.INAPPROPRIATE_IMAGE_ERROR in str(e) or cls.INAPPROPRIATE_PROMPT_ERROR in str(e):
+            hwarn(f"Failed safety check: {str(request)}")
+            empty_completion = GeneratedOutput(
+                text="",
+                logprob=0,
+                tokens=[],
+                finish_reason={"reason": cls.CONTENT_POLICY_VIOLATED_FINISH_REASON},
+            )
+            return RequestResult(
+                success=True,
+                cached=False,
+                request_time=0,
+                completions=[empty_completion] * request.num_completions,
+                embedding=[],
+            )
+        elif cls.OPENAI_SERVER_ERROR in str(e):
+            # Handle these errors by returning an empty completion to unblock
+            hwarn(f"OpenAI server error for request: {str(request)}")
+            empty_completion = GeneratedOutput(
+                text="",
+                logprob=0,
+                tokens=[],
+                finish_reason={"reason": cls.OPENAI_SERVER_ERROR},
+            )
+            return RequestResult(
+                success=True,
+                cached=False,
+                request_time=0,
+                completions=[empty_completion] * request.num_completions,
+                embedding=[],
+            )
+        elif cls.INAPPROPRIATE_PROMPT_AZURE_ERROR in str(e) or cls.INAPPROPRIATE_PROMPT_MICROSOFT_ERROR in str(e):
+            return RequestResult(
+                success=False,
+                cached=False,
+                error="Content blocked by Azure's content management filter",
+                completions=[],
+                embedding=[],
+                error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
+            )
+        error: str = f"OpenAI error: {e}"
+        return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
+class OpenAIClient(CachingClient):
+    END_OF_TEXT: str = "<|endoftext|>"
     def __init__(
         self,
         tokenizer: Tokenizer,
@@ -118,7 +176,7 @@ class OpenAIClient(CachingClient):
             embedding=embedding,
         )
-    def _make_chat_request(self, request: Request) -> RequestResult:
+    def _make_chat_raw_request(self, request: Request) -> Dict[str, Any]:
         messages: Optional[List[Dict[str, Union[str, Any]]]] = request.messages
         if (
             (request.prompt and request.messages)
@@ -137,7 +195,7 @@ class OpenAIClient(CachingClient):
             if request.messages[-1]["role"] != "user":
                 raise ValueError("Last message must have role 'user'")
             if request.prompt != "":
-                hlog("WARNING: Since message is set, prompt will be ignored")
+                hwarn("Since message is set, prompt will be ignored")
         else:
             # Convert prompt into a single message
             # For now, put the whole prompt in a single user message, and expect the response
@@ -223,7 +281,7 @@ class OpenAIClient(CachingClient):
         # Refer to the "Reasoning models" documentation further discussion of o1 model limitations:
         # https://platform.openai.com/docs/guides/reasoning
         model_engine: str = request.model_engine
-        if model_engine.startswith("o1") or model_engine.startswith("o3"):
+        if OpenAIClientUtils.is_reasoning_model(model_engine):
             # Avoid error:
             # "Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead."  # noqa: E501
             # Note that openai>=1.45 is needed for this
@@ -241,8 +299,13 @@ class OpenAIClient(CachingClient):
             # 'code': 'unsupported_parameter'}}"
             raw_request.pop("temperature", None)
+            # The following parameters also happen to be unsupported by the o-series (code unsupported_parameter)
+            raw_request.pop("top_p", None)
+            raw_request.pop("frequency_penalty", None)
+            raw_request.pop("presence_penalty", None)
             if self.reasoning_effort:
-                raw_request["reasoning_effort"] = "self.reasoning_effort"
+                raw_request["reasoning_effort"] = self.reasoning_effort
         elif is_vlm(request.model):
             # Avoid error:
             # "Invalid type for 'stop': expected an unsupported value, but got null instead."
@@ -258,6 +321,10 @@ class OpenAIClient(CachingClient):
             # OpenAI error: Error code: 400 - {'error': {'message': "[{'type': 'string_type', 'loc': ('body', 'stop', 'str'), 'msg': 'Input should be a valid string', 'input': None}, {'type': 'list_type', 'loc': ('body', 'stop', 'list[str]'), 'msg': 'Input should be a valid list', 'input': None}, {'type': 'list_type', 'loc': ('body', 'stop', 'list[list[int]]'), 'msg': 'Input should be a valid list', 'input': None}]", 'type': 'invalid_request_error', 'param': None, 'code': None}}  # noqa: 3501
             if raw_request["stop"] is None:
                 raw_request.pop("stop")
+        return raw_request
+    def _make_chat_request(self, request: Request) -> RequestResult:
+        raw_request = self._make_chat_raw_request(request)
         def do_it() -> Dict[str, Any]:
             return self.client.chat.completions.create(**raw_request).model_dump(mode="json")
@@ -266,49 +333,7 @@ class OpenAIClient(CachingClient):
             cache_key = self._get_cache_key(raw_request, request)
             response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
         except openai.OpenAIError as e:
-            if self.INAPPROPRIATE_IMAGE_ERROR in str(e) or self.INAPPROPRIATE_PROMPT_ERROR in str(e):
-                hlog(f"Failed safety check: {str(request)}")
-                empty_completion = GeneratedOutput(
-                    text="",
-                    logprob=0,
-                    tokens=[],
-                    finish_reason={"reason": self.CONTENT_POLICY_VIOLATED_FINISH_REASON},
-                )
-                return RequestResult(
-                    success=True,
-                    cached=False,
-                    request_time=0,
-                    completions=[empty_completion] * request.num_completions,
-                    embedding=[],
-                )
-            elif self.OPENAI_SERVER_ERROR in str(e):
-                # Handle these errors by returning an empty completion to unblock
-                hlog(f"OpenAI server error for request: {str(request)}")
-                empty_completion = GeneratedOutput(
-                    text="",
-                    logprob=0,
-                    tokens=[],
-                    finish_reason={"reason": self.OPENAI_SERVER_ERROR},
-                )
-                return RequestResult(
-                    success=True,
-                    cached=False,
-                    request_time=0,
-                    completions=[empty_completion] * request.num_completions,
-                    embedding=[],
-                )
-            elif self.INAPPROPRIATE_PROMPT_AZURE_ERROR in str(e) or self.INAPPROPRIATE_PROMPT_MICROSOFT_ERROR in str(e):
-                return RequestResult(
-                    success=False,
-                    cached=False,
-                    error="Content blocked by Azure's content management filter",
-                    completions=[],
-                    embedding=[],
-                    error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
-                )
-            error: str = f"OpenAI error: {e}"
-            return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
+            return OpenAIClientUtils.handle_openai_error(e, request)
         completions: List[GeneratedOutput] = []
         for raw_completion in response["choices"]:
@@ -338,11 +363,20 @@ class OpenAIClient(CachingClient):
             tokens: List[Token] = [
                 Token(text=cast(str, raw_token), logprob=0) for raw_token in tokenization_result.raw_tokens
             ]
+            # vLLM has a optional `reasoning_content` field in the message
+            # that is not in the standard OpenAI API.
+            # This field is also used by some model providers such as Grok.
+            thinking = (
+                Thinking(text=raw_completion["message"]["reasoning_content"])
+                if "reasoning_content" in raw_completion["message"]
+                else None
+            )
             completion = GeneratedOutput(
                 text=text,
                 logprob=0,  # OpenAI does not provide logprobs
                 tokens=tokens,
                 finish_reason={"reason": raw_completion["finish_reason"]},
+                thinking=thinking,
             )
             completions.append(truncate_sequence(completion, request))  # Truncate the text by stop sequences
@@ -459,7 +493,7 @@ class OpenAIClient(CachingClient):
     def make_request(self, request: Request) -> RequestResult:
         if request.embedding:
             return self._make_embedding_request(request)
-        elif "whisper" in request.model_engine:
+        elif "whisper" in request.model_engine or "transcribe" in request.model_engine:
             return self._make_transcription_request(request)
         else:
             return self._make_chat_request(request)
@@ -536,6 +570,18 @@ class OpenAITranscriptionThenCompletionClient(Client):
         # Now make the request to the completion model with just a text-only prompt and no audio
         # Use the same decoding parameters as the original request
         # Ensure to set multimodal_prompt to None so the request is treated as text-only.
-        return self._openai_client.make_request(
+        request_result: RequestResult = self._openai_client.make_request(
             replace(request, prompt=text_prompt, model=f"openai/{completion_model}", multimodal_prompt=None)
         )
+        # Also include the generated transcript to the request result
+        completions_with_transcript: List[GeneratedOutput] = [
+            replace(
+                completion,
+                multimodal_content=MultimediaObject(
+                    media_objects=[MediaObject(text=text_prompt, content_type="text/plain")]
+                ),
+            )
+            for completion in request_result.completions
+        ]
+        return replace(request_result, completions=completions_with_transcript)

crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.6py3-none-any.whl