PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +1 -2
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +76 -59
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +78 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/long_context_run_specs.py +67 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/numeracy_scenario.py +2 -1
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +63 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +100 -54
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/together_client.py +31 -4
helm/clients/vertexai_client.py +6 -0
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/local_context.py +140 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/config/model_deployments.yaml +864 -193
helm/config/model_metadata.yaml +667 -53
helm/config/tokenizer_configs.yaml +144 -3
helm/proxy/cli.py +3 -1
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/clients/openai_responses_client.py ADDED Viewed

@@ -0,0 +1,174 @@
+# mypy: check_untyped_defs = False
+import dataclasses
+from typing import Any, Dict, List, Optional, Union
+from helm.clients.openai_client import OpenAIClientUtils
+from helm.common.cache import CacheConfig
+from helm.common.media_object import TEXT_TYPE
+from helm.common.request import (
+    Thinking,
+    wrap_request_time,
+    Request,
+    RequestResult,
+    GeneratedOutput,
+)
+from helm.common.optional_dependencies import handle_module_not_found_error
+from helm.clients.client import (
+    CachingClient,
+    truncate_and_tokenize_response_text,
+    generate_uid_for_multimodal_prompt,
+)
+from helm.tokenizers.tokenizer import Tokenizer
+try:
+    import openai
+    from openai import OpenAI
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["openai"])
+class OpenAIResponseClient(CachingClient):
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        tokenizer_name: str,
+        cache_config: CacheConfig,
+        api_key: Optional[str] = None,
+        org_id: Optional[str] = None,
+        base_url: Optional[str] = None,
+        reasoning_effort: Optional[str] = None,
+        openai_model_name: Optional[str] = None,
+    ):
+        super().__init__(cache_config=cache_config)
+        self.tokenizer = tokenizer
+        self.tokenizer_name = tokenizer_name
+        self.client = OpenAI(
+            api_key=api_key,
+            organization=org_id,
+            base_url=base_url,
+        )
+        self.reasoning_effort = reasoning_effort
+        self.openai_model_name = openai_model_name
+    def _get_cache_key(self, raw_request: Dict, request: Request):
+        cache_key = CachingClient.make_cache_key(raw_request, request)
+        if request.multimodal_prompt:
+            prompt_key: str = generate_uid_for_multimodal_prompt(request.multimodal_prompt)
+            cache_key = {**cache_key, "multimodal_prompt": prompt_key}
+        return cache_key
+    def _make_raw_request(self, request: Request) -> dict[str, Any]:
+        input: Union[str, List[Dict[str, Any]]]
+        if request.multimodal_prompt is not None:
+            content = []
+            request.validate()
+            for media_object in request.multimodal_prompt.media_objects:
+                if media_object.is_type("image") and media_object.location:
+                    from helm.common.images_utils import encode_base64
+                    base64_image: str = encode_base64(media_object.location)
+                    content.append(
+                        {
+                            "type": "input_image",
+                            "image_url": f"data:image/jpeg;base64,{base64_image}",
+                        }
+                    )
+                elif media_object.is_type(TEXT_TYPE):
+                    assert media_object.text is not None
+                    content.append({"type": "input_text", "text": media_object.text})
+                else:
+                    raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
+            input = [{"role": "user", "content": content}]
+        else:
+            input = request.prompt
+        raw_request: Dict[str, Any] = {
+            "model": self._get_model_for_request(request),
+            "input": input,
+            "top_p": request.top_p,
+            # API errors if max_output_tokens is less than 16
+            # (Error you get: "Invalid 'max_output_tokens': integer below minimum value.
+            #    Expected a value >= 16, but got 5 instead.")
+            "max_output_tokens": max(16, request.max_tokens),
+            "temperature": request.temperature,
+            # Don't store responses for later retrieval
+            "store": False,
+        }
+        if self.reasoning_effort:
+            raw_request["reasoning"] = {"effort": self.reasoning_effort}
+        # If o-series model, get reasoning summaries
+        # Plus other changes
+        model_engine: str = request.model_engine
+        if OpenAIClientUtils.is_reasoning_model(model_engine):
+            raw_request["reasoning"]["summary"] = "detailed"
+            # Avoid error:
+            # "Error code: 400 - {'error': {'message': "Unsupported parameter: 'temperature' is
+            # not supported with this model.", 'type': 'invalid_request_error', 'param': 'temperature',
+            # 'code': 'unsupported_parameter'}}"
+            raw_request.pop("temperature", None)
+            # The following parameters also happen to be unsupported by the o-series (code unsupported_parameter)
+            raw_request.pop("top_p", None)
+        return raw_request
+    def _get_model_for_request(self, request: Request) -> str:
+        return self.openai_model_name or request.model_engine
+    def make_request(self, request: Request) -> RequestResult:
+        # Content can either be text or a list of multimodal content made up of text and images:
+        # https://platform.openai.com/docs/api-reference/responses/create
+        raw_request = self._make_raw_request(request)
+        # The responses API does not support a "num_completions" parameter,
+        # so we need to handle it ourselves with a simple loop
+        completions: list[GeneratedOutput] = []
+        for _ in range(request.num_completions):
+            def do_it() -> Dict[str, Any]:
+                raw_response = self.client.responses.create(**raw_request).model_dump(mode="json")
+                assert not raw_response.get("error", None), f"Error in response: {raw_response}"
+                return raw_response
+            try:
+                cache_key = self._get_cache_key(raw_request, request)
+                response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+            except openai.OpenAIError as e:
+                return OpenAIClientUtils.handle_openai_error(e, request)
+            # We can only return one completition really,
+            # but we get an array of messages back, so we need to contact them
+            reasoning_output = ""
+            text_output = ""
+            if request.echo_prompt:
+                text_output += request.prompt
+            for output in response["output"]:
+                output_type = output["type"]  # one of "message" or "reasoning" from API observation
+                is_reasoning_output = output_type == "reasoning"
+                if is_reasoning_output:
+                    reasoning_output += "\n".join([raw_output["text"] for raw_output in output["summary"]])
+                else:
+                    text_output += "\n".join([raw_output["text"] for raw_output in output["content"]])
+            completion = truncate_and_tokenize_response_text(
+                text_output,
+                request,
+                self.tokenizer,
+                self.tokenizer_name,
+                original_finish_reason="",
+            )
+            if reasoning_output:
+                completion = dataclasses.replace(completion, thinking=Thinking(text=reasoning_output))
+            completions.append(completion)
+        return RequestResult(
+            success=True,
+            cached=cached,
+            request_time=response["request_time"],
+            request_datetime=response.get("request_datetime"),
+            completions=completions,
+            embedding=[],
+        )

helm/clients/palmyra_client.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Any, Dict, List
 from helm.clients.openai_client import OpenAIClient
 from helm.common.cache import CacheConfig
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hwarn
 from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token, ErrorFlags
 from helm.common.tokenization_request import (
     TokenizationRequest,
@@ -103,10 +103,7 @@ class PalmyraClient(CachingClient):
                 return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
             if _is_content_moderation_failure(response):
-                hlog(
-                    f"WARNING: Returning empty request for {request.model_deployment} "
-                    "due to content moderation filter"
-                )
+                hwarn(f"Returning empty request for {request.model_deployment} " "due to content moderation filter")
                 return RequestResult(
                     success=False,
                     cached=False,

helm/clients/reka_client.py CHANGED Viewed

@@ -6,7 +6,7 @@ from helm.proxy.retry import NonRetriableException
 from helm.common.cache import CacheConfig
 from helm.common.media_object import TEXT_TYPE
 from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hwarn
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.tokenizers.tokenizer import Tokenizer
 from helm.clients.client import CachingClient, truncate_and_tokenize_response_text
@@ -121,7 +121,7 @@ class RekaClient(CachingClient):
             if messages[-1]["role"] != "user":
                 raise ValueError("Last message must have role 'user'")
             if request.prompt != "":
-                hlog("WARNING: Since message is set, prompt will be ignored")
+                hwarn("Since message is set, prompt will be ignored")
             reka_chat_history = self._convert_messages_to_reka_chat_history(messages)
         else:
             current_chat_history: Dict[str, Any] = {

helm/clients/together_client.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from copy import deepcopy
 from itertools import zip_longest
+import re
 import threading
-from typing import Callable, List, Dict, Any, Mapping, Optional, TypedDict, Union
+from typing import Callable, List, Dict, Any, Mapping, Optional, Tuple, TypedDict, Union
 from typing_extensions import NotRequired
 import requests
@@ -11,7 +12,7 @@ from helm.common.cache import CacheConfig
 from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
 from helm.common.object_spec import get_class_by_name
 from helm.common.optional_dependencies import handle_module_not_found_error
-from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
+from helm.common.request import Thinking, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
 from helm.clients.client import CachingClient, truncate_sequence, cleanup_str
 try:
@@ -100,6 +101,19 @@ class JobNotFinishedError(TogetherClientError):
     pass
+def _parse_thinking(input: str) -> Tuple[str, str]:
+    """Return a tuple of thinking text and output text."""
+    match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
+    if match:
+        return (match.group(1), match.group(2))
+    match = re.match(r"<think>\n?(.*)", input, re.DOTALL)
+    if match:
+        return (match.group(1), "")
+    return (input, "")
 class TogetherClient(CachingClient):
     """
     Client for the models where we evaluate offline. Since the queries are handled offline, the `TogetherClient` just
@@ -328,12 +342,14 @@ class TogetherChatClient(CachingClient):
         together_model: Optional[str] = None,
         disable_logprobs: Optional[bool] = None,
         output_processor: Optional[str] = None,
+        parse_thinking: Optional[bool] = None,
     ):
         super().__init__(cache_config=cache_config)
         self._client = Together(api_key=api_key)
         self._together_model = together_model
         self._disable_logprobs = bool(disable_logprobs)
         # self.output_processor is actually a function, not a class
+        self._parse_thinking = bool(parse_thinking)
         self.output_processor: Optional[Callable[[str], str]] = (
             get_class_by_name(output_processor) if output_processor else None
@@ -424,11 +440,21 @@ class TogetherChatClient(CachingClient):
                     if token_text is None:
                         break
                     tokens.append(Token(text=token_text, logprob=token_logprob or 0.0))
+            logprob = sum([token.logprob for token in tokens]) if tokens else 0.0
             assert choice.message.role == "assistant"
             output_text = choice.message.content
             if self.output_processor:
                 output_text = self.output_processor(output_text)
-            generated_outputs.append(GeneratedOutput(text=output_text, logprob=0.0, tokens=tokens))
+            if self._parse_thinking:
+                thinking_text, output_text = _parse_thinking(output_text)
+                generated_outputs.append(
+                    GeneratedOutput(
+                        text=output_text, logprob=logprob, tokens=tokens, thinking=Thinking(text=thinking_text)
+                    )
+                )
+            else:
+                generated_outputs.append(GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens))
         return RequestResult(
             success=True,
             cached=cached,
@@ -521,8 +547,9 @@ class TogetherCompletionClient(CachingClient):
                     if token_text is None:
                         break
                     tokens.append(Token(text=token_text, logprob=token_logprob or 0.0))
+            logprob = sum([token.logprob for token in tokens]) if tokens else 0.0
             assert choice.text
-            generated_outputs.append(GeneratedOutput(text=choice.text, logprob=0.0, tokens=tokens))
+            generated_outputs.append(GeneratedOutput(text=choice.text, logprob=logprob, tokens=tokens))
         return RequestResult(
             success=True,
             cached=cached,

helm/clients/vertexai_client.py CHANGED Viewed

@@ -360,6 +360,12 @@ class VertexAIChatClient(VertexAIClient):
         for media_object in request.multimodal_prompt.media_objects:
             if media_object.is_type("image") and media_object.location:
                 contents.append(Part.from_image(Image.load_from_file(media_object.location)))
+            elif media_object.is_type("video") and media_object.location:
+                # Following this example
+                # https://cloud.google.com/vertex-ai/generative-ai/docs/samples/googlegenaisdk-textgen-with-local-video
+                with open(media_object.location, "rb") as fp:
+                    video_content = fp.read()
+                contents.append(Part.from_data(data=video_content, mime_type=media_object.content_type))
             elif media_object.is_type("audio") and media_object.location:
                 contents.append(
                     Part.from_data(get_contents_as_bytes(media_object.location), mime_type=media_object.content_type)

helm/clients/vision_language/huggingface_vision2seq_client.py CHANGED Viewed

@@ -95,8 +95,8 @@ class HuggingFaceVision2SeqClient(CachingClient):
                 def do_it() -> Dict[str, Any]:
                     messages = [{"role": "user", "content": multimodal_prompt}]
-                    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-                    inputs = processor(
+                    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)  # type: ignore
+                    inputs = processor(  # type: ignore
                         text=[prompt] * request.num_completions,
                         images=[
                             [load_image(image_path) for image_path in image_paths]
@@ -107,8 +107,10 @@ class HuggingFaceVision2SeqClient(CachingClient):
                     inputs = {k: v.to(self._device) for k, v in inputs.items()}
                     # Generate
-                    generated_ids = model.generate(**inputs, **generation_args)
-                    generated_texts: List[str] = processor.batch_decode(generated_ids, skip_special_tokens=True)
+                    generated_ids = model.generate(**inputs, **generation_args)  # type: ignore
+                    generated_texts: List[str] = processor.batch_decode(  # type: ignore
+                        generated_ids, skip_special_tokens=True
+                    )
                     return {"output": generated_texts}
                 # Include the prompt and model name in the cache key

helm/clients/vision_language/huggingface_vlm_client.py CHANGED Viewed

@@ -50,7 +50,7 @@ class HuggingFaceVLMClient(CachingClient):
         with self._models_lock:
             model_id: str = self._models_aliases.get(model_name, model_name)
             if model_id not in self._models:
-                self._models[model_id] = pipeline("image-to-text", model=model_id, device_map="auto")
+                self._models[model_id] = pipeline("image-to-text", model=model_id, device_map="auto")  # type: ignore
             return self._models[model_id]
     def make_request(self, request: Request) -> RequestResult:
@@ -80,7 +80,7 @@ class HuggingFaceVLMClient(CachingClient):
             def do_it() -> Dict[str, Any]:
                 model: ImageToTextPipeline = self._get_model(request.model_deployment)
-                outputs = model(image, prompt=prompt, generate_kwargs=generation_args)
+                outputs = model(image, prompt=prompt, generate_kwargs=generation_args)  # type: ignore
                 return outputs[0]
             cache_key = CachingClient.make_cache_key(

helm/clients/vision_language/idefics_client.py CHANGED Viewed

@@ -89,14 +89,18 @@ class IDEFICSClient(CachingClient):
         input_args: Dict[str, Union[str, bool]] = {"return_tensors": "pt"}
         generation_args = {
             "max_new_tokens": request.max_tokens,
-            "bad_words_ids": processor.tokenizer(self.BAD_WORD_TOKENS, add_special_tokens=False).input_ids,
+            "bad_words_ids": processor.tokenizer(  # type: ignore
+                self.BAD_WORD_TOKENS, add_special_tokens=False
+            ).input_ids,
         }
         if self.END_OF_UTTERANCE_TOKEN in request.stop_sequences:
             # Following https://huggingface.co/HuggingFaceM4/idefics-80b-instruct,
             # specify <end_of_utterance> as an exit condition.
             input_args["add_end_of_utterance_token"] = False
-            exit_condition = processor.tokenizer(self.END_OF_UTTERANCE_TOKEN, add_special_tokens=False).input_ids
+            exit_condition = processor.tokenizer(  # type: ignore
+                self.END_OF_UTTERANCE_TOKEN, add_special_tokens=False
+            ).input_ids
             generation_args["eos_token_id"] = exit_condition
         multimodal_prompt: List[Union[str, Image.Image]] = []

helm/clients/vision_language/paligemma_client.py CHANGED Viewed

@@ -93,7 +93,7 @@ class PaliGemmaClient(CachingClient):
             else:
                 raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
         prompt_text: str = "\n".join(prompt_pieces)
-        model_inputs = processor(text=prompt_text, images=images, return_tensors="pt").to(self._device)
+        model_inputs = processor(text=prompt_text, images=images, return_tensors="pt").to(self._device)  # type: ignore
         input_len = model_inputs["input_ids"].shape[-1]
         completions: List[GeneratedOutput] = []
@@ -109,7 +109,7 @@ class PaliGemmaClient(CachingClient):
                             )[0]
                             if not request.echo_prompt:
                                 generation = generation[input_len:]
-                            decoded = processor.decode(generation, skip_special_tokens=True)
+                            decoded = processor.decode(generation, skip_special_tokens=True)  # type: ignore
                             return {"output": decoded}
                     # Include the prompt and model name in the cache key

helm/clients/vision_language/qwen2_vlm_client.py CHANGED Viewed

@@ -2,7 +2,7 @@ from threading import Lock
 from typing import Any, Dict, List, Optional
 from dataclasses import dataclass
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+from transformers import AutoProcessor
 from qwen_vl_utils import process_vision_info
 import torch
@@ -16,15 +16,20 @@ from helm.clients.client import CachingClient, generate_uid_for_multimodal_promp
 @dataclass(frozen=True)
-class LoadedQwen2ModelProcessor:
-    model: Qwen2VLForConditionalGeneration
+class LoadedModelProcessor:
+    model: Any
     processor: AutoProcessor
+# Global cache for all models
 _models_lock: Lock = Lock()
-_models: Dict[str, Optional[LoadedQwen2ModelProcessor]] = {
+_models: Dict[str, Optional[LoadedModelProcessor]] = {
     "Qwen/Qwen2-VL-7B-Instruct": None,
     "Qwen/Qwen2-VL-72B-Instruct": None,
+    "Qwen/Qwen2.5-VL-3B-Instruct": None,
+    "Qwen/Qwen2.5-VL-7B-Instruct": None,
+    "Qwen/Qwen2.5-VL-32B-Instruct": None,
+    "Qwen/Qwen2.5-VL-72B-Instruct": None,
 }
@@ -38,50 +43,52 @@ class Qwen2VLMClient(CachingClient):
             return "Qwen/Qwen2-VL-7B-Instruct"
         elif helm_model_name == "qwen2-vl-72b-instruct":
             return "Qwen/Qwen2-VL-72B-Instruct"
+        elif helm_model_name == "qwen2.5-vl-3b-instruct":
+            return "Qwen/Qwen2.5-VL-3B-Instruct"
+        elif helm_model_name == "qwen2.5-vl-7b-instruct":
+            return "Qwen/Qwen2.5-VL-7B-Instruct"
+        elif helm_model_name == "qwen2.5-vl-32b-instruct":
+            return "Qwen/Qwen2.5-VL-32B-Instruct"
+        elif helm_model_name == "qwen2.5-vl-72b-instruct":
+            return "Qwen/Qwen2.5-VL-72B-Instruct"
         else:
             raise ValueError(f"Unhandled model name: {helm_model_name}")
-    def _get_model(self, helm_model_name: str) -> LoadedQwen2ModelProcessor:
-        global _models_lock
-        global _models
+    def _get_model(self, helm_model_name: str) -> LoadedModelProcessor:
+        from transformers import Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration
-        model_name = self._get_model_name(helm_model_name)
+        global _models_lock, _models
+        model_name = self._get_model_name(helm_model_name)
         with _models_lock:
             loaded = _models[model_name]
             if loaded is None:
                 hlog(f"Loading model {model_name} and caching in memory...")
-                # https://huggingface.co/docs/transformers/model_doc/qwen2_vl#flash-attention-2-to-speed-up-generation
-                model = Qwen2VLForConditionalGeneration.from_pretrained(
-                    model_name,
-                    torch_dtype=torch.bfloat16,
-                    device_map="auto",
-                    attn_implementation="flash_attention_2",
-                ).eval()
+                # Use different loading routines depending on whether it's Qwen2.5 or Qwen2.
+                if "2.5" in model_name:
+                    # Qwen2.5: by default use torch_dtype="auto". You can enable flash_attention_2 if desired.
+                    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                        model_name,
+                        torch_dtype=torch.bfloat16,
+                        device_map="auto",
+                        attn_implementation="flash_attention_2",
+                    ).eval()
+                else:
+                    model = Qwen2VLForConditionalGeneration.from_pretrained(
+                        model_name,
+                        torch_dtype=torch.bfloat16,
+                        device_map="auto",
+                        attn_implementation="flash_attention_2",
+                    ).eval()
                 processor = AutoProcessor.from_pretrained(model_name)
-                loaded = LoadedQwen2ModelProcessor(model=model, processor=processor)
+                loaded = LoadedModelProcessor(model=model, processor=processor)
                 _models[model_name] = loaded
         return loaded
     def make_request(self, request: Request) -> RequestResult:
         assert request.multimodal_prompt is not None, "Multimodal prompt is required"
-        loaded = self._get_model(request.model_engine)
-        model = loaded.model
-        processor = loaded.processor
-        # Build Qwen2 messages
-        # We assume all media objects go into a single "user" message:
-        # messages = [
-        #   {
-        #     "role": "user",
-        #     "content": [
-        #       {"type": "image", "image": "file:///path/to/image1.jpg"},
-        #       {"type": "image", "image": "file:///path/to/image2.jpg"},
-        #       {"type": "text", "text": "Describe these images."}
-        #     ]
-        #   }
-        # ]
+        # Build messages by collating all media objects into a single "user" message.
         message_content = []
         for media_object in request.multimodal_prompt.media_objects:
             if media_object.is_type("image") and media_object.location:
@@ -95,18 +102,6 @@ class Qwen2VLMClient(CachingClient):
         messages = [{"role": "user", "content": message_content}]
-        # Prepare text and vision inputs
-        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        image_inputs, video_inputs = process_vision_info(messages)
-        inputs = processor(
-            text=[text],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        ).to(self._device)
         generation_args = {
             "max_new_tokens": request.max_tokens,
         }
@@ -116,23 +111,38 @@ class Qwen2VLMClient(CachingClient):
         request_datetime: Optional[int] = None
         all_cached: bool = True
-        with htrack_block(f"Generating for prompt: {text}"):
+        with htrack_block(f"Generating for prompt: {request.multimodal_prompt.text}"):
             for completion_index in range(request.num_completions):
                 try:
                     def do_it() -> Dict[str, Any]:
+                        loaded = self._get_model(request.model_engine)
+                        model = loaded.model
+                        processor = loaded.processor
+                        # Prepare text and vision inputs.
+                        text = processor.apply_chat_template(  # type: ignore
+                            messages, tokenize=False, add_generation_prompt=True
+                        )
+                        image_inputs, video_inputs = process_vision_info(messages)
+                        inputs = processor(  # type: ignore
+                            text=[text],
+                            images=image_inputs,
+                            videos=video_inputs,
+                            padding=True,
+                            return_tensors="pt",
+                        ).to(self._device)
                         generated_ids = model.generate(**inputs, **generation_args)
-                        # Remove the input prefix from outputs
+                        # Remove the input prefix from outputs.
                         generated_ids_trimmed = [
                             out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
                         ]
-                        output_text = processor.batch_decode(
+                        output_text = processor.batch_decode(  # type: ignore
                             generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
                         )
-                        # There's only one batch element
-                        completion = output_text[0]
                         # For simplicity, we split tokens by whitespace.
-                        # A more accurate tokenization would require a tokenizer for Qwen2, if desired.
+                        completion = output_text[0]
                         tokens = completion.split()
                         return {"output": (completion, tokens)}
@@ -148,7 +158,11 @@ class Qwen2VLMClient(CachingClient):
                     result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
                 except RuntimeError as model_error:
                     return RequestResult(
-                        success=False, cached=False, error=str(model_error), completions=[], embedding=[]
+                        success=False,
+                        cached=False,
+                        error=str(model_error),
+                        completions=[],
+                        embedding=[],
                     )
                 text_out, tokens = result["output"]
@@ -160,7 +174,6 @@ class Qwen2VLMClient(CachingClient):
                     )
                 )
                 hlog(f"Generated: {text_out}")
                 request_time += result["request_time"]
                 request_datetime = request_datetime or result.get("request_datetime")
                 all_cached = all_cached and cached

helm/clients/vision_language/qwen_vlm_client.py CHANGED Viewed

@@ -115,14 +115,16 @@ class QwenVLMClient(CachingClient):
                     def do_it() -> Dict[str, Any]:
                         if request.model_engine == "qwen-vl-chat":
-                            completion, _ = model.chat(tokenizer, query=tokenizer.from_list_format(query), history=None)
+                            completion, _ = model.chat(  # type: ignore
+                                tokenizer, query=tokenizer.from_list_format(query), history=None  # type: ignore
+                            )
                         else:
-                            inputs = tokenizer(tokenizer.from_list_format(query), return_tensors="pt")
+                            inputs = tokenizer(tokenizer.from_list_format(query), return_tensors="pt")  # type: ignore
                             inputs = inputs.to(self._device)
-                            pred = model.generate(**inputs, **generation_args)
-                            completion = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
+                            pred = model.generate(**inputs, **generation_args)  # type: ignore
+                            completion = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)  # type: ignore
-                        tokens: List[str] = tokenizer.tokenize(completion)
+                        tokens: List[str] = tokenizer.tokenize(completion)  # type: ignore
                         return {"output": (completion, tokens)}
                     # Include the prompt and model name in the cache key

crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.6py3-none-any.whl