PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +1 -2
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +76 -59
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +78 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/long_context_run_specs.py +67 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/numeracy_scenario.py +2 -1
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +63 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +100 -54
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/together_client.py +31 -4
helm/clients/vertexai_client.py +6 -0
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/local_context.py +140 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/config/model_deployments.yaml +864 -193
helm/config/model_metadata.yaml +667 -53
helm/config/tokenizer_configs.yaml +144 -3
helm/proxy/cli.py +3 -1
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/benchmark/static_build/index.html CHANGED Viewed

@@ -2,16 +2,16 @@
 <html lang="en">
   <head>
     <meta charset="UTF-8" />
-    <link rel="icon" type="image/svg+xml" href="./helm.svg" />
+    <link rel="icon" type="image/svg+xml" href="https://crfm.stanford.edu/helm/helm.svg" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
     <title>Holistic Evaluation of Language Models (HELM)</title>
     <meta name="description" content="The Holistic Evaluation of Language Models (HELM) serves as a living benchmark for transparency in language models. Providing broad coverage and recognizing incompleteness, multi-metric measurements, and standardization. All data and analysis are freely accessible on the website for exploration and study." />
     <script type="text/javascript" src="./config.js"></script>
-    <script type="module" crossorigin src="./assets/index-262903c1.js"></script>
+    <script type="module" crossorigin src="./assets/index-94295e78.js"></script>
     <link rel="modulepreload" crossorigin href="./assets/react-f82877fd.js">
     <link rel="modulepreload" crossorigin href="./assets/recharts-4037aff0.js">
-    <link rel="modulepreload" crossorigin href="./assets/tremor-9cefc3c5.js">
-    <link rel="stylesheet" href="./assets/index-42060d71.css">
+    <link rel="modulepreload" crossorigin href="./assets/tremor-38a10867.js">
+    <link rel="stylesheet" href="./assets/index-b9779128.css">
   </head>
   <body class="block">
     <div id="root"></div>

helm/benchmark/window_services/encoder_decoder_window_service.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from abc import ABC
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hwarn
 from helm.benchmark.window_services.local_window_service import LocalWindowService
@@ -21,8 +21,8 @@ class EncoderDecoderWindowService(LocalWindowService, ABC):
         vs. the completions, we check the two values separately.
         """
         if expected_completion_token_length > self.max_output_length:
-            hlog(
-                f"WARNING: The expected completion token length ({expected_completion_token_length}) "
+            hwarn(
+                f"The expected completion token length ({expected_completion_token_length}) "
                 f"exceeds the max output length ({self.max_output_length})."
             )
         return self.get_num_tokens(text) <= self.max_request_length

helm/benchmark/window_services/test_utils.py CHANGED Viewed

@@ -1,8 +1,7 @@
 from typing import List
-from helm.common.authentication import Authentication
+from helm.common.local_context import LocalContext
 from helm.common.cache_backend_config import CacheBackendConfig
-from helm.proxy.services.server_service import ServerService
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.window_services.tokenizer_service import TokenizerService
@@ -229,5 +228,5 @@ GPT4_TEST_TOKENS: List[str] = [
 def get_tokenizer_service(local_path: str, cache_backend_config: CacheBackendConfig) -> TokenizerService:
-    service = ServerService(base_path=local_path, root_mode=True, cache_backend_config=cache_backend_config)
-    return MetricService(service, Authentication("test"))
+    context = LocalContext(base_path=local_path, cache_backend_config=cache_backend_config)
+    return MetricService(context)

helm/benchmark/window_services/tokenizer_service.py CHANGED Viewed

@@ -1,26 +1,25 @@
-from helm.common.authentication import Authentication
+from helm.common.context import Context
 from helm.common.tokenization_request import (
     TokenizationRequest,
     TokenizationRequestResult,
     DecodeRequest,
     DecodeRequestResult,
 )
-from helm.proxy.services.service import Service
+# TODO: Rename this to TokenizerContext
 class TokenizerService:
     """
-    A wrapper around `Service` that makes only necessary server requests to tokenize.
+    A wrapper around `Context` that makes only necessary server requests to tokenize.
     """
-    def __init__(self, service: Service, auth: Authentication):
-        self._service: Service = service
-        self._auth: Authentication = auth
+    def __init__(self, context: Context):
+        self._context: Context = context
     def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
         """Tokenize via an API."""
-        return self._service.tokenize(self._auth, request)
+        return self._context.tokenize(request)
     def decode(self, request: DecodeRequest) -> DecodeRequestResult:
         """Decode via an API."""
-        return self._service.decode(self._auth, request)
+        return self._context.decode(request)

helm/clients/anthropic_client.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import dataclasses
 from typing import Any, Dict, List, Optional, TypedDict, Union, cast
 import json
 import os
@@ -7,10 +8,11 @@ import time
 import urllib.parse
 from helm.common.cache import CacheConfig
-from helm.common.hierarchical_logger import htrack_block, hlog
+from helm.common.hierarchical_logger import htrack_block, hlog, hwarn
 from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import (
+    Thinking,
     wrap_request_time,
     EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
     Request,
@@ -30,8 +32,12 @@ from helm.clients.client import CachingClient, truncate_sequence, truncate_and_t
 try:
     from anthropic import Anthropic, BadRequestError
     from anthropic.types import MessageParam
+    from anthropic.types.message import Message
+    from anthropic.types.text_block import TextBlock
+    from anthropic.types.thinking_block import ThinkingBlock
     from anthropic.types.image_block_param import ImageBlockParam
     from anthropic.types.text_block_param import TextBlockParam
+    from anthropic.types.thinking_config_enabled_param import ThinkingConfigEnabledParam
     import websocket
 except ModuleNotFoundError as e:
     handle_module_not_found_error(e, ["anthropic"])
@@ -231,30 +237,41 @@ class AnthropicMessagesRequest(TypedDict, total=False):
     temperature: float
     top_k: int
     top_p: float
+    thinking: ThinkingConfigEnabledParam
 class AnthropicMessagesRequestError(NonRetriableException):
     pass
-class AnthropicMessagesResponseError(Exception):
+class AnthropicMessagesEmptyContentError(Exception):
     pass
 class AnthropicMessagesClient(CachingClient):
     # Source: https://docs.anthropic.com/claude/docs/models-overview
-    MAX_OUTPUT_TOKENS: int = 4096
+    MAX_OUTPUT_TOKENS: int = 64000
     MAX_IMAGE_SIZE_BYTES: int = 5242880  # 5MB
     def __init__(
-        self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig, api_key: Optional[str] = None
+        self,
+        tokenizer: Tokenizer,
+        tokenizer_name: str,
+        cache_config: CacheConfig,
+        thinking_budget_tokens: Optional[int] = None,
+        anthropic_model_name: Optional[str] = None,
+        api_key: Optional[str] = None,
+        stream: Optional[bool] = None,
     ):
         super().__init__(cache_config=cache_config)
         self.tokenizer = tokenizer
         self.tokenizer_name = tokenizer_name
         self.client = Anthropic(api_key=api_key)
         self.api_key: Optional[str] = api_key
+        self.anthropic_model_name: Optional[str] = anthropic_model_name
+        self.thinking_budget_tokens: Optional[int] = thinking_budget_tokens
+        self.stream: Optional[bool] = stream
     def make_request(self, request: Request) -> RequestResult:
         if request.max_tokens > AnthropicMessagesClient.MAX_OUTPUT_TOKENS:
@@ -293,8 +310,8 @@ class AnthropicMessagesClient(CachingClient):
                         image_width > AnthropicClient.MAX_IMAGE_DIMENSION
                         or image_height > AnthropicClient.MAX_IMAGE_DIMENSION
                     ):
-                        hlog(
-                            f"WARNING: Image {image_location} exceeds max allowed size: "
+                        hwarn(
+                            f"Image {image_location} exceeds max allowed size: "
                             f"{AnthropicClient.MAX_IMAGE_DIMENSION} pixels"
                         )
                         # Save the resized image to a temporary file
@@ -309,8 +326,8 @@ class AnthropicMessagesClient(CachingClient):
                             base64_image = encode_base64(temp_file.name, format="JPEG")
                     elif os.path.getsize(image_location) > AnthropicMessagesClient.MAX_IMAGE_SIZE_BYTES:
-                        hlog(
-                            f"WARNING: Image {image_location} exceeds max allowed size: "
+                        hwarn(
+                            f"Image {image_location} exceeds max allowed size: "
                             f"{AnthropicMessagesClient.MAX_IMAGE_SIZE_BYTES} bytes"
                         )
                         # Resize the image so it is smaller than the max allowed size
@@ -351,7 +368,7 @@ class AnthropicMessagesClient(CachingClient):
         raw_request: AnthropicMessagesRequest = {
             "messages": messages,
-            "model": request.model_engine,
+            "model": self.anthropic_model_name or request.model_engine,
             "stop_sequences": request.stop_sequences,
             "max_tokens": request.max_tokens,
             "temperature": request.temperature,
@@ -360,6 +377,15 @@ class AnthropicMessagesClient(CachingClient):
         }
         if system_message is not None:
             raw_request["system"] = cast(str, system_message["content"])
+        if self.thinking_budget_tokens:
+            raw_request["thinking"] = {
+                "type": "enabled",
+                "budget_tokens": self.thinking_budget_tokens,
+            }
+            # Avoid error:
+            # `top_k` must be unset when thinking is enabled. Please consult our documentation at https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#important-considerations-when-using-extended-thinking  # noqa: E501
+            del raw_request["top_k"]
         completions: List[GeneratedOutput] = []
         # `num_completions` is not supported, so instead make `num_completions` separate requests.
@@ -367,11 +393,15 @@ class AnthropicMessagesClient(CachingClient):
             def do_it() -> Dict[str, Any]:
                 try:
-                    result = self.client.messages.create(**raw_request).model_dump()
+                    if self.stream:
+                        with self.client.messages.stream(**raw_request) as message_stream:
+                            result = message_stream.get_final_message().model_dump()
+                    else:
+                        result = self.client.messages.create(**raw_request).model_dump()
                     if "content" not in result or not result["content"]:
-                        raise AnthropicMessagesResponseError(f"Anthropic response has empty content: {result}")
-                    elif "text" not in result["content"][0]:
-                        raise AnthropicMessagesResponseError(f"Anthropic response has non-text content: {result}")
+                        raise AnthropicMessagesEmptyContentError(f"Anthropic response has empty content: {result}")
+                    elif "text" not in result["content"][-1]:
+                        raise AnthropicMessagesEmptyContentError(f"Anthropic response has non-text content: {result}")
                     return result
                 except BadRequestError as e:
                     response = e.response.json()
@@ -387,9 +417,10 @@ class AnthropicMessagesClient(CachingClient):
                     },
                     request,
                 )
-                response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
-            except AnthropicMessagesResponseError:
-                hlog("WARNING: Response has empty content")
+                raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+            except AnthropicMessagesEmptyContentError:
+                hwarn("Anthropic response has empty content")
                 return RequestResult(
                     success=False,
                     cached=False,
@@ -399,32 +430,41 @@ class AnthropicMessagesClient(CachingClient):
                     error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
                 )
-            if _is_content_moderation_failure(response):
-                hlog(
-                    f"WARNING: Returning empty request for {request.model_deployment} "
-                    "due to content moderation filter"
-                )
+            if _is_content_moderation_failure(raw_response):
+                hwarn(f"Returning empty request for {request.model_deployment} " "due to content moderation filter")
                 return RequestResult(
                     success=False,
                     cached=cached,
-                    error=response["error"]["message"],
+                    error=raw_response["error"]["message"],
                     completions=[],
                     embedding=[],
                     error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
-                    request_time=response["request_time"],
-                    request_datetime=response["request_datetime"],
+                    request_time=raw_response["request_time"],
+                    request_datetime=raw_response["request_datetime"],
                 )
+            response_message: Message = Message.model_validate(raw_response)
+            response_text: Optional[str] = None
+            response_thinking: Optional[str] = None
+            for content in response_message.content:
+                if isinstance(content, TextBlock):
+                    response_text = content.text
+                elif isinstance(content, ThinkingBlock):
+                    response_thinking = content.thinking
+            if response_text is None:
+                raise Exception("Anthropic response did not contain text block")
             completion = truncate_and_tokenize_response_text(
-                response["content"][0]["text"], request, self.tokenizer, self.tokenizer_name, original_finish_reason=""
+                response_text, request, self.tokenizer, self.tokenizer_name, original_finish_reason=""
             )
+            if response_thinking is not None:
+                completion = dataclasses.replace(completion, thinking=Thinking(text=response_thinking))
             completions.append(completion)
         return RequestResult(
             success=True,
             cached=cached,
-            request_time=response["request_time"],
-            request_datetime=response["request_datetime"],
+            request_time=raw_response["request_time"],
+            request_datetime=raw_response["request_datetime"],
             completions=completions,
             embedding=[],
         )
@@ -617,8 +657,8 @@ class AnthropicLegacyClient(CachingClient):
                     if logprobs["tokens"] != tokens:
                         # This is a known limitation with the Anthropic API. For now keep track of the
                         # entries with the mismatch.
-                        hlog(
-                            f"WARNING: naive truncation for logprobs did not work."
+                        hwarn(
+                            f"naive truncation for logprobs did not work."
                             f"\nRequest:{raw_request}\nExpected: {tokens}\nActual: {logprobs['tokens']}"
                         )
                         check_logprobs = True

helm/clients/audio_language/diva_llama_client.py CHANGED Viewed

@@ -96,9 +96,11 @@ class DivaLlamaClient(CachingClient):
                 with _LOCK:
                     audio_input, text_input = DivaLlamaClient._get_generate_input(request)
                     if text_input is None:
-                        return {"completions": self.pre_trained_model.generate([audio_input])}
+                        return {"completions": self.pre_trained_model.generate([audio_input])}  # type: ignore
                     else:
-                        return {"completions": self.pre_trained_model.generate([audio_input], [text_input])}
+                        return {
+                            "completions": self.pre_trained_model.generate([audio_input], [text_input])  # type: ignore
+                        }
             cache_key = CachingClient.make_cache_key(raw_request, request)
             response, cached = self.cache.get(cache_key, wrap_request_time(do_it))

helm/clients/audio_language/qwen2_5_omni_client.py ADDED Viewed

@@ -0,0 +1,197 @@
+from threading import Lock
+import torch
+from typing import Any, Dict, List, Optional
+from dataclasses import dataclass
+from helm.clients.audio_language.qwen_omni.modeling_qwen2_5_omni import Qwen2_5OmniModel
+from helm.clients.audio_language.qwen_omni.processing_qwen2_5_omni import Qwen2_5OmniProcessor
+from helm.clients.audio_language.qwen_omni.qwen2_5_omni_utils.v2_5 import process_mm_info
+from helm.common.cache import CacheConfig
+from helm.common.gpu_utils import get_torch_device_name
+from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.media_object import TEXT_TYPE
+from helm.common.request import Request, RequestResult, GeneratedOutput, Token
+from helm.common.request import wrap_request_time
+from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt
+@dataclass(frozen=True)
+class LoadedQwen2_5OmniModelProcessor:
+    """Loaded model and processor for Qwen."""
+    model: Qwen2_5OmniModel
+    tokenizer: Qwen2_5OmniProcessor
+_models_lock: Lock = Lock()
+_models: Dict[str, Optional[LoadedQwen2_5OmniModelProcessor]] = {
+    "Qwen/Qwen2.5-Omni-7B": None,
+}
+class Qwen2_5OmniAudioLMClient(CachingClient):
+    """
+    From https://huggingface.co/Qwen/Qwen2.5-Omni-7B,
+    Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse modalities, including text,
+    images, audio, and video, while simultaneously generating text and natural speech responses in a streaming manner.
+    Paper: https://arxiv.org/abs/2503.20215
+    """
+    END_OF_TEXT_TOKEN: str = "<|endoftext|>>"
+    def __init__(self, cache_config: CacheConfig):
+        super().__init__(cache_config=cache_config)
+        self._device: str = get_torch_device_name()
+    def _get_model(self, helm_model_name: str) -> LoadedQwen2_5OmniModelProcessor:
+        global _models_lock
+        global _models
+        model_name: str
+        if helm_model_name == "qwen2.5-omni-7b":
+            model_name = "Qwen/Qwen2.5-Omni-7B"
+        else:
+            raise ValueError(f"Unhandled model name: {helm_model_name}")
+        # Ensure that only one thread is loading the model at a time
+        with _models_lock:
+            loaded_model_processor = _models[model_name]
+            if loaded_model_processor is None:
+                hlog(f"Loading model {model_name} and caching in memory...")
+                model = Qwen2_5OmniModel.from_pretrained(
+                    model_name,
+                    attn_implementation="flash_attention_2",
+                    torch_dtype=torch.bfloat16,
+                    device_map=self._device,
+                ).eval()
+                tokenizer = Qwen2_5OmniProcessor.from_pretrained(
+                    model_name,
+                )
+                _models[model_name] = LoadedQwen2_5OmniModelProcessor(model, tokenizer)
+                loaded_model_processor = _models[model_name]
+        assert loaded_model_processor is not None
+        return loaded_model_processor
+    def make_request(self, request: Request) -> RequestResult:
+        assert request.multimodal_prompt is not None, "Multimodal prompt is required"
+        loaded_model_processor: LoadedQwen2_5OmniModelProcessor = self._get_model(request.model_engine)
+        model = loaded_model_processor.model
+        tokenizer = loaded_model_processor.tokenizer
+        input_query: List[Dict[str, Any]] = []
+        query: List[Dict[str, str]] = []
+        prompt_text: str = ""
+        input_query.append(
+            {
+                "role": "system",
+                "content": (
+                    "You are Qwen, a virtual human developed by the Qwen Team,"
+                    " Alibaba Group, capable of perceiving auditory and visual inputs,"
+                    " as well as generating text and speech."
+                ),
+            }
+        )
+        # prompt_text += "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        for media_num, media_object in enumerate(request.multimodal_prompt.media_objects):
+            if media_object.is_type("audio") and media_object.location:
+                assert media_object.is_local_file, "Only local audio files are supported"
+                query.append({"type": "audio", "audio": media_object.location})
+                # prompt_text += f"<|im_start|>user\nAudio {media_num+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
+            elif media_object.is_type(TEXT_TYPE):
+                if media_object.text is None:
+                    raise ValueError("MediaObject of text type has missing text field value")
+                query.append({"type": "text", "text": media_object.text})
+                # prompt_text += media_object.text
+            else:
+                raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
+        # prompt_text += "<|im_end|>\n<|im_start|>assistant\n"
+        input_query.append({"role": "user", "content": query})
+        completions: List[GeneratedOutput] = []
+        request_time: float = 0
+        request_datetime: Optional[int] = None
+        all_cached: bool = True
+        with htrack_block(f"Generating for prompt: {prompt_text}"):
+            for completion_index in range(request.num_completions):
+                try:
+                    def do_it() -> Dict[str, Any]:
+                        # Refer to the official Qwen2.5-Omni documentation for the format of the input query
+                        # https://huggingface.co/Qwen/Qwen2.5-Omni-7B
+                        USE_AUDIO_IN_VIDEO = True
+                        text = tokenizer.apply_chat_template(input_query, add_generation_prompt=True, tokenize=False)
+                        audios, images, videos = process_mm_info(input_query, use_audio_in_video=USE_AUDIO_IN_VIDEO)
+                        inputs = tokenizer(
+                            text=text,
+                            audios=audios,
+                            images=images,
+                            videos=videos,
+                            return_tensors="pt",
+                            padding=True,
+                            use_audio_in_video=USE_AUDIO_IN_VIDEO,
+                        )
+                        inputs = inputs.to(self._device, torch.bfloat16)
+                        input_seq_length = len(inputs.input_ids[0])
+                        # The model runs into errors when setting thinker_max_new_tokens to 1
+                        if request.max_tokens != 1:
+                            pred, _ = model.generate(**inputs, thinker_max_new_tokens=request.max_tokens)
+                            pred_decode = pred.cpu()[0][input_seq_length:]
+                        else:
+                            pred, _ = model.generate(**inputs)
+                            pred_decode = pred.cpu()[0][input_seq_length : input_seq_length + 1]
+                        completion = tokenizer.decode(
+                            pred_decode,
+                            skip_special_tokens=True,
+                            clean_up_tokenization_spaces=False,
+                        )
+                        # The processor of Qwen2-Audio-Instruct consists an AutoTokenizer and a WhisperFeatureExtractor
+                        tokens: List[str] = tokenizer.tokenizer.tokenize(completion)  # type: ignore
+                        return {"output": (completion, tokens)}
+                    # Include the prompt and model name in the cache key
+                    cache_key = CachingClient.make_cache_key(
+                        raw_request={
+                            "completion_index": completion_index,
+                            "model": request.model,
+                            "prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
+                            "max_tokens": request.max_tokens,
+                        },
+                        request=request,
+                    )
+                    result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+                except RuntimeError as model_error:
+                    return RequestResult(
+                        success=False, cached=False, error=str(model_error), completions=[], embedding=[]
+                    )
+                text, tokens = result["output"]
+                hlog(f"Generated: {text}")
+                # Tokenize truncated text to get the list of tokens
+                completions.append(
+                    GeneratedOutput(
+                        text=text, logprob=0, tokens=[Token(text=str(token), logprob=0) for token in tokens]
+                    )
+                )
+                request_time += result["request_time"]
+                # Use the datetime from the first completion because that's when the request was fired
+                request_datetime = request_datetime or result.get("request_datetime")
+                all_cached = all_cached and cached
+        return RequestResult(
+            success=True,
+            cached=all_cached,
+            request_time=request_time,
+            request_datetime=request_datetime,
+            completions=completions,
+            embedding=[],
+        )

helm/clients/audio_language/qwen2_audiolm_client.py CHANGED Viewed

@@ -113,7 +113,9 @@ class Qwen2AudioLMClient(CachingClient):
                 try:
                     def do_it() -> Dict[str, Any]:
-                        inputs = tokenizer.apply_chat_template(input_query, add_generation_prompt=True, tokenize=False)
+                        inputs = tokenizer.apply_chat_template(  # type: ignore
+                            input_query, add_generation_prompt=True, tokenize=False
+                        )
                         audios: List[Any] = []
                         # Refer to the official Qwen2-Audio documentation for the format of the input query
                         # https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct
@@ -124,13 +126,13 @@ class Qwen2AudioLMClient(CachingClient):
                                         audios.append(
                                             librosa.load(
                                                 element["audio_url"],
-                                                sr=tokenizer.feature_extractor.sampling_rate,
+                                                sr=tokenizer.feature_extractor.sampling_rate,  # type: ignore
                                             )[0]
                                         )
-                        inputs = tokenizer(
+                        inputs = tokenizer(  # type: ignore
                             text=inputs,
                             audios=audios,
-                            sampling_rate=tokenizer.feature_extractor.sampling_rate,
+                            sampling_rate=tokenizer.feature_extractor.sampling_rate,  # type: ignore
                             return_tensors="pt",
                             padding=True,
                         )
@@ -140,11 +142,11 @@ class Qwen2AudioLMClient(CachingClient):
                         inputs = inputs.to(self._device)
                         pred = model.generate(**inputs, max_length=request.max_tokens + input_length)[:, input_length:]
-                        completion = tokenizer.decode(
+                        completion = tokenizer.decode(  # type: ignore
                             pred.cpu()[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
                         )
                         # The processor of Qwen2-Audio-Instruct consists an AutoTokenizer and a WhisperFeatureExtractor
-                        tokens: List[str] = tokenizer.tokenizer.tokenize(completion)
+                        tokens: List[str] = tokenizer.tokenizer.tokenize(completion)  # type: ignore
                         return {"output": (completion, tokens)}
                     # Include the prompt and model name in the cache key

helm/clients/audio_language/qwen_audiolm_client.py CHANGED Viewed

@@ -106,8 +106,10 @@ class QwenAudioLMClient(CachingClient):
                 try:
                     def do_it() -> Dict[str, Any]:
-                        completion, _ = model.chat(tokenizer, query=tokenizer.from_list_format(query), history=None)
-                        tokens: List[str] = tokenizer.tokenize(completion)
+                        completion, _ = model.chat(  # type: ignore
+                            tokenizer, query=tokenizer.from_list_format(query), history=None  # type: ignore
+                        )
+                        tokens: List[str] = tokenizer.tokenize(completion)  # type: ignore
                         return {"output": (completion, tokens)}
                     # Include the prompt and model name in the cache key

helm/clients/audio_language/test.py ADDED Viewed

@@ -0,0 +1,62 @@
+import soundfile as sf
+from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor  # type: ignore
+from qwen_omni_utils import process_mm_info
+# default: Load the model on the available device(s)
+model = Qwen2_5OmniModel.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")
+# We recommend enabling flash_attention_2 for better acceleration and memory saving.
+# model = Qwen2_5OmniModel.from_pretrained(
+#     "Qwen/Qwen2.5-Omni-7B",
+#     torch_dtype="auto",
+#     device_map="auto",
+#     attn_implementation="flash_attention_2",
+# )
+processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
+conversation = [
+    {
+        "role": "system",
+        "content": (
+            "You are Qwen, a virtual human developed by the Qwen Team,"
+            " Alibaba Group, capable of perceiving auditory and visual"
+            " inputs, as well as generating text and speech."
+        ),
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"},
+        ],
+    },
+]
+# set use audio in video
+USE_AUDIO_IN_VIDEO = True
+# Preparation for inference
+text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
+inputs = processor(
+    text=text,
+    audios=audios,
+    images=images,
+    videos=videos,
+    return_tensors="pt",
+    padding=True,
+    use_audio_in_video=USE_AUDIO_IN_VIDEO,
+)
+inputs = inputs.to(model.device).to(model.dtype)
+# Inference: Generation of the output text and audio
+text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)
+text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+print(text)
+sf.write(
+    "output.wav",
+    audio.reshape(-1).detach().cpu().numpy(),
+    samplerate=24000,
+)

crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.6py3-none-any.whl