PyPI - crfm-helm - Versions diffs - 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

crfm-helm 0.5.0py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (56) hide show

helm/benchmark/static_build/index.html CHANGED Viewed

@@ -7,11 +7,11 @@
     <title>Holistic Evaluation of Language Models (HELM)</title>
     <meta name="description" content="The Holistic Evaluation of Language Models (HELM) serves as a living benchmark for transparency in language models. Providing broad coverage and recognizing incompleteness, multi-metric measurements, and standardization. All data and analysis are freely accessible on the website for exploration and study." />
     <script type="text/javascript" src="./config.js"></script>
-    <script type="module" crossorigin src="./assets/index-d839df55.js"></script>
+    <script type="module" crossorigin src="./assets/index-737eef9e.js"></script>
     <link rel="modulepreload" crossorigin href="./assets/react-d4a0b69b.js">
     <link rel="modulepreload" crossorigin href="./assets/recharts-6d337683.js">
     <link rel="modulepreload" crossorigin href="./assets/tremor-54a99cc4.js">
-    <link rel="stylesheet" href="./assets/index-5088afcb.css">
+    <link rel="stylesheet" href="./assets/index-878a1094.css">
   </head>
   <body class="block">
     <div id="root"></div>

helm/clients/anthropic_client.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import Any, Dict, List, Optional, TypedDict, Union, cast
 import json
 import requests
+import tempfile
 import time
 import urllib.parse
@@ -68,6 +69,9 @@ class AnthropicClient(CachingClient):
     MAX_COMPLETION_LENGTH: int = (
         8192  # See https://docs.google.com/document/d/1vX6xgoA-KEKxqtMlBVAqYvE8KUfZ7ABCjTxAjf1T5kI/edit#
     )
+    # An Anthropic error message: "At least one of the image dimensions exceed max allowed size: 8000 pixels"
+    MAX_IMAGE_DIMENSION: int = 8000
     ADDITIONAL_TOKENS: int = 5
     PROMPT_ANSWER_START: str = "The answer is "
@@ -206,7 +210,7 @@ class AnthropicClient(CachingClient):
 def _is_content_moderation_failure(response: Dict) -> bool:
-    """Return whether a a response failed because of the content moderation filter."""
+    """Return whether a response failed because of the content moderation filter."""
     if (
         "error" in response
         and "message" in response["error"]
@@ -238,7 +242,7 @@ class AnthropicMessagesResponseError(Exception):
 class AnthropicMessagesClient(CachingClient):
     # Source: https://docs.anthropic.com/claude/docs/models-overview
-    MAX_OUTPUT_TOKENS = 4096
+    MAX_OUTPUT_TOKENS: int = 4096
     def __init__(
         self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig, api_key: Optional[str] = None
@@ -273,7 +277,7 @@ class AnthropicMessagesClient(CachingClient):
             # TODO(#2439): Refactor out Request validation
             if request.messages is not None or request.prompt:
                 raise AnthropicMessagesRequestError(
-                    "Exactly one of Request.messages, Request.prompt or Request.multimodel_prompt should be set"
+                    "Exactly one of Request.messages, Request.prompt or Request.multimodal_prompt should be set"
                 )
             blocks: List[Union[TextBlockParam, ImageBlockParam]] = []
             for media_object in request.multimodal_prompt.media_objects:
@@ -282,9 +286,33 @@ class AnthropicMessagesClient(CachingClient):
                     if not media_object.location:
                         raise Exception("MediaObject of image type has missing location field value")
-                    from helm.common.images_utils import encode_base64
+                    from helm.common.images_utils import encode_base64, get_dimensions, copy_image
+                    image_location: str = media_object.location
+                    base64_image: str
+                    image_width, image_height = get_dimensions(media_object.location)
+                    if (
+                        image_width > AnthropicClient.MAX_IMAGE_DIMENSION
+                        or image_height > AnthropicClient.MAX_IMAGE_DIMENSION
+                    ):
+                        hlog(
+                            f"WARNING: Image {image_location} exceeds max allowed size: "
+                            f"{AnthropicClient.MAX_IMAGE_DIMENSION} pixels"
+                        )
+                        # Save the resized image to a temporary file
+                        with tempfile.NamedTemporaryFile(suffix=".jpg") as temp_file:
+                            hlog(f"Resizing image to temporary path: {temp_file.name}")
+                            copy_image(
+                                src=image_location,
+                                dest=temp_file.name,
+                                width=min(image_width, AnthropicClient.MAX_IMAGE_DIMENSION),
+                                height=min(image_height, AnthropicClient.MAX_IMAGE_DIMENSION),
+                            )
+                            base64_image = encode_base64(temp_file.name, format="JPEG")
+                    else:
+                        base64_image = encode_base64(image_location, format="JPEG")
-                    base64_image: str = encode_base64(media_object.location, format="JPEG")
                     image_block: ImageBlockParam = {
                         "type": "image",
                         "source": {
@@ -302,7 +330,9 @@ class AnthropicMessagesClient(CachingClient):
                         "type": "text",
                         "text": media_object.text,
                     }
-                    blocks.append(text_block)
+                    # Anthropic does not support empty text blocks
+                    if media_object.text.strip():
+                        blocks.append(text_block)
             messages = [{"role": "user", "content": blocks}]
         else:

helm/clients/openai_client.py CHANGED Viewed

@@ -130,9 +130,8 @@ class OpenAIClient(CachingClient):
                         from helm.common.images_utils import encode_base64
                         base64_image: str = encode_base64(media_object.location)
-                        content.append(
-                            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
-                        )
+                        image_object: Dict[str, str] = {"url": f"data:image/jpeg;base64,{base64_image}"}
+                        content.append({"type": "image_url", "image_url": image_object})
                     elif media_object.is_type(TEXT_TYPE):
                         if media_object.text is None:
                             raise ValueError("MediaObject of text type has missing text field value")

helm/clients/together_client.py CHANGED Viewed

@@ -1,12 +1,20 @@
 from copy import deepcopy
-from typing import List, Dict, Any, Optional, Union
+from itertools import zip_longest
+from typing import List, Dict, Any, Optional, TypedDict, Union
 import requests
 from retrying import retry
 from helm.common.cache import CacheConfig
+from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
-from .client import CachingClient, truncate_sequence, cleanup_str
+from helm.clients.client import CachingClient, truncate_sequence, cleanup_str
+try:
+    from together import Together
+    from together.types import ChatCompletionResponse
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["together"])
 class _RewriteRequestTags:
@@ -272,3 +280,86 @@ class TogetherClient(CachingClient):
                 completions=completions,
                 embedding=[],
             )
+class TogetherRawChatRequest(TypedDict):
+    messages: List[Dict[str, str]]
+    model: str
+    max_tokens: int
+    stop: List[str]
+    temperature: float
+    top_p: float
+    top_k: int
+    logprobs: int
+    echo: bool
+    n: int
+def convert_to_raw_chat_request(request: Request) -> TogetherRawChatRequest:
+    if request.messages:
+        messages = request.messages
+    else:
+        messages = [{"role": "user", "content": request.prompt}]
+    return {
+        "messages": messages,
+        "model": request.model,
+        "max_tokens": request.max_tokens,
+        "stop": request.stop_sequences,
+        "temperature": request.temperature,
+        "top_p": request.top_p,
+        "top_k": request.top_k_per_token,
+        "logprobs": min(request.top_k_per_token, 1),
+        "echo": request.echo_prompt,
+        "n": request.num_completions,
+    }
+class TogetherChatClient(CachingClient):
+    """Client that uses the Python Together library for chat models."""
+    def __init__(self, cache_config: CacheConfig, api_key: str, together_model: Optional[str] = None):
+        super().__init__(cache_config=cache_config)
+        self._client = Together(api_key=api_key)
+    def make_request(self, request: Request) -> RequestResult:
+        raw_request = convert_to_raw_chat_request(request)
+        cache_key = CachingClient.make_cache_key(raw_request, request)
+        def do_it() -> Dict[Any, Any]:
+            response = self._client.chat.completions.create(**raw_request)
+            return response.model_dump(mode="json")
+        try:
+            raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+            response = ChatCompletionResponse.model_validate(raw_response)
+        except Exception as error:
+            return RequestResult(
+                success=False,
+                cached=False,
+                error=str(error),
+                completions=[],
+                embedding=[],
+            )
+        generated_outputs: List[GeneratedOutput] = []
+        for choice in response.choices:
+            # NOTE: Together always returns None for choice.finish_reason
+            # NOTE: Together does not return logprobs for the whole generated output, only for individual tokens
+            tokens: List[Token] = []
+            if choice.logprobs:
+                for token_text, token_logprob in zip_longest(
+                    choice.logprobs.tokens or [], choice.logprobs.token_logprobs or []
+                ):
+                    if token_text is None:
+                        break
+                    tokens.append(Token(text=token_text, logprob=token_logprob or 0.0))
+            assert choice.message.role == "assistant"
+            generated_outputs.append(GeneratedOutput(text=choice.message.content, logprob=0.0, tokens=tokens))
+        return RequestResult(
+            success=True,
+            cached=cached,
+            request_time=raw_response["request_time"],
+            request_datetime=raw_response["request_datetime"],
+            completions=generated_outputs,
+            embedding=[],
+        )

helm/clients/vertexai_client.py CHANGED Viewed

@@ -4,7 +4,6 @@ from threading import Lock
 from typing import Any, Dict, Optional, List, Union
 from helm.common.cache import CacheConfig
-from helm.common.hierarchical_logger import hlog
 from helm.common.media_object import TEXT_TYPE
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, ErrorFlags
@@ -131,12 +130,6 @@ class VertexAITextClient(VertexAIClient):
 class VertexAIChatClient(VertexAIClient):
     """Client for Vertex AI chat models (e.g., Gemini). Supports multimodal prompts."""
-    # Set the finish reason to this if the prompt violates the content policy
-    CONTENT_POLICY_VIOLATED_FINISH_REASON: str = "The prompt violates Google's content policy."
-    # Gemini returns this error for certain valid requests
-    CONTENT_HAS_NO_PARTS_ERROR: str = "Content has no parts."
     # Enum taken from:
     # https://cloud.google.com/vertex-ai/docs/reference/rpc/google.cloud.aiplatform.v1beta1#google.cloud.aiplatform.v1beta1.Candidate.FinishReason
     # We don't directly import this enum because it can differ between different Vertex AI library versions.
@@ -149,7 +142,7 @@ class VertexAIChatClient(VertexAIClient):
     ]
     @staticmethod
-    def get_model(model_name: str) -> Any:
+    def get_model(model_name: str) -> GenerativeModel:
         global _models_lock
         global _models
@@ -202,21 +195,22 @@ class VertexAIChatClient(VertexAIClient):
                 )
                 candidates: List[Candidate] = response.candidates
-                # Depending on the version of the Vertex AI library and the type of content blocking,
-                # content blocking can show up in many ways, so this defensively handles most of these ways
+                # Depending on the version of the Vertex AI library and the type of prompt blocking,
+                # prompt blocking can show up in many ways, so this defensively handles most of these ways
+                if response.prompt_feedback.block_reason:
+                    raise VertexAIContentBlockedError(
+                        f"Prompt blocked with reason: {response.prompt_feedback.block_reason}"
+                    )
                 if not candidates:
-                    raise VertexAIContentBlockedError("No candidates in response due to content blocking")
+                    raise VertexAIContentBlockedError(f"No candidates in response: {response}")
                 predictions: List[Dict[str, Any]] = []
                 for candidate in candidates:
-                    if (
-                        candidate.finish_reason in VertexAIChatClient.CONTENT_BLOCKED_FINISH_REASONS
-                        or not candidate.content.parts
-                    ):
-                        # The prediction was either blocked due to safety settings or the model stopped and returned
-                        # nothing (which also happens when the model is blocked).
-                        # For now, we don't cache blocked requests, because we are trying to get the
-                        # content blocking removed.
-                        raise VertexAIContentBlockedError("Content has no parts due to content blocking")
+                    # Depending on the version of the Vertex AI library and the type of prompt blocking,
+                    # content blocking can show up in many ways, so this defensively handles most of these ways
+                    if candidate.finish_reason in VertexAIChatClient.CONTENT_BLOCKED_FINISH_REASONS:
+                        raise VertexAIContentBlockedError(f"Content blocked with reason: {candidate.finish_reason}")
+                    if not candidate.content.parts:
+                        raise VertexAIContentBlockedError(f"No parts in candidate: {candidate}")
                     predictions.append({"text": candidate.content.text})
                     # TODO: Extract more information from the response
                 return {"predictions": predictions}
@@ -234,11 +228,11 @@ class VertexAIChatClient(VertexAIClient):
             )
             response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
-        except VertexAIContentBlockedError:
+        except VertexAIContentBlockedError as e:
             return RequestResult(
                 success=False,
                 cached=False,
-                error="Response was empty due to content moderation filter",
+                error=f"Content blocked: {str(e)}",
                 completions=[],
                 embedding=[],
                 error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
@@ -252,7 +246,7 @@ class VertexAIChatClient(VertexAIClient):
             return RequestResult(
                 success=False,
                 cached=False,
-                error="Response was empty due to content moderation filter",
+                error=f"Content blocked error in cached response: {str(response)}",
                 completions=[],
                 embedding=[],
                 error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
@@ -266,7 +260,7 @@ class VertexAIChatClient(VertexAIClient):
                 return RequestResult(
                     success=False,
                     cached=False,
-                    error="Response was empty due to content moderation filter",
+                    error=f"Content blocked error in cached prediction: {str(prediction)}",
                     completions=[],
                     embedding=[],
                     error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
@@ -291,21 +285,6 @@ class VertexAIChatClient(VertexAIClient):
         )
     def _make_multimodal_request(self, request: Request) -> RequestResult:
-        def complete_for_valid_error(error_message: str) -> RequestResult:
-            empty_completion = GeneratedOutput(
-                text="",
-                logprob=0,
-                tokens=[],
-                finish_reason={"reason": error_message},
-            )
-            return RequestResult(
-                success=True,
-                cached=False,
-                request_time=0,
-                completions=[empty_completion] * request.num_completions,
-                embedding=[],
-            )
         # Contents can either be text or a list of multimodal content made up of text, images or other content
         contents: Union[str, List[Union[str, Any]]] = request.prompt
         # Used to generate a unique cache key for this specific request
@@ -346,14 +325,29 @@ class VertexAIChatClient(VertexAIClient):
             try:
                 def do_it() -> Dict[str, Any]:
-                    raw_response = model.generate_content(
+                    response: GenerationResponse = model.generate_content(
                         contents, generation_config=parameters, safety_settings=self.safety_settings
                     )
-                    if raw_response._raw_response.prompt_feedback.block_reason != 0:
-                        hlog(f"Content blocked for prompt: {request.multimodal_prompt}")
-                        return {"error": self.CONTENT_POLICY_VIOLATED_FINISH_REASON}
-                    return {"predictions": [{"text": raw_response.candidates[0].text}]}
+                    # Depending on the version of the Vertex AI library and the type of prompt blocking,
+                    # prompt blocking can show up in many ways, so this defensively handles most of these ways
+                    if response.prompt_feedback.block_reason:
+                        raise VertexAIContentBlockedError(
+                            f"Prompt blocked with reason: {response.prompt_feedback.block_reason}"
+                        )
+                    if not response.candidates:
+                        raise VertexAIContentBlockedError(f"No candidates in response: {response}")
+                    # We should only have one candidate
+                    assert (
+                        len(response.candidates) == 1
+                    ), f"Expected 1 candidate since candidate_count is 1, got {len(response.candidates)}."
+                    candidate = response.candidates[0]
+                    # Depending on the version of the Vertex AI library and the type of prompt blocking,
+                    # content blocking can show up in many ways, so this defensively handles most of these ways
+                    if candidate.finish_reason in VertexAIChatClient.CONTENT_BLOCKED_FINISH_REASONS:
+                        raise VertexAIContentBlockedError(f"Content blocked with reason: {candidate.finish_reason}")
+                    if not candidate.content.parts:
+                        raise VertexAIContentBlockedError(f"No parts in candidate: {candidate}")
+                    return {"predictions": [{"text": candidate.text}]}
                 raw_cache_key = {"model_name": model_name, "prompt": prompt_key, **parameters}
                 if completion_index > 0:
@@ -361,15 +355,30 @@ class VertexAIChatClient(VertexAIClient):
                 cache_key = CachingClient.make_cache_key(raw_cache_key, request)
                 response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
-            except (requests.exceptions.RequestException, ValueError) as e:
-                if str(e) == self.CONTENT_HAS_NO_PARTS_ERROR:
-                    return complete_for_valid_error(self.CONTENT_HAS_NO_PARTS_ERROR)
+            except requests.exceptions.RequestException as e:
                 error: str = f"Gemini Vision error: {e}"
                 return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
+            except VertexAIContentBlockedError as e:
+                return RequestResult(
+                    success=False,
+                    cached=False,
+                    error=f"Content blocked: {str(e)}",
+                    completions=[],
+                    embedding=[],
+                    error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
+                )
             if "error" in response:
-                return complete_for_valid_error(response["error"])
+                return RequestResult(
+                    success=False,
+                    cached=True,
+                    error=f"Content blocked error in cached response: {str(response)}",
+                    completions=[],
+                    embedding=[],
+                    error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
+                    request_time=response["request_time"],
+                    request_datetime=response["request_datetime"],
+                )
             response_text = response["predictions"][0]["text"]
             completion = GeneratedOutput(text=response_text, logprob=0, tokens=[])

helm/clients/vision_language/huggingface_vision2seq_client.py ADDED Viewed

@@ -0,0 +1,145 @@
+from threading import Lock
+from typing import Any, Dict, List, Optional
+from dataclasses import dataclass
+from transformers import AutoProcessor, AutoModelForVision2Seq
+from transformers.image_utils import load_image
+import torch
+from helm.common.cache import CacheConfig
+from helm.common.gpu_utils import get_torch_device_name, is_cuda_available
+from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.media_object import TEXT_TYPE
+from helm.common.request import Request, RequestResult, GeneratedOutput, Token
+from helm.common.request import wrap_request_time
+from helm.common.tokenization_request import TokenizationRequest
+from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt
+from helm.tokenizers.tokenizer import Tokenizer
+@dataclass(frozen=True)
+class Vision2SeqModelProcessor:
+    """Loaded model and processor."""
+    model: AutoModelForVision2Seq
+    processor: AutoProcessor
+_models_lock: Lock = Lock()
+_models: Dict[str, Optional[Vision2SeqModelProcessor]] = {
+    "HuggingFaceM4/idefics2-8b": None,
+}
+class HuggingFaceVision2SeqClient(CachingClient):
+    """
+    Models for Vision2Seq models from HuggingFace.
+    """
+    ASSISTANT_PREFIX: str = "Assistant:"
+    def __init__(self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig):
+        super().__init__(cache_config=cache_config)
+        self.tokenizer = tokenizer
+        self.tokenizer_name = tokenizer_name
+        self._device: str = get_torch_device_name()
+    def _get_model(self, checkpoint: str) -> Vision2SeqModelProcessor:
+        global _models_lock
+        global _models
+        # Ensure that only one thread is loading the model at a time
+        with _models_lock:
+            loaded_model_processor = _models[checkpoint]
+            if loaded_model_processor is None:
+                hlog(f"Loading model {checkpoint} and caching in memory...")
+                torch_dtype: torch.dtype = torch.float16 if is_cuda_available() else torch.float32
+                model = AutoModelForVision2Seq.from_pretrained(checkpoint, torch_dtype=torch_dtype).to(self._device)
+                processor = AutoProcessor.from_pretrained(checkpoint)
+                _models[checkpoint] = Vision2SeqModelProcessor(model, processor)
+                loaded_model_processor = _models[checkpoint]
+        assert loaded_model_processor is not None
+        return loaded_model_processor
+    def make_request(self, request: Request) -> RequestResult:
+        assert request.model_deployment in _models, f"Not a valid model for this client: {request.model_deployment}"
+        assert request.multimodal_prompt is not None, "Multimodal prompt is required"
+        loaded_model_processor: Vision2SeqModelProcessor = self._get_model(request.model_deployment)
+        model = loaded_model_processor.model
+        processor = loaded_model_processor.processor
+        generation_args: Dict[str, Any] = {
+            "max_new_tokens": request.max_tokens,
+        }
+        image_paths: List[str] = []
+        multimodal_prompt: List[Dict[str, str]] = []
+        for media_object in request.multimodal_prompt.media_objects:
+            if media_object.is_type("image") and media_object.location:
+                image_paths.append(media_object.location)
+                multimodal_prompt.append({"type": "image"})
+            elif media_object.is_type(TEXT_TYPE):
+                if media_object.text is None:
+                    raise ValueError("MediaObject of text type has missing text field value")
+                multimodal_prompt.append({"type": "text", "text": media_object.text})
+            else:
+                raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
+        completions: List[GeneratedOutput] = []
+        with htrack_block(f"Generating for prompt: {request.multimodal_prompt.text}"):
+            try:
+                def do_it() -> Dict[str, Any]:
+                    messages = [{"role": "user", "content": multimodal_prompt}]
+                    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+                    inputs = processor(
+                        text=[prompt] * request.num_completions,
+                        images=[
+                            [load_image(image_path) for image_path in image_paths]
+                            for _ in range(request.num_completions)
+                        ],
+                        return_tensors="pt",
+                    )
+                    inputs = {k: v.to(self._device) for k, v in inputs.items()}
+                    # Generate
+                    generated_ids = model.generate(**inputs, **generation_args)
+                    generated_texts: List[str] = processor.batch_decode(generated_ids, skip_special_tokens=True)
+                    return {"output": generated_texts}
+                # Include the prompt and model name in the cache key
+                cache_key = CachingClient.make_cache_key(
+                    raw_request={
+                        "n": request.num_completions,
+                        "model": request.model,
+                        "prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
+                        **generation_args,
+                    },
+                    request=request,
+                )
+                result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+            except RuntimeError as model_error:
+                return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
+            for text in result["output"]:
+                hlog(f"Generated text: {text}")
+                assert self.ASSISTANT_PREFIX in text, f"Expected {self.ASSISTANT_PREFIX} in the output"
+                text = text.rpartition(self.ASSISTANT_PREFIX)[-1]
+                hlog(f"Truncated: {text}")
+                tokenization_result = self.tokenizer.tokenize(
+                    TokenizationRequest(text, tokenizer=self.tokenizer_name, encode=False)
+                )
+                tokens: List[Token] = [Token(text=str(text), logprob=0) for text in tokenization_result.raw_tokens]
+                completions.append(GeneratedOutput(text=text, logprob=0, tokens=tokens))
+        return RequestResult(
+            success=True,
+            cached=cached,
+            request_time=result["request_time"],
+            completions=completions,
+            embedding=[],
+        )

helm/clients/vision_language/huggingface_vlm_client.py CHANGED Viewed

@@ -25,7 +25,7 @@ except ModuleNotFoundError as e:
 class HuggingFaceVLMClient(CachingClient):
     """
-    General CLient for VLM models from HuggingFace.
+    General client for VLM models from HuggingFace.
     """
     _models_lock: Lock = Lock()
@@ -34,6 +34,10 @@ class HuggingFaceVLMClient(CachingClient):
         "huggingface/llava-1.5-7b-hf": "llava-hf/llava-1.5-7b-hf",
         "huggingface/llava-1.5-13b-hf": "llava-hf/llava-1.5-13b-hf",
         "huggingface/bakLlava-v1-hf": "llava-hf/bakLlava-v1-hf",
+        "huggingface/llava-v1.6-vicuna-7b-hf": "llava-hf/llava-v1.6-vicuna-7b-hf",
+        "huggingface/llava-v1.6-vicuna-13b-hf": "llava-hf/llava-v1.6-vicuna-13b-hf",
+        "huggingface/llava-v1.6-mistral-7b-hf": "llava-hf/llava-v1.6-mistral-7b-hf",
+        "huggingface/llava-v1.6-34b-hf": "llava-hf/llava-v1.6-34b-hf",
     }
     def __init__(self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig):
@@ -45,7 +49,7 @@ class HuggingFaceVLMClient(CachingClient):
         with self._models_lock:
             model_id: str = self._models_aliases.get(model_name, model_name)
             if model_id not in self._models:
-                self._models[model_id] = pipeline("image-to-text", model=model_id)
+                self._models[model_id] = pipeline("image-to-text", model=model_id, device_map="auto")
             return self._models[model_id]
     def make_request(self, request: Request) -> RequestResult:
@@ -90,11 +94,14 @@ class HuggingFaceVLMClient(CachingClient):
         except RuntimeError as e:
             return RequestResult(success=False, cached=False, error=str(e), completions=[], embedding=[])
+        output: str = result["generated_text"]
+        if "ASSISTANT: " in output:
+            output = output.split("ASSISTANT: ")[1]
         tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize(
-            TokenizationRequest(result["generated_text"], tokenizer=self.tokenizer_name)
+            TokenizationRequest(output, tokenizer=self.tokenizer_name)
         )
         tokens: List[Token] = [Token(text=str(text), logprob=0) for text in tokenization_result.raw_tokens]
-        completions: List[GeneratedOutput] = [GeneratedOutput(text=result["generated_text"], logprob=0, tokens=tokens)]
+        completions: List[GeneratedOutput] = [GeneratedOutput(text=output, logprob=0, tokens=tokens)]
         return RequestResult(
             success=True,
             cached=cached,

helm/clients/vision_language/idefics_client.py CHANGED Viewed

@@ -88,7 +88,7 @@ class IDEFICSClient(CachingClient):
         input_args: Dict[str, Union[str, bool]] = {"return_tensors": "pt"}
         generation_args = {
-            "max_length": request.max_tokens,
+            "max_new_tokens": request.max_tokens,
             "bad_words_ids": processor.tokenizer(self.BAD_WORD_TOKENS, add_special_tokens=False).input_ids,
         }
@@ -140,7 +140,7 @@ class IDEFICSClient(CachingClient):
                 # Truncate the output text as IDEFICS outputs the entire sequence including the prompt
                 if "instruct" in request.model:
-                    assert self.ASSISTANT_PREFIX in text, f"Expected {self.ASSISTANT_PREFIX} in the output"
+                    assert self.ASSISTANT_PREFIX in text, f"Expected {self.ASSISTANT_PREFIX} in the output: {text}"
                     text = text.rpartition(self.ASSISTANT_PREFIX)[-1]
                 else:
                     # Best we can do is to remove the text portion of the prompt from the output

crfm-helm 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.0py3-none-any.whl → 0.5.1py3-none-any.whl