PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show

helm/benchmark/window_services/test_opt_window_service.py CHANGED Viewed

@@ -2,9 +2,9 @@ import shutil
 import tempfile
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from .test_utils import get_tokenizer_service, TEST_PROMPT
-from .tokenizer_service import TokenizerService
-from .window_service_factory import WindowServiceFactory
+from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
 class TestOPTWindowService:

helm/benchmark/window_services/test_palmyra_window_service.py CHANGED Viewed

@@ -2,9 +2,9 @@ from tempfile import TemporaryDirectory
 from typing import List
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from .tokenizer_service import TokenizerService
-from .window_service_factory import WindowServiceFactory
-from .test_utils import get_tokenizer_service, TEST_PROMPT
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
+from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
 class TestPalmyraWindowService:

helm/benchmark/window_services/test_t0pp_window_service.py CHANGED Viewed

@@ -3,9 +3,9 @@ import tempfile
 from typing import List
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from .tokenizer_service import TokenizerService
-from .window_service_factory import WindowServiceFactory
-from .test_utils import get_tokenizer_service, TEST_PROMPT
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
+from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
 class TestT0ppWindowService:

helm/benchmark/window_services/test_t511b_window_service.py CHANGED Viewed

@@ -3,9 +3,9 @@ import tempfile
 from typing import List
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from .tokenizer_service import TokenizerService
-from .window_service_factory import WindowServiceFactory
-from .test_utils import get_tokenizer_service, TEST_PROMPT
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
+from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
 class TestT511bWindowService:

helm/benchmark/window_services/test_ul2_window_service.py CHANGED Viewed

@@ -3,9 +3,9 @@ import tempfile
 from typing import List
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from .tokenizer_service import TokenizerService
-from .window_service_factory import WindowServiceFactory
-from .test_utils import get_tokenizer_service, TEST_PROMPT
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
+from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
 class TestUL2WindowService:

helm/benchmark/window_services/test_utils.py CHANGED Viewed

@@ -4,7 +4,7 @@ from helm.common.authentication import Authentication
 from helm.common.cache_backend_config import CacheBackendConfig
 from helm.proxy.services.server_service import ServerService
 from helm.benchmark.metrics.metric_service import MetricService
-from .tokenizer_service import TokenizerService
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
 TEST_PROMPT: str = (

helm/benchmark/window_services/test_yalm_window_service.py CHANGED Viewed

@@ -2,9 +2,9 @@ import shutil
 import tempfile
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from .test_utils import get_tokenizer_service, TEST_PROMPT
-from .tokenizer_service import TokenizerService
-from .window_service_factory import WindowServiceFactory
+from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
 class TestYaLMWindowService:

helm/benchmark/window_services/yalm_window_service.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .local_window_service import LocalWindowService
+from helm.benchmark.window_services.local_window_service import LocalWindowService
 class YaLMWindowService(LocalWindowService):

helm/clients/ai21_client.py CHANGED Viewed

@@ -11,8 +11,8 @@ from helm.common.request import (
     GeneratedOutput,
     Token,
 )
-from .client import CachingClient, truncate_sequence, cleanup_str
-from .ai21_utils import AI21RequestError, handle_failed_request
+from helm.clients.client import CachingClient, truncate_sequence, cleanup_str
+from helm.clients.ai21_utils import AI21RequestError, handle_failed_request
 try:
     from ai21 import AI21Client as AISDKClient
@@ -186,7 +186,7 @@ class AI21ChatClient(CachingClient):
         completions: List[GeneratedOutput] = []
         for choice in response["choices"]:
-            completions.append(GeneratedOutput(text=choice["message"]["content"], logprob=0.0, tokens=[]))
+            completions.append(GeneratedOutput(text=choice["message"]["content"] or "", logprob=0.0, tokens=[]))
         return RequestResult(
             success=True,

helm/clients/aleph_alpha_client.py CHANGED Viewed

@@ -4,7 +4,7 @@ from helm.common.cache import CacheConfig
 from helm.common.media_object import TEXT_TYPE
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
-from .client import CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
+from helm.clients.client import CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
 try:
     from aleph_alpha_client import Client, CompletionRequest, CompletionResponse, Image, Prompt

helm/clients/audio_language/__init__.py ADDED Viewed

File without changes

helm/clients/audio_language/diva_llama_client.py ADDED Viewed

@@ -0,0 +1,118 @@
+import threading
+from typing import Any, Dict, List, Optional, Tuple, TypedDict
+import numpy as np
+from transformers import AutoModel, PreTrainedModel
+from helm.clients.client import CachingClient
+from helm.common.cache import CacheConfig
+from helm.common.media_object import TEXT_TYPE
+from helm.common.request import (
+    GeneratedOutput,
+    Request,
+    RequestResult,
+    wrap_request_time,
+)
+from helm.common.audio_utils import get_array_from_audio_file
+from helm.proxy.retry import NonRetriableException
+_LOCK = threading.Lock()
+_PRE_TRAINED_MODEL: Optional[PreTrainedModel] = None
+def _get_pre_trained_model(model_name: str, **kwargs) -> PreTrainedModel:
+    global _LOCK
+    global _PRE_TRAINED_MODEL
+    with _LOCK:
+        if _PRE_TRAINED_MODEL is None:
+            _PRE_TRAINED_MODEL = AutoModel.from_pretrained(model_name, **kwargs)
+    return _PRE_TRAINED_MODEL
+class DivaLlamaRequest(TypedDict):
+    """Cache key for DivaLlamaClient"""
+    model: str
+    media_objects: List[Dict[str, Any]]
+class DivaLlamaClient(CachingClient):
+    SAMPLE_RATE = 16000
+    def __init__(
+        self,
+        cache_config: CacheConfig,
+        **kwargs,
+    ):
+        super().__init__(cache_config)
+        self.pre_trained_model = _get_pre_trained_model("WillHeld/DiVA-llama-3-v0-8b", trust_remote_code=True, **kwargs)
+    @staticmethod
+    def _get_generate_input(request: Request) -> Tuple[np.ndarray, Optional[str]]:
+        if request.prompt:
+            raise NonRetriableException("request.prompt must be empty for DivaLlamaClient")
+        if request.embedding:
+            raise NonRetriableException("request.embedding must be empty for DivaLlamaClient")
+        if request.messages:
+            raise NonRetriableException("request.messages must be empty for DivaLlamaClient")
+        if request.multimodal_prompt is None:
+            raise NonRetriableException("request.multimodal_prompt must not be None for DivaLlamaClient")
+        text_input: Optional[str] = None
+        audio_input: Optional[np.ndarray] = None
+        for media_object in request.multimodal_prompt.media_objects:
+            if media_object.is_type("audio"):
+                if audio_input is not None:
+                    raise NonRetriableException(
+                        "Only one audio object allowed in request.multimodal_prompt.media_objects"
+                    )
+                assert media_object.location
+                audio_input = get_array_from_audio_file(media_object.location, DivaLlamaClient.SAMPLE_RATE)
+            elif media_object.is_type(TEXT_TYPE):
+                if text_input is not None:
+                    raise NonRetriableException(
+                        "Only one text object allowed in request.multimodal_prompt.media_objects"
+                    )
+                assert media_object.text is not None
+                text_input = media_object.text
+            else:
+                raise NonRetriableException(f"Unsupported media content type type: {media_object.content_type}")
+        if audio_input is None:
+            raise NonRetriableException(
+                "Expected a single audio object allowed in request.multimodal_prompt.media_objects"
+            )
+        return audio_input, text_input
+    def make_request(self, request: Request) -> RequestResult:
+        assert request.multimodal_prompt is not None
+        raw_request: DivaLlamaRequest = {
+            "model": request.model,
+            "media_objects": [media_object.to_dict() for media_object in request.multimodal_prompt.media_objects],
+        }
+        try:
+            def do_it() -> Dict[str, Any]:
+                with _LOCK:
+                    audio_input, text_input = DivaLlamaClient._get_generate_input(request)
+                    if text_input is None:
+                        return {"completions": self.pre_trained_model.generate([audio_input])}
+                    else:
+                        return {"completions": self.pre_trained_model.generate([audio_input], [text_input])}
+            cache_key = CachingClient.make_cache_key(raw_request, request)
+            response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+        except Exception as e:  # Do something if error is encountered.
+            error: str = f"HuggingFace error: {e}"
+            return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
+        generated_output = GeneratedOutput(text=response["completions"][0], logprob=0, tokens=[])
+        return RequestResult(
+            success=True,
+            cached=cached,
+            request_time=response["request_time"],
+            request_datetime=response.get("request_datetime"),
+            completions=[generated_output],
+            embedding=[],
+        )

helm/clients/audio_language/llama_omni_client.py ADDED Viewed

@@ -0,0 +1,198 @@
+from threading import Lock
+import torch
+from typing import Any, Dict, List, Optional
+from dataclasses import dataclass
+from transformers import AutoTokenizer
+import whisper
+from helm.clients.audio_language.llama_omni.model.builder import load_pretrained_model as load_llama_omni
+from helm.clients.audio_language.llama_omni.model.language_model.omni_speech2s_llama import OmniSpeech2SLlamaForCausalLM
+from helm.clients.audio_language.llama_omni.conversation import conv_templates, Conversation
+from helm.clients.audio_language.llama_omni.preprocess import tokenizer_speech_token
+from helm.common.cache import CacheConfig
+from helm.common.gpu_utils import get_torch_device_name
+from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.media_object import TEXT_TYPE
+from helm.common.request import Request, RequestResult, GeneratedOutput, Token
+from helm.common.request import wrap_request_time
+from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt
+@dataclass(frozen=True)
+class LoadedLlamaOmniModelProcessor:
+    """Loaded model and processor for Qwen."""
+    model: OmniSpeech2SLlamaForCausalLM
+    tokenizer: AutoTokenizer
+_models_lock: Lock = Lock()
+_models: Dict[str, Optional[LoadedLlamaOmniModelProcessor]] = {
+    "ICTNLP/Llama-3.1-8B-Omni": None,
+}
+class LlamaOmniAudioLMClient(CachingClient):
+    """
+    From https://github.com/ictnlp/LLaMA-Omni,
+    LLaMA-Omni is the audio multimodal version based on the LLaMA-3.1-8B large language model,
+    developed by ICTNLP group. LLaMA-Omni accepts audio, text as inputs, and outputs text.
+    Paper: https://arxiv.org/abs/2409.06666
+    """
+    END_OF_TEXT_TOKEN: str = "<|im_end|>"
+    CONV_MODE: str = "llama_3"
+    PAD_ID: int = 128004
+    MEL_NUM: int = 128
+    def __init__(self, cache_config: CacheConfig):
+        super().__init__(cache_config=cache_config)
+        self._device: str = get_torch_device_name()
+    def _get_model(self, helm_model_name: str) -> LoadedLlamaOmniModelProcessor:
+        global _models_lock
+        global _models
+        model_name: str
+        if helm_model_name == "llama-3.1-8b-omni":
+            model_name = "ICTNLP/Llama-3.1-8B-Omni"
+        else:
+            raise ValueError(f"Unhandled model name: {helm_model_name}")
+        # Ensure that only one thread is loading the model at a time
+        with _models_lock:
+            loaded_model_processor = _models[model_name]
+            if loaded_model_processor is None:
+                hlog(f"Loading model {model_name} and caching in memory...")
+                # Follow the official LLaMA-Omni model loading pattern:
+                # https://github.com/ictnlp/LLaMA-Omni/blob/main/omni_speech/infer/run.sh
+                tokenizer, model, _ = load_llama_omni(model_name, None, s2s=True)
+                _models[model_name] = LoadedLlamaOmniModelProcessor(model, tokenizer)
+                loaded_model_processor = _models[model_name]
+        assert loaded_model_processor is not None
+        return loaded_model_processor
+    def _load_local_audio(self, media_object) -> torch.Tensor:
+        assert media_object.is_local_file, "LLaMA-Omni only supports local audio file input"
+        audio_media = whisper.load_audio(media_object.location)
+        audio_media = whisper.pad_or_trim(audio_media)
+        audio_media = whisper.log_mel_spectrogram(audio_media, n_mels=self.MEL_NUM).permute(1, 0)
+        return audio_media
+    def make_request(self, request: Request) -> RequestResult:
+        assert request.multimodal_prompt is not None, "Multimodal prompt is required"
+        loaded_model_processor: LoadedLlamaOmniModelProcessor = self._get_model(request.model_engine)
+        model = loaded_model_processor.model
+        tokenizer = loaded_model_processor.tokenizer
+        # The generation configs are taken from the official LLaMA-Omni repository
+        # https://github.com/ictnlp/LLaMA-Omni/blob/main/omni_speech/infer/infer.py#L116
+        generation_args = {
+            "max_new_tokens": 25,
+            "do_sample": False,
+            "use_cache": False,
+            "pad_token_id": self.PAD_ID,
+            "streaming_unit_gen": False,
+            "top_p": None,
+        }
+        input_text_query: Dict[str, str]
+        input_audio_query: Dict[str, Any]
+        prompt_text: str = ""
+        for media_object in request.multimodal_prompt.media_objects:
+            if media_object.is_type("audio") and media_object.location:
+                input_audio_query = {"audio": self._load_local_audio(media_object)}
+            elif media_object.is_type(TEXT_TYPE):
+                if media_object.text is None:
+                    raise ValueError("MediaObject of text type has missing text field value")
+                input_text_query = {"text": "<speech>\n" + media_object.text}
+                prompt_text += media_object.text
+            else:
+                raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
+        completions: List[GeneratedOutput] = []
+        request_time: float = 0
+        request_datetime: Optional[int] = None
+        all_cached: bool = True
+        with htrack_block(f"Generating for prompt: {prompt_text}"):
+            for completion_index in range(request.num_completions):
+                try:
+                    def do_it() -> Dict[str, Any]:
+                        conv: Conversation = conv_templates[self.CONV_MODE].copy()
+                        conv.append_message(conv.roles[0], input_text_query["text"])
+                        conv.append_message(conv.roles[1], None)
+                        query: str = conv.get_prompt()
+                        # LLama-Omni requires a batch input
+                        text_inputs = (
+                            tokenizer_speech_token(query, tokenizer, return_tensors="pt").unsqueeze(0).to(self._device)
+                        )
+                        audio_inputs = (
+                            input_audio_query["audio"].to(dtype=torch.float16, device=self._device).unsqueeze(0)
+                        )
+                        speech_length = torch.LongTensor([audio_inputs.shape[1]])
+                        pred, _ = model.generate(
+                            text_inputs,
+                            audio_inputs,
+                            speech_length,
+                            None,
+                            None,
+                            None,
+                            None,
+                            None,
+                            None,
+                            None,
+                            None,
+                            False,
+                            None,
+                            None,
+                            **generation_args,
+                        )
+                        completion = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
+                        tokens: List[str] = tokenizer.tokenize(completion)
+                        return {"output": (completion, tokens)}
+                    # Include the prompt and model name in the cache key
+                    cache_key = CachingClient.make_cache_key(
+                        raw_request={
+                            "completion_index": completion_index,
+                            "model": request.model,
+                            "prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
+                            **generation_args,
+                        },
+                        request=request,
+                    )
+                    result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+                except RuntimeError as model_error:
+                    return RequestResult(
+                        success=False, cached=False, error=str(model_error), completions=[], embedding=[]
+                    )
+                text, tokens = result["output"]
+                # Tokenize truncated text to get the list of tokens
+                completions.append(
+                    GeneratedOutput(
+                        text=text, logprob=0, tokens=[Token(text=str(token), logprob=0) for token in tokens]
+                    )
+                )
+                request_time += result["request_time"]
+                # Use the datetime from the first completion because that's when the request was fired
+                request_datetime = request_datetime or result.get("request_datetime")
+                all_cached = all_cached and cached
+        return RequestResult(
+            success=True,
+            cached=all_cached,
+            request_time=request_time,
+            request_datetime=request_datetime,
+            completions=completions,
+            embedding=[],
+        )

helm/clients/audio_language/qwen2_audiolm_client.py ADDED Viewed

@@ -0,0 +1,188 @@
+from threading import Lock
+import librosa
+from typing import Any, Dict, List, Optional
+from dataclasses import dataclass
+from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
+from helm.common.cache import CacheConfig
+from helm.common.gpu_utils import get_torch_device_name
+from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.media_object import TEXT_TYPE
+from helm.common.request import Request, RequestResult, GeneratedOutput, Token
+from helm.common.request import wrap_request_time
+from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt
+@dataclass(frozen=True)
+class LoadedQwenModelProcessor:
+    """Loaded model and processor for Qwen."""
+    model: Qwen2AudioForConditionalGeneration
+    tokenizer: AutoProcessor
+_models_lock: Lock = Lock()
+_models: Dict[str, Optional[LoadedQwenModelProcessor]] = {
+    "Qwen/Qwen2-Audio-7B-Instruct": None,
+}
+class Qwen2AudioLMClient(CachingClient):
+    """
+    From https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct,
+    Qwen2-Audio-Instruct (Qwen2 Large Vision Language Model) is the audito multimodal version of the large model series,
+    Qwen2 (abbr. Tongyi Qianwen), proposed by Alibaba Cloud. Qwen2-Audio-Instruct accepts audio, text as inputs,
+    outputs text.
+    Alibaba released Qwen-Audio and Qwen-Audio-Instruct, which is a instruction-following model based on Qwen-Audio.
+    We for now integrated Qwen2-Audio-Instruct for instruction-following tasks.
+    Paper: https://arxiv.org/abs/2407.10759
+    """
+    END_OF_TEXT_TOKEN: str = "<|im_end|>"
+    def __init__(self, cache_config: CacheConfig):
+        super().__init__(cache_config=cache_config)
+        self._device: str = get_torch_device_name()
+    def _get_model(self, helm_model_name: str) -> LoadedQwenModelProcessor:
+        global _models_lock
+        global _models
+        model_name: str
+        if helm_model_name == "qwen2-audio-7b-instruct":
+            model_name = "Qwen/Qwen2-Audio-7B-Instruct"
+        else:
+            raise ValueError(f"Unhandled model name: {helm_model_name}")
+        # Ensure that only one thread is loading the model at a time
+        with _models_lock:
+            loaded_model_processor = _models[model_name]
+            if loaded_model_processor is None:
+                hlog(f"Loading model {model_name} and caching in memory...")
+                model = Qwen2AudioForConditionalGeneration.from_pretrained(
+                    model_name,
+                    device_map=self._device,
+                ).eval()
+                tokenizer = AutoProcessor.from_pretrained(
+                    model_name,
+                )
+                _models[model_name] = LoadedQwenModelProcessor(model, tokenizer)
+                loaded_model_processor = _models[model_name]
+        assert loaded_model_processor is not None
+        return loaded_model_processor
+    def make_request(self, request: Request) -> RequestResult:
+        assert request.multimodal_prompt is not None, "Multimodal prompt is required"
+        loaded_model_processor: LoadedQwenModelProcessor = self._get_model(request.model_engine)
+        model = loaded_model_processor.model
+        tokenizer = loaded_model_processor.tokenizer
+        input_query: List[Dict[str, Any]] = []
+        query: List[Dict[str, str]] = []
+        prompt_text: str = ""
+        input_query.append({"role": "system", "content": "You are a helpful assistant."})
+        prompt_text += "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        for media_num, media_object in enumerate(request.multimodal_prompt.media_objects):
+            if media_object.is_type("audio") and media_object.location:
+                assert media_object.is_local_file, "Only local audio files are supported"
+                query.append({"type": "audio", "audio_url": media_object.location})
+                prompt_text += f"<|im_start|>user\nAudio {media_num+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
+            elif media_object.is_type(TEXT_TYPE):
+                if media_object.text is None:
+                    raise ValueError("MediaObject of text type has missing text field value")
+                query.append({"type": "text", "text": media_object.text})
+                prompt_text += media_object.text
+            else:
+                raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
+        prompt_text += "<|im_end|>\n<|im_start|>assistant\n"
+        input_query.append({"role": "user", "content": query})
+        completions: List[GeneratedOutput] = []
+        request_time: float = 0
+        request_datetime: Optional[int] = None
+        all_cached: bool = True
+        with htrack_block(f"Generating for prompt: {prompt_text}"):
+            for completion_index in range(request.num_completions):
+                try:
+                    def do_it() -> Dict[str, Any]:
+                        inputs = tokenizer.apply_chat_template(input_query, add_generation_prompt=True, tokenize=False)
+                        audios: List[Any] = []
+                        # Refer to the official Qwen2-Audio documentation for the format of the input query
+                        # https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct
+                        for message in input_query:
+                            if isinstance(message["content"], list):
+                                for element in message["content"]:
+                                    if element["type"] == "audio":
+                                        audios.append(
+                                            librosa.load(
+                                                element["audio_url"],
+                                                sr=tokenizer.feature_extractor.sampling_rate,
+                                            )[0]
+                                        )
+                        inputs = tokenizer(
+                            text=inputs,
+                            audios=audios,
+                            sampling_rate=tokenizer.feature_extractor.sampling_rate,
+                            return_tensors="pt",
+                            padding=True,
+                        )
+                        input_length = inputs.input_ids.size(1)
+                        # Qwen2-Audio-Instruct counts input into the max_length,
+                        # so we need to add the length of the prompt
+                        inputs = inputs.to(self._device)
+                        pred = model.generate(**inputs, max_length=request.max_tokens + input_length)[:, input_length:]
+                        completion = tokenizer.decode(
+                            pred.cpu()[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
+                        )
+                        # The processor of Qwen2-Audio-Instruct consists an AutoTokenizer and a WhisperFeatureExtractor
+                        tokens: List[str] = tokenizer.tokenizer.tokenize(completion)
+                        return {"output": (completion, tokens)}
+                    # Include the prompt and model name in the cache key
+                    cache_key = CachingClient.make_cache_key(
+                        raw_request={
+                            "completion_index": completion_index,
+                            "model": request.model,
+                            "prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
+                            "max_tokens": request.max_tokens,
+                        },
+                        request=request,
+                    )
+                    result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+                except RuntimeError as model_error:
+                    return RequestResult(
+                        success=False, cached=False, error=str(model_error), completions=[], embedding=[]
+                    )
+                text, tokens = result["output"]
+                hlog(f"Generated: {text}")
+                # Tokenize truncated text to get the list of tokens
+                completions.append(
+                    GeneratedOutput(
+                        text=text, logprob=0, tokens=[Token(text=str(token), logprob=0) for token in tokens]
+                    )
+                )
+                request_time += result["request_time"]
+                # Use the datetime from the first completion because that's when the request was fired
+                request_datetime = request_datetime or result.get("request_datetime")
+                all_cached = all_cached and cached
+        return RequestResult(
+            success=True,
+            cached=all_cached,
+            request_time=request_time,
+            request_datetime=request_datetime,
+            completions=completions,
+            embedding=[],
+        )

crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl