PyPI - crfm-helm - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

crfm-helm 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (482) hide show

helm/clients/vision_language/open_flamingo/src/helpers.py ADDED Viewed

@@ -0,0 +1,267 @@
+"""
+Based on: https://github.com/lucidrains/flamingo-pytorch
+"""
+import torch
+from einops import rearrange, repeat
+from einops_exts import rearrange_many
+from torch import einsum, nn
+def exists(val):
+    return val is not None
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm_media = nn.LayerNorm(dim)
+        self.norm_latents = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, T, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, T, n2, D)
+        """
+        x = self.norm_media(x)
+        latents = self.norm_latents(latents)
+        h = self.heads
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q, k, v = rearrange_many((q, k, v), "b t n (h d) -> b h t n d", h=h)
+        q = q * self.scale
+        # attention
+        sim = einsum("... i d, ... j d  -> ... i j", q, k)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        out = einsum("... i j, ... j d -> ... i d", attn, v)
+        out = rearrange(out, "b h t n d -> b t n (h d)", h=h)
+        return self.to_out(out)
+class PerceiverResampler(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth=6,
+        dim_head=64,
+        heads=8,
+        num_latents=64,
+        max_num_media=None,
+        max_num_frames=None,
+        ff_mult=4,
+    ):
+        super().__init__()
+        self.latents = nn.Parameter(torch.randn(num_latents, dim))
+        self.frame_embs = nn.Parameter(torch.randn(max_num_frames, dim)) if exists(max_num_frames) else None
+        self.media_time_embs = nn.Parameter(torch.randn(max_num_media, 1, dim)) if exists(max_num_media) else None
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+        self.norm = nn.LayerNorm(dim)
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, T, F, v, D)
+        Returns:
+            shape (b, T, n, D) where n is self.num_latents
+        """
+        b, T, F, v = x.shape[:4]
+        # frame and media time embeddings
+        if exists(self.frame_embs):
+            frame_embs = repeat(self.frame_embs[:F], "F d -> b T F v d", b=b, T=T, v=v)
+            x = x + frame_embs
+        x = rearrange(x, "b T F v d -> b T (F v) d")  # flatten the frame and spatial dimensions
+        if exists(self.media_time_embs):
+            x = x + self.media_time_embs[:T]
+        # blocks
+        latents = repeat(self.latents, "n d -> b T n d", b=b, T=T)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        return self.norm(latents)
+# gated cross attention
+class MaskedCrossAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_visual,
+        dim_head=64,
+        heads=8,
+        only_attend_immediate_media=True,
+    ):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim_visual, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+        # whether for text to only attend to immediate preceding image, or all previous images
+        self.only_attend_immediate_media = only_attend_immediate_media
+    def forward(self, x, media, media_locations=None, use_cached_media=False):
+        """
+        Args:
+            x (torch.Tensor): text features
+                shape (B, T_txt, D_txt)
+            media (torch.Tensor): image features
+                shape (B, T_img, n, D_img) where n is the dim of the latents
+            media_locations: boolean mask identifying the media tokens in x
+                shape (B, T_txt)
+            use_cached_media: bool
+                If true, treat all of x as if they occur after the last media
+                registered in media_locations. T_txt does not need to exactly
+                equal media_locations.shape[1] in this case
+        """
+        if not use_cached_media:
+            assert (
+                media_locations.shape[1] == x.shape[1]
+            ), f"media_location.shape is {media_locations.shape} but x.shape is {x.shape}"
+        T_txt = x.shape[1]
+        _, T_img, n = media.shape[:3]
+        h = self.heads
+        x = self.norm(x)
+        q = self.to_q(x)
+        media = rearrange(media, "b t n d -> b (t n) d")
+        k, v = self.to_kv(media).chunk(2, dim=-1)
+        q, k, v = rearrange_many((q, k, v), "b n (h d) -> b h n d", h=h)
+        q = q * self.scale
+        sim = einsum("... i d, ... j d -> ... i j", q, k)
+        if exists(media_locations):
+            media_time = torch.arange(T_img, device=x.device) + 1
+            if use_cached_media:
+                # text time is set to the last cached media location
+                text_time = repeat(
+                    torch.count_nonzero(media_locations, dim=1),
+                    "b -> b i",
+                    i=T_txt,
+                )
+            else:
+                # at each boolean of True, increment the time counter (relative to media time)
+                text_time = media_locations.cumsum(dim=-1)
+            # text time must equal media time if only attending to most immediate image
+            # otherwise, as long as text time is greater than media time (if attending to all previous images / media)
+            mask_op = torch.eq if self.only_attend_immediate_media else torch.ge
+            text_to_media_mask = mask_op(
+                rearrange(text_time, "b i -> b 1 i 1"),
+                repeat(media_time, "j -> 1 1 1 (j n)", n=n),
+            )
+            sim = sim.masked_fill(~text_to_media_mask, -torch.finfo(sim.dtype).max)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        if exists(media_locations) and self.only_attend_immediate_media:
+            # any text without a preceding media needs to have attention zeroed out
+            text_without_media_mask = text_time == 0
+            text_without_media_mask = rearrange(text_without_media_mask, "b i -> b 1 i 1")
+            attn = attn.masked_fill(text_without_media_mask, 0.0)
+        out = einsum("... i j, ... j d -> ... i d", attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class GatedCrossAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_visual,
+        dim_head=64,
+        heads=8,
+        ff_mult=4,
+        only_attend_immediate_media=True,
+    ):
+        super().__init__()
+        self.attn = MaskedCrossAttention(
+            dim=dim,
+            dim_visual=dim_visual,
+            dim_head=dim_head,
+            heads=heads,
+            only_attend_immediate_media=only_attend_immediate_media,
+        )
+        self.attn_gate = nn.Parameter(torch.tensor([0.0]))
+        self.ff = FeedForward(dim, mult=ff_mult)
+        self.ff_gate = nn.Parameter(torch.tensor([0.0]))
+    def forward(
+        self,
+        x,
+        media,
+        media_locations=None,
+        use_cached_media=False,
+    ):
+        x = (
+            self.attn(
+                x,
+                media,
+                media_locations=media_locations,
+                use_cached_media=use_cached_media,
+            )
+            * self.attn_gate.tanh()
+            + x
+        )
+        x = self.ff(x) * self.ff_gate.tanh() + x
+        return x

helm/clients/vision_language/open_flamingo/src/utils.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""
+Source: https://github.com/mlfoundations/open_flamingo
+"""
+def extend_instance(obj, mixin):
+    """Apply mixins to a class instance after creation"""
+    base_cls = obj.__class__
+    base_cls_name = obj.__class__.__name__
+    obj.__class__ = type(
+        base_cls_name, (mixin, base_cls), {}
+    )  # mixin needs to go first for our forward() logic to work
+def getattr_recursive(obj, att):
+    """
+    Return nested attribute of obj
+    Example: getattr_recursive(obj, 'a.b.c') is equivalent to obj.a.b.c
+    """
+    if att == "":
+        return obj
+    i = att.find(".")
+    if i < 0:
+        return getattr(obj, att)
+    else:
+        return getattr_recursive(getattr(obj, att[:i]), att[i + 1 :])
+def setattr_recursive(obj, att, val):
+    """
+    Set nested attribute of obj
+    Example: setattr_recursive(obj, 'a.b.c', val) is equivalent to obj.a.b.c = val
+    """
+    if "." in att:
+        obj = getattr_recursive(obj, ".".join(att.split(".")[:-1]))
+    setattr(obj, att.split(".")[-1], val)
+def apply_with_stopping_condition(module, apply_fn, apply_condition=None, stopping_condition=None, **other_args):
+    if stopping_condition(module):
+        return
+    if apply_condition(module):
+        apply_fn(module, **other_args)
+    for child in module.children():
+        apply_with_stopping_condition(
+            child, apply_fn, apply_condition=apply_condition, stopping_condition=stopping_condition, **other_args
+        )

helm/clients/vision_language/open_flamingo_client.py ADDED Viewed

@@ -0,0 +1,155 @@
+from threading import Lock
+from typing import List, Optional, Tuple
+import torch
+from huggingface_hub import hf_hub_download
+from helm.common.cache import CacheConfig
+from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.images_utils import open_image
+from helm.common.gpu_utils import get_torch_device_name
+from helm.common.media_object import TEXT_TYPE
+from helm.common.optional_dependencies import handle_module_not_found_error
+from helm.common.request import Request, RequestResult, GeneratedOutput, Token
+from helm.common.request import wrap_request_time
+from helm.clients.vision_language.open_flamingo import create_model_and_transforms
+from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt
+try:
+    from PIL import Image
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["images"])
+class OpenFlamingoClient(CachingClient):
+    """
+    OpenFlamingo is an open source implementation of DeepMind's Flamingo models.
+    Implementation following:
+        https://github.com/mlfoundations/open_flamingo
+        https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b
+    """
+    END_OF_CHUNK_TOKEN: str = "<|endofchunk|>"
+    IMAGE_TOKEN: str = "<image>"
+    _model_lock: Lock = Lock()
+    def __init__(
+        self,
+        cache_config: CacheConfig,
+        checkpoint_path: Optional[str] = None,
+        tokenizer_name: Optional[str] = None,
+        cross_attn_every_n_layers: int = 4,
+    ):
+        super().__init__(cache_config)
+        self._device: str = get_torch_device_name()
+        self._checkpoint_path: Optional[str] = checkpoint_path
+        self._tokenizer_name: Optional[str] = tokenizer_name
+        self._cross_attn_every_n_layers: int = cross_attn_every_n_layers
+        # Model
+        # The model is only initialized when the first request is made
+        # This is to avoid loading the model if it is not used
+        self._model: Optional[torch.nn.Module] = None
+    def _get_model(self):
+        if not self._checkpoint_path:
+            raise ValueError("OpenFlamingoClient requires a checkpoint path")
+        if not self._tokenizer_name:
+            raise ValueError("OpenFlamingoClient requires a tokenizer name")
+        with htrack_block("Initializing OpenFlamingo model"):
+            with self._model_lock:
+                self._model, self.image_processor, self.tokenizer = create_model_and_transforms(
+                    clip_vision_encoder_path="ViT-L-14",
+                    clip_vision_encoder_pretrained="openai",
+                    lang_encoder_path=self._tokenizer_name,
+                    tokenizer_path=self._tokenizer_name,
+                    cross_attn_every_n_layers=self._cross_attn_every_n_layers,
+                )
+                self.tokenizer.padding_side = "left"
+                checkpoint_path = hf_hub_download(self._checkpoint_path, "checkpoint.pt")
+                self._model.load_state_dict(torch.load(checkpoint_path), strict=False)
+                self._model = self._model.to(self._device)
+                hlog(f"Loaded model to {self._device}.")
+    def make_request(self, request: Request) -> RequestResult:
+        assert request.multimodal_prompt is not None, "Multimodal prompt is required"
+        # Load model if needed
+        if self._model is None:
+            self._get_model()
+        # Build the prompt
+        prompt_text: str = ""
+        images: List[Image.Image] = []
+        for media_object in request.multimodal_prompt.media_objects:
+            if media_object.is_type("image") and media_object.location:
+                images.append(open_image(media_object.location))
+                prompt_text += self.IMAGE_TOKEN
+            elif media_object.is_type(TEXT_TYPE):
+                if media_object.text is None:
+                    raise ValueError("MediaObject of text type has missing text field value")
+                prompt_text += media_object.text
+            else:
+                raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
+        # Preprocess
+        vision_x: torch.Tensor = torch.cat([self.image_processor(image).unsqueeze(0) for image in images], dim=0)
+        vision_x = vision_x.unsqueeze(1).unsqueeze(0)
+        lang_x = self.tokenizer([prompt_text], return_tensors="pt")
+        # Generate
+        try:
+            generation_args = {
+                "max_new_tokens": request.max_tokens,
+                "n": request.num_completions,
+            }
+            def do_it():
+                tensors = self._model.generate(
+                    vision_x=vision_x.to(self._device),
+                    lang_x=lang_x["input_ids"].to(self._device),
+                    attention_mask=lang_x["attention_mask"].to(self._device),
+                    max_new_tokens=generation_args["max_new_tokens"],
+                    num_beams=generation_args["n"],
+                    num_return_sequences=generation_args["n"],
+                )
+                generated_completions: List[Tuple[str, List[str]]] = []
+                for tensor in tensors:
+                    generated_text: str = self.tokenizer.decode(tensor)
+                    raw_tokens: List[str] = self.tokenizer.tokenize(generated_text)
+                    generated_completions.append((generated_text, raw_tokens))
+                return {"output": generated_completions}
+            cache_key = CachingClient.make_cache_key(
+                raw_request={
+                    "model": request.model,
+                    "prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
+                    **generation_args,
+                },
+                request=request,
+            )
+            result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+        except RuntimeError as ex:
+            return RequestResult(success=False, cached=False, error=str(ex), completions=[], embedding=[])
+        completions: List[GeneratedOutput] = []
+        for text, tokens in result["output"]:
+            # Remove the prompt from the generated text
+            text = (
+                text[len(prompt_text) :].replace(self.END_OF_CHUNK_TOKEN, "").strip()
+                if len(text) >= len(prompt_text)
+                else text[-1]
+            )
+            completions.append(
+                GeneratedOutput(text=text, logprob=0, tokens=[Token(text=token, logprob=0) for token in tokens])
+            )
+        return RequestResult(
+            success=True,
+            cached=cached,
+            request_time=result["request_time"],
+            completions=completions,
+            embedding=[],
+        )

helm/clients/vision_language/qwen_vlm_client.py ADDED Viewed

@@ -0,0 +1,171 @@
+from threading import Lock
+from typing import Any, Dict, List, Optional
+from dataclasses import dataclass
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+from helm.common.cache import CacheConfig
+from helm.common.gpu_utils import get_torch_device_name
+from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.media_object import TEXT_TYPE
+from helm.common.request import Request, RequestResult, GeneratedOutput, Token
+from helm.common.request import wrap_request_time
+from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt
+@dataclass(frozen=True)
+class LoadedQwenModelProcessor:
+    """Loaded model and processor for Qwen."""
+    model: AutoModelForCausalLM
+    tokenizer: AutoTokenizer
+_models_lock: Lock = Lock()
+_models: Dict[str, Optional[LoadedQwenModelProcessor]] = {
+    "Qwen/Qwen-VL": None,
+    "Qwen/Qwen-VL-Chat": None,
+}
+class QwenVLMClient(CachingClient):
+    """
+    From https://huggingface.co/Qwen/Qwen-VL,
+    Qwen-VL (Qwen Large Vision Language Model) is the visual multimodal version of the large model series,
+    Qwen (abbr. Tongyi Qianwen), proposed by Alibaba Cloud. Qwen-VL accepts image, text, and bounding box
+    as inputs, outputs text and bounding box.
+    Alibaba released Qwen-VL and Qwen-VL-Chat, which is a chatbot model based on Qwen-VL.
+    Paper: https://arxiv.org/abs/2308.12966
+    """
+    END_OF_TEXT_TOKEN: str = "<|endoftext|>"
+    def __init__(self, cache_config: CacheConfig):
+        super().__init__(cache_config=cache_config)
+        self._device: str = get_torch_device_name()
+    def _get_model(self, helm_model_name: str) -> LoadedQwenModelProcessor:
+        global _models_lock
+        global _models
+        model_name: str
+        if helm_model_name == "qwen-vl-chat":
+            model_name = "Qwen/Qwen-VL-Chat"
+        elif helm_model_name == "qwen-vl":
+            model_name = "Qwen/Qwen-VL"
+        else:
+            raise ValueError(f"Unhandled model name: {helm_model_name}")
+        # Ensure that only one thread is loading the model at a time
+        with _models_lock:
+            loaded_model_processor = _models[model_name]
+            if loaded_model_processor is None:
+                hlog(f"Loading model {model_name} and caching in memory...")
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_name, device_map=self._device, trust_remote_code=True, bf16=True
+                ).eval()
+                if model_name == "Qwen/Qwen-VL-Chat":
+                    model.generation_config = GenerationConfig.from_pretrained(model_name, trust_remote_code=True)
+                tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+                _models[model_name] = LoadedQwenModelProcessor(model, tokenizer)
+                loaded_model_processor = _models[model_name]
+        assert loaded_model_processor is not None
+        return loaded_model_processor
+    def make_request(self, request: Request) -> RequestResult:
+        assert request.multimodal_prompt is not None, "Multimodal prompt is required"
+        loaded_model_processor: LoadedQwenModelProcessor = self._get_model(request.model_engine)
+        model = loaded_model_processor.model
+        tokenizer = loaded_model_processor.tokenizer
+        generation_args = {
+            "max_length": request.max_tokens,
+        }
+        query: List[Dict[str, str]] = []
+        prompt_text: str = ""
+        image_index: int = 1
+        for media_object in request.multimodal_prompt.media_objects:
+            if media_object.is_type("image") and media_object.location:
+                query.append({"image": media_object.location})
+                prompt_text += f"Picture {image_index}: <img>{media_object.location}</img>\n"
+                image_index += 1
+            elif media_object.is_type(TEXT_TYPE):
+                if media_object.text is None:
+                    raise ValueError("MediaObject of text type has missing text field value")
+                query.append({"text": media_object.text})
+                prompt_text += media_object.text
+            else:
+                raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
+        completions: List[GeneratedOutput] = []
+        request_time: float = 0
+        request_datetime: Optional[int] = None
+        all_cached: bool = True
+        with htrack_block(f"Generating for prompt: {prompt_text}"):
+            for completion_index in range(request.num_completions):
+                try:
+                    def do_it() -> Dict[str, Any]:
+                        if request.model_engine == "qwen-vl-chat":
+                            completion, _ = model.chat(tokenizer, query=tokenizer.from_list_format(query), history=None)
+                        else:
+                            inputs = tokenizer(tokenizer.from_list_format(query), return_tensors="pt")
+                            inputs = inputs.to(self._device)
+                            pred = model.generate(**inputs, **generation_args)
+                            completion = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
+                        tokens: List[str] = tokenizer.tokenize(completion)
+                        return {"output": (completion, tokens)}
+                    # Include the prompt and model name in the cache key
+                    cache_key = CachingClient.make_cache_key(
+                        raw_request={
+                            "completion_index": completion_index,
+                            "model": request.model,
+                            "prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
+                            **generation_args,
+                        },
+                        request=request,
+                    )
+                    result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+                except RuntimeError as model_error:
+                    return RequestResult(
+                        success=False, cached=False, error=str(model_error), completions=[], embedding=[]
+                    )
+                text, tokens = result["output"]
+                # Truncate the output text as the original Qwen includes the prompt in the output sequence
+                if request.model_engine == "qwen-vl":
+                    text = text[len(prompt_text) :]
+                    text = text.replace(self.END_OF_TEXT_TOKEN, "")
+                    hlog(f"Truncated: {text}")
+                # Tokenize truncated text to get the list of tokens
+                completions.append(
+                    GeneratedOutput(
+                        text=text, logprob=0, tokens=[Token(text=str(token), logprob=0) for token in tokens]
+                    )
+                )
+                request_time += result["request_time"]
+                # Use the datetime from the first completion because that's when the request was fired
+                request_datetime = request_datetime or result.get("request_datetime")
+                all_cached = all_cached and cached
+        return RequestResult(
+            success=True,
+            cached=all_cached,
+            request_time=request_time,
+            request_datetime=request_datetime,
+            completions=completions,
+            embedding=[],
+        )

helm/clients/vllm_client.py ADDED Viewed

@@ -0,0 +1,46 @@
+from typing import Any, Dict, Optional
+from helm.common.cache import CacheConfig
+from helm.common.request import Request
+from helm.clients.openai_client import OpenAIClient
+from helm.tokenizers.tokenizer import Tokenizer
+class VLLMClient(OpenAIClient):
+    """Sends request to a vLLM server using the OpenAI-compatible API.
+    See: https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server"""
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        tokenizer_name: str,
+        cache_config: CacheConfig,
+        base_url: Optional[str] = None,
+    ):
+        super().__init__(
+            tokenizer=tokenizer,
+            tokenizer_name=tokenizer_name,
+            cache_config=cache_config,
+            api_key="EMPTY",
+            org_id=None,
+            base_url=base_url,
+        )
+        self.tokenizer = tokenizer
+        self.tokenizer_name = tokenizer_name
+    def _is_chat_model_engine(self, model_engine: str) -> bool:
+        # Only support vLLM completion models for now.
+        return False
+    def _get_model_for_request(self, request: Request) -> str:
+        # The `model` parameter for vLLM should be the whole model name including the creator organization,
+        # unlike OpenAI which only uses the model engine.
+        return request.model
+    def _to_raw_completion_request(self, request: Request) -> Dict[str, Any]:
+        raw_request = super()._to_raw_completion_request(request)
+        # This avoids the error: best_of must be 1 when using greedy sampling
+        if "best_of" in raw_request and raw_request["best_of"] > 1:
+            raw_request["best_of"] = 1
+        return raw_request

crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

crfm-helm 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl