PyPI - crfm-helm - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

crfm-helm 0.4.0py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (499) hide show

helm/{proxy/clients → clients}/vision_language/idefics_client.py RENAMED Viewed

@@ -1,5 +1,5 @@
 from threading import Lock
-from typing import Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 import torch
 from dataclasses import dataclass
@@ -8,17 +8,14 @@ from transformers import IdeficsForVisionText2Text, AutoProcessor, IdeficsProces
 from helm.common.cache import CacheConfig
 from helm.common.images_utils import open_image
 from helm.common.gpu_utils import get_torch_device_name
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hlog, htrack_block
 from helm.common.media_object import TEXT_TYPE
 from helm.common.optional_dependencies import handle_module_not_found_error
-from helm.common.request import Request, RequestResult, Sequence, Token
-from helm.common.tokenization_request import (
-    TokenizationRequest,
-    TokenizationRequestResult,
-)
+from helm.common.request import Request, RequestResult, GeneratedOutput, Token
+from helm.common.tokenization_request import TokenizationRequest
 from helm.common.request import wrap_request_time
-from helm.proxy.clients.client import CachingClient, generate_uid_for_multimodal_prompt
-from helm.proxy.tokenizers.tokenizer import Tokenizer
+from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt
+from helm.tokenizers.tokenizer import Tokenizer
 try:
     from PIL import Image
@@ -54,6 +51,8 @@ class IDEFICSClient(CachingClient):
     END_OF_UTTERANCE_TOKEN: str = "<end_of_utterance>"
     BAD_WORD_TOKENS: List[str] = ["<image>", "<fake_token_around_image>"]
+    ASSISTANT_PREFIX: str = "Assistant: "
     def __init__(self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig):
         super().__init__(cache_config=cache_config)
         self.tokenizer = tokenizer
@@ -69,8 +68,8 @@ class IDEFICSClient(CachingClient):
             loaded_model_processor = _models[checkpoint]
             if loaded_model_processor is None:
                 hlog(f"Loading model {checkpoint} and caching in memory...")
-                model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16).to(
-                    self._device
+                model = IdeficsForVisionText2Text.from_pretrained(
+                    checkpoint, torch_dtype=torch.bfloat16, device_map="auto"
                 )
                 processor = AutoProcessor.from_pretrained(checkpoint)
                 _models[checkpoint] = LoadedIDEFICSModelProcessor(model, processor)
@@ -89,7 +88,7 @@ class IDEFICSClient(CachingClient):
         input_args: Dict[str, Union[str, bool]] = {"return_tensors": "pt"}
         generation_args = {
-            "max_length": request.max_tokens,
+            "max_new_tokens": request.max_tokens,
             "bad_words_ids": processor.tokenizer(self.BAD_WORD_TOKENS, add_special_tokens=False).input_ids,
         }
@@ -112,43 +111,49 @@ class IDEFICSClient(CachingClient):
                 raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
         prompt_text: str = request.multimodal_prompt.text.replace(self.END_OF_UTTERANCE_TOKEN, " ")
-        try:
-            def do_it():
-                inputs = processor(multimodal_prompt, **input_args).to(self._device)
-                generated_ids = model.generate(**inputs, **generation_args)
-                generated_text: str = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-                assert generated_text.startswith(
-                    prompt_text
-                ), f"Generated text: {generated_text} does not start with prompt: {prompt_text}"
-                # Remove the prompt from the generated text
-                generated_text = generated_text[len(prompt_text) :].strip()
-                return {"output": generated_text}
-            # Include the prompt and model name in the cache key
-            cache_key = CachingClient.make_cache_key(
-                raw_request={
-                    "model": request.model,
-                    "prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
-                    **generation_args,
-                },
-                request=request,
-            )
-            result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
-        except RuntimeError as e:
-            return RequestResult(success=False, cached=False, error=str(e), completions=[], embedding=[])
-        # TODO: Support multiple completions and figure out how get the log probs
-        # TODO: Does it make sense to support echo? Include these params in the cache key.
-        # TODO: Together might support this model so use the TogetherClient
-        tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize(
-            TokenizationRequest(result["output"], tokenizer=self.tokenizer_name)
-        )
-        tokens: List[Token] = [
-            Token(text=str(text), logprob=0, top_logprobs={}) for text in tokenization_result.raw_tokens
-        ]
-        completions: List[Sequence] = [Sequence(text=result["output"], logprob=0, tokens=tokens)]
+        completions: List[GeneratedOutput] = []
+        with htrack_block(f"Generating for prompt: {prompt_text}"):
+            try:
+                def do_it() -> Dict[str, Any]:
+                    inputs = processor([multimodal_prompt] * request.num_completions, **input_args).to(self._device)
+                    generated_ids = model.generate(**inputs, **generation_args)
+                    generated_text: List[str] = processor.batch_decode(generated_ids, skip_special_tokens=True)
+                    return {"output": generated_text}
+                # Include the prompt and model name in the cache key
+                cache_key = CachingClient.make_cache_key(
+                    raw_request={
+                        "n": request.num_completions,
+                        "model": request.model,
+                        "prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
+                        **generation_args,
+                    },
+                    request=request,
+                )
+                result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+            except RuntimeError as model_error:
+                return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
+            for text in result["output"]:
+                hlog(f"Generated text: {text}")
+                # Truncate the output text as IDEFICS outputs the entire sequence including the prompt
+                if "instruct" in request.model:
+                    assert self.ASSISTANT_PREFIX in text, f"Expected {self.ASSISTANT_PREFIX} in the output: {text}"
+                    text = text.rpartition(self.ASSISTANT_PREFIX)[-1]
+                else:
+                    # Best we can do is to remove the text portion of the prompt from the output
+                    text = text[len(prompt_text) :]
+                # Tokenize truncated text to get the list of tokens
+                hlog(f"Truncated: {text}")
+                tokenization_result = self.tokenizer.tokenize(
+                    TokenizationRequest(text, tokenizer=self.tokenizer_name, encode=False)
+                )
+                tokens: List[Token] = [Token(text=str(text), logprob=0) for text in tokenization_result.raw_tokens]
+                completions.append(GeneratedOutput(text=text, logprob=0, tokens=tokens))
         return RequestResult(
             success=True,
             cached=cached,

helm/clients/vision_language/open_flamingo/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .src.flamingo import Flamingo
2	+ from .src.factory import create_model_and_transforms

helm/clients/vision_language/open_flamingo/src/__init__.py ADDED Viewed

File without changes

helm/clients/vision_language/open_flamingo/src/factory.py ADDED Viewed

@@ -0,0 +1,147 @@
+"""
+Source: https://github.com/mlfoundations/open_flamingo
+"""
+from typing import Optional
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from helm.common.general import handle_module_not_found_error
+from .flamingo import Flamingo
+from .flamingo_lm import FlamingoLMMixin
+from .utils import extend_instance
+def create_model_and_transforms(
+    clip_vision_encoder_path: str,
+    clip_vision_encoder_pretrained: str,
+    lang_encoder_path: str,
+    tokenizer_path: str,
+    cross_attn_every_n_layers: int = 1,
+    use_local_files: bool = False,
+    decoder_layers_attr_name: str = None,
+    freeze_lm_embeddings: bool = False,
+    cache_dir: Optional[str] = None,
+    **flamingo_kwargs,
+):
+    """
+    Initialize a Flamingo model from a pretrained vision encoder and language encoder.
+    Appends special tokens to the tokenizer and freezes backbones.
+    Args:
+        clip_vision_encoder_path (str): path to pretrained clip model (e.g. "ViT-B-32")
+        clip_vision_encoder_pretrained (str): name of pretraining dataset for clip model (e.g. "laion2b_s32b_b79k")
+        lang_encoder_path (str): path to pretrained language encoder
+        tokenizer_path (str): path to pretrained tokenizer
+        cross_attn_every_n_layers (int, optional): determines how often to add a cross-attention layer. Defaults to 1.
+        use_local_files (bool, optional): whether to use local files. Defaults to False.
+        decoder_layers_attr_name (str, optional): name of the decoder layers attribute. Defaults to None.
+        freeze_lm_embeddings (bool, optional): whether to freeze LM input embeddings when configuring Perceiver.
+        cache_dir (str, optional): path to cache directory for downloading OpenClip/HF weights.
+    Returns:
+        Flamingo: Flamingo model from pretrained vision and language encoders
+        Image processor: Pipeline to preprocess input images
+        Tokenizer: A tokenizer for the language model
+    """
+    try:
+        import open_clip
+    except ModuleNotFoundError as e:
+        handle_module_not_found_error(e, ["vlm"])
+    vision_encoder, _, image_processor = open_clip.create_model_and_transforms(
+        clip_vision_encoder_path,
+        pretrained=clip_vision_encoder_pretrained,
+        cache_dir=cache_dir,
+    )
+    # set the vision encoder to output the visual features
+    vision_encoder.visual.output_tokens = True
+    text_tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_path,
+        local_files_only=use_local_files,
+        trust_remote_code=True,
+        cache_dir=cache_dir,
+    )
+    # add Flamingo special tokens to the tokenizer
+    text_tokenizer.add_special_tokens({"additional_special_tokens": ["<|endofchunk|>", "<image>"]})
+    if text_tokenizer.pad_token is None:
+        # Issue: GPT models don't have a pad token, which we use to
+        # modify labels for the loss.
+        text_tokenizer.add_special_tokens({"pad_token": "<PAD>"})
+    lang_encoder = AutoModelForCausalLM.from_pretrained(
+        lang_encoder_path,
+        local_files_only=use_local_files,
+        trust_remote_code=True,
+        cache_dir=cache_dir,
+    )
+    # hacks for MPT-1B, which doesn't have a get_input_embeddings method
+    if "mpt-1b-redpajama-200b" in lang_encoder_path:
+        class EmbeddingFnMixin:
+            def get_input_embeddings(self):
+                return self.transformer.wte
+            def set_input_embeddings(self, new_embeddings):
+                self.transformer.wte = new_embeddings
+        extend_instance(lang_encoder, EmbeddingFnMixin)
+    # convert LM to FlamingoLM
+    extend_instance(lang_encoder, FlamingoLMMixin)
+    if decoder_layers_attr_name is None:
+        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
+    lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
+    lang_encoder.resize_token_embeddings(len(text_tokenizer))
+    model = Flamingo(
+        vision_encoder,
+        lang_encoder,
+        text_tokenizer.encode("<|endofchunk|>")[-1],
+        text_tokenizer.encode("<image>")[-1],
+        vis_dim=open_clip.get_model_config(clip_vision_encoder_path)["vision_cfg"]["width"],
+        cross_attn_every_n_layers=cross_attn_every_n_layers,
+        **flamingo_kwargs,
+    )
+    # Freeze all parameters
+    model.requires_grad_(False)
+    assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0
+    # Unfreeze perceiver, gated_cross_attn_layers, and LM input embeddings
+    model.perceiver.requires_grad_(True)
+    model.lang_encoder.gated_cross_attn_layers.requires_grad_(True)
+    if not freeze_lm_embeddings:
+        model.lang_encoder.get_input_embeddings().requires_grad_(True)
+        # TODO: investigate also training the output embeddings when untied
+    print(
+        f"Flamingo model initialized with {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters"
+    )
+    return model, image_processor, text_tokenizer
+def _infer_decoder_layers_attr_name(model):
+    for k in __KNOWN_DECODER_LAYERS_ATTR_NAMES:
+        if k.lower() in model.__class__.__name__.lower():
+            return __KNOWN_DECODER_LAYERS_ATTR_NAMES[k]
+    raise ValueError(
+        "We require the attribute name for the nn.ModuleList in the decoder storing the transformer block layers. "
+        "Please supply this string manually."
+    )
+__KNOWN_DECODER_LAYERS_ATTR_NAMES = {
+    "opt": "model.decoder.layers",
+    "gptj": "transformer.h",
+    "gpt-j": "transformer.h",
+    "pythia": "gpt_neox.layers",
+    "llama": "model.layers",
+    "gptneoxforcausallm": "gpt_neox.layers",
+    "mpt": "transformer.blocks",
+    "mosaicgpt": "transformer.blocks",
+}

helm/clients/vision_language/open_flamingo/src/flamingo.py ADDED Viewed

@@ -0,0 +1,337 @@
+"""
+Source: https://github.com/mlfoundations/open_flamingo
+"""
+import torch
+from einops import rearrange
+from torch import nn
+from .helpers import PerceiverResampler
+from torch.distributed.fsdp.wrap import (
+    enable_wrap,
+    wrap,
+)
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from torch.distributed.fsdp import (
+    FullyShardedDataParallel as FSDP,
+)
+from .utils import apply_with_stopping_condition
+class Flamingo(nn.Module):
+    def __init__(
+        self,
+        vision_encoder: nn.Module,
+        lang_encoder: nn.Module,
+        eoc_token_id: int,
+        media_token_id: int,
+        vis_dim: int,
+        cross_attn_every_n_layers: int = 1,
+        gradient_checkpointing: bool = False,
+    ):
+        """
+        Args:
+            vision_encoder (nn.Module): HF CLIPModel
+            lang_encoder (nn.Module): HF causal language model
+            eoc_token_id (int): Token id for <|endofchunk|>
+            media_token_id (int): Token id for <image>
+            vis_dim (int): Dimension of the visual features.
+                Visual features are projected to match this shape along the last dimension.
+            cross_attn_every_n_layers (int, optional): How often to apply cross attention after transformer layer. Defaults to 1.
+        """
+        super().__init__()
+        self.eoc_token_id = eoc_token_id
+        self.media_token_id = media_token_id
+        self.vis_dim = vis_dim
+        if hasattr(lang_encoder.config, "d_model"):
+            self.lang_dim = lang_encoder.config.d_model  # mpt uses d_model
+        else:
+            self.lang_dim = lang_encoder.config.hidden_size
+        self.vision_encoder = vision_encoder.visual
+        self.perceiver = PerceiverResampler(dim=self.vis_dim)
+        self.lang_encoder = lang_encoder
+        self.lang_encoder.init_flamingo(
+            media_token_id=media_token_id,
+            lang_hidden_size=self.lang_dim,
+            vis_hidden_size=self.vis_dim,
+            cross_attn_every_n_layers=cross_attn_every_n_layers,
+            gradient_checkpointing=gradient_checkpointing,
+        )
+        self._use_gradient_checkpointing = gradient_checkpointing
+        self.perceiver._use_gradient_checkpointing = gradient_checkpointing
+    def forward(
+        self,
+        vision_x: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        labels: torch.Tensor = None,
+        clear_conditioned_layers: bool = True,
+        past_key_values=None,
+        use_cache: bool = False,
+    ):
+        """
+        Forward pass of Flamingo.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W) with F=1
+            lang_x (torch.Tensor): Language input ids
+                shape (B, T_txt)
+            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
+            labels (torch.Tensor, optional): Labels. Defaults to None.
+            clear_conditioned_layers: if True, clear the conditioned layers
+                once the foward pass is completed. Set this to false if the
+                same set of images will be reused in another subsequent
+                forward pass.
+            past_key_values: pre-computed values to pass to language model.
+                See past_key_values documentation in Hugging Face
+                CausalLM models.
+            use_cache: whether to use cached key values. See use_cache
+                documentation in Hugging Face CausalLM models.
+        """
+        assert (
+            self.lang_encoder.initialized_flamingo
+        ), "Flamingo layers are not initialized. Please call `init_flamingo` first."
+        assert (
+            self.lang_encoder._use_cached_vision_x or vision_x is not None
+        ), "Must provide either vision_x or have precached media using cache_media()."
+        if self.lang_encoder._use_cached_vision_x:
+            # Case: use cached; vision_x should be cached and other
+            # vision-related inputs should not be provided.
+            assert (
+                vision_x is None
+            ), "Expect vision_x to be None when media has been cached using cache_media(). Try uncache_media() first."
+            assert self.lang_encoder.is_conditioned()
+        else:
+            # Case: do not use caching (i.e. this is a standard forward pass);
+            self._encode_vision_x(vision_x=vision_x)
+            self._condition_media_locations(input_ids=lang_x)
+        output = self.lang_encoder(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            labels=labels,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+        )
+        if clear_conditioned_layers:
+            self.lang_encoder.clear_conditioned_layers()
+        return output
+    def generate(
+        self,
+        vision_x: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        **kwargs,
+    ):
+        """
+        Generate text conditioned on vision and language inputs.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                images in the same chunk are collated along T_img, and frames are collated along F
+                currently only F=1 is supported (single-frame videos)
+            lang_x (torch.Tensor): Language input
+                shape (B, T_txt)
+            **kwargs: see generate documentation in Hugging Face CausalLM models. Some notable kwargs:
+                max_length (int, optional): Maximum length of the output. Defaults to None.
+                attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
+                num_beams (int, optional): Number of beams. Defaults to 1.
+                max_new_tokens (int, optional): Maximum new tokens. Defaults to None.
+                temperature (float, optional): Temperature. Defaults to 1.0.
+                top_k (int, optional): Top k. Defaults to 50.
+                top_p (float, optional): Top p. Defaults to 1.0.
+                no_repeat_ngram_size (int, optional): No repeat ngram size. Defaults to 0.
+                length_penalty (float, optional): Length penalty. Defaults to 1.0.
+                num_return_sequences (int, optional): Number of return sequences. Defaults to 1.
+                do_sample (bool, optional): Do sample. Defaults to False.
+                early_stopping (bool, optional): Early stopping. Defaults to False.
+        Returns:
+            torch.Tensor: lang_x with generated tokens appended to it
+        """
+        num_beams = kwargs.pop("num_beams", 1)
+        if num_beams > 1:
+            vision_x = vision_x.repeat_interleave(num_beams, dim=0)
+        self.lang_encoder._use_cached_vision_x = True
+        self._encode_vision_x(vision_x=vision_x)
+        eos_token_id = kwargs.pop("eos_token_id", self.eoc_token_id)
+        output = self.lang_encoder.generate(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            eos_token_id=eos_token_id,
+            num_beams=num_beams,
+            **kwargs,
+        )
+        self.lang_encoder.clear_conditioned_layers()
+        self.lang_encoder._use_cached_vision_x = False
+        return output
+    def _encode_vision_x(self, vision_x: torch.Tensor):
+        """
+        Compute media tokens from vision input by passing it through vision encoder and conditioning language model.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                Images in the same chunk are collated along T_img, and frames are collated along F
+                Currently only F=1 is supported (single-frame videos)
+        rearrange code based on https://github.com/dhansmair/flamingo-mini
+        """
+        assert vision_x.ndim == 6, "vision_x should be of shape (b, T_img, F, C, H, W)"
+        b, T, F = vision_x.shape[:3]
+        assert F == 1, "Only single frame supported"
+        vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w")
+        with torch.no_grad():
+            vision_x = self.vision_encoder(vision_x)[1]
+        vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F)
+        vision_x = self.perceiver(vision_x)
+        for layer in self.lang_encoder._get_decoder_layers():
+            layer.condition_vis_x(vision_x)
+    def wrap_fsdp(self, wrapper_kwargs, device_id):
+        """
+        Manually wraps submodules for FSDP and move other parameters to device_id.
+        Why manually wrap?
+        - all parameters within the FSDP wrapper must have the same requires_grad.
+            We have a mix of frozen and unfrozen parameters.
+        - model.vision_encoder.visual needs to be individually wrapped or encode_vision_x errors
+            See: https://github.com/pytorch/pytorch/issues/82461#issuecomment-1269136344
+        The rough wrapping structure is:
+        - FlamingoModel
+            - FSDP(FSDP(vision_encoder))
+            - FSDP(FSDP(perceiver))
+            - lang_encoder
+                - FSDP(FSDP(input_embeddings))
+                - FlamingoLayers
+                    - FSDP(FSDP(gated_cross_attn_layer))
+                    - FSDP(FSDP(decoder_layer))
+                - FSDP(FSDP(output_embeddings))
+                - other parameters
+        Known issues:
+        - Our FSDP strategy is not compatible with tied embeddings. If the LM embeddings are tied,
+            train with DDP or set the --freeze_lm_embeddings flag to true.
+        - With FSDP + gradient ckpting, one can increase the batch size with seemingly no upper bound.
+            Although the training curves look okay, we found that downstream performance dramatically
+            degrades if the batch size is unreasonably large (e.g., 100 MMC4 batch size for OPT-125M).
+        FAQs about our FSDP wrapping strategy:
+        Why double wrap?
+        As of torch==2.0.1, FSDP's _post_forward_hook and _post_backward_hook
+        only free gathered parameters if the module is NOT FSDP root.
+        Why unfreeze the decoder_layers?
+        See https://github.com/pytorch/pytorch/issues/95805
+        As of torch==2.0.1, FSDP's _post_backward_hook is only registed if the flat param
+        requires_grad=True. We need the postback to fire to avoid OOM.
+        To effectively freeze the decoder layers, we exclude them from the optimizer.
+        What is assumed to be frozen v. unfrozen?
+        We assume that the model is being trained under normal Flamingo settings
+        with these lines being called in factory.py:
+            ```
+            # Freeze all parameters
+            model.requires_grad_(False)
+            assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0
+            # Unfreeze perceiver, gated_cross_attn_layers, and LM input embeddings
+            model.perceiver.requires_grad_(True)
+            model.lang_encoder.gated_cross_attn_layers.requires_grad_(True)
+            [optional] model.lang_encoder.get_input_embeddings().requires_grad_(True)
+            ```
+        """
+        # unfreeze the decoder layers
+        for block in self.lang_encoder.old_decoder_blocks:
+            block.requires_grad_(True)
+        # wrap in FSDP
+        with enable_wrap(wrapper_cls=FSDP, **wrapper_kwargs):
+            self.perceiver = wrap(wrap(self.perceiver))
+            self.lang_encoder.old_decoder_blocks = nn.ModuleList(
+                wrap(wrap(block)) for block in self.lang_encoder.old_decoder_blocks
+            )
+            self.lang_encoder.gated_cross_attn_layers = nn.ModuleList(
+                wrap(wrap(layer)) if layer is not None else None for layer in self.lang_encoder.gated_cross_attn_layers
+            )
+            self.lang_encoder.init_flamingo_layers(self._use_gradient_checkpointing)
+            self.lang_encoder.set_input_embeddings(wrap(wrap(self.lang_encoder.get_input_embeddings())))
+            self.lang_encoder.set_output_embeddings(wrap(wrap(self.lang_encoder.get_output_embeddings())))
+            self.vision_encoder = wrap(wrap(self.vision_encoder))  # frozen
+        # manually move non-FSDP managed parameters to device_id
+        # these are all in lang_encoder
+        apply_with_stopping_condition(
+            module=self.lang_encoder,
+            apply_fn=lambda m: m.to(device_id),
+            apply_condition=lambda m: len(list(m.children())) == 0,
+            stopping_condition=lambda m: isinstance(m, FSDP),
+        )
+        # exclude the original decoder layers from the optimizer
+        for block in self.lang_encoder.old_decoder_blocks:
+            for p in block.parameters():
+                p.exclude_from_optimizer = True
+        # set up clip_grad_norm_ function
+        def clip_grad_norm_(max_norm):
+            self.perceiver.clip_grad_norm_(max_norm)
+            for layer in self.lang_encoder.gated_cross_attn_layers:
+                if layer is not None:
+                    layer.clip_grad_norm_(max_norm)
+            self.lang_encoder.get_input_embeddings().clip_grad_norm_(max_norm)
+        self.clip_grad_norm_ = clip_grad_norm_
+    def _condition_media_locations(self, input_ids: torch.Tensor):
+        """
+        Compute the media token locations from lang_x and condition the language model on these.
+        Args:
+            input_ids (torch.Tensor): Language input
+                shape (B, T_txt)
+        """
+        media_locations = input_ids == self.media_token_id
+        for layer in self.lang_encoder._get_decoder_layers():
+            layer.condition_media_locations(media_locations)
+    def cache_media(self, input_ids: torch.Tensor, vision_x: torch.Tensor):
+        """
+        Pre-cache a prompt/sequence of images / text for log-likelihood evaluations.
+        All subsequent calls to forward() will generate attending to the LAST
+        image in vision_x.
+        This is not meant to be used to cache things for generate().
+        Args:
+            input_ids (torch.Tensor): Language input
+                shape (B, T_txt)
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                Images in the same chunk are collated along T_img, and frames are collated along F
+                Currently only F=1 is supported (single-frame videos)
+        """
+        self._encode_vision_x(vision_x=vision_x)
+        self._condition_media_locations(input_ids=input_ids)
+        self.lang_encoder._use_cached_vision_x = True
+    def uncache_media(self):
+        """
+        Clear all conditioning.
+        """
+        self.lang_encoder.clear_conditioned_layers()
+        self.lang_encoder._use_cached_vision_x = False

crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.4.0py3-none-any.whl → 0.5.1py3-none-any.whl