PyPI - llama-stack - Versions diffs - 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

llama-stack 0.0.42py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (738) hide show

llama_stack/models/llama/llama4/tokenizer.py ADDED Viewed

@@ -0,0 +1,263 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from collections.abc import Collection, Iterator, Sequence, Set
+from pathlib import Path
+from typing import (
+    Literal,
+    cast,
+)
+import tiktoken
+from llama_stack.log import get_logger
+from llama_stack.models.llama.tokenizer_utils import load_bpe_file
+# The tiktoken tokenizer can handle <=400k chars without
+# pyo3_runtime.PanicException.
+TIKTOKEN_MAX_ENCODE_CHARS = 400_000
+# https://github.com/openai/tiktoken/issues/195
+# Here we iterate over subsequences and split if we exceed the limit
+# of max consecutive non-whitespace or whitespace characters.
+MAX_NO_WHITESPACES_CHARS = 25_000
+_INSTANCE = None
+def get_reserved_special_tokens(name, count, start_index=0):
+    return [f"<|{name}_reserved_special_token_{i}|>" for i in range(start_index, start_index + count)]
+# 200005, ..., 200079
+LLAMA4_TEXT_POST_TRAIN_SPECIAL_TOKENS = [
+    "<|header_start|>",
+    "<|header_end|>",
+    "<|eom|>",
+    "<|eot|>",
+    "<|step|>",
+    "<|text_post_train_reserved_special_token_0|>",
+    "<|text_post_train_reserved_special_token_1|>",
+    "<|text_post_train_reserved_special_token_2|>",
+    "<|text_post_train_reserved_special_token_3|>",
+    "<|text_post_train_reserved_special_token_4|>",
+    "<|text_post_train_reserved_special_token_5|>",
+    "<|python_start|>",
+    "<|python_end|>",
+    "<|finetune_right_pad|>",
+] + get_reserved_special_tokens(
+    "text_post_train", 61, 8
+)  # <|text_post_train_reserved_special_token_6|>, ..., <|text_post_train_reserved_special_token_66|>
+# 200080, ..., 201133
+LLAMA4_VISION_SPECIAL_TOKENS = [
+    "<|image_start|>",
+    "<|image_end|>",
+    "<|vision_reserved_special_token_0|>",
+    "<|vision_reserved_special_token_1|>",
+    "<|tile_x_separator|>",
+    "<|tile_y_separator|>",
+    "<|vision_reserved_special_token_2|>",
+    "<|vision_reserved_special_token_3|>",
+    "<|vision_reserved_special_token_4|>",
+    "<|vision_reserved_special_token_5|>",
+    "<|image|>",
+    "<|vision_reserved_special_token_6|>",
+    "<|patch|>",
+] + get_reserved_special_tokens(
+    "vision", 1041, 7
+)  # <|vision_reserved_special_token_7|>, ..., <|vision_reserved_special_token_1047|>
+# 201134, ..., 201143
+LLAMA4_REASONING_SPECIAL_TOKENS = [
+    "<|reasoning_reserved_special_token_0|>",
+    "<|reasoning_reserved_special_token_1|>",
+    "<|reasoning_reserved_special_token_2|>",
+    "<|reasoning_reserved_special_token_3|>",
+    "<|reasoning_reserved_special_token_4|>",
+    "<|reasoning_reserved_special_token_5|>",
+    "<|reasoning_reserved_special_token_6|>",
+    "<|reasoning_reserved_special_token_7|>",
+    "<|reasoning_thinking_start|>",
+    "<|reasoning_thinking_end|>",
+]
+LLAMA4_SPECIAL_TOKENS = (
+    LLAMA4_TEXT_POST_TRAIN_SPECIAL_TOKENS + LLAMA4_VISION_SPECIAL_TOKENS + LLAMA4_REASONING_SPECIAL_TOKENS
+)
+BASIC_SPECIAL_TOKENS = [
+    "<|begin_of_text|>",
+    "<|end_of_text|>",
+    "<|fim_prefix|>",
+    "<|fim_middle|>",
+    "<|fim_suffix|>",
+]
+logger = get_logger(name=__name__, category="models::llama")
+class Tokenizer:
+    """
+    Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
+    """
+    special_tokens: dict[str, int]
+    num_reserved_special_tokens = 2048
+    O200K_PATTERN = r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+"""  # noqa: E501
+    @classmethod
+    def get_instance(cls):
+        global _INSTANCE
+        if _INSTANCE is None:
+            _INSTANCE = Tokenizer(Path(__file__).parent / "tokenizer.model")
+        return _INSTANCE
+    def __init__(self, model_path: Path):
+        """
+        Initializes the Tokenizer with a Tiktoken model.
+        Args:
+            model_path (Path): The path to the Tiktoken model file.
+        """
+        if not model_path.exists():
+            raise FileNotFoundError(f"Tokenizer model file not found: {model_path}")
+        mergeable_ranks = load_bpe_file(model_path)
+        num_base_tokens = len(mergeable_ranks)
+        special_tokens = BASIC_SPECIAL_TOKENS + LLAMA4_SPECIAL_TOKENS
+        assert len(set(special_tokens)) == len(special_tokens)
+        assert len(special_tokens) <= self.num_reserved_special_tokens
+        reserved_tokens = [
+            f"<|reserved_special_token_{i}|>" for i in range(self.num_reserved_special_tokens - len(special_tokens))
+        ]
+        special_tokens = special_tokens + reserved_tokens
+        self.special_tokens = {token: num_base_tokens + i for i, token in enumerate(special_tokens)}
+        self.model = tiktoken.Encoding(
+            name=model_path.name,
+            pat_str=self.O200K_PATTERN,
+            mergeable_ranks=mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.n_words: int = num_base_tokens + len(special_tokens)
+        # BOS / EOS token IDs
+        self.bos_id: int = self.special_tokens["<|begin_of_text|>"]
+        self.eos_id: int = self.special_tokens["<|end_of_text|>"]
+        self.pad_id: int = self.special_tokens["<|finetune_right_pad|>"]
+        self.eot_id: int = self.special_tokens["<|eot|>"]
+        self.eom_id: int = self.special_tokens["<|eom|>"]
+        self.thinking_start_id: int = self.special_tokens["<|reasoning_thinking_start|>"]
+        self.thinking_end_id: int = self.special_tokens["<|reasoning_thinking_end|>"]
+        self.stop_tokens = [
+            self.eos_id,
+            self.special_tokens["<|eom|>"],
+            self.special_tokens["<|eot|>"],
+        ]
+    def encode(
+        self,
+        s: str,
+        *,
+        bos: bool,
+        eos: bool,
+        allowed_special: Literal["all"] | Set[str] | None = None,
+        disallowed_special: Literal["all"] | Collection[str] = (),
+    ) -> list[int]:
+        """
+        Encodes a string into a list of token IDs.
+        Args:
+            s (str): The input string to be encoded.
+            bos (bool): Whether to prepend the beginning-of-sequence token.
+            eos (bool): Whether to append the end-of-sequence token.
+            allowed_special ("all"|set[str]): allowed special tokens in string
+            disallowed_special ("all"|set[str]): special tokens that raise an error when in string
+        Returns:
+            list[int]: A list of token IDs.
+        By default, setting disallowed_special=() encodes a string by ignoring
+        special tokens. Specifically:
+        - Setting `disallowed_special` to () will cause all text corresponding
+          to special tokens to be encoded as natural text (insteading of raising
+          an error).
+        - Setting `allowed_special` to "all" will treat all text corresponding
+          to special tokens to be encoded as special tokens.
+        """
+        if allowed_special is None:
+            allowed_special = set()
+        assert type(s) is str
+        substrs = (
+            substr
+            for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS)
+            for substr in self._split_whitespaces_or_nonwhitespaces(
+                s[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
+            )
+        )
+        t: list[int] = []
+        for substr in substrs:
+            t.extend(
+                self.model.encode(
+                    substr,
+                    allowed_special=allowed_special,
+                    disallowed_special=disallowed_special,
+                )
+            )
+        if bos:
+            t.insert(0, self.bos_id)
+        if eos:
+            t.append(self.eos_id)
+        return t
+    def decode(self, t: Sequence[int]) -> str:
+        """
+        Decodes a list of token IDs into a string.
+        Args:
+            t (List[int]): The list of token IDs to be decoded.
+        Returns:
+            str: The decoded string.
+        """
+        # Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence.
+        return self.model.decode(cast(list[int], t))
+    @staticmethod
+    def _split_whitespaces_or_nonwhitespaces(s: str, max_consecutive_slice_len: int) -> Iterator[str]:
+        """
+        Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
+        consecutive whitespaces or consecutive non-whitespaces.
+        """
+        current_slice_len = 0
+        current_slice_is_space = s[0].isspace() if len(s) > 0 else False
+        slice_start = 0
+        for i in range(len(s)):
+            is_now_space = s[i].isspace()
+            if current_slice_is_space ^ is_now_space:
+                current_slice_len = 1
+                current_slice_is_space = is_now_space
+            else:
+                current_slice_len += 1
+                if current_slice_len > max_consecutive_slice_len:
+                    yield s[slice_start:i]
+                    slice_start = i
+                    current_slice_len = 1
+        yield s[slice_start:]

llama_stack/models/llama/llama4/vision/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.

llama_stack/models/llama/llama4/vision/embedding.py ADDED Viewed

@@ -0,0 +1,210 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import math
+from collections.abc import Callable
+from typing import Any
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairscale.nn.model_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from ..args import VisionArgs
+from .encoder import VisionEncoder
+class PixelShuffle(nn.Module):
+    def __init__(self, ps_ratio):
+        super().__init__()
+        self.ps_ratio = ps_ratio
+    def forward(self, x):
+        # x: [B, N, C], N = number of patches
+        assert self.ps_ratio is not None, "ps_ratio is required for pixel shuffle"
+        assert x.dim() == 3, "pixel shuffle requires encoded patches [B, N, C]"
+        hh = ww = int(math.sqrt(x.shape[1]))
+        x = x.reshape(x.shape[0], hh, ww, -1)
+        x = pixel_shuffle_op(x, ps_ratio=self.ps_ratio)
+        pixel_shuffle_patches = x.reshape(x.shape[0], -1, x.shape[-1])
+        return pixel_shuffle_patches
+def pixel_shuffle_op(input_x, ps_ratio):
+    n, w, h, c = input_x.size()
+    input_x = input_x.view(n, w, int(h * ps_ratio), int(c / ps_ratio))
+    input_x = input_x.permute(0, 2, 1, 3).contiguous()
+    input_x = input_x.view(
+        n,
+        int(h * ps_ratio),
+        int(w * ps_ratio),
+        int(c / (ps_ratio * ps_ratio)),
+    )
+    input_x = input_x.permute(0, 2, 1, 3).contiguous()
+    return input_x
+class SimpleMLP(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        bias: bool = True,
+        dropout: float = 0.0,
+        act_layer: Callable = nn.GELU,
+    ):
+        super().__init__()
+        # layers
+        self.c_fc = ColumnParallelLinear(
+            dim,
+            hidden_dim,
+            bias=bias,
+            gather_output=False,
+        )
+        self.c_proj = RowParallelLinear(
+            hidden_dim,
+            hidden_dim,
+            bias=bias,
+            input_is_parallel=True,
+        )
+        self.non_linearity = act_layer()
+        self.dropout = dropout
+    def forward(self, x):
+        hidden = self.c_fc(x)
+        hidden = self.non_linearity(hidden)
+        hidden = F.dropout(hidden, p=self.dropout, training=self.training)
+        return self.non_linearity(self.c_proj(hidden))
+class PixelShuffleMLP(torch.nn.Module):
+    def __init__(
+        self,
+        ps_ratio: float,
+        input_dim: int,
+        output_dim: int = 4096,
+        add_fc: bool = False,
+    ):
+        super().__init__()
+        self.pixel_shuffle = PixelShuffle(ps_ratio)
+        self.mlp = SimpleMLP(
+            int(input_dim // (ps_ratio**2)),
+            output_dim,
+            bias=False,
+            dropout=0.0,
+            act_layer=nn.GELU,
+        )
+        self.fc = nn.Identity()
+        if add_fc:
+            self.fc = ColumnParallelLinear(
+                output_dim,
+                output_dim,
+                bias=False,
+            )
+    def forward(self, encoded_patches: torch.Tensor) -> torch.Tensor:
+        encoded_patches = self.pixel_shuffle(encoded_patches)
+        return self.fc(self.mlp(encoded_patches))
+class VisionEmbeddings(torch.nn.Module):
+    def __init__(self, args: VisionArgs):
+        super().__init__()
+        self.args = args
+        image_size = args.image_size
+        patch_size = args.patch_size
+        self.vision_encoder = VisionEncoder(
+            image_size=(image_size.height, image_size.width),
+            patch_size=(patch_size.height, patch_size.width),
+            dim=args.dim,
+            layers=args.n_layers,
+            heads=args.n_heads,
+            mlp_ratio=args.mlp_ratio,
+        )
+        self.vision_encoder = self.vision_encoder.to(torch.bfloat16)
+        self.vision_adapter = PixelShuffleMLP(
+            ps_ratio=args.pixel_shuffle_ratio,
+            input_dim=args.dim,
+            output_dim=args.output_dim,
+        )
+        self.output_dim = args.output_dim
+        self._register_load_state_dict_pre_hook(self.load_hook)
+    def load_hook(
+        self,
+        state_dict: dict[str, Any],
+        prefix: str,
+        local_metadata: dict[str, Any],
+        strict: bool = True,
+        missing_keys: list[str] = None,
+        unexpected_keys: list[str] = None,
+        error_msgs: list[str] = None,
+        return_state_dict: bool = False,
+    ) -> None:
+        original_sd = self.state_dict()
+        for k in state_dict:
+            if k.startswith(prefix) and len(state_dict[k].shape) == 1 and state_dict[k].shape[0] == 0:
+                state_dict[k] = state_dict[k].reshape(original_sd[k[len(prefix) :]].shape)
+    def _get_empty_sequence(self, h):
+        return torch.zeros(
+            h.shape[0],
+            h.shape[1],
+            self.output_dim,
+            device=h.device,
+            dtype=h.dtype,
+        )
+    # x_images is batched; each batch sample contains a list of images. so this is List[List[torch.Tensor]]
+    # each image is a tensor of shape [num_tiles, C, H, W]
+    def forward(
+        self,
+        image_batch: list[list[torch.Tensor]],
+        image_mask: torch.Tensor,
+        h_ref: torch.Tensor,
+    ) -> torch.Tensor:
+        images_flattened = [image for sample in image_batch for image in sample]
+        images_flattened = torch.vstack(images_flattened).unsqueeze(1).to(h_ref.dtype).to(h_ref.device)
+        embedding = self.vision_encoder(images_flattened)
+        projected_embedding = self.vision_adapter(embedding)
+        h_image = self._get_empty_sequence(h_ref)
+        return scatter_embeddings(image_batch, image_mask, h_image, projected_embedding)
+def scatter_embeddings(image_batch, image_mask, h_image, encoded_patches_proj):
+    # If dynamic transform is used and the batch contains 2 images (where image_1 has 2 chunks and image_2 has 3 chunks),
+    # `num_images_per_sequence` now records the number of chunks per image as `[2, 3]`.
+    # `encoded_patches_proj.split` will then split the image chunks into 2 groups: `[image_1_chunks, image_2_chunks]`.
+    num_images_per_sequence = [sum(image.size(0) for image in sample_images) for sample_images in image_batch]
+    assert not torch.isnan(encoded_patches_proj).any()
+    assert sum(num_images_per_sequence) == encoded_patches_proj.size(0), (
+        f"{sum(num_images_per_sequence)=} != {encoded_patches_proj.shape=}"
+    )
+    encoded_patches_list = encoded_patches_proj.split(num_images_per_sequence, dim=0)
+    for index in range(h_image.size(0)):
+        encoded_patches_per_sample = encoded_patches_list[index]
+        sample_image_mask = image_mask[index]
+        if encoded_patches_per_sample.numel() == 0:
+            continue
+        encoded_patches_per_sample = encoded_patches_per_sample.contiguous().view(
+            -1, encoded_patches_per_sample.size(-1)
+        )
+        n_tokens_to_fill = sample_image_mask.sum()
+        assert n_tokens_to_fill <= encoded_patches_per_sample.size(0)
+        h_image[index].masked_scatter_(
+            sample_image_mask.expand(-1, h_image.size(-1)),
+            encoded_patches_per_sample[:n_tokens_to_fill],
+        )
+    return h_image

llama-stack 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl

llama-stack 0.0.42py3-none-any.whl → 0.3.4py3-none-any.whl