PyPI - llama-stack - Versions diffs - 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

llama-stack 0.0.42py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (738) hide show

llama_stack/models/llama/datatypes.py ADDED Viewed

@@ -0,0 +1,164 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import base64
+from enum import Enum, StrEnum
+from io import BytesIO
+from typing import Annotated, Any, Literal
+from pydantic import BaseModel, ConfigDict, Field, field_serializer, field_validator
+# The goal is that these set of types are relevant for all Llama models.
+# That isn't the current state yet -- e.g., BuiltinTool is somewhat specific to
+# the llama3 series of models.
+class Role(Enum):
+    system = "system"
+    user = "user"
+    assistant = "assistant"
+    tool = "tool"
+class BuiltinTool(Enum):
+    brave_search = "brave_search"
+    wolfram_alpha = "wolfram_alpha"
+    photogen = "photogen"
+    code_interpreter = "code_interpreter"
+Primitive = str | int | float | bool | None
+RecursiveType = Primitive | list[Primitive] | dict[str, Primitive]
+class ToolCall(BaseModel):
+    call_id: str
+    tool_name: BuiltinTool | str
+    arguments: str
+    @field_validator("tool_name", mode="before")
+    @classmethod
+    def validate_field(cls, v):
+        if isinstance(v, str):
+            try:
+                return BuiltinTool(v)
+            except ValueError:
+                return v
+        return v
+class ToolPromptFormat(Enum):
+    """Prompt format for calling custom / zero shot tools.
+    :cvar json: JSON format for calling tools. It takes the form:
+        {
+            "type": "function",
+            "function" : {
+                "name": "function_name",
+                "description": "function_description",
+                "parameters": {...}
+            }
+        }
+    :cvar function_tag: Function tag format, pseudo-XML. This looks like:
+        <function=function_name>(parameters)</function>
+    :cvar python_list: Python list. The output is a valid Python expression that can be
+        evaluated to a list. Each element in the list is a function call. Example:
+        ["function_name(param1, param2)", "function_name(param1, param2)"]
+    """
+    json = "json"
+    function_tag = "function_tag"
+    python_list = "python_list"
+class StopReason(Enum):
+    end_of_turn = "end_of_turn"
+    end_of_message = "end_of_message"
+    out_of_tokens = "out_of_tokens"
+class ToolDefinition(BaseModel):
+    tool_name: BuiltinTool | str
+    description: str | None = None
+    input_schema: dict[str, Any] | None = None
+    output_schema: dict[str, Any] | None = None
+    @field_validator("tool_name", mode="before")
+    @classmethod
+    def validate_field(cls, v):
+        if isinstance(v, str):
+            try:
+                return BuiltinTool(v)
+            except ValueError:
+                return v
+        return v
+class RawMediaItem(BaseModel):
+    type: Literal["image"] = "image"
+    data: bytes | BytesIO
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    @field_serializer("data")
+    def serialize_data(self, data: bytes | None, _info):
+        if data is None:
+            return None
+        return base64.b64encode(data).decode("utf-8")
+    @field_validator("data", mode="before")
+    @classmethod
+    def validate_data(cls, v):
+        if isinstance(v, str):
+            return base64.b64decode(v)
+        return v
+class RawTextItem(BaseModel):
+    type: Literal["text"] = "text"
+    text: str
+RawContentItem = Annotated[RawTextItem | RawMediaItem, Field(discriminator="type")]
+RawContent = str | RawContentItem | list[RawContentItem]
+class RawMessage(BaseModel):
+    role: Literal["user"] | Literal["system"] | Literal["tool"] | Literal["assistant"]
+    content: RawContent
+    # This is for RAG but likely should be absorbed into content
+    context: RawContent | None = None
+    # These are for the output message coming from the assistant
+    stop_reason: StopReason | None = None
+    tool_calls: list[ToolCall] = Field(default_factory=list)
+class GenerationResult(BaseModel):
+    token: int
+    text: str
+    logprobs: list[float] | None = None
+    source: Literal["input"] | Literal["output"]
+    # index within the batch
+    batch_idx: int
+    # whether generation for this item is already finished. note that tokens can
+    # get returned even afterwards since other items in the batch can still be generating tokens
+    finished: bool
+    # because a batch is parallel processed, useful decoding for one item can correspond to processing
+    # pad tokens or tokens beyond EOS for other items. we could have decided to return None for this case
+    # but it's more convenient to return a list of GenerationResult and filter out the ignored tokens
+    ignore_token: bool
+class QuantizationMode(StrEnum):
+    none = "none"
+    fp8_mixed = "fp8_mixed"
+    int4_mixed = "int4_mixed"

llama_stack/models/llama/hadamard_utils.py ADDED Viewed

@@ -0,0 +1,86 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import math
+import re
+import torch
+from torch import nn
+def hadamard_transform(x: torch.Tensor) -> torch.Tensor:
+    """Hadamard transform.
+    This function performs the Hadamard transform on the input tensor 'x'.
+    The Hadamard transform is a linear transformation that multiplies the input
+    tensor by the Hadamard matrix of dimension n x n, where n is the size of
+    the last dimension of the input tensor.
+    """
+    *_, n = x.shape
+    m = int(math.log2(n))
+    assert n == 1 << m, "n must be a power of 2"
+    x = x[..., None]
+    inv_sqrt2 = 0.5**0.5
+    for _ in range(m):
+        top = x[..., ::2, :] + x[..., 1::2, :]
+        bot = x[..., ::2, :] - x[..., 1::2, :]
+        x = torch.cat((top, bot), dim=-1)
+        x *= inv_sqrt2
+    res = x.squeeze(-2)
+    return res
+class HadamardModule(torch.nn.Module):
+    """A module that applies the Hadamard transform to the input tensor.
+    Args:
+        group_size: The size of the groups that the input tensor will be divided into
+            before applying the Hadamard transform.
+    """
+    def __init__(self, group_size: int) -> None:
+        super().__init__()
+        self.group_size = group_size
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        reshape_back = False
+        orig_shape = x.shape
+        if self.group_size != x.shape[-1]:
+            reshape_back = True
+            x = x.reshape(-1, x.shape[-1] // self.group_size, self.group_size)
+        x = hadamard_transform(x)
+        if reshape_back:
+            x = x.reshape(orig_shape)
+        return x
+def add_hadamard_transform_for_spinquant(model: torch.nn.Module, prefix: str = "") -> None:
+    """
+    Adds a Hadamard transform to the last linear layer of each feedforward network (FFN) in the model.
+    This function recursively traverses the model's children and looks for layers that match the pattern
+    "layers.<digit>.feed_forward.w2", where <digit> is one or more digits. When such a layer is found,
+    it is replaced with a new sequential module that consists of a HadamardModule followed by the original
+    layer. The HadamardModule applies the Hadamard transform to the input tensor.
+    See `SpinQuant <https://arxiv.org/abs/2405.16406>_` paper for more details.
+    Args:
+        model: An instance of 'torch.nn.Module' (e.g., Transformer model).
+        prefix: A string prefix to add to the full name of each child module.
+    Returns:
+        None
+    """
+    pattern_last_linear_ffn = r"layers.\d+.feed_forward.w2"
+    for module_name, module in model.named_children():
+        child_full_name = prefix + "." + module_name
+        if re.search(pattern_last_linear_ffn, child_full_name):
+            new_module = nn.Sequential(HadamardModule(group_size=module.in_features), module)
+            del module
+            setattr(model, module_name, new_module)
+        else:
+            add_hadamard_transform_for_spinquant(module, (prefix + "." if prefix else prefix) + module_name)

llama_stack/models/llama/llama3/args.py ADDED Viewed

@@ -0,0 +1,74 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from dataclasses import dataclass
+from enum import Enum
+class QuantizationScheme(Enum):
+    int4_weight_int8_dynamic_activation = "int4_weight_int8_dynamic_activation"
+@dataclass
+class QuantizationArgs:
+    scheme: QuantizationScheme | None = None
+    group_size: int | None = None
+    spinquant: bool = False
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            if k == "scheme":
+                setattr(self, k, QuantizationScheme(v))
+            else:
+                if hasattr(self, k):
+                    setattr(self, k, v)
+@dataclass
+class LoRAArgs:
+    rank: int
+    scale: float
+@dataclass
+class ModelArgs:
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: int | None = None
+    vocab_size: int = -1
+    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
+    ffn_dim_multiplier: float | None = None
+    norm_eps: float = 1e-5
+    rope_theta: float = 500000
+    use_scaled_rope: bool = False
+    max_batch_size: int = 32
+    max_seq_len: int = 2048
+    # vision model params
+    vision_chunk_size: int = -1  # image resolution for image models
+    vision_max_num_chunks: int = 4
+    vision_num_cross_attention_layers: int = -1
+    quantization_args: QuantizationArgs | None = None
+    lora_args: LoRAArgs | None = None
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            if k == "lora_args":
+                setattr(self, k, LoRAArgs(**v))
+            elif k == "quantization_args":
+                setattr(self, k, QuantizationArgs(**v))
+            else:
+                if hasattr(self, k):
+                    setattr(self, k, v)
+        if self.n_kv_heads is None:
+            self.n_kv_heads = self.n_heads
+        assert self.n_kv_heads <= self.n_heads
+        assert self.n_heads % self.n_kv_heads == 0
+        assert self.dim % self.n_heads == 0

llama_stack/models/llama/llama3/chat_format.py ADDED Viewed

@@ -0,0 +1,286 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import io
+import json
+import uuid
+from dataclasses import dataclass
+from typing import Any
+from PIL import Image as PIL_Image
+from ..datatypes import (
+    BuiltinTool,
+    RawContent,
+    RawMediaItem,
+    RawMessage,
+    RawTextItem,
+    Role,
+    StopReason,
+    ToolCall,
+    ToolPromptFormat,
+)
+from .tokenizer import Tokenizer
+from .tool_utils import ToolUtils
+@dataclass
+class VisionInput:
+    mask: list[list[int]]
+    images: list[PIL_Image.Image]
+@dataclass
+class LLMInput:
+    tokens: list[int]
+    vision: VisionInput | None = None
+def role_str(role: Role) -> str:
+    role_strs = {
+        Role.user: "user",
+        Role.system: "system",
+        Role.tool: "ipython",  # special
+        Role.assistant: "assistant",
+    }
+    return role_strs[role]
+class ChatFormat:
+    possible_headers: dict[Role, str]
+    def __init__(self, tokenizer: Tokenizer):
+        self.tokenizer = tokenizer
+        self.possible_headers = {role: f"<|start_header_id|>{role_str(role)}<|end_header_id|>\n\n" for role in Role}
+        self.vision_token = self.tokenizer.special_tokens["<|image|>"]
+    def _encode_header(self, role: str) -> list[int]:
+        tokens = []
+        tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"])
+        tokens.extend(self.tokenizer.encode("ipython" if role == "tool" else role, bos=False, eos=False))
+        tokens.append(self.tokenizer.special_tokens["<|end_header_id|>"])
+        tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False))
+        return tokens
+    def encode_content(self, content: RawContent) -> LLMInput:
+        tokens, images = self._encode_content(content, bos=True)
+        return self._model_input_from_tokens_images(tokens, images)
+    def _encode_content(self, content: RawContent, bos: bool = False) -> tuple[list[int], list[PIL_Image.Image]]:
+        tokens = []
+        images = []
+        added_bos = False
+        def _process(c):
+            nonlocal added_bos, bos
+            if isinstance(c, str) or isinstance(c, RawTextItem):
+                if isinstance(c, RawTextItem):
+                    c = c.text
+                tokens.extend(self.tokenizer.encode(c, bos=False if added_bos else bos, eos=False))
+                added_bos = True
+            elif isinstance(c, RawMediaItem):
+                bos = False if added_bos else bos
+                if bos:
+                    tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"])
+                    added_bos = True
+                tokens.append(self.vision_token)
+                bytes_io = io.BytesIO(c.data) if isinstance(c.data, bytes) else c.data
+                image = PIL_Image.open(bytes_io)
+                image = image.convert("RGB")
+                images.append(image)
+        if isinstance(content, list):
+            for c in content:
+                _process(c)
+        else:
+            _process(content)
+        return tokens, images
+    def encode_message(
+        self, message: RawMessage, tool_prompt_format: ToolPromptFormat
+    ) -> tuple[list[int], list[PIL_Image.Image]]:
+        tokens = self._encode_header(message.role)
+        images = []
+        def _process_content(c):
+            toks, imgs = self._encode_content(c)
+            tokens.extend(toks)
+            images.extend(imgs)
+        if (
+            message.role == "assistant"
+            and len(message.tool_calls) > 0
+            and message.tool_calls[0].tool_name == BuiltinTool.code_interpreter
+        ):
+            tokens.append(self.tokenizer.special_tokens["<|python_tag|>"])
+        _process_content(message.content)
+        if message.role == "user" and message.context is not None:
+            # This is RAG context; why is it here in the chat format? I don't think
+            # this is needed and can be moved upwards
+            _process_content("\n\n")
+            _process_content(message.context)
+        if message.role == "assistant":
+            for t in message.tool_calls:
+                content = ToolUtils.encode_tool_call(t, tool_prompt_format)
+                _process_content(content)
+        eom = False
+        if message.role == "assistant":
+            eom = message.stop_reason == StopReason.end_of_message
+        tokens.append(self.tokenizer.special_tokens["<|eom_id|>" if eom else "<|eot_id|>"])
+        return tokens, images
+    def encode_dialog_prompt(
+        self,
+        messages: list[RawMessage],
+        tool_prompt_format: ToolPromptFormat | None = None,
+    ) -> LLMInput:
+        tool_prompt_format = tool_prompt_format or ToolPromptFormat.json
+        tokens = []
+        images = []
+        tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"])
+        for message in messages:
+            toks, imgs = self.encode_message(message, tool_prompt_format)
+            tokens.extend(toks)
+            images.extend(imgs)
+        # Add the start of an assistant message for the model to complete.
+        tokens.extend(self._encode_header("assistant"))
+        return self._model_input_from_tokens_images(tokens, images)
+    # TODO(this should be generic, not only for assistant messages)
+    def decode_assistant_message(self, tokens: list[int], stop_reason: StopReason) -> RawMessage:
+        content = self.tokenizer.decode(tokens)
+        return self.decode_assistant_message_from_content(content, stop_reason)
+    def decode_assistant_message_from_content(self, content: str, stop_reason: StopReason) -> RawMessage:
+        content = content.strip(" ")
+        header_str = self.possible_headers[Role.assistant]
+        if content.startswith(header_str):
+            content = content[len(header_str) :]
+        ipython = content.startswith("<|python_tag|>")
+        if ipython:
+            content = content[len("<|python_tag|>") :]
+        if content.endswith("<|eot_id|>"):
+            content = content[: -len("<|eot_id|>")]
+            stop_reason = StopReason.end_of_turn
+        elif content.endswith("<|eom_id|>"):
+            content = content[: -len("<|eom_id|>")]
+            stop_reason = StopReason.end_of_message
+        tool_name: str | BuiltinTool | None = None
+        tool_arguments: dict[str, Any] = {}
+        custom_tool_info = ToolUtils.maybe_extract_custom_tool_call(content)
+        if custom_tool_info is not None:
+            # Type guard: ensure custom_tool_info is a tuple of correct types
+            if isinstance(custom_tool_info, tuple) and len(custom_tool_info) == 2:
+                extracted_tool_name, extracted_tool_arguments = custom_tool_info
+                # Handle both dict and str return types from the function
+                if isinstance(extracted_tool_arguments, dict):
+                    tool_name, tool_arguments = extracted_tool_name, extracted_tool_arguments
+                else:
+                    # If it's a string, treat it as a query parameter
+                    tool_name, tool_arguments = extracted_tool_name, {"query": extracted_tool_arguments}
+            else:
+                tool_name, tool_arguments = None, {}
+            # Sometimes when agent has custom tools alongside builin tools
+            # Agent responds for builtin tool calls in the format of the custom tools
+            # This code tries to handle that case
+            if tool_name is not None and tool_name in BuiltinTool.__members__:
+                tool_name = BuiltinTool[tool_name]
+                if isinstance(tool_arguments, dict):
+                    tool_arguments = {
+                        "query": list(tool_arguments.values())[0],
+                    }
+        else:
+            builtin_tool_info = ToolUtils.maybe_extract_builtin_tool_call(content)
+            if builtin_tool_info is not None:
+                tool_name, query = builtin_tool_info
+                tool_arguments = {
+                    "query": query,
+                }
+                if tool_name in BuiltinTool.__members__:
+                    tool_name = BuiltinTool[tool_name]
+            elif ipython:
+                tool_name = BuiltinTool.code_interpreter
+                tool_arguments = {
+                    "code": content,
+                }
+        tool_calls = []
+        if tool_name is not None and tool_arguments is not None:
+            call_id = str(uuid.uuid4())
+            tool_calls.append(
+                ToolCall(
+                    call_id=call_id,
+                    tool_name=tool_name,
+                    arguments=json.dumps(tool_arguments),
+                )
+            )
+            content = ""
+        return RawMessage(
+            role="assistant",
+            content=content,
+            stop_reason=stop_reason,
+            tool_calls=tool_calls,
+        )
+    def _model_input_from_tokens_images(self, tokens: list[int], images: list[PIL_Image.Image]) -> LLMInput:
+        vision_input = None
+        if len(images) > 0:
+            vision_input = VisionInput(
+                mask=create_vision_mask(tokens, self.vision_token),
+                images=images,
+            )
+        return LLMInput(
+            tokens=[128256 if token == self.vision_token else token for token in tokens],
+            vision=vision_input,
+        )
+def create_vision_mask(
+    tokens: list[int],
+    vision_token: int,
+) -> list[list[int]]:
+    vision_token_locations = [i for i, token in enumerate(tokens) if token == vision_token]
+    if len(vision_token_locations) == 0:
+        return []
+    if len(vision_token_locations) == 1:
+        # only one image present, unmask until end of sequence
+        return [[vision_token_locations[0], -1]]
+    vision_masks = [
+        [loc1, loc2] for loc1, loc2 in zip(vision_token_locations[:-1], vision_token_locations[1:], strict=False)
+    ]
+    # last image will attend to all subsequent text
+    vision_masks.append([vision_token_locations[-1], len(tokens)])
+    # if there are two or more consecutive vision tokens,
+    # they should all attend to all subsequent
+    # text present
+    last_mask_end = vision_masks[-1][1]
+    for vision_mask in vision_masks[::-1]:
+        if vision_mask[0] == vision_mask[1] - 1:
+            vision_mask[1] = last_mask_end
+        last_mask_end = vision_mask[1]
+    return vision_masks

llama-stack 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl

llama-stack 0.0.42py3-none-any.whl → 0.3.4py3-none-any.whl