PyPI - llama-stack - Versions diffs - 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

llama-stack 0.0.42py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (738) hide show

llama_stack/models/llama/llama4/prompts.py ADDED Viewed

@@ -0,0 +1,279 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import textwrap
+from io import BytesIO
+from pathlib import Path
+from llama_stack.models.llama.llama4.prompt_templates.system_prompts import (
+    PythonListCustomToolGenerator,
+)
+from ..datatypes import RawMediaItem, RawMessage, RawTextItem
+from ..prompt_format import (
+    Llama4UseCase,
+    TextCompletionContent,
+    UseCase,
+)
+THIS_DIR = Path(__file__).parent
+def usecases(base_model: bool = False) -> list[UseCase | str]:
+    with open(THIS_DIR.parent / "resources/small_dog.jpg", "rb") as f:
+        img_small_dog = f.read()
+    with open(THIS_DIR.parent / "resources/dog.jpg", "rb") as f:
+        img_dog = f.read()
+    with open(THIS_DIR.parent / "resources/pasta.jpeg", "rb") as f:
+        img_pasta = f.read()
+    out = []
+    out.extend(
+        [
+            textwrap.dedent(
+                """
+                # Llama 4 - Prompt Formats
+                ## Tokens
+                Here is a list of special tokens that are supported by Llama 4:
+                - `<|begin_of_text|>`: Specifies the start of the prompt
+                - `<|end_of_text|>`: Model will cease to generate more tokens. This token is generated only by the base models.
+                - `<|header_start|>` and `<|header_end|>`: These tokens enclose the role for a particular message. The possible roles are: [system, user and assistant].
+                - `<|eot|>`: End of turn. Represents when the model has determined that it has finished interacting with the user message that initiated its response. This is used in two scenarios:
+                    - at the end of a direct interaction between the model and the user
+                    - at the end of multiple interactions between the model and any available tools
+                    This token signals to the executor that the model has finished generating a response.
+                - `<|image_start|>` and `<|image_end|>`: These tokens enclose the image data in the prompt.
+                - `<|patch|>`: This token represents a piece of the tile/
+                - `<|tile_y_separator|>` and `<|tile_x_separator|>`: These tokens are used to separate the y and x tiles of an image
+                - `<|image|>`: In the new architecture, this token now separates the regular sized image information from a downsized version of it that fits in a single tile. The longer side is used for calculating the scale factor and the rest is padded to fit the tile.
+                """
+            ),
+            textwrap.dedent(
+                """
+                    There are 3 different roles that are supported by Llama 4
+                    - `system`: Sets the context in which to interact with the AI model. It typically includes rules, guidelines, or necessary information that helps the model respond effectively.
+                    - `user`: Represents the human interacting with the model. It includes the inputs, commands, and questions to the model.
+                    - `assistant`: Represents the response generated by the AI model based on the context provided in the `system`, `tool` and `user` prompts.
+                    """
+            ),
+        ]
+    )
+    if base_model:
+        out.extend(
+            [
+                "# Llama 4 Base Model",
+                Llama4UseCase(
+                    title="Text completion - Paris information",
+                    description="Text completion for Llama 4 base model uses this format.",
+                    dialogs=[TextCompletionContent(content="The capital of France is Paris")],
+                ),
+                Llama4UseCase(
+                    title="Text completion - The color of the sky",
+                    description="Text completion for Llama 4 base model uses this format.",
+                    dialogs=[
+                        TextCompletionContent(content="The color of the sky is blue but sometimes it can also be")
+                    ],
+                    notes="",
+                ),
+                Llama4UseCase(
+                    title="Text completion - Translation example",
+                    description="Text completion for Llama 4 base model uses this format.",
+                    dialogs=[
+                        TextCompletionContent(
+                            content="""apple is pomme,
+            bannana is banane,
+            cherry is"""
+                        )
+                    ],
+                    notes="",
+                ),
+            ]
+        )
+    out.extend(
+        [
+            "# Llama 4 Instruct Model",
+            Llama4UseCase(
+                title="Simple User and assistant conversation",
+                description="Here is a regular multi-turn user assistant conversation and how its formatted.",
+                dialogs=[
+                    [
+                        RawMessage(role="system", content="You are a helpful assistant"),
+                        RawMessage(
+                            role="user",
+                            content="Answer who are you in the form of jeopardy?",
+                        ),
+                    ]
+                ],
+                notes="",
+                max_gen_len=512,
+            ),
+            "# Image prompt format",
+            Llama4UseCase(
+                title="Single image prompt format - small image",
+                description="This example passes an image that is smaller than the tile size, to show the tile separator tokens are not needed",
+                dialogs=[
+                    [
+                        RawMessage(
+                            role="user",
+                            content=[
+                                RawMediaItem(data=BytesIO(img_small_dog)),
+                                RawTextItem(text="Describe this image in two sentences"),
+                            ],
+                        )
+                    ]
+                ],
+                notes="""Notice the structure of the image section:
+        ```
+        <|image_start|><|image|><|patch|>...<|patch|><|image_end|>
+        ```
+        This is due to the image being smaller than the tile size.
+        """,
+                max_gen_len=512,
+            ),
+            Llama4UseCase(
+                title="Single image prompt format",
+                description="Here is an example of how to pass an image to the model",
+                dialogs=[
+                    [
+                        RawMessage(
+                            role="user",
+                            content=[
+                                RawMediaItem(data=BytesIO(img_dog)),
+                                RawTextItem(text="Describe this image in two sentences"),
+                            ],
+                        )
+                    ]
+                ],
+                notes="""With a bigger image, the image will include the tile separator tokens. Additionally, the image tag now separates a scaled down version of the image from the regular sized image.
+        ```
+        <|image_start|><|patch|>...<|patch|><|tile_x_separator|><|patch|>...<|patch|><|tile_y_separator|><|patch|>...<|patch|><|image|><|patch|>...<|patch|><|image_end|>
+        ```
+        """,
+                max_gen_len=1024,
+            ),
+            Llama4UseCase(
+                title="Multiple images prompt format",
+                description="Here is an example of how to pass an image to the model",
+                dialogs=[
+                    [
+                        RawMessage(
+                            role="user",
+                            content=[
+                                RawMediaItem(data=BytesIO(img_dog)),
+                                RawMediaItem(data=BytesIO(img_pasta)),
+                                RawTextItem(text="Describe these images in two sentences"),
+                            ],
+                        )
+                    ]
+                ],
+                notes="With multiple images, each one is encapsulated in their corresponding image tags.",
+                max_gen_len=4096,
+            ),
+            "# Tool calling\nWe are continuing the format for zero shot function calling used in previous versions of Llama. All available functions can be provided either in the system message or in the user message.",
+            Llama4UseCase(
+                title="Zero shot function calling - system message",
+                dialogs=[
+                    [
+                        RawMessage(
+                            role="system",
+                            content=PythonListCustomToolGenerator()
+                            .gen(PythonListCustomToolGenerator().data_examples()[0])
+                            .render(),
+                        ),
+                        RawMessage(
+                            role="user",
+                            content="What is the weather in SF and Seattle?",
+                        ),
+                    ]
+                ],
+                notes=textwrap.dedent(
+                    """
+                - The output supports multiple, and parallel tool calls natively
+                - JSON format for defining the functions in the system prompt is similar to Llama3.1
+                """
+                ),
+            ),
+            Llama4UseCase(
+                title="Zero shot function calling - user message",
+                description=textwrap.dedent(
+                    """
+        Similar to the above example, you can also provide information for all the available tools in the user message.
+        """
+                ),
+                dialogs=[
+                    [
+                        RawMessage(
+                            role="user",
+                            content="""Questions: Can you retrieve the details for the user with the ID 7890, who has black as their special request?
+Here is a list of functions in JSON format that you can invoke:
+[
+    {
+        "name": "get_user_info",
+        "description": "Retrieve details for a specific user by their unique identifier. Note that the provided function is in Python 3 syntax.",
+        "parameters": {
+            "type": "dict",
+            "required": [
+                "user_id"
+            ],
+            "properties": {
+                "user_id": {
+                "type": "integer",
+                "description": "The unique identifier of the user. It is used to fetch the specific user details from the database."
+            },
+            "special": {
+                "type": "string",
+                "description": "Any special information or parameters that need to be considered while fetching user details.",
+                "default": "none"
+                }
+            }
+        }
+    }
+]
+Should you decide to return the function call(s), put them in the format of [func1(params_name=params_value, params_name2=params_value2...), func2(params)]
+You SHOULD NOT include any other text in the response.""",
+                        ),
+                    ]
+                ],
+                notes=textwrap.dedent(
+                    """
+        - The tool call format for the model is the same whether your function calls are provided in the system or user message.
+        """
+                ),
+            ),
+            Llama4UseCase(
+                title="Tool calling with custom formats",
+                description=textwrap.dedent(
+                    """
+                Here is an example of how you could also write custom instructions for model to do zero shot tool calling.
+                In this example, we define a custom tool calling format using the `<function>` tag.
+                """
+                ),
+                dialogs=[
+                    [
+                        RawMessage(
+                            role="user",
+                            content="""You have access to the following functions:\nUse the function 'trending_songs' to 'Returns the trending songs on a Music site':\n{"name": "trending_songs", "description": "Returns the trending songs on a Music site", "parameters": {"genre": {"description": "The genre of the songs to return", "param_type": "str", "required": false}, "n": {"description": "The number of songs to return", "param_type": "int", "required": true}}}\n\nThink very carefully before calling functions.\nIf you choose to call a function ONLY reply in the following format with no prefix or suffix:\n\n<function=example_function_name>{"example_name": "example_value"}</function>
+Reminder:
+- If looking for real time information use relevant functions before falling back to brave_search
+- Function calls MUST follow the specified format, start with <function= and end with </function>
+- Required parameters MUST be specified
+- Only call one function at a time
+- Put the entire function call reply on one line<|eot_id|>""",
+                        ),
+                        RawMessage(
+                            role="user",
+                            content="Use tools to get latest trending songs",
+                        ),
+                    ]
+                ],
+            ),
+        ]
+    )
+    return out

llama_stack/models/llama/llama4/quantization/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.

llama_stack/models/llama/llama4/quantization/loader.py ADDED Viewed

@@ -0,0 +1,226 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import os
+from collections.abc import Callable
+import torch
+from fairscale.nn.model_parallel.initialize import get_model_parallel_rank
+from torch import Tensor, nn
+from torch.nn import functional as F
+from llama_stack.log import get_logger
+from ...datatypes import QuantizationMode
+from ..model import Transformer, TransformerBlock
+from ..moe import MoE
+log = get_logger(name=__name__, category="models::llama")
+def swiglu_wrapper_no_reduce(
+    self,
+    x: Tensor,
+):
+    from ...quantize_impls import ffn_swiglu
+    return ffn_swiglu(x, self.w1.weight, self.w3.weight, self.w2.weight)
+def experts_batched_swiglu_wrapper(
+    self,
+    x: Tensor,  # (e, g, D)
+    w1: Tensor,  # (e, D, F)
+    w3: Tensor,  # (e, D, F)
+    w2: Tensor,  # (e, F, D)
+) -> torch.Tensor:
+    from ...quantize_impls import bmm_nt
+    middle_out_egF = F.silu(bmm_nt(x, w1)) * bmm_nt(x, w3)  # noqa: N806
+    return bmm_nt(middle_out_egF, w2)
+def convert_to_quantized_model(
+    model: Transformer,
+    checkpoint_dir: str,
+    quantization_mode: str | None = None,
+    fp8_activation_scale_ub: float | None = 1200.0,
+    use_rich_progress: bool = True,
+) -> Transformer:
+    from ...quantize_impls import (
+        Fp8ScaledWeights,
+        Int4ScaledWeights,
+        load_fp8,
+        load_int4,
+        quantize_fp8,
+        quantize_int4,
+    )
+    rank = get_model_parallel_rank()
+    def should_quantize_block(block: nn.Module) -> bool:
+        if not isinstance(block, TransformerBlock):
+            return False
+        is_moe = isinstance(block.feed_forward, MoE)
+        if quantization_mode == QuantizationMode.fp8_mixed:
+            # skip quantization on first and last layers
+            return is_moe and not (block.layer_id == 0 or block.layer_id == (model.n_layers - 1))
+        return is_moe
+    use_rich_progress = use_rich_progress and rank == 0
+    progress, log_status, update_status = logging_callbacks(use_rich_progress, rank, model, should_quantize_block)
+    if quantization_mode == QuantizationMode.int4_mixed:
+        int4_scales_path = os.path.join(checkpoint_dir, f"int4_scales_{rank}.pt")
+        if os.path.isfile(int4_scales_path):
+            log_status(f"Rank {rank}: Loading int4 scales")
+            int4_scales = torch.load(int4_scales_path, weights_only=True)
+            def apply_quantization(key, weight):
+                scale = int4_scales[key]
+                return load_int4(
+                    weight,
+                    scale,
+                    output_device=torch.device("cuda"),
+                )
+        else:
+            log_status(f"Rank {rank}: Quantizing int4 weights from bf16")
+            def apply_quantization(_, weight):
+                return quantize_int4(weight, output_device=torch.device("cuda"))
+    else:
+        fp8_scales_path = os.path.join(checkpoint_dir, f"fp8_scales_{rank}.pt")
+        if os.path.isfile(fp8_scales_path):
+            log_status(f"Rank {rank}: Loading fp8 scales")
+            fp8_scales = torch.load(fp8_scales_path, weights_only=True)
+            def apply_quantization(key, weight):
+                scale = fp8_scales[key]
+                return load_fp8(
+                    weight,
+                    scale,
+                    fp8_activation_scale_ub,
+                    output_device=torch.device("cuda"),
+                )
+        else:
+            log_status(f"Rank {rank}: Quantizing fp8 weights from bf16")
+            def apply_quantization(_, weight):
+                return quantize_fp8(weight, fp8_activation_scale_ub, output_device=torch.device("cuda"))
+    processed_blocks = 0
+    try:
+        if use_rich_progress:
+            progress.start()
+        for _, block in model.named_modules():
+            if not should_quantize_block(block):
+                continue
+            update_status(f"Rank {rank} - Layer {block.layer_id}")
+            # Quantize only routed experts, not shared
+            prefix = f"layers.{block.layer_id}.feed_forward"
+            moe = block.feed_forward
+            moe.experts.batched_swiglu = experts_batched_swiglu_wrapper.__get__(moe.experts)
+            for key in ("w1", "w3", "w2"):
+                param = getattr(moe.experts, key)
+                update_status(f"Rank {rank} - Layer {block.layer_id} - MoE {key}")
+                setattr(
+                    moe.experts,
+                    key,
+                    apply_quantization(
+                        f"{prefix}.experts.{key}",
+                        param.transpose(1, 2).contiguous(),
+                    ),
+                )
+            if quantization_mode == QuantizationMode.int4_mixed:
+                # Quantize shared experts
+                moe.shared_expert.forward = swiglu_wrapper_no_reduce.__get__(moe.shared_expert)
+                for key in ("w1", "w3", "w2"):
+                    param = getattr(moe.shared_expert, key)
+                    update_status(f"Rank {rank} - Layer {block.layer_id} - MoE shared expert {key}")
+                    param.weight = apply_quantization(f"{prefix}.shared_expert.{key}", param.weight)
+            processed_blocks += 1
+            update_status(message=None, completed=processed_blocks)
+        update_status(f"Rank {rank} - Moving parameters to CUDA")
+        param_count = 0
+        for _, parameter in model.named_parameters():
+            if not isinstance(parameter, Fp8ScaledWeights) and not isinstance(parameter, Int4ScaledWeights):
+                parameter.data = parameter.to(device="cuda")
+                param_count += 1
+        update_status(f"Rank {rank} - Completed - moved {param_count} parameters to CUDA")
+    finally:
+        if use_rich_progress:
+            progress.stop()
+    return model
+# fp8/int4 loading can be very slow so we add progress bars to make life slightly better
+def logging_callbacks(
+    use_rich_progress: bool,
+    rank: int,
+    model: Transformer,
+    should_quantize_block: Callable[[nn.Module], bool],
+):
+    console = None
+    if use_rich_progress:
+        from rich.console import Console
+        console = Console(highlight=False)
+    def log_status(message: str) -> None:
+        if use_rich_progress:
+            console.print(message)
+        elif rank == 0:  # Only log from rank 0 for non-rich logging
+            log.info(message)
+    total_blocks = sum(1 for _, block in model.named_modules() if should_quantize_block(block))
+    progress = None
+    if use_rich_progress:
+        from rich.progress import (
+            BarColumn,
+            Progress,
+            SpinnerColumn,
+            TextColumn,
+            TimeElapsedColumn,
+            TimeRemainingColumn,
+        )
+        progress = Progress(
+            SpinnerColumn(),
+            BarColumn(complete_style="green", finished_style="bright_green"),
+            TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+            TimeElapsedColumn(),
+            TextColumn("ETA:"),
+            TimeRemainingColumn(),
+            TextColumn("[bold]{task.fields[status]}"),
+            console=console,
+            expand=True,
+        )
+        task_id = progress.add_task("[blue]Converting layers...", total=total_blocks, status="Starting")
+    def update_status(message: str | None, completed: int | None = None) -> None:
+        if use_rich_progress:
+            if message is not None:
+                progress.update(task_id, status=message)
+            if completed is not None:
+                progress.update(task_id, completed=completed)
+        elif rank == 0 and completed and completed % 10 == 0:
+            log.info(f"Rank {rank}: {completed}/{total_blocks} blocks completed")
+    return progress, log_status, update_status

llama-stack 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl

llama-stack 0.0.42py3-none-any.whl → 0.3.4py3-none-any.whl