PyPI - llama-stack - Versions diffs - 0.4.4__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl - Mend

llama-stack 0.4.4py3-none-any.whl → 0.5.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (155) hide show

llama_stack/models/llama/quantize_impls.py DELETED Viewed

@@ -1,316 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-# type: ignore
-import collections
-from llama_stack.log import get_logger
-log = get_logger(name=__name__, category="models::llama")
-try:
-    import fbgemm_gpu.experimental.gen_ai  # noqa: F401
-    log.info("Using efficient FP8 or INT4 operators in FBGEMM.")
-except ImportError:
-    log.error("No efficient FP8 or INT4 operators. Please install FBGEMM.")
-    raise
-import torch
-from torch import Tensor, nn
-class Fp8ScaledWeights:
-    # TODO: Ugly trick so torch allows us to replace parameters
-    # with our custom Fp8Weights instance. Do this properly.
-    @property
-    def __class__(self) -> type[nn.parameter.Parameter]:
-        return nn.Parameter
-    @property
-    def grad_fn(self) -> None:
-        return None
-# pyre-fixme[4]: Attribute annotation cannot be `Any`.
-# pyre-fixme[2]: Parameter annotation cannot be `Any`.
-class Fp8RowwiseWeights(
-    Fp8ScaledWeights,
-    collections.namedtuple(
-        "Fp8RowwiseWeights",
-        ["weight", "scale", "shape", "activation_scale_ub"],
-    ),
-):
-    pass
-class Int4ScaledWeights:
-    # TODO: Ugly trick so torch allows us to replace parameters
-    # with our custom Int4Weights instance. Do this properly.
-    @property
-    def __class__(self) -> type[nn.parameter.Parameter]:
-        return nn.Parameter
-    @property
-    def grad_fn(self) -> None:
-        return None
-# pyre-fixme[4]: Attribute annotation cannot be `Any`.
-# pyre-fixme[2]: Parameter annotation cannot be `Any`.
-class Int4Weights(
-    Int4ScaledWeights,
-    collections.namedtuple(
-        "Int4Weights",
-        ["weight", "scale", "zero_point", "shape"],
-    ),
-):
-    pass
-def int4_row_quantize(
-    x: torch.Tensor,
-    group_size: int = 128,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    n_bit = 4  # Number of target bits.
-    to_quant = x.reshape(-1, group_size).to(torch.float)
-    max_val = to_quant.amax(dim=1, keepdim=True)
-    min_val = to_quant.amin(dim=1, keepdim=True)
-    max_int = 2**n_bit - 1
-    min_int = 0
-    scales = (max_val - min_val).clamp(min=1e-6) / max_int
-    zeros = min_val + scales * (2 ** (n_bit - 1))
-    out = to_quant.sub(min_val).div(scales).round().clamp_(min_int, max_int)
-    # Recenter output and move to int8.
-    out = (out - 2 ** (n_bit - 1)).to(dtype=torch.int8).reshape(x.shape)
-    # Cutlass expects column major layout for scale and zero point,
-    # so we transpose here and make them contiguous.
-    scales = scales.view(x.shape[0], -1).t().contiguous()
-    zeros = zeros.view(x.shape[0], -1).t().contiguous()
-    return out, scales, zeros
-def pack_int4(x: torch.Tensor) -> torch.Tensor:
-    # Given int8 x, pack adjacent int4 values into a single int8.
-    low_x = x[:, ::2]
-    high_x = x[:, 1::2]
-    # High bits need to left shift, this also masks off extra bits.
-    high_x = torch.bitwise_left_shift(high_x, 4)
-    # Low bits need to have sign bits removed.
-    low_x = torch.bitwise_and(low_x, 0xF)
-    # Recombine into a single value with bitwise or.
-    return torch.bitwise_or(low_x, high_x).contiguous()
-def bmm_nt(
-    x: Tensor,
-    w: Fp8RowwiseWeights | Int4Weights,
-    num_tokens: Tensor | None = None,
-) -> Tensor:
-    if isinstance(w, Fp8ScaledWeights):
-        xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(x, num_tokens, w.activation_scale_ub)
-        return torch.ops.fbgemm.f8f8bf16_rowwise_batched(xq, w.weight, x_scale, w.scale)
-    elif isinstance(w, Int4ScaledWeights):
-        return torch.ops.fbgemm.bf16i4bf16_rowwise_batched(x, w.weight, w.scale, w.zero_point)
-    else:
-        raise ValueError("Unsupported quantization type")
-def ffn_swiglu(
-    x: Tensor,
-    w1: Fp8RowwiseWeights | Int4Weights,
-    w3: Fp8RowwiseWeights | Int4Weights,
-    w2: Fp8RowwiseWeights | Int4Weights,
-    num_tokens: Tensor | None = None,
-    is_memory_bounded: bool = False,
-) -> Tensor:
-    if (isinstance(w1, Fp8ScaledWeights) and isinstance(w3, Fp8ScaledWeights) and isinstance(w2, Fp8ScaledWeights)) or (
-        isinstance(w1, Int4ScaledWeights) and isinstance(w3, Int4ScaledWeights) and isinstance(w2, Int4ScaledWeights)
-    ):
-        return ffn_swiglu_dynamic(x, w1, w3, w2, w1.activation_scale_ub, num_tokens, is_memory_bounded)
-    (B, T, D) = x.shape  # noqa: N806
-    (HD_L, D_) = w1.shape  # noqa: N806
-    assert D_ == D
-    assert isinstance(w1, Tensor)
-    assert isinstance(w3, Tensor)
-    x1 = x.view(B * T, D) @ w1.T
-    x2 = x.view(B * T, D) @ w3.T
-    z = torch.nn.functional.silu(x1) * x2
-    del x1, x2
-    assert isinstance(w2, Tensor)
-    return (z @ w2.T).view(B, T, D)
-@torch.inference_mode()
-def quantize_fp8(
-    w: Tensor,
-    fp8_activation_scale_ub: float,
-    output_device: torch.device | None = None,
-) -> Fp8RowwiseWeights:
-    """Quantize [n, k] weight tensor.
-    Args:
-        w (Tensor): [n, k] input high precision tensor to quantize.
-        fp8_activation_scale_ub (float): Upper bound for activation max.
-    """
-    activation_scale_ub = torch.tensor(
-        [fp8_activation_scale_ub],
-        dtype=torch.float,
-        device=output_device,
-    )
-    wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_row(w)
-    del w
-    return Fp8RowwiseWeights(
-        weight=wq,
-        scale=w_scale,
-        shape=wq.shape,
-        activation_scale_ub=activation_scale_ub,
-    )
-@torch.inference_mode()
-def quantize_int4(
-    w: Tensor,
-    output_device: torch.device | None = None,
-) -> Int4Weights:
-    """Quantize [n, k/2] weight tensor.
-    Args:
-        w (Tensor): [n, k/2] input high precision tensor to quantize.
-    """
-    if w.ndim >= 3:
-        wq, scale, zero_point = zip(*[int4_row_quantize(i) for i in w], strict=False)
-        wq = torch.stack([pack_int4(i) for i in wq], dim=0)
-        scale = torch.stack(scale, dim=0)
-        zero_point = torch.stack(zero_point, dim=0)
-    else:
-        wq, scale, zero_point = int4_row_quantize(w)
-        wq = pack_int4(wq)
-    del w
-    return Int4Weights(
-        weight=wq.to(output_device),
-        scale=scale.to(output_device),
-        zero_point=zero_point.to(output_device),
-        shape=wq.shape,
-    )
-@torch.inference_mode()
-def load_fp8(
-    w: Tensor,
-    w_scale: Tensor,
-    fp8_activation_scale_ub: float,
-    output_device: torch.device | None = None,
-) -> Fp8RowwiseWeights:
-    """Load FP8 [n, k] weight tensor.
-    Args:
-        w (Tensor): [n, k] input FP8.
-        fp8_activation_scale_ub (float): Upper bound for activation max.
-    """
-    activation_scale_ub = torch.tensor(
-        [fp8_activation_scale_ub],
-        dtype=torch.float,
-        device=output_device,
-    )
-    return Fp8RowwiseWeights(
-        weight=w.to(torch.float8_e4m3fn).to(device=output_device),
-        scale=w_scale.to(device=output_device),
-        shape=w.shape,
-        activation_scale_ub=activation_scale_ub,
-    )
-@torch.inference_mode()
-def load_int4(
-    w: Tensor,
-    scale: Tensor,
-    zero_point: Tensor,
-    output_device: torch.device | None = None,
-) -> Int4Weights:
-    """Load INT4 [n, k/2] weight tensor.
-    Args:
-        w (Tensor): [n, k/2] input INT4.
-    """
-    return Int4Weights(
-        weight=w.to(torch.int8).to(device=output_device),
-        scale=scale.to(device=output_device),
-        zero_point=zero_point.to(device=output_device),
-        shape=w.shape,
-    )
-def fc_dynamic(
-    x: Tensor,
-    w: Fp8RowwiseWeights | Int4Weights,
-    activation_scale_ub: Tensor | None = None,
-    num_tokens: Tensor | None = None,
-    is_memory_bounded: bool = False,
-) -> Tensor:
-    """
-    Single w8a8 fc layer with dynamic row-wise scaling, or w4a16 fc layer with dyanmic row-wise scaling
-    """
-    if isinstance(w, Int4Weights):
-        y = torch.ops.fbgemm.bf16i4bf16_rowwise(x, w.weight, w.scale, w.zero_point)
-    else:
-        xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(x, num_tokens, activation_scale_ub)
-        y = torch.ops.fbgemm.f8f8bf16_rowwise(xq, w.weight, x_scale, w.scale, use_fast_accum=True)
-        del xq
-    return y
-def ffn_swiglu_dynamic(
-    x: Tensor,
-    w1: Fp8RowwiseWeights | Int4Weights,
-    w3: Fp8RowwiseWeights | Int4Weights,
-    w2: Fp8RowwiseWeights | Int4Weights,
-    activation_scale_ub: Tensor | None = None,
-    num_tokens: Tensor | None = None,
-    is_memory_bounded: bool = False,
-) -> Tensor:
-    assert x.dim() == 3 or x.dim() == 2
-    if x.dim() == 3:
-        (B, T, D) = x.shape  # noqa: N806
-    else:
-        (T, D) = x.shape  # noqa: N806
-        B = 1  # noqa: N806
-    HD_L = w1.shape[0]  # noqa: N806
-    assert HD_L == w3.shape[0]
-    x1 = fc_dynamic(
-        x.view(B * T, D),
-        w1,
-        activation_scale_ub,
-        num_tokens,
-        is_memory_bounded,
-    )
-    x2 = fc_dynamic(
-        x.view(B * T, D),
-        w3,
-        activation_scale_ub,
-        num_tokens,
-        is_memory_bounded,
-    )
-    z = torch.nn.functional.silu(x1) * x2
-    del x1, x2
-    z_ = fc_dynamic(z, w2, activation_scale_ub, num_tokens, is_memory_bounded)
-    if x.dim() == 3:
-        return z_.view(B, T, D)
-    else:
-        return z_

llama_stack/providers/inline/inference/meta_reference/__init__.py DELETED Viewed

@@ -1,20 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-from .config import MetaReferenceInferenceConfig
-async def get_provider_impl(
-    config: MetaReferenceInferenceConfig,
-    _deps: dict[str, Any],
-):
-    from .inference import MetaReferenceInferenceImpl
-    impl = MetaReferenceInferenceImpl(config)
-    await impl.initialize()
-    return impl

llama_stack/providers/inline/inference/meta_reference/common.py DELETED Viewed

@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from pathlib import Path
-from llama_stack.core.utils.model_utils import model_local_dir
-def model_checkpoint_dir(model_id) -> str:
-    checkpoint_dir = Path(model_local_dir(model_id))
-    paths = [Path(checkpoint_dir / f"consolidated.{ext}") for ext in ["pth", "00.pth"]]
-    if not any(p.exists() for p in paths):
-        checkpoint_dir = checkpoint_dir / "original"
-    assert checkpoint_dir.exists(), (
-        f"Could not find checkpoints in: {model_local_dir(model_id)}. "
-        f"If you try to use the native llama model, please download the model using `llama-model download --source meta --model-id {model_id}` (see https://github.com/meta-llama/llama-models). "
-        f"Otherwise, please save your model checkpoint under {model_local_dir(model_id)}"
-    )
-    return str(checkpoint_dir)

llama_stack/providers/inline/inference/meta_reference/config.py DELETED Viewed

@@ -1,68 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-from pydantic import BaseModel, field_validator
-from llama_stack.providers.utils.inference import supported_inference_models
-from llama_stack_api import QuantizationConfig
-class MetaReferenceInferenceConfig(BaseModel):
-    # this is a placeholder to indicate inference model id
-    # the actual inference model id is dtermined by the moddel id in the request
-    # Note: you need to register the model before using it for inference
-    # models in the resouce list in the config.yaml config will be registered automatically
-    model: str | None = None
-    torch_seed: int | None = None
-    max_seq_len: int = 4096
-    max_batch_size: int = 1
-    model_parallel_size: int | None = None
-    # when this is False, we assume that the distributed process group is setup by someone
-    # outside of this code (e.g., when run inside `torchrun`). that is useful for clients
-    # (including our testing code) who might be using llama-stack as a library.
-    create_distributed_process_group: bool = True
-    # By default, the implementation will look at ~/.llama/checkpoints/<model> but you
-    # can override by specifying the directory explicitly
-    checkpoint_dir: str | None = None
-    quantization: QuantizationConfig | None = None
-    @field_validator("model")
-    @classmethod
-    def validate_model(cls, model: str) -> str:
-        permitted_models = supported_inference_models()
-        descriptors = [m.descriptor() for m in permitted_models]
-        repos = [m.huggingface_repo for m in permitted_models if m.huggingface_repo is not None]
-        if model not in (descriptors + repos):
-            model_list = "\n\t".join(repos)
-            raise ValueError(f"Unknown model: `{model}`. Choose from [\n\t{model_list}\n]")
-        return model
-    @classmethod
-    def sample_run_config(
-        cls,
-        model: str = "Llama3.2-3B-Instruct",
-        checkpoint_dir: str = "${env.CHECKPOINT_DIR:=null}",
-        quantization_type: str = "${env.QUANTIZATION_TYPE:=bf16}",
-        model_parallel_size: str = "${env.MODEL_PARALLEL_SIZE:=0}",
-        max_batch_size: str = "${env.MAX_BATCH_SIZE:=1}",
-        max_seq_len: str = "${env.MAX_SEQ_LEN:=4096}",
-        **kwargs,
-    ) -> dict[str, Any]:
-        return {
-            "model": model,
-            "checkpoint_dir": checkpoint_dir,
-            "quantization": {
-                "type": quantization_type,
-            },
-            "model_parallel_size": model_parallel_size,
-            "max_batch_size": max_batch_size,
-            "max_seq_len": max_seq_len,
-        }

llama_stack/providers/inline/inference/meta_reference/generators.py DELETED Viewed

@@ -1,201 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import math
-from typing import Optional
-import torch
-from lmformatenforcer import JsonSchemaParser, TokenEnforcer, TokenEnforcerTokenizerData
-from llama_stack.models.llama.datatypes import QuantizationMode, ToolPromptFormat
-from llama_stack.models.llama.llama3.generation import Llama3
-from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer
-from llama_stack.models.llama.llama4.generation import Llama4
-from llama_stack.models.llama.llama4.tokenizer import Tokenizer as Llama4Tokenizer
-from llama_stack.models.llama.sku_types import Model, ModelFamily
-from llama_stack_api import (
-    GreedySamplingStrategy,
-    JsonSchemaResponseFormat,
-    OpenAIChatCompletionRequestWithExtraBody,
-    OpenAIResponseFormatJSONSchema,
-    ResponseFormat,
-    ResponseFormatType,
-    SamplingParams,
-    TopPSamplingStrategy,
-)
-from .common import model_checkpoint_dir
-from .config import MetaReferenceInferenceConfig
-from .inference import resolve_model
-Tokenizer = Llama4Tokenizer | Llama3Tokenizer
-class LogitsProcessor:
-    def __init__(self, token_enforcer: TokenEnforcer):
-        self.token_enforcer = token_enforcer
-        self.mask: torch.Tensor | None = None
-    def __call__(self, tokens: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
-        token_sequence = tokens[0, :].tolist()
-        allowed_tokens = self.token_enforcer.get_allowed_tokens(token_sequence)
-        if self.mask is not None:
-            self.mask.fill_(-math.inf)
-        else:
-            self.mask = torch.full_like(scores, -math.inf)
-        self.mask[:, :, allowed_tokens] = 0
-        scores = scores + self.mask
-        return scores
-def get_logits_processor(
-    tokenizer: Tokenizer,
-    vocab_size: int,
-    response_format: ResponseFormat | None,
-) -> Optional["LogitsProcessor"]:
-    if response_format is None:
-        return None
-    if not isinstance(response_format, JsonSchemaResponseFormat):
-        raise ValueError(f"Unsupported response format type {response_format.type}")
-    parser = JsonSchemaParser(response_format.json_schema)
-    data = TokenEnforcerTokenizerData(
-        _build_regular_tokens_list(tokenizer, vocab_size),
-        tokenizer.decode,
-        tokenizer.stop_tokens,
-    )
-    token_enforcer = TokenEnforcer(data, parser)
-    return LogitsProcessor(token_enforcer)
-def _build_regular_tokens_list(tokenizer: Tokenizer, vocab_size: int) -> list[tuple[int, str, bool]]:
-    token_0 = tokenizer.encode("0", bos=False, eos=False)[-1]
-    regular_tokens = []
-    special_token_ids = set(tokenizer.special_tokens.values())
-    for token_idx in range(vocab_size):
-        if token_idx in special_token_ids:
-            continue
-        # We prepend token 0 and skip the first letter of the result to get a space if the token is a start word.
-        decoded_after_0 = tokenizer.decode([token_0, token_idx])[1:]
-        decoded_regular = tokenizer.decode([token_idx])
-        is_word_start_token = len(decoded_after_0) > len(decoded_regular)
-        regular_tokens.append((token_idx, decoded_after_0, is_word_start_token))
-    return regular_tokens
-def _infer_sampling_params(sampling_params: SamplingParams):
-    if isinstance(sampling_params.strategy, GreedySamplingStrategy):
-        temperature = 0.0
-        top_p = 1.0
-    elif isinstance(sampling_params.strategy, TopPSamplingStrategy):
-        temperature = sampling_params.strategy.temperature or 1.0
-        top_p = sampling_params.strategy.top_p or 1.0
-    else:
-        raise ValueError(f"Unsupported sampling strategy {sampling_params.strategy}")
-    return temperature, top_p
-class LlamaGenerator:
-    def __init__(
-        self,
-        config: MetaReferenceInferenceConfig,
-        model_id: str,
-        llama_model: Model,
-    ):
-        if config.checkpoint_dir and config.checkpoint_dir != "null":
-            ckpt_dir = config.checkpoint_dir
-        else:
-            resolved_model = resolve_model(model_id)
-            if resolved_model is None:
-                # if the model is not a native llama model, get the default checkpoint_dir based on model id
-                ckpt_dir = model_checkpoint_dir(model_id)
-            else:
-                # if the model is a native llama model, get the default checkpoint_dir based on model core_model_id value
-                ckpt_dir = model_checkpoint_dir(resolved_model.descriptor())
-        if config.quantization:
-            if config.quantization.type == "fp8_mixed":
-                quantization_mode = QuantizationMode.fp8_mixed
-            elif config.quantization.type == "int4_mixed":
-                quantization_mode = QuantizationMode.int4_mixed
-            elif config.quantization.type == "bf16":
-                quantization_mode = None
-            else:
-                raise ValueError(f"Unsupported quantization mode {config.quantization}")
-        else:
-            quantization_mode = None
-        cls = Llama4 if llama_model.model_family == ModelFamily.llama4 else Llama3
-        self.inner_generator = cls.build(
-            ckpt_dir=ckpt_dir,
-            max_seq_len=config.max_seq_len,
-            max_batch_size=config.max_batch_size,
-            world_size=config.model_parallel_size or llama_model.pth_file_count,
-            quantization_mode=quantization_mode,
-        )
-        self.tokenizer = self.inner_generator.tokenizer
-        self.args = self.inner_generator.args
-        self.formatter = self.inner_generator.formatter
-    def chat_completion(
-        self,
-        request: OpenAIChatCompletionRequestWithExtraBody,
-        raw_messages: list,
-    ):
-        """Generate chat completion using OpenAI request format.
-        Args:
-            request: OpenAI chat completion request
-            raw_messages: Pre-converted list of RawMessage objects
-        """
-        # Determine tool prompt format
-        tool_prompt_format = ToolPromptFormat.json if request.tools else ToolPromptFormat.json
-        # Prepare sampling params
-        sampling_params = SamplingParams()
-        if request.temperature is not None or request.top_p is not None:
-            sampling_params.strategy = TopPSamplingStrategy(
-                temperature=request.temperature if request.temperature is not None else 1.0,
-                top_p=request.top_p if request.top_p is not None else 1.0,
-            )
-        if request.max_tokens:
-            sampling_params.max_tokens = request.max_tokens
-        max_gen_len = sampling_params.max_tokens
-        if max_gen_len is None or max_gen_len == 0 or max_gen_len >= self.args.max_seq_len:
-            max_gen_len = self.args.max_seq_len - 1
-        temperature, top_p = _infer_sampling_params(sampling_params)
-        # Get logits processor for response format
-        logits_processor = None
-        if request.response_format:
-            if isinstance(request.response_format, OpenAIResponseFormatJSONSchema):
-                # Extract the actual schema from OpenAIJSONSchema TypedDict
-                schema_dict = request.response_format.json_schema.get("schema") or {}
-                json_schema_format = JsonSchemaResponseFormat(
-                    type=ResponseFormatType.json_schema,
-                    json_schema=schema_dict,
-                )
-                logits_processor = get_logits_processor(self.tokenizer, self.args.vocab_size, json_schema_format)
-        # Generate
-        yield from self.inner_generator.generate(
-            llm_inputs=[self.formatter.encode_dialog_prompt(raw_messages, tool_prompt_format)],
-            max_gen_len=max_gen_len,
-            temperature=temperature,
-            top_p=top_p,
-            logprobs=False,
-            echo=False,
-            logits_processor=logits_processor,
-        )

llama-stack 0.4.4__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

llama-stack 0.4.4py3-none-any.whl → 0.5.0rc1py3-none-any.whl