PyPI - llama-stack - Versions diffs - 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

llama-stack 0.0.42py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (738) hide show

llama_stack/providers/impls/meta_reference/inference/generation.py DELETED Viewed

@@ -1,376 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
-import json
-import os
-import sys
-import time
-from pathlib import Path
-from typing import Generator, List, Optional, Union
-import torch
-import torch.nn.functional as F
-from fairscale.nn.model_parallel.initialize import (
-    get_model_parallel_rank,
-    initialize_model_parallel,
-    model_parallel_is_initialized,
-)
-from llama_models.llama3.api.args import ModelArgs
-from llama_models.llama3.api.chat_format import ChatFormat, ModelInput
-from llama_models.llama3.api.datatypes import (
-    InterleavedTextMedia,
-    Message,
-    ToolPromptFormat,
-)
-from llama_models.llama3.api.tokenizer import Tokenizer
-from llama_models.llama3.reference_impl.model import Transformer
-from llama_models.llama3.reference_impl.multimodal.model import (
-    CrossAttentionTransformer,
-)
-from llama_models.sku_list import resolve_model
-from pydantic import BaseModel
-from termcolor import cprint
-from llama_stack.distribution.utils.model_utils import model_local_dir
-from .config import MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig
-def model_checkpoint_dir(model) -> str:
-    checkpoint_dir = Path(model_local_dir(model.descriptor()))
-    paths = [Path(checkpoint_dir / f"consolidated.{ext}") for ext in ["pth", "00.pth"]]
-    if not any(p.exists() for p in paths):
-        checkpoint_dir = checkpoint_dir / "original"
-    assert checkpoint_dir.exists(), (
-        f"Could not find checkpoints in: {model_local_dir(model.descriptor())}. "
-        f"Please download model using `llama download --model-id {model.descriptor()}`"
-    )
-    return str(checkpoint_dir)
-class TokenResult(BaseModel):
-    token: int
-    text: str
-    logprobs: Optional[List[float]] = None
-class Llama:
-    @staticmethod
-    def build(
-        config: Union[
-            MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig
-        ]
-    ):
-        """
-        Build a Llama instance by initializing and loading a model checkpoint.
-        Note:
-            This method initializes the distributed process group, sets the device to CUDA,
-            and loads the pre-trained model and tokenizer.
-        """
-        model = resolve_model(config.model)
-        if not torch.distributed.is_initialized():
-            torch.distributed.init_process_group("nccl")
-        model_parallel_size = config.model_parallel_size
-        if not model_parallel_is_initialized():
-            initialize_model_parallel(model_parallel_size)
-        local_rank = int(os.environ.get("LOCAL_RANK", 0))
-        torch.cuda.set_device(local_rank)
-        # seed must be the same in all processes
-        if config.torch_seed is not None:
-            torch.manual_seed(config.torch_seed)
-        if local_rank > 0:
-            sys.stdout = open(os.devnull, "w")
-        start_time = time.time()
-        ckpt_dir = model_checkpoint_dir(model)
-        checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
-        assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}"
-        assert model_parallel_size == len(
-            checkpoints
-        ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {model_parallel_size}"
-        ckpt_path = checkpoints[get_model_parallel_rank()]
-        state_dict = torch.load(ckpt_path, map_location="cpu", weights_only=True)
-        with open(Path(ckpt_dir) / "params.json", "r") as f:
-            params = json.loads(f.read())
-        if "model" in params:
-            params = params["model"]
-        model_args: ModelArgs = ModelArgs(
-            max_seq_len=config.max_seq_len,
-            max_batch_size=config.max_batch_size,
-            **params,
-        )
-        tokenizer_path = os.path.join(ckpt_dir, "tokenizer.model")
-        tokenizer = Tokenizer(model_path=tokenizer_path)
-        assert (
-            model_args.vocab_size == tokenizer.n_words
-        ), f"model_args vocab = {model_args.vocab_size} but tokenizer vocab = {tokenizer.n_words}"
-        if isinstance(config, MetaReferenceQuantizedInferenceConfig):
-            from .quantization.loader import convert_to_quantized_model
-            # load on CPU in bf16 so that fp8 conversion does not find an
-            # unexpected (fp32, e.g.) datatype
-            torch.set_default_tensor_type(torch.BFloat16Tensor)
-            if model_args.vision_chunk_size > 0:
-                model = CrossAttentionTransformer(model_args)
-                model.setup_cache(model_args.max_batch_size, torch.bfloat16)
-            else:
-                model = Transformer(model_args)
-            model.load_state_dict(state_dict, strict=False)
-            model = convert_to_quantized_model(model, config)
-        else:
-            if torch.cuda.is_bf16_supported():
-                torch.set_default_tensor_type(torch.cuda.BFloat16Tensor)
-            else:
-                torch.set_default_tensor_type(torch.cuda.HalfTensor)
-            if model_args.vision_chunk_size > 0:
-                model = CrossAttentionTransformer(model_args)
-                model.setup_cache(model_args.max_batch_size, torch.bfloat16)
-            else:
-                model = Transformer(model_args)
-            model.load_state_dict(state_dict, strict=False)
-        print(f"Loaded in {time.time() - start_time:.2f} seconds")
-        return Llama(model, tokenizer, model_args)
-    def __init__(self, model: Transformer, tokenizer: Tokenizer, args: ModelArgs):
-        self.args = args
-        self.model = model
-        self.tokenizer = tokenizer
-        self.formatter = ChatFormat(tokenizer)
-    @torch.inference_mode()
-    def generate(
-        self,
-        model_input: ModelInput,
-        max_gen_len: int,
-        temperature: float = 0.6,
-        top_p: float = 0.9,
-        logprobs: bool = False,
-        echo: bool = False,
-        include_stop_token: bool = False,
-    ) -> Generator:
-        params = self.model.params
-        # input_tokens = [
-        #     self.formatter.vision_token if t == 128256 else t
-        #     for t in model_input.tokens
-        # ]
-        # cprint("Input to model -> " + self.tokenizer.decode(input_tokens), "red")
-        prompt_tokens = [model_input.tokens]
-        bsz = 1
-        assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)
-        min_prompt_len = min(len(t) for t in prompt_tokens)
-        max_prompt_len = max(len(t) for t in prompt_tokens)
-        if max_prompt_len >= params.max_seq_len:
-            cprint(
-                f"Out of token budget {max_prompt_len} vs {params.max_seq_len}", "red"
-            )
-            return
-        total_len = min(max_gen_len + max_prompt_len, params.max_seq_len)
-        is_vision = isinstance(self.model, CrossAttentionTransformer)
-        if is_vision:
-            images = model_input.vision.images if model_input.vision is not None else []
-            mask = model_input.vision.mask if model_input.vision is not None else []
-            # the method works for bsz > 1 so add a batch dimension
-            xattn_caches, cross_attention_masks, full_text_row_masked_out_mask = (
-                self.model.compute_vision_tokens_masks(
-                    batch_images=[images],
-                    batch_masks=[mask],
-                    total_len=total_len,
-                )
-            )
-        pad_id = self.tokenizer.pad_id
-        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device="cuda")
-        for k, t in enumerate(prompt_tokens):
-            tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device="cuda")
-        if logprobs:
-            token_logprobs = torch.zeros_like(tokens, dtype=torch.float)
-        prev_pos = 0
-        eos_reached = torch.tensor([False] * bsz, device="cuda")
-        input_text_mask = tokens != pad_id
-        if min_prompt_len == total_len:
-            # TODO(ashwin): unify this branch with the one below and figure out multimodal crap
-            logits = self.model.forward(tokens, prev_pos)
-            token_logprobs = -F.cross_entropy(
-                input=logits.transpose(1, 2),
-                target=tokens,
-                reduction="none",
-                ignore_index=pad_id,
-            )
-        stop_tokens = torch.tensor(self.tokenizer.stop_tokens)
-        for cur_pos in range(min_prompt_len, total_len):
-            if is_vision:
-                position_ids = torch.arange(
-                    prev_pos, cur_pos, dtype=torch.long, device="cuda"
-                )
-                logits = self.model.forward(
-                    position_ids,
-                    tokens,
-                    cross_attention_masks,
-                    full_text_row_masked_out_mask,
-                    xattn_caches,
-                )
-            else:
-                logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
-            if temperature > 0:
-                probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
-                next_token = sample_top_p(probs, top_p)
-            else:
-                next_token = torch.argmax(logits[:, -1], dim=-1)
-            next_token = next_token.reshape(-1)
-            # only replace token if prompt has already been generated
-            next_token = torch.where(
-                input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
-            )
-            tokens[:, cur_pos] = next_token
-            target = tokens[:, prev_pos + 1 : cur_pos + 1]
-            if is_vision:
-                # the logits space (num_classes) is designed to never contain a media_token
-                # however our input token stream does contain them. we need to nuke them here
-                # or else the CUDA kernels will crash with an illegal memory access
-                vision_tokens = [self.tokenizer.special_tokens["<|image|>"], 128256]
-                masks = [target.eq(t) for t in vision_tokens]
-                if len(masks) > 1:
-                    mask = torch.logical_or(*masks)
-                else:
-                    mask = masks[0]
-                target[mask] = 0
-            if logprobs:
-                token_logprobs[:, prev_pos + 1 : cur_pos + 1] = -F.cross_entropy(
-                    input=logits.transpose(1, 2),
-                    target=tokens[:, prev_pos + 1 : cur_pos + 1],
-                    reduction="none",
-                    ignore_index=pad_id,
-                )
-            eos_reached |= (~input_text_mask[:, cur_pos]) & (
-                torch.isin(next_token, stop_tokens)
-            )
-            yield TokenResult(
-                token=next_token[0].item(),
-                text=self.tokenizer.decode(next_token.tolist()),
-                logprobs=(
-                    token_logprobs[:, cur_pos : cur_pos + 1][0].tolist()
-                    if logprobs
-                    else None
-                ),
-            )
-            prev_pos = cur_pos
-            if all(eos_reached):
-                break
-    def text_completion(
-        self,
-        content: InterleavedTextMedia,
-        temperature: float = 0.6,
-        top_p: float = 0.9,
-        max_gen_len: Optional[int] = None,
-        logprobs: bool = False,
-        echo: bool = False,
-    ) -> Generator:
-        if (
-            max_gen_len is None
-            or max_gen_len == 0
-            or max_gen_len >= self.model.params.max_seq_len
-        ):
-            max_gen_len = self.model.params.max_seq_len - 1
-        model_input = self.formatter.encode_content(content)
-        yield from self.generate(
-            model_input=model_input,
-            max_gen_len=max_gen_len,
-            temperature=temperature,
-            top_p=top_p,
-            logprobs=logprobs,
-            echo=echo,
-        )
-    def chat_completion(
-        self,
-        messages: List[Message],
-        temperature: float = 0.6,
-        top_p: float = 0.9,
-        max_gen_len: Optional[int] = None,
-        logprobs: bool = False,
-        tool_prompt_format: ToolPromptFormat = ToolPromptFormat.json,
-    ) -> Generator:
-        if (
-            max_gen_len is None
-            or max_gen_len == 0
-            or max_gen_len >= self.model.params.max_seq_len
-        ):
-            max_gen_len = self.model.params.max_seq_len - 1
-        yield from self.generate(
-            model_input=self.formatter.encode_dialog_prompt(
-                messages,
-                tool_prompt_format,
-            ),
-            max_gen_len=max_gen_len,
-            temperature=temperature,
-            top_p=top_p,
-            logprobs=logprobs,
-            include_stop_token=True,
-        )
-def sample_top_p(probs, p):
-    """
-    Perform top-p (nucleus) sampling on a probability distribution.
-    Args:
-        probs (torch.Tensor): Probability distribution tensor.
-        p (float): Probability threshold for top-p sampling.
-    Returns:
-        torch.Tensor: Sampled token indices.
-    Note:
-        Top-p sampling selects the smallest set of tokens whose cumulative probability mass
-        exceeds the threshold p. The distribution is renormalized based on the selected tokens.
-    """
-    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
-    probs_sum = torch.cumsum(probs_sort, dim=-1)
-    mask = probs_sum - probs_sort > p
-    probs_sort[mask] = 0.0
-    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
-    next_token = torch.multinomial(probs_sort, num_samples=1)
-    next_token = torch.gather(probs_idx, -1, next_token)
-    return next_token

llama_stack/providers/impls/meta_reference/inference/inference.py DELETED Viewed

@@ -1,280 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import asyncio
-from typing import AsyncGenerator, List
-from llama_models.sku_list import resolve_model
-from llama_models.llama3.api.datatypes import *  # noqa: F403
-from llama_stack.apis.inference import *  # noqa: F403
-from llama_stack.providers.datatypes import ModelDef, ModelsProtocolPrivate
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    chat_completion_request_to_messages,
-)
-from .config import MetaReferenceInferenceConfig
-from .model_parallel import LlamaModelParallelGenerator
-# there's a single model parallel process running serving the model. for now,
-# we don't support multiple concurrent requests to this process.
-SEMAPHORE = asyncio.Semaphore(1)
-class MetaReferenceInferenceImpl(Inference, ModelsProtocolPrivate):
-    def __init__(self, config: MetaReferenceInferenceConfig) -> None:
-        self.config = config
-        model = resolve_model(config.model)
-        if model is None:
-            raise RuntimeError(f"Unknown model: {config.model}, Run `llama model list`")
-        self.model = model
-        # verify that the checkpoint actually is for this model lol
-    async def initialize(self) -> None:
-        print(f"Loading model `{self.model.descriptor()}`")
-        self.generator = LlamaModelParallelGenerator(self.config)
-        self.generator.start()
-    async def register_model(self, model: ModelDef) -> None:
-        raise ValueError("Dynamic model registration is not supported")
-    async def list_models(self) -> List[ModelDef]:
-        return [
-            ModelDef(
-                identifier=self.model.descriptor(),
-                llama_model=self.model.descriptor(),
-            )
-        ]
-    async def shutdown(self) -> None:
-        self.generator.stop()
-    def completion(
-        self,
-        model: str,
-        content: InterleavedTextMedia,
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> Union[CompletionResponse, CompletionResponseStreamChunk]:
-        raise NotImplementedError()
-    def chat_completion(
-        self,
-        model: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> AsyncGenerator:
-        if logprobs:
-            assert logprobs.top_k == 1, f"Unexpected top_k={logprobs.top_k}"
-        # wrapper request to make it easier to pass around (internal only, not exposed to API)
-        request = ChatCompletionRequest(
-            model=model,
-            messages=messages,
-            sampling_params=sampling_params,
-            tools=tools or [],
-            tool_choice=tool_choice,
-            tool_prompt_format=tool_prompt_format,
-            stream=stream,
-            logprobs=logprobs,
-        )
-        model = resolve_model(request.model)
-        if model is None:
-            raise RuntimeError(
-                f"Unknown model: {request.model}, Run `llama model list`"
-            )
-        elif model.descriptor() != self.model.descriptor():
-            raise RuntimeError(
-                f"Model mismatch: {request.model} != {self.model.descriptor()}"
-            )
-        if SEMAPHORE.locked():
-            raise RuntimeError("Only one concurrent request is supported")
-        if request.stream:
-            return self._stream_chat_completion(request)
-        else:
-            return self._nonstream_chat_completion(request)
-    async def _nonstream_chat_completion(
-        self, request: ChatCompletionRequest
-    ) -> ChatCompletionResponse:
-        async with SEMAPHORE:
-            messages = chat_completion_request_to_messages(request)
-            tokens = []
-            logprobs = []
-            stop_reason = None
-            for token_result in self.generator.chat_completion(
-                messages=messages,
-                temperature=request.sampling_params.temperature,
-                top_p=request.sampling_params.top_p,
-                max_gen_len=request.sampling_params.max_tokens,
-                logprobs=request.logprobs,
-                tool_prompt_format=request.tool_prompt_format,
-            ):
-                tokens.append(token_result.token)
-                if token_result.text == "<|eot_id|>":
-                    stop_reason = StopReason.end_of_turn
-                elif token_result.text == "<|eom_id|>":
-                    stop_reason = StopReason.end_of_message
-                if request.logprobs:
-                    assert len(token_result.logprobs) == 1
-                    logprobs.append(
-                        TokenLogProbs(
-                            logprobs_by_token={
-                                token_result.text: token_result.logprobs[0]
-                            }
-                        )
-                    )
-            if stop_reason is None:
-                stop_reason = StopReason.out_of_tokens
-            message = self.generator.formatter.decode_assistant_message(
-                tokens, stop_reason
-            )
-            return ChatCompletionResponse(
-                completion_message=message,
-                logprobs=logprobs if request.logprobs else None,
-            )
-    async def _stream_chat_completion(
-        self, request: ChatCompletionRequest
-    ) -> AsyncGenerator:
-        async with SEMAPHORE:
-            messages = chat_completion_request_to_messages(request)
-            yield ChatCompletionResponseStreamChunk(
-                event=ChatCompletionResponseEvent(
-                    event_type=ChatCompletionResponseEventType.start,
-                    delta="",
-                )
-            )
-            tokens = []
-            logprobs = []
-            stop_reason = None
-            ipython = False
-            for token_result in self.generator.chat_completion(
-                messages=messages,
-                temperature=request.sampling_params.temperature,
-                top_p=request.sampling_params.top_p,
-                max_gen_len=request.sampling_params.max_tokens,
-                logprobs=request.logprobs,
-                tool_prompt_format=request.tool_prompt_format,
-            ):
-                tokens.append(token_result.token)
-                if not ipython and token_result.text.startswith("<|python_tag|>"):
-                    ipython = True
-                    yield ChatCompletionResponseStreamChunk(
-                        event=ChatCompletionResponseEvent(
-                            event_type=ChatCompletionResponseEventType.progress,
-                            delta=ToolCallDelta(
-                                content="",
-                                parse_status=ToolCallParseStatus.started,
-                            ),
-                        )
-                    )
-                    continue
-                if token_result.text == "<|eot_id|>":
-                    stop_reason = StopReason.end_of_turn
-                    text = ""
-                elif token_result.text == "<|eom_id|>":
-                    stop_reason = StopReason.end_of_message
-                    text = ""
-                else:
-                    text = token_result.text
-                if ipython:
-                    delta = ToolCallDelta(
-                        content=text,
-                        parse_status=ToolCallParseStatus.in_progress,
-                    )
-                else:
-                    delta = text
-                if stop_reason is None:
-                    if request.logprobs:
-                        assert len(token_result.logprobs) == 1
-                        logprobs.append(
-                            TokenLogProbs(
-                                logprobs_by_token={
-                                    token_result.text: token_result.logprobs[0]
-                                }
-                            )
-                        )
-                    yield ChatCompletionResponseStreamChunk(
-                        event=ChatCompletionResponseEvent(
-                            event_type=ChatCompletionResponseEventType.progress,
-                            delta=delta,
-                            stop_reason=stop_reason,
-                            logprobs=logprobs if request.logprobs else None,
-                        )
-                    )
-            if stop_reason is None:
-                stop_reason = StopReason.out_of_tokens
-            message = self.generator.formatter.decode_assistant_message(
-                tokens, stop_reason
-            )
-            parsed_tool_calls = len(message.tool_calls) > 0
-            if ipython and not parsed_tool_calls:
-                yield ChatCompletionResponseStreamChunk(
-                    event=ChatCompletionResponseEvent(
-                        event_type=ChatCompletionResponseEventType.progress,
-                        delta=ToolCallDelta(
-                            content="",
-                            parse_status=ToolCallParseStatus.failure,
-                        ),
-                        stop_reason=stop_reason,
-                    )
-                )
-            for tool_call in message.tool_calls:
-                yield ChatCompletionResponseStreamChunk(
-                    event=ChatCompletionResponseEvent(
-                        event_type=ChatCompletionResponseEventType.progress,
-                        delta=ToolCallDelta(
-                            content=tool_call,
-                            parse_status=ToolCallParseStatus.success,
-                        ),
-                        stop_reason=stop_reason,
-                    )
-                )
-            yield ChatCompletionResponseStreamChunk(
-                event=ChatCompletionResponseEvent(
-                    event_type=ChatCompletionResponseEventType.complete,
-                    delta="",
-                    stop_reason=stop_reason,
-                )
-            )
-    async def embeddings(
-        self,
-        model: str,
-        contents: List[InterleavedTextMedia],
-    ) -> EmbeddingsResponse:
-        raise NotImplementedError()

llama-stack 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl

llama-stack 0.0.42py3-none-any.whl → 0.3.4py3-none-any.whl