PyPI - llama-stack - Versions diffs - 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

llama-stack 0.0.42py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (738) hide show

llama_stack/providers/impls/meta_reference/safety/safety.py DELETED Viewed

@@ -1,112 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any, Dict, List
-from llama_stack.distribution.utils.model_utils import model_local_dir
-from llama_stack.apis.inference import *  # noqa: F403
-from llama_stack.apis.safety import *  # noqa: F403
-from llama_models.llama3.api.datatypes import *  # noqa: F403
-from llama_stack.distribution.datatypes import Api
-from llama_stack.providers.datatypes import ShieldsProtocolPrivate
-from .base import OnViolationAction, ShieldBase
-from .config import SafetyConfig
-from .llama_guard import LlamaGuardShield
-from .prompt_guard import InjectionShield, JailbreakShield, PromptGuardShield
-PROMPT_GUARD_MODEL = "Prompt-Guard-86M"
-class MetaReferenceSafetyImpl(Safety, ShieldsProtocolPrivate):
-    def __init__(self, config: SafetyConfig, deps) -> None:
-        self.config = config
-        self.inference_api = deps[Api.inference]
-        self.available_shields = []
-        if config.llama_guard_shield:
-            self.available_shields.append(ShieldType.llama_guard.value)
-        if config.enable_prompt_guard:
-            self.available_shields.append(ShieldType.prompt_guard.value)
-    async def initialize(self) -> None:
-        if self.config.enable_prompt_guard:
-            model_dir = model_local_dir(PROMPT_GUARD_MODEL)
-            _ = PromptGuardShield.instance(model_dir)
-    async def shutdown(self) -> None:
-        pass
-    async def register_shield(self, shield: ShieldDef) -> None:
-        raise ValueError("Registering dynamic shields is not supported")
-    async def list_shields(self) -> List[ShieldDef]:
-        return [
-            ShieldDef(
-                identifier=shield_type,
-                type=shield_type,
-                params={},
-            )
-            for shield_type in self.available_shields
-        ]
-    async def run_shield(
-        self,
-        shield_type: str,
-        messages: List[Message],
-        params: Dict[str, Any] = None,
-    ) -> RunShieldResponse:
-        shield_def = await self.shield_store.get_shield(shield_type)
-        if not shield_def:
-            raise ValueError(f"Unknown shield {shield_type}")
-        shield = self.get_shield_impl(shield_def)
-        messages = messages.copy()
-        # some shields like llama-guard require the first message to be a user message
-        # since this might be a tool call, first role might not be user
-        if len(messages) > 0 and messages[0].role != Role.user.value:
-            messages[0] = UserMessage(content=messages[0].content)
-        # TODO: we can refactor ShieldBase, etc. to be inline with the API types
-        res = await shield.run(messages)
-        violation = None
-        if res.is_violation and shield.on_violation_action != OnViolationAction.IGNORE:
-            violation = SafetyViolation(
-                violation_level=(
-                    ViolationLevel.ERROR
-                    if shield.on_violation_action == OnViolationAction.RAISE
-                    else ViolationLevel.WARN
-                ),
-                user_message=res.violation_return_message,
-                metadata={
-                    "violation_type": res.violation_type,
-                },
-            )
-        return RunShieldResponse(violation=violation)
-    def get_shield_impl(self, shield: ShieldDef) -> ShieldBase:
-        if shield.type == ShieldType.llama_guard.value:
-            cfg = self.config.llama_guard_shield
-            return LlamaGuardShield(
-                model=cfg.model,
-                inference_api=self.inference_api,
-                excluded_categories=cfg.excluded_categories,
-            )
-        elif shield.type == ShieldType.prompt_guard.value:
-            model_dir = model_local_dir(PROMPT_GUARD_MODEL)
-            subtype = shield.params.get("prompt_guard_type", "injection")
-            if subtype == "injection":
-                return InjectionShield.instance(model_dir)
-            elif subtype == "jailbreak":
-                return JailbreakShield.instance(model_dir)
-            else:
-                raise ValueError(f"Unknown prompt guard type: {subtype}")
-        else:
-            raise ValueError(f"Unknown shield type: {shield.type}")

llama_stack/providers/impls/meta_reference/telemetry/console.py DELETED Viewed

@@ -1,89 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Optional
-from llama_stack.apis.telemetry import *  # noqa: F403
-from .config import ConsoleConfig
-class ConsoleTelemetryImpl(Telemetry):
-    def __init__(self, config: ConsoleConfig) -> None:
-        self.config = config
-        self.spans = {}
-    async def initialize(self) -> None: ...
-    async def shutdown(self) -> None: ...
-    async def log_event(self, event: Event):
-        if (
-            isinstance(event, StructuredLogEvent)
-            and event.payload.type == StructuredLogType.SPAN_START.value
-        ):
-            self.spans[event.span_id] = event.payload
-        names = []
-        span_id = event.span_id
-        while True:
-            span_payload = self.spans.get(span_id)
-            if not span_payload:
-                break
-            names = [span_payload.name] + names
-            span_id = span_payload.parent_span_id
-        span_name = ".".join(names) if names else None
-        formatted = format_event(event, span_name)
-        if formatted:
-            print(formatted)
-    async def get_trace(self, trace_id: str) -> Trace:
-        raise NotImplementedError()
-COLORS = {
-    "reset": "\033[0m",
-    "bold": "\033[1m",
-    "dim": "\033[2m",
-    "red": "\033[31m",
-    "green": "\033[32m",
-    "yellow": "\033[33m",
-    "blue": "\033[34m",
-    "magenta": "\033[35m",
-    "cyan": "\033[36m",
-    "white": "\033[37m",
-}
-SEVERITY_COLORS = {
-    LogSeverity.VERBOSE: COLORS["dim"] + COLORS["white"],
-    LogSeverity.DEBUG: COLORS["cyan"],
-    LogSeverity.INFO: COLORS["green"],
-    LogSeverity.WARN: COLORS["yellow"],
-    LogSeverity.ERROR: COLORS["red"],
-    LogSeverity.CRITICAL: COLORS["bold"] + COLORS["red"],
-}
-def format_event(event: Event, span_name: str) -> Optional[str]:
-    timestamp = event.timestamp.strftime("%H:%M:%S.%f")[:-3]
-    span = ""
-    if span_name:
-        span = f"{COLORS['magenta']}[{span_name}]{COLORS['reset']} "
-    if isinstance(event, UnstructuredLogEvent):
-        severity_color = SEVERITY_COLORS.get(event.severity, COLORS["reset"])
-        return (
-            f"{COLORS['dim']}{timestamp}{COLORS['reset']} "
-            f"{severity_color}[{event.severity.name}]{COLORS['reset']} "
-            f"{span}"
-            f"{event.message}"
-        )
-    elif isinstance(event, StructuredLogEvent):
-        return None
-    return f"Unknown event type: {event}"

llama_stack/providers/impls/vllm/config.py DELETED Viewed

@@ -1,35 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from llama_models.schema_utils import json_schema_type
-from pydantic import BaseModel, Field, field_validator
-from llama_stack.providers.utils.inference import supported_inference_models
-@json_schema_type
-class VLLMConfig(BaseModel):
-    """Configuration for the vLLM inference provider."""
-    model: str = Field(
-        default="Llama3.1-8B-Instruct",
-        description="Model descriptor from `llama model list`",
-    )
-    tensor_parallel_size: int = Field(
-        default=1,
-        description="Number of tensor parallel replicas (number of GPUs to use).",
-    )
-    @field_validator("model")
-    @classmethod
-    def validate_model(cls, model: str) -> str:
-        permitted_models = supported_inference_models()
-        if model not in permitted_models:
-            model_list = "\n\t".join(permitted_models)
-            raise ValueError(
-                f"Unknown model: `{model}`. Choose from [\n\t{model_list}\n]"
-            )
-        return model

llama_stack/providers/impls/vllm/vllm.py DELETED Viewed

@@ -1,241 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import logging
-import os
-import uuid
-from typing import Any
-from llama_models.llama3.api.chat_format import ChatFormat
-from llama_models.llama3.api.datatypes import *  # noqa: F403
-from llama_models.llama3.api.tokenizer import Tokenizer
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.sampling_params import SamplingParams
-from llama_stack.apis.inference import *  # noqa: F403
-from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
-from llama_stack.providers.utils.inference.openai_compat import (
-    OpenAICompatCompletionChoice,
-    OpenAICompatCompletionResponse,
-    process_chat_completion_response,
-    process_chat_completion_stream_response,
-)
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    chat_completion_request_to_prompt,
-)
-from .config import VLLMConfig
-log = logging.getLogger(__name__)
-def _random_uuid() -> str:
-    return str(uuid.uuid4().hex)
-def _vllm_sampling_params(sampling_params: Any) -> SamplingParams:
-    """Convert sampling params to vLLM sampling params."""
-    if sampling_params is None:
-        return SamplingParams()
-    # TODO convert what I saw in my first test ... but surely there's more to do here
-    kwargs = {
-        "temperature": sampling_params.temperature,
-    }
-    if sampling_params.top_k >= 1:
-        kwargs["top_k"] = sampling_params.top_k
-    if sampling_params.top_p:
-        kwargs["top_p"] = sampling_params.top_p
-    if sampling_params.max_tokens >= 1:
-        kwargs["max_tokens"] = sampling_params.max_tokens
-    if sampling_params.repetition_penalty > 0:
-        kwargs["repetition_penalty"] = sampling_params.repetition_penalty
-    return SamplingParams(**kwargs)
-class VLLMInferenceImpl(ModelRegistryHelper, Inference):
-    """Inference implementation for vLLM."""
-    HF_MODEL_MAPPINGS = {
-        # TODO: seems like we should be able to build this table dynamically ...
-        "Llama3.1-8B": "meta-llama/Llama-3.1-8B",
-        "Llama3.1-70B": "meta-llama/Llama-3.1-70B",
-        "Llama3.1-405B:bf16-mp8": "meta-llama/Llama-3.1-405B",
-        "Llama3.1-405B": "meta-llama/Llama-3.1-405B-FP8",
-        "Llama3.1-405B:bf16-mp16": "meta-llama/Llama-3.1-405B",
-        "Llama3.1-8B-Instruct": "meta-llama/Llama-3.1-8B-Instruct",
-        "Llama3.1-70B-Instruct": "meta-llama/Llama-3.1-70B-Instruct",
-        "Llama3.1-405B-Instruct:bf16-mp8": "meta-llama/Llama-3.1-405B-Instruct",
-        "Llama3.1-405B-Instruct": "meta-llama/Llama-3.1-405B-Instruct-FP8",
-        "Llama3.1-405B-Instruct:bf16-mp16": "meta-llama/Llama-3.1-405B-Instruct",
-        "Llama3.2-1B": "meta-llama/Llama-3.2-1B",
-        "Llama3.2-3B": "meta-llama/Llama-3.2-3B",
-        "Llama3.2-11B-Vision": "meta-llama/Llama-3.2-11B-Vision",
-        "Llama3.2-90B-Vision": "meta-llama/Llama-3.2-90B-Vision",
-        "Llama3.2-1B-Instruct": "meta-llama/Llama-3.2-1B-Instruct",
-        "Llama3.2-3B-Instruct": "meta-llama/Llama-3.2-3B-Instruct",
-        "Llama3.2-11B-Vision-Instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct",
-        "Llama3.2-90B-Vision-Instruct": "meta-llama/Llama-3.2-90B-Vision-Instruct",
-        "Llama-Guard-3-11B-Vision": "meta-llama/Llama-Guard-3-11B-Vision",
-        "Llama-Guard-3-1B:int4-mp1": "meta-llama/Llama-Guard-3-1B-INT4",
-        "Llama-Guard-3-1B": "meta-llama/Llama-Guard-3-1B",
-        "Llama-Guard-3-8B": "meta-llama/Llama-Guard-3-8B",
-        "Llama-Guard-3-8B:int8-mp1": "meta-llama/Llama-Guard-3-8B-INT8",
-        "Prompt-Guard-86M": "meta-llama/Prompt-Guard-86M",
-        "Llama-Guard-2-8B": "meta-llama/Llama-Guard-2-8B",
-    }
-    def __init__(self, config: VLLMConfig):
-        Inference.__init__(self)
-        ModelRegistryHelper.__init__(
-            self,
-            stack_to_provider_models_map=self.HF_MODEL_MAPPINGS,
-        )
-        self.config = config
-        self.engine = None
-        tokenizer = Tokenizer.get_instance()
-        self.formatter = ChatFormat(tokenizer)
-    async def initialize(self):
-        """Initialize the vLLM inference adapter."""
-        log.info("Initializing vLLM inference adapter")
-        # Disable usage stats reporting. This would be a surprising thing for most
-        # people to find out was on by default.
-        # https://docs.vllm.ai/en/latest/serving/usage_stats.html
-        if "VLLM_NO_USAGE_STATS" not in os.environ:
-            os.environ["VLLM_NO_USAGE_STATS"] = "1"
-        hf_model = self.HF_MODEL_MAPPINGS.get(self.config.model)
-        # TODO -- there are a ton of options supported here ...
-        engine_args = AsyncEngineArgs()
-        engine_args.model = hf_model
-        # We will need a new config item for this in the future if model support is more broad
-        # than it is today (llama only)
-        engine_args.tokenizer = hf_model
-        engine_args.tensor_parallel_size = self.config.tensor_parallel_size
-        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
-    async def shutdown(self):
-        """Shutdown the vLLM inference adapter."""
-        log.info("Shutting down vLLM inference adapter")
-        if self.engine:
-            self.engine.shutdown_background_loop()
-    def completion(
-        self,
-        model: str,
-        content: InterleavedTextMedia,
-        sampling_params: Any | None = ...,
-        stream: bool | None = False,
-        logprobs: LogProbConfig | None = None,
-    ) -> CompletionResponse | CompletionResponseStreamChunk:
-        log.info("vLLM completion")
-        messages = [UserMessage(content=content)]
-        return self.chat_completion(
-            model=model,
-            messages=messages,
-            sampling_params=sampling_params,
-            stream=stream,
-            logprobs=logprobs,
-        )
-    def chat_completion(
-        self,
-        model: str,
-        messages: list[Message],
-        sampling_params: Any | None = ...,
-        tools: list[ToolDefinition] | None = ...,
-        tool_choice: ToolChoice | None = ...,
-        tool_prompt_format: ToolPromptFormat | None = ...,
-        stream: bool | None = False,
-        logprobs: LogProbConfig | None = None,
-    ) -> ChatCompletionResponse | ChatCompletionResponseStreamChunk:
-        log.info("vLLM chat completion")
-        assert self.engine is not None
-        request = ChatCompletionRequest(
-            model=model,
-            messages=messages,
-            sampling_params=sampling_params,
-            tools=tools or [],
-            tool_choice=tool_choice,
-            tool_prompt_format=tool_prompt_format,
-            stream=stream,
-            logprobs=logprobs,
-        )
-        log.info("Sampling params: %s", sampling_params)
-        request_id = _random_uuid()
-        prompt = chat_completion_request_to_prompt(request, self.formatter)
-        vllm_sampling_params = _vllm_sampling_params(request.sampling_params)
-        results_generator = self.engine.generate(
-            prompt, vllm_sampling_params, request_id
-        )
-        if stream:
-            return self._stream_chat_completion(request, results_generator)
-        else:
-            return self._nonstream_chat_completion(request, results_generator)
-    async def _nonstream_chat_completion(
-        self, request: ChatCompletionRequest, results_generator: AsyncGenerator
-    ) -> ChatCompletionResponse:
-        outputs = [o async for o in results_generator]
-        final_output = outputs[-1]
-        assert final_output is not None
-        outputs = final_output.outputs
-        finish_reason = outputs[-1].stop_reason
-        choice = OpenAICompatCompletionChoice(
-            finish_reason=finish_reason,
-            text="".join([output.text for output in outputs]),
-        )
-        response = OpenAICompatCompletionResponse(
-            choices=[choice],
-        )
-        return process_chat_completion_response(request, response, self.formatter)
-    async def _stream_chat_completion(
-        self, request: ChatCompletionRequest, results_generator: AsyncGenerator
-    ) -> AsyncGenerator:
-        async def _generate_and_convert_to_openai_compat():
-            async for chunk in results_generator:
-                if not chunk.outputs:
-                    log.warning("Empty chunk received")
-                    continue
-                text = "".join([output.text for output in chunk.outputs])
-                choice = OpenAICompatCompletionChoice(
-                    finish_reason=chunk.outputs[-1].stop_reason,
-                    text=text,
-                )
-                yield OpenAICompatCompletionResponse(
-                    choices=[choice],
-                )
-        stream = _generate_and_convert_to_openai_compat()
-        async for chunk in process_chat_completion_stream_response(
-            request, stream, self.formatter
-        ):
-            yield chunk
-    async def embeddings(
-        self, model: str, contents: list[InterleavedTextMedia]
-    ) -> EmbeddingsResponse:
-        log.info("vLLM embeddings")
-        # TODO
-        raise NotImplementedError()

llama_stack/providers/registry/memory.py DELETED Viewed

@@ -1,78 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import List
-from llama_stack.distribution.datatypes import *  # noqa: F403
-EMBEDDING_DEPS = [
-    "blobfile",
-    "chardet",
-    "pypdf",
-    "tqdm",
-    "numpy",
-    "scikit-learn",
-    "scipy",
-    "nltk",
-    "sentencepiece",
-    "transformers",
-    # this happens to work because special dependencies are always installed last
-    # so if there was a regular torch installed first, this would be ignored
-    # we need a better way to do this to identify potential conflicts, etc.
-    # for now, this lets us significantly reduce the size of the container which
-    # does not have any "local" inference code (and hence does not need GPU-enabled torch)
-    "torch --index-url https://download.pytorch.org/whl/cpu",
-    "sentence-transformers --no-deps",
-]
-def available_providers() -> List[ProviderSpec]:
-    return [
-        InlineProviderSpec(
-            api=Api.memory,
-            provider_type="meta-reference",
-            pip_packages=EMBEDDING_DEPS + ["faiss-cpu"],
-            module="llama_stack.providers.impls.meta_reference.memory",
-            config_class="llama_stack.providers.impls.meta_reference.memory.FaissImplConfig",
-        ),
-        remote_provider_spec(
-            Api.memory,
-            AdapterSpec(
-                adapter_type="chromadb",
-                pip_packages=EMBEDDING_DEPS + ["chromadb-client"],
-                module="llama_stack.providers.adapters.memory.chroma",
-            ),
-        ),
-        remote_provider_spec(
-            Api.memory,
-            AdapterSpec(
-                adapter_type="pgvector",
-                pip_packages=EMBEDDING_DEPS + ["psycopg2-binary"],
-                module="llama_stack.providers.adapters.memory.pgvector",
-                config_class="llama_stack.providers.adapters.memory.pgvector.PGVectorConfig",
-            ),
-        ),
-        remote_provider_spec(
-            Api.memory,
-            AdapterSpec(
-                adapter_type="weaviate",
-                pip_packages=EMBEDDING_DEPS + ["weaviate-client"],
-                module="llama_stack.providers.adapters.memory.weaviate",
-                config_class="llama_stack.providers.adapters.memory.weaviate.WeaviateConfig",
-                provider_data_validator="llama_stack.providers.adapters.memory.weaviate.WeaviateRequestProviderData",
-            ),
-        ),
-        remote_provider_spec(
-            api=Api.memory,
-            adapter=AdapterSpec(
-                adapter_type="sample",
-                pip_packages=[],
-                module="llama_stack.providers.adapters.memory.sample",
-                config_class="llama_stack.providers.adapters.memory.sample.SampleConfig",
-            ),
-        ),
-    ]

llama_stack/providers/registry/telemetry.py DELETED Viewed

@@ -1,44 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import List
-from llama_stack.distribution.datatypes import *  # noqa: F403
-def available_providers() -> List[ProviderSpec]:
-    return [
-        InlineProviderSpec(
-            api=Api.telemetry,
-            provider_type="meta-reference",
-            pip_packages=[],
-            module="llama_stack.providers.impls.meta_reference.telemetry",
-            config_class="llama_stack.providers.impls.meta_reference.telemetry.ConsoleConfig",
-        ),
-        remote_provider_spec(
-            api=Api.telemetry,
-            adapter=AdapterSpec(
-                adapter_type="sample",
-                pip_packages=[],
-                module="llama_stack.providers.adapters.telemetry.sample",
-                config_class="llama_stack.providers.adapters.telemetry.sample.SampleConfig",
-            ),
-        ),
-        remote_provider_spec(
-            api=Api.telemetry,
-            adapter=AdapterSpec(
-                adapter_type="opentelemetry-jaeger",
-                pip_packages=[
-                    "opentelemetry-api",
-                    "opentelemetry-sdk",
-                    "opentelemetry-exporter-jaeger",
-                    "opentelemetry-semantic-conventions",
-                ],
-                module="llama_stack.providers.adapters.telemetry.opentelemetry",
-                config_class="llama_stack.providers.adapters.telemetry.opentelemetry.OpenTelemetryConfig",
-            ),
-        ),
-    ]

llama-stack 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl

llama-stack 0.0.42py3-none-any.whl → 0.3.4py3-none-any.whl