PyPI - llama-stack - Versions diffs - 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

llama-stack 0.0.42py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (738) hide show

llama_stack/providers/adapters/inference/databricks/databricks.py DELETED Viewed

@@ -1,125 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import AsyncGenerator
-from llama_models.llama3.api.chat_format import ChatFormat
-from llama_models.llama3.api.datatypes import Message
-from llama_models.llama3.api.tokenizer import Tokenizer
-from openai import OpenAI
-from llama_stack.apis.inference import *  # noqa: F403
-from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
-from llama_stack.providers.utils.inference.openai_compat import (
-    get_sampling_options,
-    process_chat_completion_response,
-    process_chat_completion_stream_response,
-)
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    chat_completion_request_to_prompt,
-)
-from .config import DatabricksImplConfig
-DATABRICKS_SUPPORTED_MODELS = {
-    "Llama3.1-70B-Instruct": "databricks-meta-llama-3-1-70b-instruct",
-    "Llama3.1-405B-Instruct": "databricks-meta-llama-3-1-405b-instruct",
-}
-class DatabricksInferenceAdapter(ModelRegistryHelper, Inference):
-    def __init__(self, config: DatabricksImplConfig) -> None:
-        ModelRegistryHelper.__init__(
-            self, stack_to_provider_models_map=DATABRICKS_SUPPORTED_MODELS
-        )
-        self.config = config
-        self.formatter = ChatFormat(Tokenizer.get_instance())
-    async def initialize(self) -> None:
-        return
-    async def shutdown(self) -> None:
-        pass
-    def completion(
-        self,
-        model: str,
-        content: InterleavedTextMedia,
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> AsyncGenerator:
-        raise NotImplementedError()
-    def chat_completion(
-        self,
-        model: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> AsyncGenerator:
-        request = ChatCompletionRequest(
-            model=model,
-            messages=messages,
-            sampling_params=sampling_params,
-            tools=tools or [],
-            tool_choice=tool_choice,
-            tool_prompt_format=tool_prompt_format,
-            stream=stream,
-            logprobs=logprobs,
-        )
-        client = OpenAI(base_url=self.config.url, api_key=self.config.api_token)
-        if stream:
-            return self._stream_chat_completion(request, client)
-        else:
-            return self._nonstream_chat_completion(request, client)
-    async def _nonstream_chat_completion(
-        self, request: ChatCompletionRequest, client: OpenAI
-    ) -> ChatCompletionResponse:
-        params = self._get_params(request)
-        r = client.completions.create(**params)
-        return process_chat_completion_response(request, r, self.formatter)
-    async def _stream_chat_completion(
-        self, request: ChatCompletionRequest, client: OpenAI
-    ) -> AsyncGenerator:
-        params = self._get_params(request)
-        async def _to_async_generator():
-            s = client.completions.create(**params)
-            for chunk in s:
-                yield chunk
-        stream = _to_async_generator()
-        async for chunk in process_chat_completion_stream_response(
-            request, stream, self.formatter
-        ):
-            yield chunk
-    def _get_params(self, request: ChatCompletionRequest) -> dict:
-        return {
-            "model": self.map_to_provider_model(request.model),
-            "prompt": chat_completion_request_to_prompt(request, self.formatter),
-            "stream": request.stream,
-            **get_sampling_options(request),
-        }
-    async def embeddings(
-        self,
-        model: str,
-        contents: List[InterleavedTextMedia],
-    ) -> EmbeddingsResponse:
-        raise NotImplementedError()

llama_stack/providers/adapters/inference/fireworks/config.py DELETED Viewed

@@ -1,20 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from llama_models.schema_utils import json_schema_type
-from pydantic import BaseModel, Field
-@json_schema_type
-class FireworksImplConfig(BaseModel):
-    url: str = Field(
-        default="https://api.fireworks.ai/inference",
-        description="The URL for the Fireworks server",
-    )
-    api_key: str = Field(
-        default="",
-        description="The Fireworks.ai API Key",
-    )

llama_stack/providers/adapters/inference/fireworks/fireworks.py DELETED Viewed

@@ -1,130 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import AsyncGenerator
-from fireworks.client import Fireworks
-from llama_models.llama3.api.chat_format import ChatFormat
-from llama_models.llama3.api.datatypes import Message
-from llama_models.llama3.api.tokenizer import Tokenizer
-from llama_stack.apis.inference import *  # noqa: F403
-from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
-from llama_stack.providers.utils.inference.openai_compat import (
-    get_sampling_options,
-    process_chat_completion_response,
-    process_chat_completion_stream_response,
-)
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    chat_completion_request_to_prompt,
-)
-from .config import FireworksImplConfig
-FIREWORKS_SUPPORTED_MODELS = {
-    "Llama3.1-8B-Instruct": "fireworks/llama-v3p1-8b-instruct",
-    "Llama3.1-70B-Instruct": "fireworks/llama-v3p1-70b-instruct",
-    "Llama3.1-405B-Instruct": "fireworks/llama-v3p1-405b-instruct",
-    "Llama3.2-1B-Instruct": "fireworks/llama-v3p2-1b-instruct",
-    "Llama3.2-3B-Instruct": "fireworks/llama-v3p2-3b-instruct",
-}
-class FireworksInferenceAdapter(ModelRegistryHelper, Inference):
-    def __init__(self, config: FireworksImplConfig) -> None:
-        ModelRegistryHelper.__init__(
-            self, stack_to_provider_models_map=FIREWORKS_SUPPORTED_MODELS
-        )
-        self.config = config
-        self.formatter = ChatFormat(Tokenizer.get_instance())
-    async def initialize(self) -> None:
-        return
-    async def shutdown(self) -> None:
-        pass
-    def completion(
-        self,
-        model: str,
-        content: InterleavedTextMedia,
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> AsyncGenerator:
-        raise NotImplementedError()
-    def chat_completion(
-        self,
-        model: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> AsyncGenerator:
-        request = ChatCompletionRequest(
-            model=model,
-            messages=messages,
-            sampling_params=sampling_params,
-            tools=tools or [],
-            tool_choice=tool_choice,
-            tool_prompt_format=tool_prompt_format,
-            stream=stream,
-            logprobs=logprobs,
-        )
-        client = Fireworks(api_key=self.config.api_key)
-        if stream:
-            return self._stream_chat_completion(request, client)
-        else:
-            return self._nonstream_chat_completion(request, client)
-    async def _nonstream_chat_completion(
-        self, request: ChatCompletionRequest, client: Fireworks
-    ) -> ChatCompletionResponse:
-        params = self._get_params(request)
-        r = await client.completion.acreate(**params)
-        return process_chat_completion_response(request, r, self.formatter)
-    async def _stream_chat_completion(
-        self, request: ChatCompletionRequest, client: Fireworks
-    ) -> AsyncGenerator:
-        params = self._get_params(request)
-        stream = client.completion.acreate(**params)
-        async for chunk in process_chat_completion_stream_response(
-            request, stream, self.formatter
-        ):
-            yield chunk
-    def _get_params(self, request: ChatCompletionRequest) -> dict:
-        prompt = chat_completion_request_to_prompt(request, self.formatter)
-        # Fireworks always prepends with BOS
-        if prompt.startswith("<|begin_of_text|>"):
-            prompt = prompt[len("<|begin_of_text|>") :]
-        options = get_sampling_options(request)
-        options.setdefault("max_tokens", 512)
-        return {
-            "model": self.map_to_provider_model(request.model),
-            "prompt": prompt,
-            "stream": request.stream,
-            **options,
-        }
-    async def embeddings(
-        self,
-        model: str,
-        contents: List[InterleavedTextMedia],
-    ) -> EmbeddingsResponse:
-        raise NotImplementedError()

llama_stack/providers/adapters/inference/ollama/__init__.py DELETED Viewed

@@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from llama_stack.distribution.datatypes import RemoteProviderConfig
-class OllamaImplConfig(RemoteProviderConfig):
-    port: int = 11434
-async def get_adapter_impl(config: RemoteProviderConfig, _deps):
-    from .ollama import OllamaInferenceAdapter
-    impl = OllamaInferenceAdapter(config.url)
-    await impl.initialize()
-    return impl

llama_stack/providers/adapters/inference/ollama/ollama.py DELETED Viewed

@@ -1,175 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import AsyncGenerator
-import httpx
-from llama_models.llama3.api.chat_format import ChatFormat
-from llama_models.llama3.api.datatypes import Message
-from llama_models.llama3.api.tokenizer import Tokenizer
-from ollama import AsyncClient
-from llama_stack.apis.inference import *  # noqa: F403
-from llama_stack.providers.datatypes import ModelsProtocolPrivate
-from llama_stack.providers.utils.inference.openai_compat import (
-    get_sampling_options,
-    OpenAICompatCompletionChoice,
-    OpenAICompatCompletionResponse,
-    process_chat_completion_response,
-    process_chat_completion_stream_response,
-)
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    chat_completion_request_to_prompt,
-)
-OLLAMA_SUPPORTED_MODELS = {
-    "Llama3.1-8B-Instruct": "llama3.1:8b-instruct-fp16",
-    "Llama3.1-70B-Instruct": "llama3.1:70b-instruct-fp16",
-    "Llama3.2-1B-Instruct": "llama3.2:1b-instruct-fp16",
-    "Llama3.2-3B-Instruct": "llama3.2:3b-instruct-fp16",
-    "Llama-Guard-3-8B": "xe/llamaguard3:latest",
-}
-class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
-    def __init__(self, url: str) -> None:
-        self.url = url
-        self.formatter = ChatFormat(Tokenizer.get_instance())
-    @property
-    def client(self) -> AsyncClient:
-        return AsyncClient(host=self.url)
-    async def initialize(self) -> None:
-        print("Initializing Ollama, checking connectivity to server...")
-        try:
-            await self.client.ps()
-        except httpx.ConnectError as e:
-            raise RuntimeError(
-                "Ollama Server is not running, start it using `ollama serve` in a separate terminal"
-            ) from e
-    async def shutdown(self) -> None:
-        pass
-    async def register_model(self, model: ModelDef) -> None:
-        raise ValueError("Dynamic model registration is not supported")
-    async def list_models(self) -> List[ModelDef]:
-        ollama_to_llama = {v: k for k, v in OLLAMA_SUPPORTED_MODELS.items()}
-        ret = []
-        res = await self.client.ps()
-        for r in res["models"]:
-            if r["model"] not in ollama_to_llama:
-                print(f"Ollama is running a model unknown to Llama Stack: {r['model']}")
-                continue
-            llama_model = ollama_to_llama[r["model"]]
-            ret.append(
-                ModelDef(
-                    identifier=llama_model,
-                    llama_model=llama_model,
-                    metadata={
-                        "ollama_model": r["model"],
-                    },
-                )
-            )
-        return ret
-    def completion(
-        self,
-        model: str,
-        content: InterleavedTextMedia,
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> AsyncGenerator:
-        raise NotImplementedError()
-    def chat_completion(
-        self,
-        model: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> AsyncGenerator:
-        request = ChatCompletionRequest(
-            model=model,
-            messages=messages,
-            sampling_params=sampling_params,
-            tools=tools or [],
-            tool_choice=tool_choice,
-            tool_prompt_format=tool_prompt_format,
-            stream=stream,
-            logprobs=logprobs,
-        )
-        if stream:
-            return self._stream_chat_completion(request)
-        else:
-            return self._nonstream_chat_completion(request)
-    def _get_params(self, request: ChatCompletionRequest) -> dict:
-        return {
-            "model": OLLAMA_SUPPORTED_MODELS[request.model],
-            "prompt": chat_completion_request_to_prompt(request, self.formatter),
-            "options": get_sampling_options(request),
-            "raw": True,
-            "stream": request.stream,
-        }
-    async def _nonstream_chat_completion(
-        self, request: ChatCompletionRequest
-    ) -> ChatCompletionResponse:
-        params = self._get_params(request)
-        r = await self.client.generate(**params)
-        assert isinstance(r, dict)
-        choice = OpenAICompatCompletionChoice(
-            finish_reason=r["done_reason"] if r["done"] else None,
-            text=r["response"],
-        )
-        response = OpenAICompatCompletionResponse(
-            choices=[choice],
-        )
-        return process_chat_completion_response(request, response, self.formatter)
-    async def _stream_chat_completion(
-        self, request: ChatCompletionRequest
-    ) -> AsyncGenerator:
-        params = self._get_params(request)
-        async def _generate_and_convert_to_openai_compat():
-            s = await self.client.generate(**params)
-            async for chunk in s:
-                choice = OpenAICompatCompletionChoice(
-                    finish_reason=chunk["done_reason"] if chunk["done"] else None,
-                    text=chunk["response"],
-                )
-                yield OpenAICompatCompletionResponse(
-                    choices=[choice],
-                )
-        stream = _generate_and_convert_to_openai_compat()
-        async for chunk in process_chat_completion_stream_response(
-            request, stream, self.formatter
-        ):
-            yield chunk
-    async def embeddings(
-        self,
-        model: str,
-        contents: List[InterleavedTextMedia],
-    ) -> EmbeddingsResponse:
-        raise NotImplementedError()

llama_stack/providers/adapters/inference/sample/sample.py DELETED Viewed

@@ -1,23 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from .config import SampleConfig
-from llama_stack.apis.inference import *  # noqa: F403
-class SampleInferenceImpl(Inference):
-    def __init__(self, config: SampleConfig):
-        self.config = config
-    async def register_model(self, model: ModelDef) -> None:
-        # these are the model names the Llama Stack will use to route requests to this provider
-        # perform validation here if necessary
-        pass
-    async def initialize(self):
-        pass

llama_stack/providers/adapters/inference/tgi/config.py DELETED Viewed

@@ -1,43 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Optional
-from llama_models.schema_utils import json_schema_type
-from pydantic import BaseModel, Field
-@json_schema_type
-class TGIImplConfig(BaseModel):
-    url: str = Field(
-        description="The URL for the TGI endpoint (e.g. 'http://localhost:8080')",
-    )
-    api_token: Optional[str] = Field(
-        default=None,
-        description="A bearer token if your TGI endpoint is protected.",
-    )
-@json_schema_type
-class InferenceEndpointImplConfig(BaseModel):
-    endpoint_name: str = Field(
-        description="The name of the Hugging Face Inference Endpoint in the format of '{namespace}/{endpoint_name}' (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided.",
-    )
-    api_token: Optional[str] = Field(
-        default=None,
-        description="Your Hugging Face user access token (will default to locally saved token if not provided)",
-    )
-@json_schema_type
-class InferenceAPIImplConfig(BaseModel):
-    huggingface_repo: str = Field(
-        description="The model ID of the model on the Hugging Face Hub (e.g. 'meta-llama/Meta-Llama-3.1-70B-Instruct')",
-    )
-    api_token: Optional[str] = Field(
-        default=None,
-        description="Your Hugging Face user access token (will default to locally saved token if not provided)",
-    )

llama-stack 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl

llama-stack 0.0.42py3-none-any.whl → 0.3.4py3-none-any.whl