PyPI - llama-stack - Versions diffs - 0.3.3__tar.gz → 0.3.5__tar.gz - Mend

llama-stack 0.3.3tar.gz → 0.3.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (633) hide show

{llama_stack-0.3.3/llama_stack.egg-info → llama_stack-0.3.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llama_stack
-Version: 0.3.3
+Version: 0.3.5
 Summary: Llama Stack
 Author-email: Meta Llama <llama-oss@meta.com>
 License: MIT
@@ -22,7 +22,7 @@ Requires-Dist: fire
 Requires-Dist: httpx
 Requires-Dist: jinja2>=3.1.6
 Requires-Dist: jsonschema
-Requires-Dist: llama-stack-client>=0.3.3
+Requires-Dist: llama-stack-client==0.3.5
 Requires-Dist: openai>=1.107
 Requires-Dist: prompt-toolkit
 Requires-Dist: python-dotenv
@@ -41,10 +41,11 @@ Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.30.0
 Requires-Dist: aiosqlite>=0.21.0
 Requires-Dist: asyncpg
 Requires-Dist: sqlalchemy[asyncio]>=2.0.41
+Requires-Dist: starlette>=0.49.1
 Provides-Extra: ui
 Requires-Dist: streamlit; extra == "ui"
 Requires-Dist: pandas; extra == "ui"
-Requires-Dist: llama-stack-client>=0.3.3; extra == "ui"
+Requires-Dist: llama-stack-client==0.3.5; extra == "ui"
 Requires-Dist: streamlit-option-menu; extra == "ui"
 Dynamic: license-file
@@ -60,83 +61,6 @@ Dynamic: license-file
 [**Quick Start**](https://llamastack.github.io/docs/getting_started/quickstart) | [**Documentation**](https://llamastack.github.io/docs) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
-### ✨🎉 Llama 4 Support  🎉✨
-We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
-<details>
-<summary>👋 Click here to see how to run Llama 4 models on Llama Stack </summary>
-\
-*Note you need 8xH100 GPU-host to run these models*
-```bash
-pip install -U llama_stack
-MODEL="Llama-4-Scout-17B-16E-Instruct"
-# get meta url from llama.com
-huggingface-cli download meta-llama/$MODEL --local-dir ~/.llama/$MODEL
-# install dependencies for the distribution
-llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
-# start a llama stack server
-INFERENCE_MODEL=meta-llama/$MODEL llama stack run meta-reference-gpu
-# install client to interact with the server
-pip install llama-stack-client
-```
-### CLI
-```bash
-# Run a chat completion
-MODEL="Llama-4-Scout-17B-16E-Instruct"
-llama-stack-client --endpoint http://localhost:8321 \
-inference chat-completion \
---model-id meta-llama/$MODEL \
---message "write a haiku for meta's llama 4 models"
-OpenAIChatCompletion(
-    ...
-    choices=[
-        OpenAIChatCompletionChoice(
-            finish_reason='stop',
-            index=0,
-            message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
-                role='assistant',
-                content='...**Silent minds awaken,**  \n**Whispers of billions of words,**  \n**Reasoning breaks the night.**  \n\n—  \n*This haiku blends the essence of LLaMA 4\'s capabilities with nature-inspired metaphor, evoking its vast training data and transformative potential.*',
-                ...
-            ),
-            ...
-        )
-    ],
-    ...
-)
-```
-### Python SDK
-```python
-from llama_stack_client import LlamaStackClient
-client = LlamaStackClient(base_url=f"http://localhost:8321")
-model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-prompt = "Write a haiku about coding"
-print(f"User> {prompt}")
-response = client.chat.completions.create(
-    model=model_id,
-    messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": prompt},
-    ],
-)
-print(f"Assistant> {response.choices[0].message.content}")
-```
-As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned!
-</details>
 ### 🚀 One-Line Installer 🚀
 To try Llama Stack locally, run:

{llama_stack-0.3.3 → llama_stack-0.3.5}/README.md RENAMED Viewed

@@ -10,83 +10,6 @@
 [**Quick Start**](https://llamastack.github.io/docs/getting_started/quickstart) | [**Documentation**](https://llamastack.github.io/docs) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
-### ✨🎉 Llama 4 Support  🎉✨
-We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
-<details>
-<summary>👋 Click here to see how to run Llama 4 models on Llama Stack </summary>
-\
-*Note you need 8xH100 GPU-host to run these models*
-```bash
-pip install -U llama_stack
-MODEL="Llama-4-Scout-17B-16E-Instruct"
-# get meta url from llama.com
-huggingface-cli download meta-llama/$MODEL --local-dir ~/.llama/$MODEL
-# install dependencies for the distribution
-llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
-# start a llama stack server
-INFERENCE_MODEL=meta-llama/$MODEL llama stack run meta-reference-gpu
-# install client to interact with the server
-pip install llama-stack-client
-```
-### CLI
-```bash
-# Run a chat completion
-MODEL="Llama-4-Scout-17B-16E-Instruct"
-llama-stack-client --endpoint http://localhost:8321 \
-inference chat-completion \
---model-id meta-llama/$MODEL \
---message "write a haiku for meta's llama 4 models"
-OpenAIChatCompletion(
-    ...
-    choices=[
-        OpenAIChatCompletionChoice(
-            finish_reason='stop',
-            index=0,
-            message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
-                role='assistant',
-                content='...**Silent minds awaken,**  \n**Whispers of billions of words,**  \n**Reasoning breaks the night.**  \n\n—  \n*This haiku blends the essence of LLaMA 4\'s capabilities with nature-inspired metaphor, evoking its vast training data and transformative potential.*',
-                ...
-            ),
-            ...
-        )
-    ],
-    ...
-)
-```
-### Python SDK
-```python
-from llama_stack_client import LlamaStackClient
-client = LlamaStackClient(base_url=f"http://localhost:8321")
-model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-prompt = "Write a haiku about coding"
-print(f"User> {prompt}")
-response = client.chat.completions.create(
-    model=model_id,
-    messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": prompt},
-    ],
-)
-print(f"Assistant> {response.choices[0].message.content}")
-```
-As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned!
-</details>
 ### 🚀 One-Line Installer 🚀
 To try Llama Stack locally, run:

{llama_stack-0.3.3 → llama_stack-0.3.5}/llama_stack/core/resolver.py RENAMED Viewed

@@ -391,6 +391,9 @@ async def instantiate_provider(
         method = "get_adapter_impl"
         args = [config, deps]
+        if "policy" in inspect.signature(getattr(module, method)).parameters:
+            args.append(policy)
     elif isinstance(provider_spec, AutoRoutedProviderSpec):
         method = "get_auto_router_impl"

{llama_stack-0.3.3 → llama_stack-0.3.5}/llama_stack/core/routers/inference.py RENAMED Viewed

@@ -49,10 +49,17 @@ from llama_stack.apis.inference import (
 )
 from llama_stack.apis.models import Model, ModelType
 from llama_stack.apis.telemetry import MetricEvent, MetricInResponse, Telemetry
+from llama_stack.core.access_control.access_control import is_action_allowed
+from llama_stack.core.datatypes import ModelWithOwner
+from llama_stack.core.request_headers import get_authenticated_user
 from llama_stack.log import get_logger
 from llama_stack.models.llama.llama3.chat_format import ChatFormat
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
-from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
+from llama_stack.providers.datatypes import (
+    HealthResponse,
+    HealthStatus,
+    RoutingTable,
+)
 from llama_stack.providers.utils.inference.inference_store import InferenceStore
 from llama_stack.providers.utils.telemetry.tracing import enqueue_event, get_current_span
@@ -186,15 +193,41 @@ class InferenceRouter(Inference):
             provider = await self.routing_table.get_provider_impl(model.identifier)
             return provider, model.provider_resource_id
+        # Handles cases where clients use the provider format directly
+        return await self._get_provider_by_fallback(model_id, expected_model_type)
+    async def _get_provider_by_fallback(self, model_id: str, expected_model_type: str) -> tuple[Inference, str]:
+        """
+        Handle fallback case where model_id is in provider_id/provider_resource_id format.
+        """
         splits = model_id.split("/", maxsplit=1)
         if len(splits) != 2:
             raise ModelNotFoundError(model_id)
         provider_id, provider_resource_id = splits
+        # Check if provider exists
         if provider_id not in self.routing_table.impls_by_provider_id:
             logger.warning(f"Provider {provider_id} not found for model {model_id}")
             raise ModelNotFoundError(model_id)
+        # Create a temporary model object for RBAC check
+        temp_model = ModelWithOwner(
+            identifier=model_id,
+            provider_id=provider_id,
+            provider_resource_id=provider_resource_id,
+            model_type=expected_model_type,
+            metadata={},  # Empty metadata for temporary object
+        )
+        # Perform RBAC check
+        user = get_authenticated_user()
+        if not is_action_allowed(self.routing_table.policy, "read", temp_model, user):
+            logger.debug(
+                f"Access denied to model '{model_id}' via fallback path for user {user.principal if user else 'anonymous'}"
+            )
+            raise ModelNotFoundError(model_id)
         return self.routing_table.impls_by_provider_id[provider_id], provider_resource_id
     async def openai_completion(

{llama_stack-0.3.3 → llama_stack-0.3.5}/llama_stack/providers/registry/agents.py RENAMED Viewed

@@ -23,7 +23,7 @@ def available_providers() -> list[ProviderSpec]:
                 "pillow",
                 "pandas",
                 "scikit-learn",
-                "mcp>=1.8.1",
+                "mcp>=1.23.0",
             ]
             + kvstore_dependencies(),  # TODO make this dynamic based on the kvstore config
             module="llama_stack.providers.inline.agents.meta_reference",

{llama_stack-0.3.3 → llama_stack-0.3.5}/llama_stack/providers/registry/tool_runtime.py RENAMED Viewed

@@ -80,7 +80,7 @@ def available_providers() -> list[ProviderSpec]:
             provider_type="remote::model-context-protocol",
             module="llama_stack.providers.remote.tool_runtime.model_context_protocol",
             config_class="llama_stack.providers.remote.tool_runtime.model_context_protocol.config.MCPProviderConfig",
-            pip_packages=["mcp>=1.8.1"],
+            pip_packages=["mcp>=1.23.0"],
             provider_data_validator="llama_stack.providers.remote.tool_runtime.model_context_protocol.config.MCPProviderDataValidator",
             description="Model Context Protocol (MCP) tool for standardized tool calling and context management.",
         ),

{llama_stack-0.3.3 → llama_stack-0.3.5}/llama_stack/providers/remote/inference/watsonx/watsonx.py RENAMED Viewed

@@ -283,8 +283,8 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin):
             # ...
             provider_resource_id = f"{self.__provider_id__}/{model_spec['model_id']}"
             if "embedding" in functions:
-                embedding_dimension = model_spec["model_limits"]["embedding_dimension"]
-                context_length = model_spec["model_limits"]["max_sequence_length"]
+                embedding_dimension = model_spec.get("model_limits", {}).get("embedding_dimension", 0)
+                context_length = model_spec.get("model_limits", {}).get("max_sequence_length", 0)
                 embedding_metadata = {
                     "embedding_dimension": embedding_dimension,
                     "context_length": context_length,
@@ -306,10 +306,6 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin):
                     metadata={},
                     model_type=ModelType.llm,
                 )
-                # In theory, I guess it is possible that a model could be both an embedding model and a text chat model.
-                # In that case, the cache will record the generator Model object, and the list which we return will have
-                # both the generator Model object and the text chat Model object.  That's fine because the cache is
-                # only used for check_model_availability() anyway.
                 self._model_cache[provider_resource_id] = model
                 models.append(model)
         return models

{llama_stack-0.3.3 → llama_stack-0.3.5}/llama_stack/providers/utils/inference/inference_store.py RENAMED Viewed

@@ -56,7 +56,7 @@ class InferenceStore:
             logger.debug("Write queue disabled for SQLite (WAL mode handles concurrency)")
         await self.sql_store.create_table(
-            "chat_completions",
+            self.reference.table_name,
             {
                 "id": ColumnDefinition(type=ColumnType.STRING, primary_key=True),
                 "created": ColumnType.INTEGER,
@@ -66,14 +66,6 @@ class InferenceStore:
             },
         )
-        if self.enable_write_queue:
-            self._queue = asyncio.Queue(maxsize=self._max_write_queue_size)
-            for _ in range(self._num_writers):
-                self._worker_tasks.append(asyncio.create_task(self._worker_loop()))
-            logger.debug(
-                f"Inference store write queue enabled with {self._num_writers} writers, max queue size {self._max_write_queue_size}"
-            )
     async def shutdown(self) -> None:
         if not self._worker_tasks:
             return
@@ -161,7 +153,7 @@ class InferenceStore:
         try:
             await self.sql_store.insert(
-                table="chat_completions",
+                table=self.reference.table_name,
                 data=record_data,
             )
         except IntegrityError as e:
@@ -173,7 +165,7 @@ class InferenceStore:
             error_message = str(e.orig) if e.orig else str(e)
             if self._is_unique_constraint_error(error_message):
                 # Update the existing record instead
-                await self.sql_store.update(table="chat_completions", data=record_data, where={"id": data["id"]})
+                await self.sql_store.update(table=self.reference.table_name, data=record_data, where={"id": data["id"]})
             else:
                 # Re-raise if it's not a unique constraint error
                 raise
@@ -217,7 +209,7 @@ class InferenceStore:
             where_conditions["model"] = model
         paginated_result = await self.sql_store.fetch_all(
-            table="chat_completions",
+            table=self.reference.table_name,
             where=where_conditions if where_conditions else None,
             order_by=[("created", order.value)],
             cursor=("id", after) if after else None,
@@ -246,7 +238,7 @@ class InferenceStore:
             raise ValueError("Inference store is not initialized")
         row = await self.sql_store.fetch_one(
-            table="chat_completions",
+            table=self.reference.table_name,
             where={"id": completion_id},
         )

{llama_stack-0.3.3 → llama_stack-0.3.5}/llama_stack/providers/utils/responses/responses_store.py RENAMED Viewed

@@ -3,7 +3,6 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-import asyncio
 from llama_stack.apis.agents import (
     Order,
@@ -18,12 +17,12 @@ from llama_stack.apis.agents.openai_responses import (
 )
 from llama_stack.apis.inference import OpenAIMessageParam
 from llama_stack.core.datatypes import AccessRule
-from llama_stack.core.storage.datatypes import ResponsesStoreReference, SqlStoreReference, StorageBackendType
+from llama_stack.core.storage.datatypes import ResponsesStoreReference, SqlStoreReference
 from llama_stack.log import get_logger
 from ..sqlstore.api import ColumnDefinition, ColumnType
 from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
-from ..sqlstore.sqlstore import _SQLSTORE_BACKENDS, sqlstore_impl
+from ..sqlstore.sqlstore import sqlstore_impl
 logger = get_logger(name=__name__, category="openai_responses")
@@ -60,13 +59,6 @@ class ResponsesStore:
         base_store = sqlstore_impl(self.reference)
         self.sql_store = AuthorizedSqlStore(base_store, self.policy)
-        # Disable write queue for SQLite since WAL mode handles concurrency
-        # Keep it enabled for other backends (like Postgres) for performance
-        backend_config = _SQLSTORE_BACKENDS.get(self.reference.backend)
-        if backend_config and backend_config.type == StorageBackendType.SQL_SQLITE:
-            self.enable_write_queue = False
-            logger.debug("Write queue disabled for SQLite (WAL mode handles concurrency)")
         await self.sql_store.create_table(
             "openai_responses",
             {
@@ -85,14 +77,6 @@ class ResponsesStore:
             },
         )
-        if self.enable_write_queue:
-            self._queue = asyncio.Queue(maxsize=self._max_write_queue_size)
-            for _ in range(self._num_writers):
-                self._worker_tasks.append(asyncio.create_task(self._worker_loop()))
-            logger.debug(
-                f"Responses store write queue enabled with {self._num_writers} writers, max queue size {self._max_write_queue_size}"
-            )
     async def shutdown(self) -> None:
         return

{llama_stack-0.3.3 → llama_stack-0.3.5/llama_stack.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llama_stack
-Version: 0.3.3
+Version: 0.3.5
 Summary: Llama Stack
 Author-email: Meta Llama <llama-oss@meta.com>
 License: MIT
@@ -22,7 +22,7 @@ Requires-Dist: fire
 Requires-Dist: httpx
 Requires-Dist: jinja2>=3.1.6
 Requires-Dist: jsonschema
-Requires-Dist: llama-stack-client>=0.3.3
+Requires-Dist: llama-stack-client==0.3.5
 Requires-Dist: openai>=1.107
 Requires-Dist: prompt-toolkit
 Requires-Dist: python-dotenv
@@ -41,10 +41,11 @@ Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.30.0
 Requires-Dist: aiosqlite>=0.21.0
 Requires-Dist: asyncpg
 Requires-Dist: sqlalchemy[asyncio]>=2.0.41
+Requires-Dist: starlette>=0.49.1
 Provides-Extra: ui
 Requires-Dist: streamlit; extra == "ui"
 Requires-Dist: pandas; extra == "ui"
-Requires-Dist: llama-stack-client>=0.3.3; extra == "ui"
+Requires-Dist: llama-stack-client==0.3.5; extra == "ui"
 Requires-Dist: streamlit-option-menu; extra == "ui"
 Dynamic: license-file
@@ -60,83 +61,6 @@ Dynamic: license-file
 [**Quick Start**](https://llamastack.github.io/docs/getting_started/quickstart) | [**Documentation**](https://llamastack.github.io/docs) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
-### ✨🎉 Llama 4 Support  🎉✨
-We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
-<details>
-<summary>👋 Click here to see how to run Llama 4 models on Llama Stack </summary>
-\
-*Note you need 8xH100 GPU-host to run these models*
-```bash
-pip install -U llama_stack
-MODEL="Llama-4-Scout-17B-16E-Instruct"
-# get meta url from llama.com
-huggingface-cli download meta-llama/$MODEL --local-dir ~/.llama/$MODEL
-# install dependencies for the distribution
-llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
-# start a llama stack server
-INFERENCE_MODEL=meta-llama/$MODEL llama stack run meta-reference-gpu
-# install client to interact with the server
-pip install llama-stack-client
-```
-### CLI
-```bash
-# Run a chat completion
-MODEL="Llama-4-Scout-17B-16E-Instruct"
-llama-stack-client --endpoint http://localhost:8321 \
-inference chat-completion \
---model-id meta-llama/$MODEL \
---message "write a haiku for meta's llama 4 models"
-OpenAIChatCompletion(
-    ...
-    choices=[
-        OpenAIChatCompletionChoice(
-            finish_reason='stop',
-            index=0,
-            message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
-                role='assistant',
-                content='...**Silent minds awaken,**  \n**Whispers of billions of words,**  \n**Reasoning breaks the night.**  \n\n—  \n*This haiku blends the essence of LLaMA 4\'s capabilities with nature-inspired metaphor, evoking its vast training data and transformative potential.*',
-                ...
-            ),
-            ...
-        )
-    ],
-    ...
-)
-```
-### Python SDK
-```python
-from llama_stack_client import LlamaStackClient
-client = LlamaStackClient(base_url=f"http://localhost:8321")
-model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-prompt = "Write a haiku about coding"
-print(f"User> {prompt}")
-response = client.chat.completions.create(
-    model=model_id,
-    messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": prompt},
-    ],
-)
-print(f"Assistant> {response.choices[0].message.content}")
-```
-As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned!
-</details>
 ### 🚀 One-Line Installer 🚀
 To try Llama Stack locally, run:

{llama_stack-0.3.3 → llama_stack-0.3.5}/llama_stack.egg-info/requires.txt RENAMED Viewed

@@ -4,7 +4,7 @@ fire
 httpx
 jinja2>=3.1.6
 jsonschema
-llama-stack-client>=0.3.3
+llama-stack-client==0.3.5
 openai>=1.107
 prompt-toolkit
 python-dotenv
@@ -23,9 +23,10 @@ opentelemetry-exporter-otlp-proto-http>=1.30.0
 aiosqlite>=0.21.0
 asyncpg
 sqlalchemy[asyncio]>=2.0.41
+starlette>=0.49.1
 [ui]
 streamlit
 pandas
-llama-stack-client>=0.3.3
+llama-stack-client==0.3.5
 streamlit-option-menu

{llama_stack-0.3.3 → llama_stack-0.3.5}/pyproject.toml RENAMED Viewed

@@ -7,7 +7,7 @@ required-version = ">=0.7.0"
 [project]
 name = "llama_stack"
-version = "0.3.3"
+version = "0.3.5"
 authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
 description = "Llama Stack"
 readme = "README.md"
@@ -30,7 +30,7 @@ dependencies = [
     "httpx",
     "jinja2>=3.1.6",
     "jsonschema",
-    "llama-stack-client>=0.3.3",
+    "llama-stack-client==0.3.5",
     "openai>=1.107",                                  # for expires_after support
     "prompt-toolkit",
     "python-dotenv",
@@ -49,13 +49,14 @@ dependencies = [
     "aiosqlite>=0.21.0",                              # server - for metadata store
     "asyncpg",                                        # for metadata store
     "sqlalchemy[asyncio]>=2.0.41",                    # server - for conversations
+    "starlette>=0.49.1",
 ]
 [project.optional-dependencies]
 ui = [
     "streamlit",
     "pandas",
-    "llama-stack-client>=0.3.3",
+    "llama-stack-client==0.3.5",
     "streamlit-option-menu",
 ]