PyPI - llama-stack - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

llama-stack 0.4.1py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

llama_stack/cli/stack/run.py +3 -0
llama_stack/core/library_client.py +80 -3
llama_stack/core/routing_tables/common.py +11 -0
llama_stack/core/routing_tables/vector_stores.py +4 -0
llama_stack/core/stack.py +38 -11
llama_stack/core/storage/kvstore/kvstore.py +11 -0
llama_stack/core/storage/kvstore/mongodb/mongodb.py +5 -0
llama_stack/core/storage/kvstore/postgres/postgres.py +8 -0
llama_stack/core/storage/kvstore/redis/redis.py +5 -0
llama_stack/core/storage/sqlstore/sqlalchemy_sqlstore.py +8 -0
llama_stack/core/storage/sqlstore/sqlstore.py +8 -0
llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +60 -34
llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +4 -0
llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +9 -1
llama_stack/providers/inline/tool_runtime/rag/memory.py +8 -3
llama_stack/providers/remote/vector_io/pgvector/pgvector.py +13 -1
llama_stack/providers/utils/inference/embedding_mixin.py +20 -16
llama_stack/providers/utils/memory/openai_vector_store_mixin.py +33 -0
llama_stack/providers/utils/memory/vector_store.py +9 -4
llama_stack/providers/utils/tools/mcp.py +258 -16
{llama_stack-0.4.1.dist-info → llama_stack-0.4.3.dist-info}/METADATA +2 -2
{llama_stack-0.4.1.dist-info → llama_stack-0.4.3.dist-info}/RECORD +96 -29
{llama_stack-0.4.1.dist-info → llama_stack-0.4.3.dist-info}/WHEEL +1 -1
llama_stack_api/internal/kvstore.py +2 -0
llama_stack_api/internal/sqlstore.py +2 -0
llama_stack_api/llama_stack_api/__init__.py +945 -0
llama_stack_api/llama_stack_api/admin/__init__.py +45 -0
llama_stack_api/llama_stack_api/admin/api.py +72 -0
llama_stack_api/llama_stack_api/admin/fastapi_routes.py +117 -0
llama_stack_api/llama_stack_api/admin/models.py +113 -0
llama_stack_api/llama_stack_api/agents.py +173 -0
llama_stack_api/llama_stack_api/batches/__init__.py +40 -0
llama_stack_api/llama_stack_api/batches/api.py +53 -0
llama_stack_api/llama_stack_api/batches/fastapi_routes.py +113 -0
llama_stack_api/llama_stack_api/batches/models.py +78 -0
llama_stack_api/llama_stack_api/benchmarks/__init__.py +43 -0
llama_stack_api/llama_stack_api/benchmarks/api.py +39 -0
llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +109 -0
llama_stack_api/llama_stack_api/benchmarks/models.py +109 -0
llama_stack_api/llama_stack_api/common/__init__.py +5 -0
llama_stack_api/llama_stack_api/common/content_types.py +101 -0
llama_stack_api/llama_stack_api/common/errors.py +95 -0
llama_stack_api/llama_stack_api/common/job_types.py +38 -0
llama_stack_api/llama_stack_api/common/responses.py +77 -0
llama_stack_api/llama_stack_api/common/training_types.py +47 -0
llama_stack_api/llama_stack_api/common/type_system.py +146 -0
llama_stack_api/llama_stack_api/connectors.py +146 -0
llama_stack_api/llama_stack_api/conversations.py +270 -0
llama_stack_api/llama_stack_api/datasetio.py +55 -0
llama_stack_api/llama_stack_api/datasets/__init__.py +61 -0
llama_stack_api/llama_stack_api/datasets/api.py +35 -0
llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +104 -0
llama_stack_api/llama_stack_api/datasets/models.py +152 -0
llama_stack_api/llama_stack_api/datatypes.py +373 -0
llama_stack_api/llama_stack_api/eval.py +137 -0
llama_stack_api/llama_stack_api/file_processors/__init__.py +27 -0
llama_stack_api/llama_stack_api/file_processors/api.py +64 -0
llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +78 -0
llama_stack_api/llama_stack_api/file_processors/models.py +42 -0
llama_stack_api/llama_stack_api/files/__init__.py +35 -0
llama_stack_api/llama_stack_api/files/api.py +51 -0
llama_stack_api/llama_stack_api/files/fastapi_routes.py +124 -0
llama_stack_api/llama_stack_api/files/models.py +107 -0
llama_stack_api/llama_stack_api/inference.py +1169 -0
llama_stack_api/llama_stack_api/inspect_api/__init__.py +37 -0
llama_stack_api/llama_stack_api/inspect_api/api.py +25 -0
llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +76 -0
llama_stack_api/llama_stack_api/inspect_api/models.py +28 -0
llama_stack_api/llama_stack_api/internal/__init__.py +9 -0
llama_stack_api/llama_stack_api/internal/kvstore.py +28 -0
llama_stack_api/llama_stack_api/internal/sqlstore.py +81 -0
llama_stack_api/llama_stack_api/models.py +171 -0
llama_stack_api/llama_stack_api/openai_responses.py +1468 -0
llama_stack_api/llama_stack_api/post_training.py +370 -0
llama_stack_api/llama_stack_api/prompts.py +203 -0
llama_stack_api/llama_stack_api/providers/__init__.py +33 -0
llama_stack_api/llama_stack_api/providers/api.py +16 -0
llama_stack_api/llama_stack_api/providers/fastapi_routes.py +57 -0
llama_stack_api/llama_stack_api/providers/models.py +24 -0
llama_stack_api/llama_stack_api/py.typed +0 -0
llama_stack_api/llama_stack_api/rag_tool.py +168 -0
llama_stack_api/llama_stack_api/resource.py +37 -0
llama_stack_api/llama_stack_api/router_utils.py +160 -0
llama_stack_api/llama_stack_api/safety.py +132 -0
llama_stack_api/llama_stack_api/schema_utils.py +208 -0
llama_stack_api/llama_stack_api/scoring.py +93 -0
llama_stack_api/llama_stack_api/scoring_functions.py +211 -0
llama_stack_api/llama_stack_api/shields.py +93 -0
llama_stack_api/llama_stack_api/tools.py +226 -0
llama_stack_api/llama_stack_api/vector_io.py +941 -0
llama_stack_api/llama_stack_api/vector_stores.py +53 -0
llama_stack_api/llama_stack_api/version.py +9 -0
llama_stack_api/vector_stores.py +2 -0
{llama_stack-0.4.1.dist-info → llama_stack-0.4.3.dist-info}/entry_points.txt +0 -0
{llama_stack-0.4.1.dist-info → llama_stack-0.4.3.dist-info}/licenses/LICENSE +0 -0
{llama_stack-0.4.1.dist-info → llama_stack-0.4.3.dist-info}/top_level.txt +0 -0

llama_stack/cli/stack/run.py CHANGED Viewed

@@ -202,6 +202,9 @@ class StackRun(Subcommand):
         # Set the config file in environment so create_app can find it
         os.environ["LLAMA_STACK_CONFIG"] = str(config_file)
+        # disable together banner that spams llama stack run every time
+        os.environ["TOGETHER_NO_BANNER"] = "1"
         uvicorn_config = {
             "factory": True,
             "host": host,

llama_stack/core/library_client.py CHANGED Viewed

@@ -161,6 +161,45 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
         """
         pass
+    def shutdown(self) -> None:
+        """Shutdown the client and release all resources.
+        This method should be called when you're done using the client to properly
+        close database connections and release other resources. Failure to call this
+        method may result in the program hanging on exit while waiting for background
+        threads to complete.
+        This method is idempotent and can be called multiple times safely.
+        Example:
+            client = LlamaStackAsLibraryClient("starter")
+            # ... use the client ...
+            client.shutdown()
+        """
+        loop = self.loop
+        asyncio.set_event_loop(loop)
+        try:
+            loop.run_until_complete(self.async_client.shutdown())
+        finally:
+            loop.close()
+            asyncio.set_event_loop(None)
+    def __enter__(self) -> "LlamaStackAsLibraryClient":
+        """Enter the context manager.
+        The client is already initialized in __init__, so this just returns self.
+        Example:
+            with LlamaStackAsLibraryClient("starter") as client:
+                response = client.models.list()
+            # Client is automatically shut down here
+        """
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        """Exit the context manager and shut down the client."""
+        self.shutdown()
     def request(self, *args, **kwargs):
         loop = self.loop
         asyncio.set_event_loop(loop)
@@ -224,6 +263,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
         self.custom_provider_registry = custom_provider_registry
         self.provider_data = provider_data
         self.route_impls: RouteImpls | None = None  # Initialize to None to prevent AttributeError
+        self.stack: Stack | None = None
     def _remove_root_logger_handlers(self):
         """
@@ -246,9 +286,9 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
         try:
             self.route_impls = None
-            stack = Stack(self.config, self.custom_provider_registry)
-            await stack.initialize()
-            self.impls = stack.impls
+            self.stack = Stack(self.config, self.custom_provider_registry)
+            await self.stack.initialize()
+            self.impls = self.stack.impls
         except ModuleNotFoundError as _e:
             cprint(_e.msg, color="red", file=sys.stderr)
             cprint(
@@ -283,6 +323,43 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
         self.route_impls = initialize_route_impls(self.impls)
         return True
+    async def shutdown(self) -> None:
+        """Shutdown the client and release all resources.
+        This method should be called when you're done using the client to properly
+        close database connections and release other resources. Failure to call this
+        method may result in the program hanging on exit while waiting for background
+        threads to complete.
+        This method is idempotent and can be called multiple times safely.
+        Example:
+            client = AsyncLlamaStackAsLibraryClient("starter")
+            await client.initialize()
+            # ... use the client ...
+            await client.shutdown()
+        """
+        if self.stack:
+            await self.stack.shutdown()
+            self.stack = None
+    async def __aenter__(self) -> "AsyncLlamaStackAsLibraryClient":
+        """Enter the async context manager.
+        Initializes the client and returns it.
+        Example:
+            async with AsyncLlamaStackAsLibraryClient("starter") as client:
+                response = await client.models.list()
+            # Client is automatically shut down here
+        """
+        await self.initialize()
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
+        """Exit the async context manager and shut down the client."""
+        await self.shutdown()
     async def request(
         self,
         cast_to: Any,

llama_stack/core/routing_tables/common.py CHANGED Viewed

@@ -209,6 +209,17 @@ class CommonRoutingTableImpl(RoutingTable):
             logger.info(f"Setting owner for {obj.type} '{obj.identifier}' to {obj.owner.principal}")
         registered_obj = await register_object_with_provider(obj, p)
+        # Ensure OpenAI metadata exists for vector stores
+        if obj.type == ResourceType.vector_store.value:
+            if hasattr(p, "_ensure_openai_metadata_exists"):
+                await p._ensure_openai_metadata_exists(obj)
+            else:
+                logger.warning(
+                    f"Provider {obj.provider_id} does not support OpenAI metadata creation. "
+                    f"Vector store {obj.identifier} may not work with OpenAI-compatible APIs."
+                )
         # TODO: This needs to be fixed for all APIs once they return the registered object
         if obj.type == ResourceType.model.value:
             await self.dist_registry.register(registered_obj)

llama_stack/core/routing_tables/vector_stores.py CHANGED Viewed

@@ -55,6 +55,10 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
     # Internal methods only - no public API exposure
+    async def list_vector_stores(self) -> list[VectorStoreWithOwner]:
+        """List all registered vector stores."""
+        return await self.get_all_with_type(ResourceType.vector_store.value)
     async def register_vector_store(
         self,
         vector_store_id: str,

llama_stack/core/stack.py CHANGED Viewed

@@ -53,6 +53,7 @@ from llama_stack_api import (
     PostTraining,
     Prompts,
     Providers,
+    RegisterBenchmarkRequest,
     Safety,
     Scoring,
     ScoringFunctions,
@@ -61,6 +62,7 @@ from llama_stack_api import (
     ToolRuntime,
     VectorIO,
 )
+from llama_stack_api.datasets import RegisterDatasetRequest
 logger = get_logger(name=__name__, category="core")
@@ -91,18 +93,22 @@ class LlamaStack(
     pass
+# Resources to register based on configuration.
+# If a request class is specified, the configuration object will be converted to this class before invoking the registration method.
 RESOURCES = [
-    ("models", Api.models, "register_model", "list_models"),
-    ("shields", Api.shields, "register_shield", "list_shields"),
-    ("datasets", Api.datasets, "register_dataset", "list_datasets"),
+    ("models", Api.models, "register_model", "list_models", None),
+    ("shields", Api.shields, "register_shield", "list_shields", None),
+    ("datasets", Api.datasets, "register_dataset", "list_datasets", RegisterDatasetRequest),
     (
         "scoring_fns",
         Api.scoring_functions,
         "register_scoring_function",
         "list_scoring_functions",
+        None,
     ),
-    ("benchmarks", Api.benchmarks, "register_benchmark", "list_benchmarks"),
-    ("tool_groups", Api.tool_groups, "register_tool_group", "list_tool_groups"),
+    ("benchmarks", Api.benchmarks, "register_benchmark", "list_benchmarks", RegisterBenchmarkRequest),
+    ("tool_groups", Api.tool_groups, "register_tool_group", "list_tool_groups", None),
+    ("vector_stores", Api.vector_stores, "register_vector_store", "list_vector_stores", None),
 ]
@@ -199,7 +205,7 @@ async def invoke_with_optional_request(method: Any) -> Any:
 async def register_resources(run_config: StackConfig, impls: dict[Api, Any]):
-    for rsrc, api, register_method, list_method in RESOURCES:
+    for rsrc, api, register_method, list_method, request_class in RESOURCES:
         objects = getattr(run_config.registered_resources, rsrc)
         if api not in impls:
             continue
@@ -213,10 +219,17 @@ async def register_resources(run_config: StackConfig, impls: dict[Api, Any]):
                     continue
                 logger.debug(f"registering {rsrc.capitalize()} {obj} for provider {obj.provider_id}")
-            # we want to maintain the type information in arguments to method.
-            # instead of method(**obj.model_dump()), which may convert a typed attr to a dict,
-            # we use model_dump() to find all the attrs and then getattr to get the still typed value.
-            await method(**{k: getattr(obj, k) for k in obj.model_dump().keys()})
+            # TODO: Once all register methods are migrated to accept request objects,
+            # remove this conditional and always use the request_class pattern.
+            if request_class is not None:
+                request = request_class(**obj.model_dump())
+                await method(request)
+            else:
+                # we want to maintain the type information in arguments to method.
+                # instead of method(**obj.model_dump()), which may convert a typed attr to a dict,
+                # we use model_dump() to find all the attrs and then getattr to get the still typed
+                # value.
+                await method(**{k: getattr(obj, k) for k in obj.model_dump().keys()})
         method = getattr(impls[api], list_method)
         response = await invoke_with_optional_request(method)
@@ -608,7 +621,7 @@ class Stack:
     async def shutdown(self):
         for impl in self.impls.values():
             impl_name = impl.__class__.__name__
-            logger.info(f"Shutting down {impl_name}")
+            logger.debug(f"Shutting down {impl_name}")
             try:
                 if hasattr(impl, "shutdown"):
                     await asyncio.wait_for(impl.shutdown(), timeout=5)
@@ -630,6 +643,20 @@ class Stack:
         if REGISTRY_REFRESH_TASK:
             REGISTRY_REFRESH_TASK.cancel()
+        # Shutdown storage backends
+        from llama_stack.core.storage.kvstore.kvstore import shutdown_kvstore_backends
+        from llama_stack.core.storage.sqlstore.sqlstore import shutdown_sqlstore_backends
+        try:
+            await shutdown_kvstore_backends()
+        except Exception as e:
+            logger.exception(f"Failed to shutdown KV store backends: {e}")
+        try:
+            await shutdown_sqlstore_backends()
+        except Exception as e:
+            logger.exception(f"Failed to shutdown SQL store backends: {e}")
 async def refresh_registry_once(impls: dict[Api, Any]):
     logger.debug("refreshing registry")

llama_stack/core/storage/kvstore/kvstore.py CHANGED Viewed

@@ -62,6 +62,9 @@ class InmemoryKVStoreImpl(KVStore):
     async def delete(self, key: str) -> None:
         del self._store[key]
+    async def shutdown(self) -> None:
+        self._store.clear()
 _KVSTORE_BACKENDS: dict[str, KVStoreConfig] = {}
 _KVSTORE_INSTANCES: dict[tuple[str, str], KVStore] = {}
@@ -126,3 +129,11 @@ async def kvstore_impl(reference: KVStoreReference) -> KVStore:
         await impl.initialize()
         _KVSTORE_INSTANCES[cache_key] = impl
         return impl
+async def shutdown_kvstore_backends() -> None:
+    """Shutdown all cached KV store instances."""
+    global _KVSTORE_INSTANCES
+    for instance in _KVSTORE_INSTANCES.values():
+        await instance.shutdown()
+    _KVSTORE_INSTANCES.clear()

llama_stack/core/storage/kvstore/mongodb/mongodb.py CHANGED Viewed

@@ -83,3 +83,8 @@ class MongoDBKVStoreImpl(KVStore):
         async for doc in cursor:
             result.append(doc["key"])
         return result
+    async def shutdown(self) -> None:
+        if self.conn:
+            await self.conn.close()
+            self.conn = None

llama_stack/core/storage/kvstore/postgres/postgres.py CHANGED Viewed

@@ -123,3 +123,11 @@ class PostgresKVStoreImpl(KVStore):
             (start_key, end_key),
         )
         return [row[0] for row in cursor.fetchall()]
+    async def shutdown(self) -> None:
+        if self._cursor:
+            self._cursor.close()
+            self._cursor = None
+        if self._conn:
+            self._conn.close()
+            self._conn = None

llama_stack/core/storage/kvstore/redis/redis.py CHANGED Viewed

@@ -99,3 +99,8 @@ class RedisKVStoreImpl(KVStore):
             if cursor == 0:
                 break
         return result
+    async def shutdown(self) -> None:
+        if self._redis:
+            await self._redis.close()
+            self._redis = None

llama_stack/core/storage/sqlstore/sqlalchemy_sqlstore.py CHANGED Viewed

@@ -107,6 +107,14 @@ class SqlAlchemySqlStoreImpl(SqlStore):
         return engine
+    async def shutdown(self) -> None:
+        """Dispose the session maker's engine and close all connections."""
+        # The async_session holds a reference to the engine created in __init__
+        if self.async_session:
+            engine = self.async_session.kw.get("bind")
+            if engine:
+                await engine.dispose()
     async def create_table(
         self,
         table: str,

llama_stack/core/storage/sqlstore/sqlstore.py CHANGED Viewed

@@ -85,3 +85,11 @@ def register_sqlstore_backends(backends: dict[str, StorageBackendConfig]) -> Non
     _SQLSTORE_LOCKS.clear()
     for name, cfg in backends.items():
         _SQLSTORE_BACKENDS[name] = cfg
+async def shutdown_sqlstore_backends() -> None:
+    """Shutdown all cached SQL store instances."""
+    global _SQLSTORE_INSTANCES
+    for instance in _SQLSTORE_INSTANCES.values():
+        await instance.shutdown()
+    _SQLSTORE_INSTANCES.clear()

llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py CHANGED Viewed

@@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import asyncio
 import re
 import time
 import uuid
@@ -16,6 +17,7 @@ from llama_stack.providers.utils.responses.responses_store import (
     ResponsesStore,
     _OpenAIResponseObjectWithInputAndMessages,
 )
+from llama_stack.providers.utils.tools.mcp import MCPSessionManager
 from llama_stack_api import (
     ConversationItem,
     Conversations,
@@ -489,6 +491,19 @@ class OpenAIResponsesImpl:
         response_id = f"resp_{uuid.uuid4()}"
         created_at = int(time.time())
+        # Create a per-request MCP session manager for session reuse (fix for #4452)
+        # This avoids redundant tools/list calls when making multiple MCP tool invocations
+        mcp_session_manager = MCPSessionManager()
+        # Create a per-request ToolExecutor with the session manager
+        request_tool_executor = ToolExecutor(
+            tool_groups_api=self.tool_groups_api,
+            tool_runtime_api=self.tool_runtime_api,
+            vector_io_api=self.vector_io_api,
+            vector_stores_config=self.tool_executor.vector_stores_config,
+            mcp_session_manager=mcp_session_manager,
+        )
         orchestrator = StreamingResponseOrchestrator(
             inference_api=self.inference_api,
             ctx=ctx,
@@ -498,7 +513,7 @@ class OpenAIResponsesImpl:
             text=text,
             max_infer_iters=max_infer_iters,
             parallel_tool_calls=parallel_tool_calls,
-            tool_executor=self.tool_executor,
+            tool_executor=request_tool_executor,
             safety_api=self.safety_api,
             guardrail_ids=guardrail_ids,
             instructions=instructions,
@@ -513,41 +528,52 @@ class OpenAIResponsesImpl:
         # Type as ConversationItem to avoid list invariance issues
         output_items: list[ConversationItem] = []
-        async for stream_chunk in orchestrator.create_response():
-            match stream_chunk.type:
-                case "response.completed" | "response.incomplete":
-                    final_response = stream_chunk.response
-                case "response.failed":
-                    failed_response = stream_chunk.response
-                case "response.output_item.done":
-                    item = stream_chunk.item
-                    output_items.append(item)
-                case _:
-                    pass  # Other event types
-            # Store and sync before yielding terminal events
-            # This ensures the storage/syncing happens even if the consumer breaks after receiving the event
-            if (
-                stream_chunk.type in {"response.completed", "response.incomplete"}
-                and final_response
-                and failed_response is None
-            ):
-                messages_to_store = list(
-                    filter(lambda x: not isinstance(x, OpenAISystemMessageParam), orchestrator.final_messages)
-                )
-                if store:
-                    # TODO: we really should work off of output_items instead of "final_messages"
-                    await self._store_response(
-                        response=final_response,
-                        input=all_input,
-                        messages=messages_to_store,
+        try:
+            async for stream_chunk in orchestrator.create_response():
+                match stream_chunk.type:
+                    case "response.completed" | "response.incomplete":
+                        final_response = stream_chunk.response
+                    case "response.failed":
+                        failed_response = stream_chunk.response
+                    case "response.output_item.done":
+                        item = stream_chunk.item
+                        output_items.append(item)
+                    case _:
+                        pass  # Other event types
+                # Store and sync before yielding terminal events
+                # This ensures the storage/syncing happens even if the consumer breaks after receiving the event
+                if (
+                    stream_chunk.type in {"response.completed", "response.incomplete"}
+                    and final_response
+                    and failed_response is None
+                ):
+                    messages_to_store = list(
+                        filter(lambda x: not isinstance(x, OpenAISystemMessageParam), orchestrator.final_messages)
                     )
+                    if store:
+                        # TODO: we really should work off of output_items instead of "final_messages"
+                        await self._store_response(
+                            response=final_response,
+                            input=all_input,
+                            messages=messages_to_store,
+                        )
-                if conversation:
-                    await self._sync_response_to_conversation(conversation, input, output_items)
-                    await self.responses_store.store_conversation_messages(conversation, messages_to_store)
-            yield stream_chunk
+                    if conversation:
+                        await self._sync_response_to_conversation(conversation, input, output_items)
+                        await self.responses_store.store_conversation_messages(conversation, messages_to_store)
+                yield stream_chunk
+        finally:
+            # Clean up MCP sessions at the end of the request (fix for #4452)
+            # Use shield() to prevent cancellation from interrupting cleanup and leaking resources
+            # Wrap in try/except as cleanup errors should not mask the original response
+            try:
+                await asyncio.shield(mcp_session_manager.close_all())
+            except BaseException as e:
+                # Debug level - cleanup errors are expected in streaming scenarios where
+                # anyio cancel scopes may be in a different task context
+                logger.debug(f"Error during MCP session cleanup: {e}")
     async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
         return await self.responses_store.delete_response_object(response_id)

llama_stack/providers/inline/agents/meta_reference/responses/streaming.py CHANGED Viewed

@@ -1200,6 +1200,9 @@ class StreamingResponseOrchestrator:
                 "mcp_list_tools_id": list_id,
             }
+            # Get session manager from tool_executor if available (fix for #4452)
+            session_manager = getattr(self.tool_executor, "mcp_session_manager", None)
             # TODO: follow semantic conventions for Open Telemetry tool spans
             # https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span
             with tracer.start_as_current_span("list_mcp_tools", attributes=attributes):
@@ -1207,6 +1210,7 @@ class StreamingResponseOrchestrator:
                     endpoint=mcp_tool.server_url,
                     headers=mcp_tool.headers,
                     authorization=mcp_tool.authorization,
+                    session_manager=session_manager,
                 )
             # Create the MCP list tools message

llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py CHANGED Viewed

@@ -54,11 +54,14 @@ class ToolExecutor:
         tool_runtime_api: ToolRuntime,
         vector_io_api: VectorIO,
         vector_stores_config=None,
+        mcp_session_manager=None,
     ):
         self.tool_groups_api = tool_groups_api
         self.tool_runtime_api = tool_runtime_api
         self.vector_io_api = vector_io_api
         self.vector_stores_config = vector_stores_config
+        # Optional MCPSessionManager for session reuse within a request (fix for #4452)
+        self.mcp_session_manager = mcp_session_manager
     async def execute_tool_call(
         self,
@@ -233,6 +236,7 @@ class ToolExecutor:
                 "document_ids": [r.file_id for r in search_results],
                 "chunks": [r.content[0].text if r.content else "" for r in search_results],
                 "scores": [r.score for r in search_results],
+                "attributes": [r.attributes or {} for r in search_results],
                 "citation_files": citation_files,
             },
         )
@@ -327,12 +331,14 @@ class ToolExecutor:
                 # TODO: follow semantic conventions for Open Telemetry tool spans
                 # https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span
                 with tracer.start_as_current_span("invoke_mcp_tool", attributes=attributes):
+                    # Pass session_manager for session reuse within request (fix for #4452)
                     result = await invoke_mcp_tool(
                         endpoint=mcp_tool.server_url,
                         tool_name=function_name,
                         kwargs=tool_kwargs,
                         headers=mcp_tool.headers,
                         authorization=mcp_tool.authorization,
+                        session_manager=self.mcp_session_manager,
                     )
             elif function_name == "knowledge_search":
                 response_file_search_tool = (
@@ -464,16 +470,18 @@ class ToolExecutor:
                 )
                 if result and (metadata := getattr(result, "metadata", None)) and "document_ids" in metadata:
                     message.results = []
+                    attributes_list = metadata.get("attributes", [])
                     for i, doc_id in enumerate(metadata["document_ids"]):
                         text = metadata["chunks"][i] if "chunks" in metadata else None
                         score = metadata["scores"][i] if "scores" in metadata else None
+                        attrs = attributes_list[i] if i < len(attributes_list) else {}
                         message.results.append(
                             OpenAIResponseOutputMessageFileSearchToolCallResults(
                                 file_id=doc_id,
                                 filename=doc_id,
                                 text=text if text is not None else "",
                                 score=score if score is not None else 0.0,
-                                attributes={},
+                                attributes=attrs,
                             )
                         )
                 if has_error:

llama_stack/providers/inline/tool_runtime/rag/memory.py CHANGED Viewed

@@ -50,8 +50,11 @@ log = get_logger(name=__name__, category="tool_runtime")
 async def raw_data_from_doc(doc: RAGDocument) -> tuple[bytes, str]:
     """Get raw binary data and mime type from a RAGDocument for file upload."""
     if isinstance(doc.content, URL):
-        if doc.content.uri.startswith("data:"):
-            parts = parse_data_url(doc.content.uri)
+        uri = doc.content.uri
+        if uri.startswith("file://"):
+            raise ValueError("file:// URIs are not supported. Please use the Files API (/v1/files) to upload files.")
+        if uri.startswith("data:"):
+            parts = parse_data_url(uri)
             mime_type = parts["mimetype"]
             data = parts["data"]
@@ -63,7 +66,7 @@ async def raw_data_from_doc(doc: RAGDocument) -> tuple[bytes, str]:
             return file_data, mime_type
         else:
             async with httpx.AsyncClient() as client:
-                r = await client.get(doc.content.uri)
+                r = await client.get(uri)
                 r.raise_for_status()
                 mime_type = r.headers.get("content-type", "application/octet-stream")
                 return r.content, mime_type
@@ -73,6 +76,8 @@ async def raw_data_from_doc(doc: RAGDocument) -> tuple[bytes, str]:
         else:
             content_str = interleaved_content_as_str(doc.content)
+        if content_str.startswith("file://"):
+            raise ValueError("file:// URIs are not supported. Please use the Files API (/v1/files) to upload files.")
         if content_str.startswith("data:"):
             parts = parse_data_url(content_str)
             mime_type = parts["mimetype"]

llama_stack/providers/remote/vector_io/pgvector/pgvector.py CHANGED Viewed

@@ -10,6 +10,7 @@ from typing import Any
 import psycopg2
 from numpy.typing import NDArray
 from psycopg2 import sql
+from psycopg2.extensions import cursor
 from psycopg2.extras import Json, execute_values
 from pydantic import BaseModel, TypeAdapter
@@ -54,6 +55,17 @@ def check_extension_version(cur):
     return result[0] if result else None
+def create_vector_extension(cur: cursor) -> None:
+    try:
+        log.info("Vector extension not found, creating...")
+        cur.execute("CREATE EXTENSION vector;")
+        log.info("Vector extension created successfully")
+        log.info(f"Vector extension version: {check_extension_version(cur)}")
+    except psycopg2.Error as e:
+        raise RuntimeError(f"Failed to create vector extension for PGVector: {e}") from e
 def upsert_models(conn, keys_models: list[tuple[str, BaseModel]]):
     with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
         query = sql.SQL(
@@ -364,7 +376,7 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProt
                 if version:
                     log.info(f"Vector extension version: {version}")
                 else:
-                    raise RuntimeError("Vector extension is not installed.")
+                    create_vector_extension(cur)
                 cur.execute(
                     """

llama-stack 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

llama-stack 0.4.1py3-none-any.whl → 0.4.3py3-none-any.whl