PyPI - llama-stack - Versions diffs - 0.4.3__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl - Mend

llama-stack 0.4.3py3-none-any.whl → 0.5.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (307) hide show

llama_stack/cli/stack/_list_deps.py +11 -7
llama_stack/cli/stack/run.py +3 -25
llama_stack/core/access_control/datatypes.py +78 -0
llama_stack/core/configure.py +2 -2
{llama_stack_api/internal → llama_stack/core/connectors}/__init__.py +2 -2
llama_stack/core/connectors/connectors.py +162 -0
llama_stack/core/conversations/conversations.py +61 -58
llama_stack/core/datatypes.py +54 -8
llama_stack/core/library_client.py +60 -13
llama_stack/core/prompts/prompts.py +43 -42
llama_stack/core/routers/datasets.py +20 -17
llama_stack/core/routers/eval_scoring.py +143 -53
llama_stack/core/routers/inference.py +20 -9
llama_stack/core/routers/safety.py +30 -42
llama_stack/core/routers/vector_io.py +15 -7
llama_stack/core/routing_tables/models.py +42 -3
llama_stack/core/routing_tables/scoring_functions.py +19 -19
llama_stack/core/routing_tables/shields.py +20 -17
llama_stack/core/routing_tables/vector_stores.py +8 -5
llama_stack/core/server/auth.py +192 -17
llama_stack/core/server/fastapi_router_registry.py +40 -5
llama_stack/core/server/server.py +24 -5
llama_stack/core/stack.py +54 -10
llama_stack/core/storage/datatypes.py +9 -0
llama_stack/core/store/registry.py +1 -1
llama_stack/core/utils/exec.py +2 -2
llama_stack/core/utils/type_inspection.py +16 -2
llama_stack/distributions/dell/config.yaml +4 -1
llama_stack/distributions/dell/doc_template.md +209 -0
llama_stack/distributions/dell/run-with-safety.yaml +4 -1
llama_stack/distributions/nvidia/config.yaml +4 -1
llama_stack/distributions/nvidia/doc_template.md +170 -0
llama_stack/distributions/nvidia/run-with-safety.yaml +4 -1
llama_stack/distributions/oci/config.yaml +4 -1
llama_stack/distributions/oci/doc_template.md +140 -0
llama_stack/distributions/open-benchmark/config.yaml +9 -1
llama_stack/distributions/postgres-demo/config.yaml +1 -1
llama_stack/distributions/starter/build.yaml +62 -0
llama_stack/distributions/starter/config.yaml +22 -3
llama_stack/distributions/starter/run-with-postgres-store.yaml +22 -3
llama_stack/distributions/starter/starter.py +13 -1
llama_stack/distributions/starter-gpu/build.yaml +62 -0
llama_stack/distributions/starter-gpu/config.yaml +22 -3
llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +22 -3
llama_stack/distributions/template.py +10 -2
llama_stack/distributions/watsonx/config.yaml +4 -1
llama_stack/log.py +1 -0
llama_stack/models/llama/resources/dog.jpg +0 -0
llama_stack/models/llama/resources/pasta.jpeg +0 -0
llama_stack/models/llama/resources/small_dog.jpg +0 -0
llama_stack/providers/inline/agents/meta_reference/__init__.py +1 -0
llama_stack/providers/inline/agents/meta_reference/agents.py +57 -61
llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +183 -60
llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +94 -22
llama_stack/providers/inline/agents/meta_reference/responses/types.py +2 -1
llama_stack/providers/inline/agents/meta_reference/responses/utils.py +4 -1
llama_stack/providers/inline/agents/meta_reference/safety.py +2 -2
llama_stack/providers/inline/batches/reference/batches.py +2 -1
llama_stack/providers/inline/eval/meta_reference/eval.py +40 -32
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h +9 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift +189 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift +238 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/PromptTemplate.swift +12 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/SystemPrompts.swift +89 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.pbxproj +550 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
llama_stack/providers/inline/post_training/huggingface/post_training.py +33 -38
llama_stack/providers/inline/post_training/huggingface/utils.py +2 -5
llama_stack/providers/inline/post_training/torchtune/post_training.py +28 -33
llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +2 -4
llama_stack/providers/inline/safety/code_scanner/code_scanner.py +12 -15
llama_stack/providers/inline/safety/llama_guard/llama_guard.py +15 -18
llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +11 -17
llama_stack/providers/inline/scoring/basic/scoring.py +13 -17
llama_stack/providers/inline/scoring/braintrust/braintrust.py +15 -15
llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +13 -17
llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +1 -1
llama_stack/providers/registry/agents.py +1 -0
llama_stack/providers/registry/inference.py +1 -9
llama_stack/providers/registry/vector_io.py +136 -16
llama_stack/providers/remote/datasetio/nvidia/README.md +74 -0
llama_stack/providers/remote/eval/nvidia/README.md +134 -0
llama_stack/providers/remote/eval/nvidia/eval.py +22 -21
llama_stack/providers/remote/files/s3/README.md +266 -0
llama_stack/providers/remote/files/s3/config.py +5 -3
llama_stack/providers/remote/files/s3/files.py +2 -2
llama_stack/providers/remote/inference/gemini/gemini.py +4 -0
llama_stack/providers/remote/inference/nvidia/NVIDIA.md +203 -0
llama_stack/providers/remote/inference/openai/openai.py +2 -0
llama_stack/providers/remote/inference/together/together.py +4 -0
llama_stack/providers/remote/inference/vertexai/config.py +3 -3
llama_stack/providers/remote/inference/vertexai/vertexai.py +5 -2
llama_stack/providers/remote/inference/vllm/config.py +37 -18
llama_stack/providers/remote/inference/vllm/vllm.py +0 -3
llama_stack/providers/remote/inference/watsonx/watsonx.py +4 -0
llama_stack/providers/remote/post_training/nvidia/README.md +151 -0
llama_stack/providers/remote/post_training/nvidia/post_training.py +31 -33
llama_stack/providers/remote/safety/bedrock/bedrock.py +10 -27
llama_stack/providers/remote/safety/nvidia/README.md +78 -0
llama_stack/providers/remote/safety/nvidia/nvidia.py +9 -25
llama_stack/providers/remote/safety/sambanova/sambanova.py +13 -11
llama_stack/providers/remote/vector_io/elasticsearch/__init__.py +17 -0
llama_stack/providers/remote/vector_io/elasticsearch/config.py +32 -0
llama_stack/providers/remote/vector_io/elasticsearch/elasticsearch.py +463 -0
llama_stack/providers/remote/vector_io/oci/__init__.py +22 -0
llama_stack/providers/remote/vector_io/oci/config.py +41 -0
llama_stack/providers/remote/vector_io/oci/oci26ai.py +595 -0
llama_stack/providers/remote/vector_io/pgvector/config.py +69 -2
llama_stack/providers/remote/vector_io/pgvector/pgvector.py +255 -6
llama_stack/providers/remote/vector_io/qdrant/qdrant.py +62 -38
llama_stack/providers/utils/bedrock/client.py +3 -3
llama_stack/providers/utils/bedrock/config.py +7 -7
llama_stack/providers/utils/inference/embedding_mixin.py +4 -0
llama_stack/providers/utils/inference/http_client.py +239 -0
llama_stack/providers/utils/inference/litellm_openai_mixin.py +5 -0
llama_stack/providers/utils/inference/model_registry.py +148 -2
llama_stack/providers/utils/inference/openai_compat.py +2 -1
llama_stack/providers/utils/inference/openai_mixin.py +41 -2
llama_stack/providers/utils/memory/openai_vector_store_mixin.py +92 -5
llama_stack/providers/utils/memory/vector_store.py +46 -19
llama_stack/providers/utils/responses/responses_store.py +40 -6
llama_stack/providers/utils/safety.py +114 -0
llama_stack/providers/utils/tools/mcp.py +44 -3
llama_stack/testing/api_recorder.py +9 -3
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/METADATA +14 -2
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/RECORD +131 -275
llama_stack-0.5.0rc1.dist-info/top_level.txt +1 -0
llama_stack/distributions/meta-reference-gpu/__init__.py +0 -7
llama_stack/distributions/meta-reference-gpu/config.yaml +0 -140
llama_stack/distributions/meta-reference-gpu/meta_reference.py +0 -163
llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +0 -155
llama_stack/models/llama/hadamard_utils.py +0 -88
llama_stack/models/llama/llama3/args.py +0 -74
llama_stack/models/llama/llama3/generation.py +0 -378
llama_stack/models/llama/llama3/model.py +0 -304
llama_stack/models/llama/llama3/multimodal/__init__.py +0 -12
llama_stack/models/llama/llama3/multimodal/encoder_utils.py +0 -180
llama_stack/models/llama/llama3/multimodal/image_transform.py +0 -409
llama_stack/models/llama/llama3/multimodal/model.py +0 -1430
llama_stack/models/llama/llama3/multimodal/utils.py +0 -26
llama_stack/models/llama/llama3/quantization/__init__.py +0 -5
llama_stack/models/llama/llama3/quantization/loader.py +0 -316
llama_stack/models/llama/llama3_1/__init__.py +0 -12
llama_stack/models/llama/llama3_1/prompt_format.md +0 -358
llama_stack/models/llama/llama3_1/prompts.py +0 -258
llama_stack/models/llama/llama3_2/__init__.py +0 -5
llama_stack/models/llama/llama3_2/prompts_text.py +0 -229
llama_stack/models/llama/llama3_2/prompts_vision.py +0 -126
llama_stack/models/llama/llama3_2/text_prompt_format.md +0 -286
llama_stack/models/llama/llama3_2/vision_prompt_format.md +0 -141
llama_stack/models/llama/llama3_3/__init__.py +0 -5
llama_stack/models/llama/llama3_3/prompts.py +0 -259
llama_stack/models/llama/llama4/args.py +0 -107
llama_stack/models/llama/llama4/ffn.py +0 -58
llama_stack/models/llama/llama4/moe.py +0 -214
llama_stack/models/llama/llama4/preprocess.py +0 -435
llama_stack/models/llama/llama4/quantization/__init__.py +0 -5
llama_stack/models/llama/llama4/quantization/loader.py +0 -226
llama_stack/models/llama/llama4/vision/__init__.py +0 -5
llama_stack/models/llama/llama4/vision/embedding.py +0 -210
llama_stack/models/llama/llama4/vision/encoder.py +0 -412
llama_stack/models/llama/quantize_impls.py +0 -316
llama_stack/providers/inline/inference/meta_reference/__init__.py +0 -20
llama_stack/providers/inline/inference/meta_reference/common.py +0 -24
llama_stack/providers/inline/inference/meta_reference/config.py +0 -68
llama_stack/providers/inline/inference/meta_reference/generators.py +0 -201
llama_stack/providers/inline/inference/meta_reference/inference.py +0 -542
llama_stack/providers/inline/inference/meta_reference/model_parallel.py +0 -77
llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +0 -353
llama_stack-0.4.3.dist-info/top_level.txt +0 -2
llama_stack_api/__init__.py +0 -945
llama_stack_api/admin/__init__.py +0 -45
llama_stack_api/admin/api.py +0 -72
llama_stack_api/admin/fastapi_routes.py +0 -117
llama_stack_api/admin/models.py +0 -113
llama_stack_api/agents.py +0 -173
llama_stack_api/batches/__init__.py +0 -40
llama_stack_api/batches/api.py +0 -53
llama_stack_api/batches/fastapi_routes.py +0 -113
llama_stack_api/batches/models.py +0 -78
llama_stack_api/benchmarks/__init__.py +0 -43
llama_stack_api/benchmarks/api.py +0 -39
llama_stack_api/benchmarks/fastapi_routes.py +0 -109
llama_stack_api/benchmarks/models.py +0 -109
llama_stack_api/common/__init__.py +0 -5
llama_stack_api/common/content_types.py +0 -101
llama_stack_api/common/errors.py +0 -95
llama_stack_api/common/job_types.py +0 -38
llama_stack_api/common/responses.py +0 -77
llama_stack_api/common/training_types.py +0 -47
llama_stack_api/common/type_system.py +0 -146
llama_stack_api/connectors.py +0 -146
llama_stack_api/conversations.py +0 -270
llama_stack_api/datasetio.py +0 -55
llama_stack_api/datasets/__init__.py +0 -61
llama_stack_api/datasets/api.py +0 -35
llama_stack_api/datasets/fastapi_routes.py +0 -104
llama_stack_api/datasets/models.py +0 -152
llama_stack_api/datatypes.py +0 -373
llama_stack_api/eval.py +0 -137
llama_stack_api/file_processors/__init__.py +0 -27
llama_stack_api/file_processors/api.py +0 -64
llama_stack_api/file_processors/fastapi_routes.py +0 -78
llama_stack_api/file_processors/models.py +0 -42
llama_stack_api/files/__init__.py +0 -35
llama_stack_api/files/api.py +0 -51
llama_stack_api/files/fastapi_routes.py +0 -124
llama_stack_api/files/models.py +0 -107
llama_stack_api/inference.py +0 -1169
llama_stack_api/inspect_api/__init__.py +0 -37
llama_stack_api/inspect_api/api.py +0 -25
llama_stack_api/inspect_api/fastapi_routes.py +0 -76
llama_stack_api/inspect_api/models.py +0 -28
llama_stack_api/internal/kvstore.py +0 -28
llama_stack_api/internal/sqlstore.py +0 -81
llama_stack_api/llama_stack_api/__init__.py +0 -945
llama_stack_api/llama_stack_api/admin/__init__.py +0 -45
llama_stack_api/llama_stack_api/admin/api.py +0 -72
llama_stack_api/llama_stack_api/admin/fastapi_routes.py +0 -117
llama_stack_api/llama_stack_api/admin/models.py +0 -113
llama_stack_api/llama_stack_api/agents.py +0 -173
llama_stack_api/llama_stack_api/batches/__init__.py +0 -40
llama_stack_api/llama_stack_api/batches/api.py +0 -53
llama_stack_api/llama_stack_api/batches/fastapi_routes.py +0 -113
llama_stack_api/llama_stack_api/batches/models.py +0 -78
llama_stack_api/llama_stack_api/benchmarks/__init__.py +0 -43
llama_stack_api/llama_stack_api/benchmarks/api.py +0 -39
llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +0 -109
llama_stack_api/llama_stack_api/benchmarks/models.py +0 -109
llama_stack_api/llama_stack_api/common/__init__.py +0 -5
llama_stack_api/llama_stack_api/common/content_types.py +0 -101
llama_stack_api/llama_stack_api/common/errors.py +0 -95
llama_stack_api/llama_stack_api/common/job_types.py +0 -38
llama_stack_api/llama_stack_api/common/responses.py +0 -77
llama_stack_api/llama_stack_api/common/training_types.py +0 -47
llama_stack_api/llama_stack_api/common/type_system.py +0 -146
llama_stack_api/llama_stack_api/connectors.py +0 -146
llama_stack_api/llama_stack_api/conversations.py +0 -270
llama_stack_api/llama_stack_api/datasetio.py +0 -55
llama_stack_api/llama_stack_api/datasets/__init__.py +0 -61
llama_stack_api/llama_stack_api/datasets/api.py +0 -35
llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +0 -104
llama_stack_api/llama_stack_api/datasets/models.py +0 -152
llama_stack_api/llama_stack_api/datatypes.py +0 -373
llama_stack_api/llama_stack_api/eval.py +0 -137
llama_stack_api/llama_stack_api/file_processors/__init__.py +0 -27
llama_stack_api/llama_stack_api/file_processors/api.py +0 -64
llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +0 -78
llama_stack_api/llama_stack_api/file_processors/models.py +0 -42
llama_stack_api/llama_stack_api/files/__init__.py +0 -35
llama_stack_api/llama_stack_api/files/api.py +0 -51
llama_stack_api/llama_stack_api/files/fastapi_routes.py +0 -124
llama_stack_api/llama_stack_api/files/models.py +0 -107
llama_stack_api/llama_stack_api/inference.py +0 -1169
llama_stack_api/llama_stack_api/inspect_api/__init__.py +0 -37
llama_stack_api/llama_stack_api/inspect_api/api.py +0 -25
llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +0 -76
llama_stack_api/llama_stack_api/inspect_api/models.py +0 -28
llama_stack_api/llama_stack_api/internal/__init__.py +0 -9
llama_stack_api/llama_stack_api/internal/kvstore.py +0 -28
llama_stack_api/llama_stack_api/internal/sqlstore.py +0 -81
llama_stack_api/llama_stack_api/models.py +0 -171
llama_stack_api/llama_stack_api/openai_responses.py +0 -1468
llama_stack_api/llama_stack_api/post_training.py +0 -370
llama_stack_api/llama_stack_api/prompts.py +0 -203
llama_stack_api/llama_stack_api/providers/__init__.py +0 -33
llama_stack_api/llama_stack_api/providers/api.py +0 -16
llama_stack_api/llama_stack_api/providers/fastapi_routes.py +0 -57
llama_stack_api/llama_stack_api/providers/models.py +0 -24
llama_stack_api/llama_stack_api/py.typed +0 -0
llama_stack_api/llama_stack_api/rag_tool.py +0 -168
llama_stack_api/llama_stack_api/resource.py +0 -37
llama_stack_api/llama_stack_api/router_utils.py +0 -160
llama_stack_api/llama_stack_api/safety.py +0 -132
llama_stack_api/llama_stack_api/schema_utils.py +0 -208
llama_stack_api/llama_stack_api/scoring.py +0 -93
llama_stack_api/llama_stack_api/scoring_functions.py +0 -211
llama_stack_api/llama_stack_api/shields.py +0 -93
llama_stack_api/llama_stack_api/tools.py +0 -226
llama_stack_api/llama_stack_api/vector_io.py +0 -941
llama_stack_api/llama_stack_api/vector_stores.py +0 -53
llama_stack_api/llama_stack_api/version.py +0 -9
llama_stack_api/models.py +0 -171
llama_stack_api/openai_responses.py +0 -1468
llama_stack_api/post_training.py +0 -370
llama_stack_api/prompts.py +0 -203
llama_stack_api/providers/__init__.py +0 -33
llama_stack_api/providers/api.py +0 -16
llama_stack_api/providers/fastapi_routes.py +0 -57
llama_stack_api/providers/models.py +0 -24
llama_stack_api/py.typed +0 -0
llama_stack_api/rag_tool.py +0 -168
llama_stack_api/resource.py +0 -37
llama_stack_api/router_utils.py +0 -160
llama_stack_api/safety.py +0 -132
llama_stack_api/schema_utils.py +0 -208
llama_stack_api/scoring.py +0 -93
llama_stack_api/scoring_functions.py +0 -211
llama_stack_api/shields.py +0 -93
llama_stack_api/tools.py +0 -226
llama_stack_api/vector_io.py +0 -941
llama_stack_api/vector_stores.py +0 -53
llama_stack_api/version.py +0 -9
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/WHEEL +0 -0
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/entry_points.txt +0 -0
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0

llama_stack/providers/inline/agents/meta_reference/agents.py CHANGED Viewed

@@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from collections.abc import AsyncIterator
 from llama_stack.core.datatypes import AccessRule
 from llama_stack.core.storage.kvstore import InmemoryKVStoreImpl, kvstore_impl
@@ -11,21 +12,21 @@ from llama_stack.log import get_logger
 from llama_stack.providers.utils.responses.responses_store import ResponsesStore
 from llama_stack_api import (
     Agents,
+    Connectors,
     Conversations,
+    CreateResponseRequest,
+    DeleteResponseRequest,
     Files,
     Inference,
     ListOpenAIResponseInputItem,
     ListOpenAIResponseObject,
+    ListResponseInputItemsRequest,
+    ListResponsesRequest,
     OpenAIDeleteResponseObject,
-    OpenAIResponseInput,
-    OpenAIResponseInputTool,
-    OpenAIResponseInputToolChoice,
     OpenAIResponseObject,
-    OpenAIResponsePrompt,
-    OpenAIResponseText,
-    Order,
+    OpenAIResponseObjectStream,
     Prompts,
-    ResponseGuardrail,
+    RetrieveResponseRequest,
     Safety,
     ToolGroups,
     ToolRuntime,
@@ -50,6 +51,7 @@ class MetaReferenceAgentsImpl(Agents):
         conversations_api: Conversations,
         prompts_api: Prompts,
         files_api: Files,
+        connectors_api: Connectors,
         policy: list[AccessRule],
     ):
         self.config = config
@@ -64,6 +66,7 @@ class MetaReferenceAgentsImpl(Agents):
         self.in_memory_store = InmemoryKVStoreImpl()
         self.openai_responses_impl: OpenAIResponsesImpl | None = None
         self.policy = policy
+        self.connectors_api = connectors_api
     async def initialize(self) -> None:
         self.persistence_store = await kvstore_impl(self.config.persistence.agent_state)
@@ -80,6 +83,7 @@ class MetaReferenceAgentsImpl(Agents):
             prompts_api=self.prompts_api,
             files_api=self.files_api,
             vector_stores_config=self.config.vector_stores_config,
+            connectors_api=self.connectors_api,
         )
     async def shutdown(self) -> None:
@@ -88,79 +92,71 @@ class MetaReferenceAgentsImpl(Agents):
     # OpenAI responses
     async def get_openai_response(
         self,
-        response_id: str,
+        request: RetrieveResponseRequest,
     ) -> OpenAIResponseObject:
         assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
-        return await self.openai_responses_impl.get_openai_response(response_id)
+        return await self.openai_responses_impl.get_openai_response(request.response_id)
     async def create_openai_response(
         self,
-        input: str | list[OpenAIResponseInput],
-        model: str,
-        prompt: OpenAIResponsePrompt | None = None,
-        instructions: str | None = None,
-        parallel_tool_calls: bool | None = True,
-        previous_response_id: str | None = None,
-        conversation: str | None = None,
-        store: bool | None = True,
-        stream: bool | None = False,
-        temperature: float | None = None,
-        text: OpenAIResponseText | None = None,
-        tool_choice: OpenAIResponseInputToolChoice | None = None,
-        tools: list[OpenAIResponseInputTool] | None = None,
-        include: list[str] | None = None,
-        max_infer_iters: int | None = 10,
-        guardrails: list[ResponseGuardrail] | None = None,
-        max_tool_calls: int | None = None,
-        metadata: dict[str, str] | None = None,
-    ) -> OpenAIResponseObject:
+        request: CreateResponseRequest,
+    ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
+        """Create an OpenAI response.
+        Returns either a single response object (non-streaming) or an async iterator
+        yielding response stream events (streaming).
+        """
         assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
         result = await self.openai_responses_impl.create_openai_response(
-            input,
-            model,
-            prompt,
-            instructions,
-            previous_response_id,
-            conversation,
-            store,
-            stream,
-            temperature,
-            text,
-            tool_choice,
-            tools,
-            include,
-            max_infer_iters,
-            guardrails,
-            parallel_tool_calls,
-            max_tool_calls,
-            metadata,
+            request.input,
+            request.model,
+            request.prompt,
+            request.instructions,
+            request.previous_response_id,
+            request.conversation,
+            request.store,
+            request.stream,
+            request.temperature,
+            request.text,
+            request.tool_choice,
+            request.tools,
+            request.include,
+            request.max_infer_iters,
+            request.guardrails,
+            request.parallel_tool_calls,
+            request.max_tool_calls,
+            request.max_output_tokens,
+            request.reasoning,
+            request.metadata,
         )
-        return result  # type: ignore[no-any-return]
+        return result
     async def list_openai_responses(
         self,
-        after: str | None = None,
-        limit: int | None = 50,
-        model: str | None = None,
-        order: Order | None = Order.desc,
+        request: ListResponsesRequest,
     ) -> ListOpenAIResponseObject:
         assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
-        return await self.openai_responses_impl.list_openai_responses(after, limit, model, order)
+        return await self.openai_responses_impl.list_openai_responses(
+            request.after, request.limit, request.model, request.order
+        )
     async def list_openai_response_input_items(
         self,
-        response_id: str,
-        after: str | None = None,
-        before: str | None = None,
-        include: list[str] | None = None,
-        limit: int | None = 20,
-        order: Order | None = Order.desc,
+        request: ListResponseInputItemsRequest,
     ) -> ListOpenAIResponseInputItem:
         assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
         return await self.openai_responses_impl.list_openai_response_input_items(
-            response_id, after, before, include, limit, order
+            request.response_id,
+            request.after,
+            request.before,
+            request.include,
+            request.limit,
+            request.order,
         )
-    async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
+    async def delete_openai_response(
+        self,
+        request: DeleteResponseRequest,
+    ) -> OpenAIDeleteResponseObject:
         assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
-        return await self.openai_responses_impl.delete_openai_response(response_id)
+        return await self.openai_responses_impl.delete_openai_response(request.response_id)

llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py CHANGED Viewed

@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-import asyncio
 import re
 import time
 import uuid
@@ -19,11 +18,14 @@ from llama_stack.providers.utils.responses.responses_store import (
 )
 from llama_stack.providers.utils.tools.mcp import MCPSessionManager
 from llama_stack_api import (
+    AddItemsRequest,
+    Connectors,
     ConversationItem,
     Conversations,
     Files,
     Inference,
     InvalidConversationIdError,
+    ListItemsRequest,
     ListOpenAIResponseInputItem,
     ListOpenAIResponseObject,
     OpenAIChatCompletionContentPartParam,
@@ -39,6 +41,7 @@ from llama_stack_api import (
     OpenAIResponseObject,
     OpenAIResponseObjectStream,
     OpenAIResponsePrompt,
+    OpenAIResponseReasoning,
     OpenAIResponseText,
     OpenAIResponseTextFormat,
     OpenAISystemMessageParam,
@@ -83,6 +86,7 @@ class OpenAIResponsesImpl:
         conversations_api: Conversations,
         prompts_api: Prompts,
         files_api: Files,
+        connectors_api: Connectors,
         vector_stores_config=None,
     ):
         self.inference_api = inference_api
@@ -100,6 +104,7 @@ class OpenAIResponsesImpl:
         )
         self.prompts_api = prompts_api
         self.files_api = files_api
+        self.connectors_api = connectors_api
     async def _prepend_previous_response(
         self,
@@ -150,7 +155,9 @@ class OpenAIResponsesImpl:
             tool_context.recover_tools_from_previous_response(previous_response)
         elif conversation is not None:
-            conversation_items = await self.conversations_api.list_items(conversation, order="asc")
+            conversation_items = await self.conversations_api.list_items(
+                ListItemsRequest(conversation_id=conversation, order="asc")
+            )
             # Use stored messages as source of truth (like previous_response.messages)
             stored_messages = await self.responses_store.get_conversation_messages(conversation)
@@ -324,6 +331,125 @@ class OpenAIResponsesImpl:
             messages=messages,
         )
+    def _prepare_input_items_for_storage(
+        self,
+        input: str | list[OpenAIResponseInput],
+    ) -> list[OpenAIResponseInput]:
+        """Prepare input items for storage, adding IDs where needed.
+        This method is called once at the start of streaming to prepare input items
+        that will be reused across multiple persistence calls during streaming.
+        """
+        new_input_id = f"msg_{uuid.uuid4()}"
+        input_items_data: list[OpenAIResponseInput] = []
+        if isinstance(input, str):
+            input_content = OpenAIResponseInputMessageContentText(text=input)
+            input_content_item = OpenAIResponseMessage(
+                role="user",
+                content=[input_content],
+                id=new_input_id,
+            )
+            input_items_data = [input_content_item]
+        else:
+            for input_item in input:
+                if isinstance(input_item, OpenAIResponseMessage):
+                    input_item_dict = input_item.model_dump()
+                    if "id" not in input_item_dict:
+                        input_item_dict["id"] = new_input_id
+                    input_items_data.append(OpenAIResponseMessage(**input_item_dict))
+                else:
+                    input_items_data.append(input_item)
+        return input_items_data
+    async def _persist_streaming_state(
+        self,
+        stream_chunk: OpenAIResponseObjectStream,
+        orchestrator,
+        input_items: list[OpenAIResponseInput],
+        output_items: list,
+    ) -> None:
+        """Persist response state at significant streaming events.
+        This enables clients to poll GET /v1/responses/{response_id} during streaming
+        to see in-progress turn state instead of empty results.
+        Persistence occurs at:
+        - response.in_progress: Initial INSERT with empty output
+        - response.output_item.done: UPDATE with accumulated output items
+        - response.completed/response.incomplete: Final UPDATE with complete state
+        - response.failed: UPDATE with error state
+        :param stream_chunk: The current streaming event.
+        :param orchestrator: The streaming orchestrator (for snapshotting response).
+        :param input_items: Pre-prepared input items for storage.
+        :param output_items: Accumulated output items so far.
+        """
+        try:
+            match stream_chunk.type:
+                case "response.in_progress":
+                    # Initial persistence when response starts
+                    in_progress_response = stream_chunk.response
+                    await self.responses_store.upsert_response_object(
+                        response_object=in_progress_response,
+                        input=input_items,
+                        messages=[],
+                    )
+                case "response.output_item.done":
+                    # Incremental update when an output item completes (tool call, message)
+                    current_snapshot = orchestrator._snapshot_response(
+                        status="in_progress",
+                        outputs=output_items,
+                    )
+                    # Get current messages (filter out system messages)
+                    messages_to_store = list(
+                        filter(
+                            lambda x: not isinstance(x, OpenAISystemMessageParam),
+                            orchestrator.final_messages or orchestrator.ctx.messages,
+                        )
+                    )
+                    await self.responses_store.upsert_response_object(
+                        response_object=current_snapshot,
+                        input=input_items,
+                        messages=messages_to_store,
+                    )
+                case "response.completed" | "response.incomplete":
+                    # Final persistence when response finishes
+                    final_response = stream_chunk.response
+                    messages_to_store = list(
+                        filter(
+                            lambda x: not isinstance(x, OpenAISystemMessageParam),
+                            orchestrator.final_messages,
+                        )
+                    )
+                    await self.responses_store.upsert_response_object(
+                        response_object=final_response,
+                        input=input_items,
+                        messages=messages_to_store,
+                    )
+                case "response.failed":
+                    # Persist failed state so GET shows error
+                    failed_response = stream_chunk.response
+                    # Preserve any accumulated non-system messages for failed responses
+                    messages_to_store = list(
+                        filter(
+                            lambda x: not isinstance(x, OpenAISystemMessageParam),
+                            orchestrator.final_messages or orchestrator.ctx.messages,
+                        )
+                    )
+                    await self.responses_store.upsert_response_object(
+                        response_object=failed_response,
+                        input=input_items,
+                        messages=messages_to_store,
+                    )
+        except Exception as e:
+            # Best-effort persistence: log error but don't fail the stream
+            logger.warning(f"Failed to persist streaming state for {stream_chunk.type}: {e}")
     async def create_openai_response(
         self,
         input: str | list[OpenAIResponseInput],
@@ -343,6 +469,8 @@ class OpenAIResponsesImpl:
         guardrails: list[str | ResponseGuardrailSpec] | None = None,
         parallel_tool_calls: bool | None = None,
         max_tool_calls: int | None = None,
+        reasoning: OpenAIResponseReasoning | None = None,
+        max_output_tokens: int | None = None,
         metadata: dict[str, str] | None = None,
     ):
         stream = bool(stream)
@@ -380,9 +508,6 @@ class OpenAIResponsesImpl:
             if not conversation.startswith("conv_"):
                 raise InvalidConversationIdError(conversation)
-        if max_tool_calls is not None and max_tool_calls < 1:
-            raise ValueError(f"Invalid {max_tool_calls=}; should be >= 1")
         stream_gen = self._create_streaming_response(
             input=input,
             conversation=conversation,
@@ -399,6 +524,8 @@ class OpenAIResponsesImpl:
             guardrail_ids=guardrail_ids,
             parallel_tool_calls=parallel_tool_calls,
             max_tool_calls=max_tool_calls,
+            reasoning=reasoning,
+            max_output_tokens=max_output_tokens,
             metadata=metadata,
             include=include,
         )
@@ -454,6 +581,8 @@ class OpenAIResponsesImpl:
         guardrail_ids: list[str] | None = None,
         parallel_tool_calls: bool | None = True,
         max_tool_calls: int | None = None,
+        reasoning: OpenAIResponseReasoning | None = None,
+        max_output_tokens: int | None = None,
         metadata: dict[str, str] | None = None,
         include: list[ResponseItemInclude] | None = None,
     ) -> AsyncIterator[OpenAIResponseObjectStream]:
@@ -493,42 +622,44 @@ class OpenAIResponsesImpl:
         # Create a per-request MCP session manager for session reuse (fix for #4452)
         # This avoids redundant tools/list calls when making multiple MCP tool invocations
-        mcp_session_manager = MCPSessionManager()
-        # Create a per-request ToolExecutor with the session manager
-        request_tool_executor = ToolExecutor(
-            tool_groups_api=self.tool_groups_api,
-            tool_runtime_api=self.tool_runtime_api,
-            vector_io_api=self.vector_io_api,
-            vector_stores_config=self.tool_executor.vector_stores_config,
-            mcp_session_manager=mcp_session_manager,
-        )
+        async with MCPSessionManager() as mcp_session_manager:
+            request_tool_executor = ToolExecutor(
+                tool_groups_api=self.tool_groups_api,
+                tool_runtime_api=self.tool_runtime_api,
+                vector_io_api=self.vector_io_api,
+                vector_stores_config=self.tool_executor.vector_stores_config,
+                mcp_session_manager=mcp_session_manager,
+            )
-        orchestrator = StreamingResponseOrchestrator(
-            inference_api=self.inference_api,
-            ctx=ctx,
-            response_id=response_id,
-            created_at=created_at,
-            prompt=prompt,
-            text=text,
-            max_infer_iters=max_infer_iters,
-            parallel_tool_calls=parallel_tool_calls,
-            tool_executor=request_tool_executor,
-            safety_api=self.safety_api,
-            guardrail_ids=guardrail_ids,
-            instructions=instructions,
-            max_tool_calls=max_tool_calls,
-            metadata=metadata,
-            include=include,
-        )
+            orchestrator = StreamingResponseOrchestrator(
+                inference_api=self.inference_api,
+                ctx=ctx,
+                response_id=response_id,
+                created_at=created_at,
+                prompt=prompt,
+                text=text,
+                max_infer_iters=max_infer_iters,
+                parallel_tool_calls=parallel_tool_calls,
+                tool_executor=request_tool_executor,
+                safety_api=self.safety_api,
+                connectors_api=self.connectors_api,
+                guardrail_ids=guardrail_ids,
+                instructions=instructions,
+                max_tool_calls=max_tool_calls,
+                reasoning=reasoning,
+                max_output_tokens=max_output_tokens,
+                metadata=metadata,
+                include=include,
+                store=store,
+            )
-        # Stream the response
-        final_response = None
-        failed_response = None
+            final_response = None
+            failed_response = None
+            output_items: list[ConversationItem] = []
+            input_items_for_storage = self._prepare_input_items_for_storage(all_input)
-        # Type as ConversationItem to avoid list invariance issues
-        output_items: list[ConversationItem] = []
-        try:
             async for stream_chunk in orchestrator.create_response():
                 match stream_chunk.type:
                     case "response.completed" | "response.incomplete":
@@ -541,6 +672,16 @@ class OpenAIResponsesImpl:
                     case _:
                         pass  # Other event types
+                # Incremental persistence: persist on significant state changes
+                # This enables clients to poll GET /v1/responses/{response_id} during streaming
+                if store:
+                    await self._persist_streaming_state(
+                        stream_chunk=stream_chunk,
+                        orchestrator=orchestrator,
+                        input_items=input_items_for_storage,
+                        output_items=output_items,
+                    )
                 # Store and sync before yielding terminal events
                 # This ensures the storage/syncing happens even if the consumer breaks after receiving the event
                 if (
@@ -548,32 +689,14 @@ class OpenAIResponsesImpl:
                     and final_response
                     and failed_response is None
                 ):
-                    messages_to_store = list(
-                        filter(lambda x: not isinstance(x, OpenAISystemMessageParam), orchestrator.final_messages)
-                    )
-                    if store:
-                        # TODO: we really should work off of output_items instead of "final_messages"
-                        await self._store_response(
-                            response=final_response,
-                            input=all_input,
-                            messages=messages_to_store,
-                        )
                     if conversation:
+                        messages_to_store = list(
+                            filter(lambda x: not isinstance(x, OpenAISystemMessageParam), orchestrator.final_messages)
+                        )
                         await self._sync_response_to_conversation(conversation, input, output_items)
                         await self.responses_store.store_conversation_messages(conversation, messages_to_store)
                 yield stream_chunk
-        finally:
-            # Clean up MCP sessions at the end of the request (fix for #4452)
-            # Use shield() to prevent cancellation from interrupting cleanup and leaking resources
-            # Wrap in try/except as cleanup errors should not mask the original response
-            try:
-                await asyncio.shield(mcp_session_manager.close_all())
-            except BaseException as e:
-                # Debug level - cleanup errors are expected in streaming scenarios where
-                # anyio cancel scopes may be in a different task context
-                logger.debug(f"Error during MCP session cleanup: {e}")
     async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
         return await self.responses_store.delete_response_object(response_id)
@@ -596,4 +719,4 @@ class OpenAIResponsesImpl:
         adapter = TypeAdapter(list[ConversationItem])
         validated_items = adapter.validate_python(conversation_items)
-        await self.conversations_api.add_items(conversation_id, validated_items)
+        await self.conversations_api.add_items(conversation_id, AddItemsRequest(items=validated_items))

llama-stack 0.4.3__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

llama-stack 0.4.3py3-none-any.whl → 0.5.0rc1py3-none-any.whl