llama-stack 0.4.4__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_stack/cli/stack/_list_deps.py +11 -7
- llama_stack/cli/stack/run.py +3 -25
- llama_stack/core/access_control/datatypes.py +78 -0
- llama_stack/core/configure.py +2 -2
- llama_stack/{distributions/meta-reference-gpu → core/connectors}/__init__.py +3 -1
- llama_stack/core/connectors/connectors.py +162 -0
- llama_stack/core/conversations/conversations.py +61 -58
- llama_stack/core/datatypes.py +54 -8
- llama_stack/core/library_client.py +60 -13
- llama_stack/core/prompts/prompts.py +43 -42
- llama_stack/core/routers/datasets.py +20 -17
- llama_stack/core/routers/eval_scoring.py +143 -53
- llama_stack/core/routers/inference.py +20 -9
- llama_stack/core/routers/safety.py +30 -42
- llama_stack/core/routers/vector_io.py +15 -7
- llama_stack/core/routing_tables/models.py +42 -3
- llama_stack/core/routing_tables/scoring_functions.py +19 -19
- llama_stack/core/routing_tables/shields.py +20 -17
- llama_stack/core/routing_tables/vector_stores.py +8 -5
- llama_stack/core/server/auth.py +192 -17
- llama_stack/core/server/fastapi_router_registry.py +40 -5
- llama_stack/core/server/server.py +24 -5
- llama_stack/core/stack.py +54 -10
- llama_stack/core/storage/datatypes.py +9 -0
- llama_stack/core/store/registry.py +1 -1
- llama_stack/core/utils/exec.py +2 -2
- llama_stack/core/utils/type_inspection.py +16 -2
- llama_stack/distributions/dell/config.yaml +4 -1
- llama_stack/distributions/dell/run-with-safety.yaml +4 -1
- llama_stack/distributions/nvidia/config.yaml +4 -1
- llama_stack/distributions/nvidia/run-with-safety.yaml +4 -1
- llama_stack/distributions/oci/config.yaml +4 -1
- llama_stack/distributions/open-benchmark/config.yaml +9 -1
- llama_stack/distributions/postgres-demo/config.yaml +1 -1
- llama_stack/distributions/starter/build.yaml +62 -0
- llama_stack/distributions/starter/config.yaml +22 -3
- llama_stack/distributions/starter/run-with-postgres-store.yaml +22 -3
- llama_stack/distributions/starter/starter.py +13 -1
- llama_stack/distributions/starter-gpu/build.yaml +62 -0
- llama_stack/distributions/starter-gpu/config.yaml +22 -3
- llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +22 -3
- llama_stack/distributions/template.py +10 -2
- llama_stack/distributions/watsonx/config.yaml +4 -1
- llama_stack/log.py +1 -0
- llama_stack/providers/inline/agents/meta_reference/__init__.py +1 -0
- llama_stack/providers/inline/agents/meta_reference/agents.py +57 -61
- llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +49 -51
- llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +94 -22
- llama_stack/providers/inline/agents/meta_reference/responses/types.py +2 -1
- llama_stack/providers/inline/agents/meta_reference/responses/utils.py +4 -1
- llama_stack/providers/inline/agents/meta_reference/safety.py +2 -2
- llama_stack/providers/inline/batches/reference/batches.py +2 -1
- llama_stack/providers/inline/eval/meta_reference/eval.py +40 -32
- llama_stack/providers/inline/post_training/huggingface/post_training.py +33 -38
- llama_stack/providers/inline/post_training/huggingface/utils.py +2 -5
- llama_stack/providers/inline/post_training/torchtune/post_training.py +28 -33
- llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +2 -4
- llama_stack/providers/inline/safety/code_scanner/code_scanner.py +12 -15
- llama_stack/providers/inline/safety/llama_guard/llama_guard.py +15 -18
- llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +11 -17
- llama_stack/providers/inline/scoring/basic/scoring.py +13 -17
- llama_stack/providers/inline/scoring/braintrust/braintrust.py +15 -15
- llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +13 -17
- llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +1 -1
- llama_stack/providers/registry/agents.py +1 -0
- llama_stack/providers/registry/inference.py +1 -9
- llama_stack/providers/registry/vector_io.py +136 -16
- llama_stack/providers/remote/eval/nvidia/eval.py +22 -21
- llama_stack/providers/remote/files/s3/config.py +5 -3
- llama_stack/providers/remote/files/s3/files.py +2 -2
- llama_stack/providers/remote/inference/gemini/gemini.py +4 -0
- llama_stack/providers/remote/inference/openai/openai.py +2 -0
- llama_stack/providers/remote/inference/together/together.py +4 -0
- llama_stack/providers/remote/inference/vertexai/config.py +3 -3
- llama_stack/providers/remote/inference/vertexai/vertexai.py +5 -2
- llama_stack/providers/remote/inference/vllm/config.py +37 -18
- llama_stack/providers/remote/inference/vllm/vllm.py +0 -3
- llama_stack/providers/remote/inference/watsonx/watsonx.py +4 -0
- llama_stack/providers/remote/post_training/nvidia/post_training.py +31 -33
- llama_stack/providers/remote/safety/bedrock/bedrock.py +10 -27
- llama_stack/providers/remote/safety/nvidia/nvidia.py +9 -25
- llama_stack/providers/remote/safety/sambanova/sambanova.py +13 -11
- llama_stack/providers/remote/vector_io/elasticsearch/__init__.py +17 -0
- llama_stack/providers/remote/vector_io/elasticsearch/config.py +32 -0
- llama_stack/providers/remote/vector_io/elasticsearch/elasticsearch.py +463 -0
- llama_stack/providers/remote/vector_io/oci/__init__.py +22 -0
- llama_stack/providers/remote/vector_io/oci/config.py +41 -0
- llama_stack/providers/remote/vector_io/oci/oci26ai.py +595 -0
- llama_stack/providers/remote/vector_io/pgvector/config.py +69 -2
- llama_stack/providers/remote/vector_io/pgvector/pgvector.py +255 -6
- llama_stack/providers/remote/vector_io/qdrant/qdrant.py +62 -38
- llama_stack/providers/utils/bedrock/client.py +3 -3
- llama_stack/providers/utils/bedrock/config.py +7 -7
- llama_stack/providers/utils/inference/embedding_mixin.py +4 -0
- llama_stack/providers/utils/inference/http_client.py +239 -0
- llama_stack/providers/utils/inference/litellm_openai_mixin.py +5 -0
- llama_stack/providers/utils/inference/model_registry.py +148 -2
- llama_stack/providers/utils/inference/openai_compat.py +2 -1
- llama_stack/providers/utils/inference/openai_mixin.py +41 -2
- llama_stack/providers/utils/memory/openai_vector_store_mixin.py +92 -5
- llama_stack/providers/utils/memory/vector_store.py +46 -19
- llama_stack/providers/utils/responses/responses_store.py +7 -7
- llama_stack/providers/utils/safety.py +114 -0
- llama_stack/providers/utils/tools/mcp.py +44 -3
- llama_stack/testing/api_recorder.py +9 -3
- {llama_stack-0.4.4.dist-info → llama_stack-0.5.0rc1.dist-info}/METADATA +14 -2
- {llama_stack-0.4.4.dist-info → llama_stack-0.5.0rc1.dist-info}/RECORD +111 -144
- llama_stack/distributions/meta-reference-gpu/config.yaml +0 -140
- llama_stack/distributions/meta-reference-gpu/doc_template.md +0 -119
- llama_stack/distributions/meta-reference-gpu/meta_reference.py +0 -163
- llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +0 -155
- llama_stack/models/llama/hadamard_utils.py +0 -88
- llama_stack/models/llama/llama3/args.py +0 -74
- llama_stack/models/llama/llama3/dog.jpg +0 -0
- llama_stack/models/llama/llama3/generation.py +0 -378
- llama_stack/models/llama/llama3/model.py +0 -304
- llama_stack/models/llama/llama3/multimodal/__init__.py +0 -12
- llama_stack/models/llama/llama3/multimodal/encoder_utils.py +0 -180
- llama_stack/models/llama/llama3/multimodal/image_transform.py +0 -409
- llama_stack/models/llama/llama3/multimodal/model.py +0 -1430
- llama_stack/models/llama/llama3/multimodal/utils.py +0 -26
- llama_stack/models/llama/llama3/pasta.jpeg +0 -0
- llama_stack/models/llama/llama3/quantization/__init__.py +0 -5
- llama_stack/models/llama/llama3/quantization/loader.py +0 -316
- llama_stack/models/llama/llama3_1/__init__.py +0 -12
- llama_stack/models/llama/llama3_1/prompt_format.md +0 -358
- llama_stack/models/llama/llama3_1/prompts.py +0 -258
- llama_stack/models/llama/llama3_2/__init__.py +0 -5
- llama_stack/models/llama/llama3_2/prompts_text.py +0 -229
- llama_stack/models/llama/llama3_2/prompts_vision.py +0 -126
- llama_stack/models/llama/llama3_2/text_prompt_format.md +0 -286
- llama_stack/models/llama/llama3_2/vision_prompt_format.md +0 -141
- llama_stack/models/llama/llama3_3/__init__.py +0 -5
- llama_stack/models/llama/llama3_3/prompts.py +0 -259
- llama_stack/models/llama/llama4/args.py +0 -107
- llama_stack/models/llama/llama4/ffn.py +0 -58
- llama_stack/models/llama/llama4/moe.py +0 -214
- llama_stack/models/llama/llama4/preprocess.py +0 -435
- llama_stack/models/llama/llama4/quantization/__init__.py +0 -5
- llama_stack/models/llama/llama4/quantization/loader.py +0 -226
- llama_stack/models/llama/llama4/vision/__init__.py +0 -5
- llama_stack/models/llama/llama4/vision/embedding.py +0 -210
- llama_stack/models/llama/llama4/vision/encoder.py +0 -412
- llama_stack/models/llama/quantize_impls.py +0 -316
- llama_stack/providers/inline/inference/meta_reference/__init__.py +0 -20
- llama_stack/providers/inline/inference/meta_reference/common.py +0 -24
- llama_stack/providers/inline/inference/meta_reference/config.py +0 -68
- llama_stack/providers/inline/inference/meta_reference/generators.py +0 -201
- llama_stack/providers/inline/inference/meta_reference/inference.py +0 -542
- llama_stack/providers/inline/inference/meta_reference/model_parallel.py +0 -77
- llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +0 -353
- {llama_stack-0.4.4.dist-info → llama_stack-0.5.0rc1.dist-info}/WHEEL +0 -0
- {llama_stack-0.4.4.dist-info → llama_stack-0.5.0rc1.dist-info}/entry_points.txt +0 -0
- {llama_stack-0.4.4.dist-info → llama_stack-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
- {llama_stack-0.4.4.dist-info → llama_stack-0.5.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
5
|
# the root directory of this source tree.
|
|
6
6
|
|
|
7
|
+
from collections.abc import AsyncIterator
|
|
7
8
|
|
|
8
9
|
from llama_stack.core.datatypes import AccessRule
|
|
9
10
|
from llama_stack.core.storage.kvstore import InmemoryKVStoreImpl, kvstore_impl
|
|
@@ -11,21 +12,21 @@ from llama_stack.log import get_logger
|
|
|
11
12
|
from llama_stack.providers.utils.responses.responses_store import ResponsesStore
|
|
12
13
|
from llama_stack_api import (
|
|
13
14
|
Agents,
|
|
15
|
+
Connectors,
|
|
14
16
|
Conversations,
|
|
17
|
+
CreateResponseRequest,
|
|
18
|
+
DeleteResponseRequest,
|
|
15
19
|
Files,
|
|
16
20
|
Inference,
|
|
17
21
|
ListOpenAIResponseInputItem,
|
|
18
22
|
ListOpenAIResponseObject,
|
|
23
|
+
ListResponseInputItemsRequest,
|
|
24
|
+
ListResponsesRequest,
|
|
19
25
|
OpenAIDeleteResponseObject,
|
|
20
|
-
OpenAIResponseInput,
|
|
21
|
-
OpenAIResponseInputTool,
|
|
22
|
-
OpenAIResponseInputToolChoice,
|
|
23
26
|
OpenAIResponseObject,
|
|
24
|
-
|
|
25
|
-
OpenAIResponseText,
|
|
26
|
-
Order,
|
|
27
|
+
OpenAIResponseObjectStream,
|
|
27
28
|
Prompts,
|
|
28
|
-
|
|
29
|
+
RetrieveResponseRequest,
|
|
29
30
|
Safety,
|
|
30
31
|
ToolGroups,
|
|
31
32
|
ToolRuntime,
|
|
@@ -50,6 +51,7 @@ class MetaReferenceAgentsImpl(Agents):
|
|
|
50
51
|
conversations_api: Conversations,
|
|
51
52
|
prompts_api: Prompts,
|
|
52
53
|
files_api: Files,
|
|
54
|
+
connectors_api: Connectors,
|
|
53
55
|
policy: list[AccessRule],
|
|
54
56
|
):
|
|
55
57
|
self.config = config
|
|
@@ -64,6 +66,7 @@ class MetaReferenceAgentsImpl(Agents):
|
|
|
64
66
|
self.in_memory_store = InmemoryKVStoreImpl()
|
|
65
67
|
self.openai_responses_impl: OpenAIResponsesImpl | None = None
|
|
66
68
|
self.policy = policy
|
|
69
|
+
self.connectors_api = connectors_api
|
|
67
70
|
|
|
68
71
|
async def initialize(self) -> None:
|
|
69
72
|
self.persistence_store = await kvstore_impl(self.config.persistence.agent_state)
|
|
@@ -80,6 +83,7 @@ class MetaReferenceAgentsImpl(Agents):
|
|
|
80
83
|
prompts_api=self.prompts_api,
|
|
81
84
|
files_api=self.files_api,
|
|
82
85
|
vector_stores_config=self.config.vector_stores_config,
|
|
86
|
+
connectors_api=self.connectors_api,
|
|
83
87
|
)
|
|
84
88
|
|
|
85
89
|
async def shutdown(self) -> None:
|
|
@@ -88,79 +92,71 @@ class MetaReferenceAgentsImpl(Agents):
|
|
|
88
92
|
# OpenAI responses
|
|
89
93
|
async def get_openai_response(
|
|
90
94
|
self,
|
|
91
|
-
|
|
95
|
+
request: RetrieveResponseRequest,
|
|
92
96
|
) -> OpenAIResponseObject:
|
|
93
97
|
assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
|
|
94
|
-
return await self.openai_responses_impl.get_openai_response(response_id)
|
|
98
|
+
return await self.openai_responses_impl.get_openai_response(request.response_id)
|
|
95
99
|
|
|
96
100
|
async def create_openai_response(
|
|
97
101
|
self,
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
store: bool | None = True,
|
|
106
|
-
stream: bool | None = False,
|
|
107
|
-
temperature: float | None = None,
|
|
108
|
-
text: OpenAIResponseText | None = None,
|
|
109
|
-
tool_choice: OpenAIResponseInputToolChoice | None = None,
|
|
110
|
-
tools: list[OpenAIResponseInputTool] | None = None,
|
|
111
|
-
include: list[str] | None = None,
|
|
112
|
-
max_infer_iters: int | None = 10,
|
|
113
|
-
guardrails: list[ResponseGuardrail] | None = None,
|
|
114
|
-
max_tool_calls: int | None = None,
|
|
115
|
-
metadata: dict[str, str] | None = None,
|
|
116
|
-
) -> OpenAIResponseObject:
|
|
102
|
+
request: CreateResponseRequest,
|
|
103
|
+
) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
|
|
104
|
+
"""Create an OpenAI response.
|
|
105
|
+
|
|
106
|
+
Returns either a single response object (non-streaming) or an async iterator
|
|
107
|
+
yielding response stream events (streaming).
|
|
108
|
+
"""
|
|
117
109
|
assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
|
|
118
110
|
result = await self.openai_responses_impl.create_openai_response(
|
|
119
|
-
input,
|
|
120
|
-
model,
|
|
121
|
-
prompt,
|
|
122
|
-
instructions,
|
|
123
|
-
previous_response_id,
|
|
124
|
-
conversation,
|
|
125
|
-
store,
|
|
126
|
-
stream,
|
|
127
|
-
temperature,
|
|
128
|
-
text,
|
|
129
|
-
tool_choice,
|
|
130
|
-
tools,
|
|
131
|
-
include,
|
|
132
|
-
max_infer_iters,
|
|
133
|
-
guardrails,
|
|
134
|
-
parallel_tool_calls,
|
|
135
|
-
max_tool_calls,
|
|
136
|
-
|
|
111
|
+
request.input,
|
|
112
|
+
request.model,
|
|
113
|
+
request.prompt,
|
|
114
|
+
request.instructions,
|
|
115
|
+
request.previous_response_id,
|
|
116
|
+
request.conversation,
|
|
117
|
+
request.store,
|
|
118
|
+
request.stream,
|
|
119
|
+
request.temperature,
|
|
120
|
+
request.text,
|
|
121
|
+
request.tool_choice,
|
|
122
|
+
request.tools,
|
|
123
|
+
request.include,
|
|
124
|
+
request.max_infer_iters,
|
|
125
|
+
request.guardrails,
|
|
126
|
+
request.parallel_tool_calls,
|
|
127
|
+
request.max_tool_calls,
|
|
128
|
+
request.max_output_tokens,
|
|
129
|
+
request.reasoning,
|
|
130
|
+
request.metadata,
|
|
137
131
|
)
|
|
138
|
-
return result
|
|
132
|
+
return result
|
|
139
133
|
|
|
140
134
|
async def list_openai_responses(
|
|
141
135
|
self,
|
|
142
|
-
|
|
143
|
-
limit: int | None = 50,
|
|
144
|
-
model: str | None = None,
|
|
145
|
-
order: Order | None = Order.desc,
|
|
136
|
+
request: ListResponsesRequest,
|
|
146
137
|
) -> ListOpenAIResponseObject:
|
|
147
138
|
assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
|
|
148
|
-
return await self.openai_responses_impl.list_openai_responses(
|
|
139
|
+
return await self.openai_responses_impl.list_openai_responses(
|
|
140
|
+
request.after, request.limit, request.model, request.order
|
|
141
|
+
)
|
|
149
142
|
|
|
150
143
|
async def list_openai_response_input_items(
|
|
151
144
|
self,
|
|
152
|
-
|
|
153
|
-
after: str | None = None,
|
|
154
|
-
before: str | None = None,
|
|
155
|
-
include: list[str] | None = None,
|
|
156
|
-
limit: int | None = 20,
|
|
157
|
-
order: Order | None = Order.desc,
|
|
145
|
+
request: ListResponseInputItemsRequest,
|
|
158
146
|
) -> ListOpenAIResponseInputItem:
|
|
159
147
|
assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
|
|
160
148
|
return await self.openai_responses_impl.list_openai_response_input_items(
|
|
161
|
-
response_id,
|
|
149
|
+
request.response_id,
|
|
150
|
+
request.after,
|
|
151
|
+
request.before,
|
|
152
|
+
request.include,
|
|
153
|
+
request.limit,
|
|
154
|
+
request.order,
|
|
162
155
|
)
|
|
163
156
|
|
|
164
|
-
async def delete_openai_response(
|
|
157
|
+
async def delete_openai_response(
|
|
158
|
+
self,
|
|
159
|
+
request: DeleteResponseRequest,
|
|
160
|
+
) -> OpenAIDeleteResponseObject:
|
|
165
161
|
assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
|
|
166
|
-
return await self.openai_responses_impl.delete_openai_response(response_id)
|
|
162
|
+
return await self.openai_responses_impl.delete_openai_response(request.response_id)
|
|
@@ -4,7 +4,6 @@
|
|
|
4
4
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
5
|
# the root directory of this source tree.
|
|
6
6
|
|
|
7
|
-
import asyncio
|
|
8
7
|
import re
|
|
9
8
|
import time
|
|
10
9
|
import uuid
|
|
@@ -19,11 +18,14 @@ from llama_stack.providers.utils.responses.responses_store import (
|
|
|
19
18
|
)
|
|
20
19
|
from llama_stack.providers.utils.tools.mcp import MCPSessionManager
|
|
21
20
|
from llama_stack_api import (
|
|
21
|
+
AddItemsRequest,
|
|
22
|
+
Connectors,
|
|
22
23
|
ConversationItem,
|
|
23
24
|
Conversations,
|
|
24
25
|
Files,
|
|
25
26
|
Inference,
|
|
26
27
|
InvalidConversationIdError,
|
|
28
|
+
ListItemsRequest,
|
|
27
29
|
ListOpenAIResponseInputItem,
|
|
28
30
|
ListOpenAIResponseObject,
|
|
29
31
|
OpenAIChatCompletionContentPartParam,
|
|
@@ -39,6 +41,7 @@ from llama_stack_api import (
|
|
|
39
41
|
OpenAIResponseObject,
|
|
40
42
|
OpenAIResponseObjectStream,
|
|
41
43
|
OpenAIResponsePrompt,
|
|
44
|
+
OpenAIResponseReasoning,
|
|
42
45
|
OpenAIResponseText,
|
|
43
46
|
OpenAIResponseTextFormat,
|
|
44
47
|
OpenAISystemMessageParam,
|
|
@@ -83,6 +86,7 @@ class OpenAIResponsesImpl:
|
|
|
83
86
|
conversations_api: Conversations,
|
|
84
87
|
prompts_api: Prompts,
|
|
85
88
|
files_api: Files,
|
|
89
|
+
connectors_api: Connectors,
|
|
86
90
|
vector_stores_config=None,
|
|
87
91
|
):
|
|
88
92
|
self.inference_api = inference_api
|
|
@@ -100,6 +104,7 @@ class OpenAIResponsesImpl:
|
|
|
100
104
|
)
|
|
101
105
|
self.prompts_api = prompts_api
|
|
102
106
|
self.files_api = files_api
|
|
107
|
+
self.connectors_api = connectors_api
|
|
103
108
|
|
|
104
109
|
async def _prepend_previous_response(
|
|
105
110
|
self,
|
|
@@ -150,7 +155,9 @@ class OpenAIResponsesImpl:
|
|
|
150
155
|
|
|
151
156
|
tool_context.recover_tools_from_previous_response(previous_response)
|
|
152
157
|
elif conversation is not None:
|
|
153
|
-
conversation_items = await self.conversations_api.list_items(
|
|
158
|
+
conversation_items = await self.conversations_api.list_items(
|
|
159
|
+
ListItemsRequest(conversation_id=conversation, order="asc")
|
|
160
|
+
)
|
|
154
161
|
|
|
155
162
|
# Use stored messages as source of truth (like previous_response.messages)
|
|
156
163
|
stored_messages = await self.responses_store.get_conversation_messages(conversation)
|
|
@@ -462,6 +469,8 @@ class OpenAIResponsesImpl:
|
|
|
462
469
|
guardrails: list[str | ResponseGuardrailSpec] | None = None,
|
|
463
470
|
parallel_tool_calls: bool | None = None,
|
|
464
471
|
max_tool_calls: int | None = None,
|
|
472
|
+
reasoning: OpenAIResponseReasoning | None = None,
|
|
473
|
+
max_output_tokens: int | None = None,
|
|
465
474
|
metadata: dict[str, str] | None = None,
|
|
466
475
|
):
|
|
467
476
|
stream = bool(stream)
|
|
@@ -499,9 +508,6 @@ class OpenAIResponsesImpl:
|
|
|
499
508
|
if not conversation.startswith("conv_"):
|
|
500
509
|
raise InvalidConversationIdError(conversation)
|
|
501
510
|
|
|
502
|
-
if max_tool_calls is not None and max_tool_calls < 1:
|
|
503
|
-
raise ValueError(f"Invalid {max_tool_calls=}; should be >= 1")
|
|
504
|
-
|
|
505
511
|
stream_gen = self._create_streaming_response(
|
|
506
512
|
input=input,
|
|
507
513
|
conversation=conversation,
|
|
@@ -518,6 +524,8 @@ class OpenAIResponsesImpl:
|
|
|
518
524
|
guardrail_ids=guardrail_ids,
|
|
519
525
|
parallel_tool_calls=parallel_tool_calls,
|
|
520
526
|
max_tool_calls=max_tool_calls,
|
|
527
|
+
reasoning=reasoning,
|
|
528
|
+
max_output_tokens=max_output_tokens,
|
|
521
529
|
metadata=metadata,
|
|
522
530
|
include=include,
|
|
523
531
|
)
|
|
@@ -573,6 +581,8 @@ class OpenAIResponsesImpl:
|
|
|
573
581
|
guardrail_ids: list[str] | None = None,
|
|
574
582
|
parallel_tool_calls: bool | None = True,
|
|
575
583
|
max_tool_calls: int | None = None,
|
|
584
|
+
reasoning: OpenAIResponseReasoning | None = None,
|
|
585
|
+
max_output_tokens: int | None = None,
|
|
576
586
|
metadata: dict[str, str] | None = None,
|
|
577
587
|
include: list[ResponseItemInclude] | None = None,
|
|
578
588
|
) -> AsyncIterator[OpenAIResponseObjectStream]:
|
|
@@ -612,46 +622,44 @@ class OpenAIResponsesImpl:
|
|
|
612
622
|
|
|
613
623
|
# Create a per-request MCP session manager for session reuse (fix for #4452)
|
|
614
624
|
# This avoids redundant tools/list calls when making multiple MCP tool invocations
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
mcp_session_manager=mcp_session_manager,
|
|
624
|
-
)
|
|
625
|
+
async with MCPSessionManager() as mcp_session_manager:
|
|
626
|
+
request_tool_executor = ToolExecutor(
|
|
627
|
+
tool_groups_api=self.tool_groups_api,
|
|
628
|
+
tool_runtime_api=self.tool_runtime_api,
|
|
629
|
+
vector_io_api=self.vector_io_api,
|
|
630
|
+
vector_stores_config=self.tool_executor.vector_stores_config,
|
|
631
|
+
mcp_session_manager=mcp_session_manager,
|
|
632
|
+
)
|
|
625
633
|
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
634
|
+
orchestrator = StreamingResponseOrchestrator(
|
|
635
|
+
inference_api=self.inference_api,
|
|
636
|
+
ctx=ctx,
|
|
637
|
+
response_id=response_id,
|
|
638
|
+
created_at=created_at,
|
|
639
|
+
prompt=prompt,
|
|
640
|
+
text=text,
|
|
641
|
+
max_infer_iters=max_infer_iters,
|
|
642
|
+
parallel_tool_calls=parallel_tool_calls,
|
|
643
|
+
tool_executor=request_tool_executor,
|
|
644
|
+
safety_api=self.safety_api,
|
|
645
|
+
connectors_api=self.connectors_api,
|
|
646
|
+
guardrail_ids=guardrail_ids,
|
|
647
|
+
instructions=instructions,
|
|
648
|
+
max_tool_calls=max_tool_calls,
|
|
649
|
+
reasoning=reasoning,
|
|
650
|
+
max_output_tokens=max_output_tokens,
|
|
651
|
+
metadata=metadata,
|
|
652
|
+
include=include,
|
|
653
|
+
store=store,
|
|
654
|
+
)
|
|
643
655
|
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
failed_response = None
|
|
656
|
+
final_response = None
|
|
657
|
+
failed_response = None
|
|
647
658
|
|
|
648
|
-
|
|
649
|
-
output_items: list[ConversationItem] = []
|
|
659
|
+
output_items: list[ConversationItem] = []
|
|
650
660
|
|
|
651
|
-
|
|
652
|
-
input_items_for_storage = self._prepare_input_items_for_storage(all_input)
|
|
661
|
+
input_items_for_storage = self._prepare_input_items_for_storage(all_input)
|
|
653
662
|
|
|
654
|
-
try:
|
|
655
663
|
async for stream_chunk in orchestrator.create_response():
|
|
656
664
|
match stream_chunk.type:
|
|
657
665
|
case "response.completed" | "response.incomplete":
|
|
@@ -689,16 +697,6 @@ class OpenAIResponsesImpl:
|
|
|
689
697
|
await self.responses_store.store_conversation_messages(conversation, messages_to_store)
|
|
690
698
|
|
|
691
699
|
yield stream_chunk
|
|
692
|
-
finally:
|
|
693
|
-
# Clean up MCP sessions at the end of the request (fix for #4452)
|
|
694
|
-
# Use shield() to prevent cancellation from interrupting cleanup and leaking resources
|
|
695
|
-
# Wrap in try/except as cleanup errors should not mask the original response
|
|
696
|
-
try:
|
|
697
|
-
await asyncio.shield(mcp_session_manager.close_all())
|
|
698
|
-
except BaseException as e:
|
|
699
|
-
# Debug level - cleanup errors are expected in streaming scenarios where
|
|
700
|
-
# anyio cancel scopes may be in a different task context
|
|
701
|
-
logger.debug(f"Error during MCP session cleanup: {e}")
|
|
702
700
|
|
|
703
701
|
async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
|
|
704
702
|
return await self.responses_store.delete_response_object(response_id)
|
|
@@ -721,4 +719,4 @@ class OpenAIResponsesImpl:
|
|
|
721
719
|
|
|
722
720
|
adapter = TypeAdapter(list[ConversationItem])
|
|
723
721
|
validated_items = adapter.validate_python(conversation_items)
|
|
724
|
-
await self.conversations_api.add_items(conversation_id, validated_items)
|
|
722
|
+
await self.conversations_api.add_items(conversation_id, AddItemsRequest(items=validated_items))
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
5
|
# the root directory of this source tree.
|
|
6
6
|
|
|
7
|
+
import time
|
|
7
8
|
import uuid
|
|
8
9
|
from collections.abc import AsyncIterator
|
|
9
10
|
from typing import Any
|
|
@@ -16,6 +17,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import interleaved_con
|
|
|
16
17
|
from llama_stack_api import (
|
|
17
18
|
AllowedToolsFilter,
|
|
18
19
|
ApprovalFilter,
|
|
20
|
+
Connectors,
|
|
19
21
|
Inference,
|
|
20
22
|
MCPListToolsTool,
|
|
21
23
|
ModelNotFoundError,
|
|
@@ -30,6 +32,7 @@ from llama_stack_api import (
|
|
|
30
32
|
OpenAIChatCompletionToolChoiceFunctionTool,
|
|
31
33
|
OpenAIChoice,
|
|
32
34
|
OpenAIChoiceLogprobs,
|
|
35
|
+
OpenAIFinishReason,
|
|
33
36
|
OpenAIMessageParam,
|
|
34
37
|
OpenAIResponseContentPartOutputText,
|
|
35
38
|
OpenAIResponseContentPartReasoningText,
|
|
@@ -77,6 +80,7 @@ from llama_stack_api import (
|
|
|
77
80
|
OpenAIResponseOutputMessageMCPListTools,
|
|
78
81
|
OpenAIResponseOutputMessageWebSearchToolCall,
|
|
79
82
|
OpenAIResponsePrompt,
|
|
83
|
+
OpenAIResponseReasoning,
|
|
80
84
|
OpenAIResponseText,
|
|
81
85
|
OpenAIResponseUsage,
|
|
82
86
|
OpenAIResponseUsageInputTokensDetails,
|
|
@@ -133,11 +137,15 @@ class StreamingResponseOrchestrator:
|
|
|
133
137
|
instructions: str | None,
|
|
134
138
|
safety_api: Safety | None,
|
|
135
139
|
guardrail_ids: list[str] | None = None,
|
|
140
|
+
connectors_api: Connectors | None = None,
|
|
136
141
|
prompt: OpenAIResponsePrompt | None = None,
|
|
137
142
|
parallel_tool_calls: bool | None = None,
|
|
138
143
|
max_tool_calls: int | None = None,
|
|
144
|
+
reasoning: OpenAIResponseReasoning | None = None,
|
|
145
|
+
max_output_tokens: int | None = None,
|
|
139
146
|
metadata: dict[str, str] | None = None,
|
|
140
147
|
include: list[ResponseItemInclude] | None = None,
|
|
148
|
+
store: bool | None = True,
|
|
141
149
|
):
|
|
142
150
|
self.inference_api = inference_api
|
|
143
151
|
self.ctx = ctx
|
|
@@ -147,6 +155,7 @@ class StreamingResponseOrchestrator:
|
|
|
147
155
|
self.max_infer_iters = max_infer_iters
|
|
148
156
|
self.tool_executor = tool_executor
|
|
149
157
|
self.safety_api = safety_api
|
|
158
|
+
self.connectors_api = connectors_api
|
|
150
159
|
self.guardrail_ids = guardrail_ids or []
|
|
151
160
|
self.prompt = prompt
|
|
152
161
|
# System message that is inserted into the model's context
|
|
@@ -155,8 +164,13 @@ class StreamingResponseOrchestrator:
|
|
|
155
164
|
self.parallel_tool_calls = parallel_tool_calls
|
|
156
165
|
# Max number of total calls to built-in tools that can be processed in a response
|
|
157
166
|
self.max_tool_calls = max_tool_calls
|
|
167
|
+
self.reasoning = reasoning
|
|
168
|
+
# An upper bound for the number of tokens that can be generated for a response
|
|
169
|
+
self.max_output_tokens = max_output_tokens
|
|
158
170
|
self.metadata = metadata
|
|
171
|
+
self.store = store
|
|
159
172
|
self.include = include
|
|
173
|
+
self.store = bool(store) if store is not None else True
|
|
160
174
|
self.sequence_number = 0
|
|
161
175
|
# Store MCP tool mapping that gets built during tool processing
|
|
162
176
|
self.mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] = (
|
|
@@ -179,6 +193,8 @@ class StreamingResponseOrchestrator:
|
|
|
179
193
|
self.violation_detected = False
|
|
180
194
|
# Track total calls made to built-in tools
|
|
181
195
|
self.accumulated_builtin_tool_calls = 0
|
|
196
|
+
# Track total output tokens generated across inference calls
|
|
197
|
+
self.accumulated_builtin_output_tokens = 0
|
|
182
198
|
|
|
183
199
|
async def _create_refusal_response(self, violation_message: str) -> OpenAIResponseObjectStream:
|
|
184
200
|
"""Create a refusal response to replace streaming content."""
|
|
@@ -191,7 +207,9 @@ class StreamingResponseOrchestrator:
|
|
|
191
207
|
model=self.ctx.model,
|
|
192
208
|
status="completed",
|
|
193
209
|
output=[OpenAIResponseMessage(role="assistant", content=[refusal_content], type="message")],
|
|
210
|
+
max_output_tokens=self.max_output_tokens,
|
|
194
211
|
metadata=self.metadata,
|
|
212
|
+
store=self.store,
|
|
195
213
|
)
|
|
196
214
|
|
|
197
215
|
return OpenAIResponseObjectStreamResponseCompleted(response=refusal_response)
|
|
@@ -212,8 +230,10 @@ class StreamingResponseOrchestrator:
|
|
|
212
230
|
*,
|
|
213
231
|
error: OpenAIResponseError | None = None,
|
|
214
232
|
) -> OpenAIResponseObject:
|
|
233
|
+
completed_at = int(time.time()) if status == "completed" else None
|
|
215
234
|
return OpenAIResponseObject(
|
|
216
235
|
created_at=self.created_at,
|
|
236
|
+
completed_at=completed_at,
|
|
217
237
|
id=self.response_id,
|
|
218
238
|
model=self.ctx.model,
|
|
219
239
|
object="response",
|
|
@@ -228,7 +248,10 @@ class StreamingResponseOrchestrator:
|
|
|
228
248
|
prompt=self.prompt,
|
|
229
249
|
parallel_tool_calls=self.parallel_tool_calls,
|
|
230
250
|
max_tool_calls=self.max_tool_calls,
|
|
251
|
+
reasoning=self.reasoning,
|
|
252
|
+
max_output_tokens=self.max_output_tokens,
|
|
231
253
|
metadata=self.metadata,
|
|
254
|
+
store=self.store,
|
|
232
255
|
)
|
|
233
256
|
|
|
234
257
|
async def create_response(self) -> AsyncIterator[OpenAIResponseObjectStream]:
|
|
@@ -292,6 +315,22 @@ class StreamingResponseOrchestrator:
|
|
|
292
315
|
|
|
293
316
|
try:
|
|
294
317
|
while True:
|
|
318
|
+
if (
|
|
319
|
+
self.max_output_tokens is not None
|
|
320
|
+
and self.accumulated_builtin_output_tokens >= self.max_output_tokens
|
|
321
|
+
):
|
|
322
|
+
logger.info(
|
|
323
|
+
"Skipping inference call since max_output_tokens reached: "
|
|
324
|
+
f"{self.accumulated_builtin_output_tokens}/{self.max_output_tokens}"
|
|
325
|
+
)
|
|
326
|
+
final_status = "incomplete"
|
|
327
|
+
break
|
|
328
|
+
|
|
329
|
+
remaining_output_tokens = (
|
|
330
|
+
self.max_output_tokens - self.accumulated_builtin_output_tokens
|
|
331
|
+
if self.max_output_tokens is not None
|
|
332
|
+
else None
|
|
333
|
+
)
|
|
295
334
|
# Text is the default response format for chat completion so don't need to pass it
|
|
296
335
|
# (some providers don't support non-empty response_format when tools are present)
|
|
297
336
|
response_format = (
|
|
@@ -311,6 +350,11 @@ class StreamingResponseOrchestrator:
|
|
|
311
350
|
True if self.include and ResponseItemInclude.message_output_text_logprobs in self.include else None
|
|
312
351
|
)
|
|
313
352
|
|
|
353
|
+
# In OpenAI, parallel_tool_calls is only allowed when 'tools' are specified.
|
|
354
|
+
effective_parallel_tool_calls = (
|
|
355
|
+
self.parallel_tool_calls if effective_tools is not None and len(effective_tools) > 0 else None
|
|
356
|
+
)
|
|
357
|
+
|
|
314
358
|
params = OpenAIChatCompletionRequestWithExtraBody(
|
|
315
359
|
model=self.ctx.model,
|
|
316
360
|
messages=messages,
|
|
@@ -324,6 +368,9 @@ class StreamingResponseOrchestrator:
|
|
|
324
368
|
"include_usage": True,
|
|
325
369
|
},
|
|
326
370
|
logprobs=logprobs,
|
|
371
|
+
parallel_tool_calls=effective_parallel_tool_calls,
|
|
372
|
+
reasoning_effort=self.reasoning.effort if self.reasoning else None,
|
|
373
|
+
max_completion_tokens=remaining_output_tokens,
|
|
327
374
|
)
|
|
328
375
|
completion_result = await self.inference_api.openai_chat_completion(params)
|
|
329
376
|
|
|
@@ -480,23 +527,24 @@ class StreamingResponseOrchestrator:
|
|
|
480
527
|
if not chunk.usage:
|
|
481
528
|
return
|
|
482
529
|
|
|
530
|
+
self.accumulated_builtin_output_tokens += chunk.usage.completion_tokens
|
|
531
|
+
|
|
483
532
|
if self.accumulated_usage is None:
|
|
484
533
|
# Convert from chat completion format to response format
|
|
485
534
|
self.accumulated_usage = OpenAIResponseUsage(
|
|
486
535
|
input_tokens=chunk.usage.prompt_tokens,
|
|
487
536
|
output_tokens=chunk.usage.completion_tokens,
|
|
488
537
|
total_tokens=chunk.usage.total_tokens,
|
|
489
|
-
input_tokens_details=(
|
|
490
|
-
|
|
491
|
-
if chunk.usage.prompt_tokens_details
|
|
492
|
-
else
|
|
538
|
+
input_tokens_details=OpenAIResponseUsageInputTokensDetails(
|
|
539
|
+
cached_tokens=chunk.usage.prompt_tokens_details.cached_tokens
|
|
540
|
+
if chunk.usage.prompt_tokens_details and chunk.usage.prompt_tokens_details.cached_tokens is not None
|
|
541
|
+
else 0
|
|
493
542
|
),
|
|
494
|
-
output_tokens_details=(
|
|
495
|
-
|
|
496
|
-
reasoning_tokens=chunk.usage.completion_tokens_details.reasoning_tokens
|
|
497
|
-
)
|
|
543
|
+
output_tokens_details=OpenAIResponseUsageOutputTokensDetails(
|
|
544
|
+
reasoning_tokens=chunk.usage.completion_tokens_details.reasoning_tokens
|
|
498
545
|
if chunk.usage.completion_tokens_details
|
|
499
|
-
|
|
546
|
+
and chunk.usage.completion_tokens_details.reasoning_tokens is not None
|
|
547
|
+
else 0
|
|
500
548
|
),
|
|
501
549
|
)
|
|
502
550
|
else:
|
|
@@ -506,17 +554,16 @@ class StreamingResponseOrchestrator:
|
|
|
506
554
|
output_tokens=self.accumulated_usage.output_tokens + chunk.usage.completion_tokens,
|
|
507
555
|
total_tokens=self.accumulated_usage.total_tokens + chunk.usage.total_tokens,
|
|
508
556
|
# Use latest non-null details
|
|
509
|
-
input_tokens_details=(
|
|
510
|
-
|
|
511
|
-
if chunk.usage.prompt_tokens_details
|
|
512
|
-
else self.accumulated_usage.input_tokens_details
|
|
557
|
+
input_tokens_details=OpenAIResponseUsageInputTokensDetails(
|
|
558
|
+
cached_tokens=chunk.usage.prompt_tokens_details.cached_tokens
|
|
559
|
+
if chunk.usage.prompt_tokens_details and chunk.usage.prompt_tokens_details.cached_tokens is not None
|
|
560
|
+
else self.accumulated_usage.input_tokens_details.cached_tokens
|
|
513
561
|
),
|
|
514
|
-
output_tokens_details=(
|
|
515
|
-
|
|
516
|
-
reasoning_tokens=chunk.usage.completion_tokens_details.reasoning_tokens
|
|
517
|
-
)
|
|
562
|
+
output_tokens_details=OpenAIResponseUsageOutputTokensDetails(
|
|
563
|
+
reasoning_tokens=chunk.usage.completion_tokens_details.reasoning_tokens
|
|
518
564
|
if chunk.usage.completion_tokens_details
|
|
519
|
-
|
|
565
|
+
and chunk.usage.completion_tokens_details.reasoning_tokens is not None
|
|
566
|
+
else self.accumulated_usage.output_tokens_details.reasoning_tokens
|
|
520
567
|
),
|
|
521
568
|
)
|
|
522
569
|
|
|
@@ -652,7 +699,7 @@ class StreamingResponseOrchestrator:
|
|
|
652
699
|
chat_response_tool_calls: dict[int, OpenAIChatCompletionToolCall] = {}
|
|
653
700
|
chunk_created = 0
|
|
654
701
|
chunk_model = ""
|
|
655
|
-
chunk_finish_reason = ""
|
|
702
|
+
chunk_finish_reason: OpenAIFinishReason = "stop"
|
|
656
703
|
chat_response_logprobs = []
|
|
657
704
|
|
|
658
705
|
# Create a placeholder message item for delta events
|
|
@@ -744,9 +791,9 @@ class StreamingResponseOrchestrator:
|
|
|
744
791
|
chunk_finish_reason = chunk_choice.finish_reason
|
|
745
792
|
|
|
746
793
|
# Handle reasoning content if present (non-standard field for o1/o3 models)
|
|
747
|
-
if hasattr(chunk_choice.delta, "
|
|
794
|
+
if hasattr(chunk_choice.delta, "reasoning") and chunk_choice.delta.reasoning:
|
|
748
795
|
async for event in self._handle_reasoning_content_chunk(
|
|
749
|
-
reasoning_content=chunk_choice.delta.
|
|
796
|
+
reasoning_content=chunk_choice.delta.reasoning,
|
|
750
797
|
reasoning_part_emitted=reasoning_part_emitted,
|
|
751
798
|
reasoning_content_index=reasoning_content_index,
|
|
752
799
|
message_item_id=message_item_id,
|
|
@@ -758,7 +805,7 @@ class StreamingResponseOrchestrator:
|
|
|
758
805
|
else:
|
|
759
806
|
yield event
|
|
760
807
|
reasoning_part_emitted = True
|
|
761
|
-
reasoning_text_accumulated.append(chunk_choice.delta.
|
|
808
|
+
reasoning_text_accumulated.append(chunk_choice.delta.reasoning)
|
|
762
809
|
|
|
763
810
|
# Handle refusal content if present
|
|
764
811
|
if chunk_choice.delta.refusal:
|
|
@@ -1175,6 +1222,9 @@ class StreamingResponseOrchestrator:
|
|
|
1175
1222
|
"""Process an MCP tool configuration and emit appropriate streaming events."""
|
|
1176
1223
|
from llama_stack.providers.utils.tools.mcp import list_mcp_tools
|
|
1177
1224
|
|
|
1225
|
+
# Resolve connector_id to server_url if provided
|
|
1226
|
+
mcp_tool = await resolve_mcp_connector_id(mcp_tool, self.connectors_api)
|
|
1227
|
+
|
|
1178
1228
|
# Emit mcp_list_tools.in_progress
|
|
1179
1229
|
self.sequence_number += 1
|
|
1180
1230
|
yield OpenAIResponseObjectStreamResponseMcpListToolsInProgress(
|
|
@@ -1489,3 +1539,25 @@ async def _process_tool_choice(
|
|
|
1489
1539
|
tools=tool_choice,
|
|
1490
1540
|
mode="required",
|
|
1491
1541
|
)
|
|
1542
|
+
|
|
1543
|
+
|
|
1544
|
+
async def resolve_mcp_connector_id(
|
|
1545
|
+
mcp_tool: OpenAIResponseInputToolMCP,
|
|
1546
|
+
connectors_api: Connectors,
|
|
1547
|
+
) -> OpenAIResponseInputToolMCP:
|
|
1548
|
+
"""Resolve connector_id to server_url for an MCP tool.
|
|
1549
|
+
|
|
1550
|
+
If the mcp_tool has a connector_id but no server_url, this function
|
|
1551
|
+
looks up the connector and populates the server_url from it.
|
|
1552
|
+
|
|
1553
|
+
Args:
|
|
1554
|
+
mcp_tool: The MCP tool configuration to resolve
|
|
1555
|
+
connectors_api: The connectors API for looking up connectors
|
|
1556
|
+
|
|
1557
|
+
Returns:
|
|
1558
|
+
The mcp_tool with server_url populated (may be same instance if already set)
|
|
1559
|
+
"""
|
|
1560
|
+
if mcp_tool.connector_id and not mcp_tool.server_url:
|
|
1561
|
+
connector = await connectors_api.get_connector(mcp_tool.connector_id)
|
|
1562
|
+
return mcp_tool.model_copy(update={"server_url": connector.url})
|
|
1563
|
+
return mcp_tool
|