llama-stack 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. llama_stack/cli/stack/_list_deps.py +11 -7
  2. llama_stack/cli/stack/run.py +3 -25
  3. llama_stack/core/access_control/datatypes.py +78 -0
  4. llama_stack/core/configure.py +2 -2
  5. llama_stack/{distributions/meta-reference-gpu → core/connectors}/__init__.py +3 -1
  6. llama_stack/core/connectors/connectors.py +162 -0
  7. llama_stack/core/conversations/conversations.py +61 -58
  8. llama_stack/core/datatypes.py +54 -8
  9. llama_stack/core/library_client.py +60 -13
  10. llama_stack/core/prompts/prompts.py +43 -42
  11. llama_stack/core/routers/datasets.py +20 -17
  12. llama_stack/core/routers/eval_scoring.py +143 -53
  13. llama_stack/core/routers/inference.py +20 -9
  14. llama_stack/core/routers/safety.py +30 -42
  15. llama_stack/core/routers/vector_io.py +15 -7
  16. llama_stack/core/routing_tables/models.py +42 -3
  17. llama_stack/core/routing_tables/scoring_functions.py +19 -19
  18. llama_stack/core/routing_tables/shields.py +20 -17
  19. llama_stack/core/routing_tables/vector_stores.py +8 -5
  20. llama_stack/core/server/auth.py +192 -17
  21. llama_stack/core/server/fastapi_router_registry.py +40 -5
  22. llama_stack/core/server/server.py +24 -5
  23. llama_stack/core/stack.py +54 -10
  24. llama_stack/core/storage/datatypes.py +9 -0
  25. llama_stack/core/store/registry.py +1 -1
  26. llama_stack/core/utils/exec.py +2 -2
  27. llama_stack/core/utils/type_inspection.py +16 -2
  28. llama_stack/distributions/dell/config.yaml +4 -1
  29. llama_stack/distributions/dell/run-with-safety.yaml +4 -1
  30. llama_stack/distributions/nvidia/config.yaml +4 -1
  31. llama_stack/distributions/nvidia/run-with-safety.yaml +4 -1
  32. llama_stack/distributions/oci/config.yaml +4 -1
  33. llama_stack/distributions/open-benchmark/config.yaml +9 -1
  34. llama_stack/distributions/postgres-demo/config.yaml +1 -1
  35. llama_stack/distributions/starter/build.yaml +62 -0
  36. llama_stack/distributions/starter/config.yaml +22 -3
  37. llama_stack/distributions/starter/run-with-postgres-store.yaml +22 -3
  38. llama_stack/distributions/starter/starter.py +13 -1
  39. llama_stack/distributions/starter-gpu/build.yaml +62 -0
  40. llama_stack/distributions/starter-gpu/config.yaml +22 -3
  41. llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +22 -3
  42. llama_stack/distributions/template.py +10 -2
  43. llama_stack/distributions/watsonx/config.yaml +4 -1
  44. llama_stack/log.py +1 -0
  45. llama_stack/providers/inline/agents/meta_reference/__init__.py +1 -0
  46. llama_stack/providers/inline/agents/meta_reference/agents.py +58 -61
  47. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +53 -51
  48. llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +99 -22
  49. llama_stack/providers/inline/agents/meta_reference/responses/types.py +2 -1
  50. llama_stack/providers/inline/agents/meta_reference/responses/utils.py +4 -1
  51. llama_stack/providers/inline/agents/meta_reference/safety.py +2 -2
  52. llama_stack/providers/inline/batches/reference/batches.py +2 -1
  53. llama_stack/providers/inline/eval/meta_reference/eval.py +40 -32
  54. llama_stack/providers/inline/post_training/huggingface/post_training.py +33 -38
  55. llama_stack/providers/inline/post_training/huggingface/utils.py +2 -5
  56. llama_stack/providers/inline/post_training/torchtune/common/utils.py +5 -9
  57. llama_stack/providers/inline/post_training/torchtune/post_training.py +28 -33
  58. llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +2 -4
  59. llama_stack/providers/inline/safety/code_scanner/code_scanner.py +12 -15
  60. llama_stack/providers/inline/safety/llama_guard/llama_guard.py +20 -24
  61. llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +11 -17
  62. llama_stack/providers/inline/scoring/basic/scoring.py +13 -17
  63. llama_stack/providers/inline/scoring/braintrust/braintrust.py +15 -15
  64. llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +13 -17
  65. llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +1 -1
  66. llama_stack/providers/registry/agents.py +1 -0
  67. llama_stack/providers/registry/inference.py +1 -9
  68. llama_stack/providers/registry/vector_io.py +136 -16
  69. llama_stack/providers/remote/eval/nvidia/eval.py +22 -21
  70. llama_stack/providers/remote/files/s3/config.py +5 -3
  71. llama_stack/providers/remote/files/s3/files.py +2 -2
  72. llama_stack/providers/remote/inference/gemini/gemini.py +4 -0
  73. llama_stack/providers/remote/inference/openai/openai.py +2 -0
  74. llama_stack/providers/remote/inference/together/together.py +4 -0
  75. llama_stack/providers/remote/inference/vertexai/config.py +3 -3
  76. llama_stack/providers/remote/inference/vertexai/vertexai.py +5 -2
  77. llama_stack/providers/remote/inference/vllm/config.py +37 -18
  78. llama_stack/providers/remote/inference/vllm/vllm.py +0 -3
  79. llama_stack/providers/remote/inference/watsonx/watsonx.py +4 -0
  80. llama_stack/providers/remote/post_training/nvidia/models.py +3 -11
  81. llama_stack/providers/remote/post_training/nvidia/post_training.py +31 -33
  82. llama_stack/providers/remote/safety/bedrock/bedrock.py +10 -27
  83. llama_stack/providers/remote/safety/nvidia/nvidia.py +9 -25
  84. llama_stack/providers/remote/safety/sambanova/sambanova.py +13 -11
  85. llama_stack/providers/remote/vector_io/elasticsearch/__init__.py +17 -0
  86. llama_stack/providers/remote/vector_io/elasticsearch/config.py +32 -0
  87. llama_stack/providers/remote/vector_io/elasticsearch/elasticsearch.py +463 -0
  88. llama_stack/providers/remote/vector_io/oci/__init__.py +22 -0
  89. llama_stack/providers/remote/vector_io/oci/config.py +41 -0
  90. llama_stack/providers/remote/vector_io/oci/oci26ai.py +595 -0
  91. llama_stack/providers/remote/vector_io/pgvector/config.py +69 -2
  92. llama_stack/providers/remote/vector_io/pgvector/pgvector.py +255 -6
  93. llama_stack/providers/remote/vector_io/qdrant/qdrant.py +62 -38
  94. llama_stack/providers/utils/bedrock/client.py +3 -3
  95. llama_stack/providers/utils/bedrock/config.py +7 -7
  96. llama_stack/providers/utils/inference/__init__.py +0 -25
  97. llama_stack/providers/utils/inference/embedding_mixin.py +4 -0
  98. llama_stack/providers/utils/inference/http_client.py +239 -0
  99. llama_stack/providers/utils/inference/litellm_openai_mixin.py +6 -0
  100. llama_stack/providers/utils/inference/model_registry.py +148 -2
  101. llama_stack/providers/utils/inference/openai_compat.py +1 -158
  102. llama_stack/providers/utils/inference/openai_mixin.py +42 -2
  103. llama_stack/providers/utils/inference/prompt_adapter.py +0 -209
  104. llama_stack/providers/utils/memory/openai_vector_store_mixin.py +92 -5
  105. llama_stack/providers/utils/memory/vector_store.py +46 -19
  106. llama_stack/providers/utils/responses/responses_store.py +7 -7
  107. llama_stack/providers/utils/safety.py +114 -0
  108. llama_stack/providers/utils/tools/mcp.py +44 -3
  109. llama_stack/testing/api_recorder.py +9 -3
  110. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0.dist-info}/METADATA +14 -2
  111. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0.dist-info}/RECORD +115 -148
  112. llama_stack/distributions/meta-reference-gpu/config.yaml +0 -140
  113. llama_stack/distributions/meta-reference-gpu/doc_template.md +0 -119
  114. llama_stack/distributions/meta-reference-gpu/meta_reference.py +0 -163
  115. llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +0 -155
  116. llama_stack/models/llama/hadamard_utils.py +0 -88
  117. llama_stack/models/llama/llama3/args.py +0 -74
  118. llama_stack/models/llama/llama3/dog.jpg +0 -0
  119. llama_stack/models/llama/llama3/generation.py +0 -378
  120. llama_stack/models/llama/llama3/model.py +0 -304
  121. llama_stack/models/llama/llama3/multimodal/__init__.py +0 -12
  122. llama_stack/models/llama/llama3/multimodal/encoder_utils.py +0 -180
  123. llama_stack/models/llama/llama3/multimodal/image_transform.py +0 -409
  124. llama_stack/models/llama/llama3/multimodal/model.py +0 -1430
  125. llama_stack/models/llama/llama3/multimodal/utils.py +0 -26
  126. llama_stack/models/llama/llama3/pasta.jpeg +0 -0
  127. llama_stack/models/llama/llama3/quantization/__init__.py +0 -5
  128. llama_stack/models/llama/llama3/quantization/loader.py +0 -316
  129. llama_stack/models/llama/llama3_1/__init__.py +0 -12
  130. llama_stack/models/llama/llama3_1/prompt_format.md +0 -358
  131. llama_stack/models/llama/llama3_1/prompts.py +0 -258
  132. llama_stack/models/llama/llama3_2/__init__.py +0 -5
  133. llama_stack/models/llama/llama3_2/prompts_text.py +0 -229
  134. llama_stack/models/llama/llama3_2/prompts_vision.py +0 -126
  135. llama_stack/models/llama/llama3_2/text_prompt_format.md +0 -286
  136. llama_stack/models/llama/llama3_2/vision_prompt_format.md +0 -141
  137. llama_stack/models/llama/llama3_3/__init__.py +0 -5
  138. llama_stack/models/llama/llama3_3/prompts.py +0 -259
  139. llama_stack/models/llama/llama4/args.py +0 -107
  140. llama_stack/models/llama/llama4/ffn.py +0 -58
  141. llama_stack/models/llama/llama4/moe.py +0 -214
  142. llama_stack/models/llama/llama4/preprocess.py +0 -435
  143. llama_stack/models/llama/llama4/quantization/__init__.py +0 -5
  144. llama_stack/models/llama/llama4/quantization/loader.py +0 -226
  145. llama_stack/models/llama/llama4/vision/__init__.py +0 -5
  146. llama_stack/models/llama/llama4/vision/embedding.py +0 -210
  147. llama_stack/models/llama/llama4/vision/encoder.py +0 -412
  148. llama_stack/models/llama/quantize_impls.py +0 -316
  149. llama_stack/providers/inline/inference/meta_reference/__init__.py +0 -20
  150. llama_stack/providers/inline/inference/meta_reference/common.py +0 -24
  151. llama_stack/providers/inline/inference/meta_reference/config.py +0 -68
  152. llama_stack/providers/inline/inference/meta_reference/generators.py +0 -201
  153. llama_stack/providers/inline/inference/meta_reference/inference.py +0 -542
  154. llama_stack/providers/inline/inference/meta_reference/model_parallel.py +0 -77
  155. llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +0 -353
  156. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0.dist-info}/WHEEL +0 -0
  157. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0.dist-info}/entry_points.txt +0 -0
  158. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0.dist-info}/licenses/LICENSE +0 -0
  159. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,7 @@ async def get_provider_impl(
28
28
  deps[Api.conversations],
29
29
  deps[Api.prompts],
30
30
  deps[Api.files],
31
+ deps[Api.connectors],
31
32
  policy,
32
33
  )
33
34
  await impl.initialize()
@@ -4,6 +4,7 @@
4
4
  # This source code is licensed under the terms described in the LICENSE file in
5
5
  # the root directory of this source tree.
6
6
 
7
+ from collections.abc import AsyncIterator
7
8
 
8
9
  from llama_stack.core.datatypes import AccessRule
9
10
  from llama_stack.core.storage.kvstore import InmemoryKVStoreImpl, kvstore_impl
@@ -11,21 +12,21 @@ from llama_stack.log import get_logger
11
12
  from llama_stack.providers.utils.responses.responses_store import ResponsesStore
12
13
  from llama_stack_api import (
13
14
  Agents,
15
+ Connectors,
14
16
  Conversations,
17
+ CreateResponseRequest,
18
+ DeleteResponseRequest,
15
19
  Files,
16
20
  Inference,
17
21
  ListOpenAIResponseInputItem,
18
22
  ListOpenAIResponseObject,
23
+ ListResponseInputItemsRequest,
24
+ ListResponsesRequest,
19
25
  OpenAIDeleteResponseObject,
20
- OpenAIResponseInput,
21
- OpenAIResponseInputTool,
22
- OpenAIResponseInputToolChoice,
23
26
  OpenAIResponseObject,
24
- OpenAIResponsePrompt,
25
- OpenAIResponseText,
26
- Order,
27
+ OpenAIResponseObjectStream,
27
28
  Prompts,
28
- ResponseGuardrail,
29
+ RetrieveResponseRequest,
29
30
  Safety,
30
31
  ToolGroups,
31
32
  ToolRuntime,
@@ -50,6 +51,7 @@ class MetaReferenceAgentsImpl(Agents):
50
51
  conversations_api: Conversations,
51
52
  prompts_api: Prompts,
52
53
  files_api: Files,
54
+ connectors_api: Connectors,
53
55
  policy: list[AccessRule],
54
56
  ):
55
57
  self.config = config
@@ -64,6 +66,7 @@ class MetaReferenceAgentsImpl(Agents):
64
66
  self.in_memory_store = InmemoryKVStoreImpl()
65
67
  self.openai_responses_impl: OpenAIResponsesImpl | None = None
66
68
  self.policy = policy
69
+ self.connectors_api = connectors_api
67
70
 
68
71
  async def initialize(self) -> None:
69
72
  self.persistence_store = await kvstore_impl(self.config.persistence.agent_state)
@@ -80,6 +83,7 @@ class MetaReferenceAgentsImpl(Agents):
80
83
  prompts_api=self.prompts_api,
81
84
  files_api=self.files_api,
82
85
  vector_stores_config=self.config.vector_stores_config,
86
+ connectors_api=self.connectors_api,
83
87
  )
84
88
 
85
89
  async def shutdown(self) -> None:
@@ -88,79 +92,72 @@ class MetaReferenceAgentsImpl(Agents):
88
92
  # OpenAI responses
89
93
  async def get_openai_response(
90
94
  self,
91
- response_id: str,
95
+ request: RetrieveResponseRequest,
92
96
  ) -> OpenAIResponseObject:
93
97
  assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
94
- return await self.openai_responses_impl.get_openai_response(response_id)
98
+ return await self.openai_responses_impl.get_openai_response(request.response_id)
95
99
 
96
100
  async def create_openai_response(
97
101
  self,
98
- input: str | list[OpenAIResponseInput],
99
- model: str,
100
- prompt: OpenAIResponsePrompt | None = None,
101
- instructions: str | None = None,
102
- parallel_tool_calls: bool | None = True,
103
- previous_response_id: str | None = None,
104
- conversation: str | None = None,
105
- store: bool | None = True,
106
- stream: bool | None = False,
107
- temperature: float | None = None,
108
- text: OpenAIResponseText | None = None,
109
- tool_choice: OpenAIResponseInputToolChoice | None = None,
110
- tools: list[OpenAIResponseInputTool] | None = None,
111
- include: list[str] | None = None,
112
- max_infer_iters: int | None = 10,
113
- guardrails: list[ResponseGuardrail] | None = None,
114
- max_tool_calls: int | None = None,
115
- metadata: dict[str, str] | None = None,
116
- ) -> OpenAIResponseObject:
102
+ request: CreateResponseRequest,
103
+ ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
104
+ """Create an OpenAI response.
105
+
106
+ Returns either a single response object (non-streaming) or an async iterator
107
+ yielding response stream events (streaming).
108
+ """
117
109
  assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
118
110
  result = await self.openai_responses_impl.create_openai_response(
119
- input,
120
- model,
121
- prompt,
122
- instructions,
123
- previous_response_id,
124
- conversation,
125
- store,
126
- stream,
127
- temperature,
128
- text,
129
- tool_choice,
130
- tools,
131
- include,
132
- max_infer_iters,
133
- guardrails,
134
- parallel_tool_calls,
135
- max_tool_calls,
136
- metadata,
111
+ request.input,
112
+ request.model,
113
+ request.prompt,
114
+ request.instructions,
115
+ request.previous_response_id,
116
+ request.conversation,
117
+ request.store,
118
+ request.stream,
119
+ request.temperature,
120
+ request.text,
121
+ request.tool_choice,
122
+ request.tools,
123
+ request.include,
124
+ request.max_infer_iters,
125
+ request.guardrails,
126
+ request.parallel_tool_calls,
127
+ request.max_tool_calls,
128
+ request.max_output_tokens,
129
+ request.reasoning,
130
+ request.safety_identifier,
131
+ request.metadata,
137
132
  )
138
- return result # type: ignore[no-any-return]
133
+ return result
139
134
 
140
135
  async def list_openai_responses(
141
136
  self,
142
- after: str | None = None,
143
- limit: int | None = 50,
144
- model: str | None = None,
145
- order: Order | None = Order.desc,
137
+ request: ListResponsesRequest,
146
138
  ) -> ListOpenAIResponseObject:
147
139
  assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
148
- return await self.openai_responses_impl.list_openai_responses(after, limit, model, order)
140
+ return await self.openai_responses_impl.list_openai_responses(
141
+ request.after, request.limit, request.model, request.order
142
+ )
149
143
 
150
144
  async def list_openai_response_input_items(
151
145
  self,
152
- response_id: str,
153
- after: str | None = None,
154
- before: str | None = None,
155
- include: list[str] | None = None,
156
- limit: int | None = 20,
157
- order: Order | None = Order.desc,
146
+ request: ListResponseInputItemsRequest,
158
147
  ) -> ListOpenAIResponseInputItem:
159
148
  assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
160
149
  return await self.openai_responses_impl.list_openai_response_input_items(
161
- response_id, after, before, include, limit, order
150
+ request.response_id,
151
+ request.after,
152
+ request.before,
153
+ request.include,
154
+ request.limit,
155
+ request.order,
162
156
  )
163
157
 
164
- async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
158
+ async def delete_openai_response(
159
+ self,
160
+ request: DeleteResponseRequest,
161
+ ) -> OpenAIDeleteResponseObject:
165
162
  assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
166
- return await self.openai_responses_impl.delete_openai_response(response_id)
163
+ return await self.openai_responses_impl.delete_openai_response(request.response_id)
@@ -4,7 +4,6 @@
4
4
  # This source code is licensed under the terms described in the LICENSE file in
5
5
  # the root directory of this source tree.
6
6
 
7
- import asyncio
8
7
  import re
9
8
  import time
10
9
  import uuid
@@ -19,11 +18,14 @@ from llama_stack.providers.utils.responses.responses_store import (
19
18
  )
20
19
  from llama_stack.providers.utils.tools.mcp import MCPSessionManager
21
20
  from llama_stack_api import (
21
+ AddItemsRequest,
22
+ Connectors,
22
23
  ConversationItem,
23
24
  Conversations,
24
25
  Files,
25
26
  Inference,
26
27
  InvalidConversationIdError,
28
+ ListItemsRequest,
27
29
  ListOpenAIResponseInputItem,
28
30
  ListOpenAIResponseObject,
29
31
  OpenAIChatCompletionContentPartParam,
@@ -39,6 +41,7 @@ from llama_stack_api import (
39
41
  OpenAIResponseObject,
40
42
  OpenAIResponseObjectStream,
41
43
  OpenAIResponsePrompt,
44
+ OpenAIResponseReasoning,
42
45
  OpenAIResponseText,
43
46
  OpenAIResponseTextFormat,
44
47
  OpenAISystemMessageParam,
@@ -83,6 +86,7 @@ class OpenAIResponsesImpl:
83
86
  conversations_api: Conversations,
84
87
  prompts_api: Prompts,
85
88
  files_api: Files,
89
+ connectors_api: Connectors,
86
90
  vector_stores_config=None,
87
91
  ):
88
92
  self.inference_api = inference_api
@@ -100,6 +104,7 @@ class OpenAIResponsesImpl:
100
104
  )
101
105
  self.prompts_api = prompts_api
102
106
  self.files_api = files_api
107
+ self.connectors_api = connectors_api
103
108
 
104
109
  async def _prepend_previous_response(
105
110
  self,
@@ -150,7 +155,9 @@ class OpenAIResponsesImpl:
150
155
 
151
156
  tool_context.recover_tools_from_previous_response(previous_response)
152
157
  elif conversation is not None:
153
- conversation_items = await self.conversations_api.list_items(conversation, order="asc")
158
+ conversation_items = await self.conversations_api.list_items(
159
+ ListItemsRequest(conversation_id=conversation, order="asc")
160
+ )
154
161
 
155
162
  # Use stored messages as source of truth (like previous_response.messages)
156
163
  stored_messages = await self.responses_store.get_conversation_messages(conversation)
@@ -462,6 +469,9 @@ class OpenAIResponsesImpl:
462
469
  guardrails: list[str | ResponseGuardrailSpec] | None = None,
463
470
  parallel_tool_calls: bool | None = None,
464
471
  max_tool_calls: int | None = None,
472
+ reasoning: OpenAIResponseReasoning | None = None,
473
+ max_output_tokens: int | None = None,
474
+ safety_identifier: str | None = None,
465
475
  metadata: dict[str, str] | None = None,
466
476
  ):
467
477
  stream = bool(stream)
@@ -499,9 +509,6 @@ class OpenAIResponsesImpl:
499
509
  if not conversation.startswith("conv_"):
500
510
  raise InvalidConversationIdError(conversation)
501
511
 
502
- if max_tool_calls is not None and max_tool_calls < 1:
503
- raise ValueError(f"Invalid {max_tool_calls=}; should be >= 1")
504
-
505
512
  stream_gen = self._create_streaming_response(
506
513
  input=input,
507
514
  conversation=conversation,
@@ -518,6 +525,9 @@ class OpenAIResponsesImpl:
518
525
  guardrail_ids=guardrail_ids,
519
526
  parallel_tool_calls=parallel_tool_calls,
520
527
  max_tool_calls=max_tool_calls,
528
+ reasoning=reasoning,
529
+ max_output_tokens=max_output_tokens,
530
+ safety_identifier=safety_identifier,
521
531
  metadata=metadata,
522
532
  include=include,
523
533
  )
@@ -573,6 +583,9 @@ class OpenAIResponsesImpl:
573
583
  guardrail_ids: list[str] | None = None,
574
584
  parallel_tool_calls: bool | None = True,
575
585
  max_tool_calls: int | None = None,
586
+ reasoning: OpenAIResponseReasoning | None = None,
587
+ max_output_tokens: int | None = None,
588
+ safety_identifier: str | None = None,
576
589
  metadata: dict[str, str] | None = None,
577
590
  include: list[ResponseItemInclude] | None = None,
578
591
  ) -> AsyncIterator[OpenAIResponseObjectStream]:
@@ -612,46 +625,45 @@ class OpenAIResponsesImpl:
612
625
 
613
626
  # Create a per-request MCP session manager for session reuse (fix for #4452)
614
627
  # This avoids redundant tools/list calls when making multiple MCP tool invocations
615
- mcp_session_manager = MCPSessionManager()
616
-
617
- # Create a per-request ToolExecutor with the session manager
618
- request_tool_executor = ToolExecutor(
619
- tool_groups_api=self.tool_groups_api,
620
- tool_runtime_api=self.tool_runtime_api,
621
- vector_io_api=self.vector_io_api,
622
- vector_stores_config=self.tool_executor.vector_stores_config,
623
- mcp_session_manager=mcp_session_manager,
624
- )
628
+ async with MCPSessionManager() as mcp_session_manager:
629
+ request_tool_executor = ToolExecutor(
630
+ tool_groups_api=self.tool_groups_api,
631
+ tool_runtime_api=self.tool_runtime_api,
632
+ vector_io_api=self.vector_io_api,
633
+ vector_stores_config=self.tool_executor.vector_stores_config,
634
+ mcp_session_manager=mcp_session_manager,
635
+ )
625
636
 
626
- orchestrator = StreamingResponseOrchestrator(
627
- inference_api=self.inference_api,
628
- ctx=ctx,
629
- response_id=response_id,
630
- created_at=created_at,
631
- prompt=prompt,
632
- text=text,
633
- max_infer_iters=max_infer_iters,
634
- parallel_tool_calls=parallel_tool_calls,
635
- tool_executor=request_tool_executor,
636
- safety_api=self.safety_api,
637
- guardrail_ids=guardrail_ids,
638
- instructions=instructions,
639
- max_tool_calls=max_tool_calls,
640
- metadata=metadata,
641
- include=include,
642
- )
637
+ orchestrator = StreamingResponseOrchestrator(
638
+ inference_api=self.inference_api,
639
+ ctx=ctx,
640
+ response_id=response_id,
641
+ created_at=created_at,
642
+ prompt=prompt,
643
+ text=text,
644
+ max_infer_iters=max_infer_iters,
645
+ parallel_tool_calls=parallel_tool_calls,
646
+ tool_executor=request_tool_executor,
647
+ safety_api=self.safety_api,
648
+ connectors_api=self.connectors_api,
649
+ guardrail_ids=guardrail_ids,
650
+ instructions=instructions,
651
+ max_tool_calls=max_tool_calls,
652
+ reasoning=reasoning,
653
+ max_output_tokens=max_output_tokens,
654
+ safety_identifier=safety_identifier,
655
+ metadata=metadata,
656
+ include=include,
657
+ store=store,
658
+ )
643
659
 
644
- # Stream the response
645
- final_response = None
646
- failed_response = None
660
+ final_response = None
661
+ failed_response = None
647
662
 
648
- # Type as ConversationItem to avoid list invariance issues
649
- output_items: list[ConversationItem] = []
663
+ output_items: list[ConversationItem] = []
650
664
 
651
- # Prepare input items for storage once (used by all persistence calls)
652
- input_items_for_storage = self._prepare_input_items_for_storage(all_input)
665
+ input_items_for_storage = self._prepare_input_items_for_storage(all_input)
653
666
 
654
- try:
655
667
  async for stream_chunk in orchestrator.create_response():
656
668
  match stream_chunk.type:
657
669
  case "response.completed" | "response.incomplete":
@@ -689,16 +701,6 @@ class OpenAIResponsesImpl:
689
701
  await self.responses_store.store_conversation_messages(conversation, messages_to_store)
690
702
 
691
703
  yield stream_chunk
692
- finally:
693
- # Clean up MCP sessions at the end of the request (fix for #4452)
694
- # Use shield() to prevent cancellation from interrupting cleanup and leaking resources
695
- # Wrap in try/except as cleanup errors should not mask the original response
696
- try:
697
- await asyncio.shield(mcp_session_manager.close_all())
698
- except BaseException as e:
699
- # Debug level - cleanup errors are expected in streaming scenarios where
700
- # anyio cancel scopes may be in a different task context
701
- logger.debug(f"Error during MCP session cleanup: {e}")
702
704
 
703
705
  async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
704
706
  return await self.responses_store.delete_response_object(response_id)
@@ -721,4 +723,4 @@ class OpenAIResponsesImpl:
721
723
 
722
724
  adapter = TypeAdapter(list[ConversationItem])
723
725
  validated_items = adapter.validate_python(conversation_items)
724
- await self.conversations_api.add_items(conversation_id, validated_items)
726
+ await self.conversations_api.add_items(conversation_id, AddItemsRequest(items=validated_items))
@@ -4,6 +4,7 @@
4
4
  # This source code is licensed under the terms described in the LICENSE file in
5
5
  # the root directory of this source tree.
6
6
 
7
+ import time
7
8
  import uuid
8
9
  from collections.abc import AsyncIterator
9
10
  from typing import Any
@@ -16,6 +17,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import interleaved_con
16
17
  from llama_stack_api import (
17
18
  AllowedToolsFilter,
18
19
  ApprovalFilter,
20
+ Connectors,
19
21
  Inference,
20
22
  MCPListToolsTool,
21
23
  ModelNotFoundError,
@@ -30,6 +32,7 @@ from llama_stack_api import (
30
32
  OpenAIChatCompletionToolChoiceFunctionTool,
31
33
  OpenAIChoice,
32
34
  OpenAIChoiceLogprobs,
35
+ OpenAIFinishReason,
33
36
  OpenAIMessageParam,
34
37
  OpenAIResponseContentPartOutputText,
35
38
  OpenAIResponseContentPartReasoningText,
@@ -77,6 +80,7 @@ from llama_stack_api import (
77
80
  OpenAIResponseOutputMessageMCPListTools,
78
81
  OpenAIResponseOutputMessageWebSearchToolCall,
79
82
  OpenAIResponsePrompt,
83
+ OpenAIResponseReasoning,
80
84
  OpenAIResponseText,
81
85
  OpenAIResponseUsage,
82
86
  OpenAIResponseUsageInputTokensDetails,
@@ -133,11 +137,16 @@ class StreamingResponseOrchestrator:
133
137
  instructions: str | None,
134
138
  safety_api: Safety | None,
135
139
  guardrail_ids: list[str] | None = None,
140
+ connectors_api: Connectors | None = None,
136
141
  prompt: OpenAIResponsePrompt | None = None,
137
142
  parallel_tool_calls: bool | None = None,
138
143
  max_tool_calls: int | None = None,
144
+ reasoning: OpenAIResponseReasoning | None = None,
145
+ max_output_tokens: int | None = None,
146
+ safety_identifier: str | None = None,
139
147
  metadata: dict[str, str] | None = None,
140
148
  include: list[ResponseItemInclude] | None = None,
149
+ store: bool | None = True,
141
150
  ):
142
151
  self.inference_api = inference_api
143
152
  self.ctx = ctx
@@ -147,6 +156,7 @@ class StreamingResponseOrchestrator:
147
156
  self.max_infer_iters = max_infer_iters
148
157
  self.tool_executor = tool_executor
149
158
  self.safety_api = safety_api
159
+ self.connectors_api = connectors_api
150
160
  self.guardrail_ids = guardrail_ids or []
151
161
  self.prompt = prompt
152
162
  # System message that is inserted into the model's context
@@ -155,8 +165,14 @@ class StreamingResponseOrchestrator:
155
165
  self.parallel_tool_calls = parallel_tool_calls
156
166
  # Max number of total calls to built-in tools that can be processed in a response
157
167
  self.max_tool_calls = max_tool_calls
168
+ self.reasoning = reasoning
169
+ # An upper bound for the number of tokens that can be generated for a response
170
+ self.max_output_tokens = max_output_tokens
171
+ self.safety_identifier = safety_identifier
158
172
  self.metadata = metadata
173
+ self.store = store
159
174
  self.include = include
175
+ self.store = bool(store) if store is not None else True
160
176
  self.sequence_number = 0
161
177
  # Store MCP tool mapping that gets built during tool processing
162
178
  self.mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] = (
@@ -179,6 +195,8 @@ class StreamingResponseOrchestrator:
179
195
  self.violation_detected = False
180
196
  # Track total calls made to built-in tools
181
197
  self.accumulated_builtin_tool_calls = 0
198
+ # Track total output tokens generated across inference calls
199
+ self.accumulated_builtin_output_tokens = 0
182
200
 
183
201
  async def _create_refusal_response(self, violation_message: str) -> OpenAIResponseObjectStream:
184
202
  """Create a refusal response to replace streaming content."""
@@ -191,7 +209,10 @@ class StreamingResponseOrchestrator:
191
209
  model=self.ctx.model,
192
210
  status="completed",
193
211
  output=[OpenAIResponseMessage(role="assistant", content=[refusal_content], type="message")],
212
+ max_output_tokens=self.max_output_tokens,
213
+ safety_identifier=self.safety_identifier,
194
214
  metadata=self.metadata,
215
+ store=self.store,
195
216
  )
196
217
 
197
218
  return OpenAIResponseObjectStreamResponseCompleted(response=refusal_response)
@@ -212,8 +233,10 @@ class StreamingResponseOrchestrator:
212
233
  *,
213
234
  error: OpenAIResponseError | None = None,
214
235
  ) -> OpenAIResponseObject:
236
+ completed_at = int(time.time()) if status == "completed" else None
215
237
  return OpenAIResponseObject(
216
238
  created_at=self.created_at,
239
+ completed_at=completed_at,
217
240
  id=self.response_id,
218
241
  model=self.ctx.model,
219
242
  object="response",
@@ -228,7 +251,11 @@ class StreamingResponseOrchestrator:
228
251
  prompt=self.prompt,
229
252
  parallel_tool_calls=self.parallel_tool_calls,
230
253
  max_tool_calls=self.max_tool_calls,
254
+ reasoning=self.reasoning,
255
+ max_output_tokens=self.max_output_tokens,
256
+ safety_identifier=self.safety_identifier,
231
257
  metadata=self.metadata,
258
+ store=self.store,
232
259
  )
233
260
 
234
261
  async def create_response(self) -> AsyncIterator[OpenAIResponseObjectStream]:
@@ -292,6 +319,22 @@ class StreamingResponseOrchestrator:
292
319
 
293
320
  try:
294
321
  while True:
322
+ if (
323
+ self.max_output_tokens is not None
324
+ and self.accumulated_builtin_output_tokens >= self.max_output_tokens
325
+ ):
326
+ logger.info(
327
+ "Skipping inference call since max_output_tokens reached: "
328
+ f"{self.accumulated_builtin_output_tokens}/{self.max_output_tokens}"
329
+ )
330
+ final_status = "incomplete"
331
+ break
332
+
333
+ remaining_output_tokens = (
334
+ self.max_output_tokens - self.accumulated_builtin_output_tokens
335
+ if self.max_output_tokens is not None
336
+ else None
337
+ )
295
338
  # Text is the default response format for chat completion so don't need to pass it
296
339
  # (some providers don't support non-empty response_format when tools are present)
297
340
  response_format = (
@@ -311,6 +354,11 @@ class StreamingResponseOrchestrator:
311
354
  True if self.include and ResponseItemInclude.message_output_text_logprobs in self.include else None
312
355
  )
313
356
 
357
+ # In OpenAI, parallel_tool_calls is only allowed when 'tools' are specified.
358
+ effective_parallel_tool_calls = (
359
+ self.parallel_tool_calls if effective_tools is not None and len(effective_tools) > 0 else None
360
+ )
361
+
314
362
  params = OpenAIChatCompletionRequestWithExtraBody(
315
363
  model=self.ctx.model,
316
364
  messages=messages,
@@ -324,6 +372,10 @@ class StreamingResponseOrchestrator:
324
372
  "include_usage": True,
325
373
  },
326
374
  logprobs=logprobs,
375
+ parallel_tool_calls=effective_parallel_tool_calls,
376
+ reasoning_effort=self.reasoning.effort if self.reasoning else None,
377
+ safety_identifier=self.safety_identifier,
378
+ max_completion_tokens=remaining_output_tokens,
327
379
  )
328
380
  completion_result = await self.inference_api.openai_chat_completion(params)
329
381
 
@@ -480,23 +532,24 @@ class StreamingResponseOrchestrator:
480
532
  if not chunk.usage:
481
533
  return
482
534
 
535
+ self.accumulated_builtin_output_tokens += chunk.usage.completion_tokens
536
+
483
537
  if self.accumulated_usage is None:
484
538
  # Convert from chat completion format to response format
485
539
  self.accumulated_usage = OpenAIResponseUsage(
486
540
  input_tokens=chunk.usage.prompt_tokens,
487
541
  output_tokens=chunk.usage.completion_tokens,
488
542
  total_tokens=chunk.usage.total_tokens,
489
- input_tokens_details=(
490
- OpenAIResponseUsageInputTokensDetails(cached_tokens=chunk.usage.prompt_tokens_details.cached_tokens)
491
- if chunk.usage.prompt_tokens_details
492
- else None
543
+ input_tokens_details=OpenAIResponseUsageInputTokensDetails(
544
+ cached_tokens=chunk.usage.prompt_tokens_details.cached_tokens
545
+ if chunk.usage.prompt_tokens_details and chunk.usage.prompt_tokens_details.cached_tokens is not None
546
+ else 0
493
547
  ),
494
- output_tokens_details=(
495
- OpenAIResponseUsageOutputTokensDetails(
496
- reasoning_tokens=chunk.usage.completion_tokens_details.reasoning_tokens
497
- )
548
+ output_tokens_details=OpenAIResponseUsageOutputTokensDetails(
549
+ reasoning_tokens=chunk.usage.completion_tokens_details.reasoning_tokens
498
550
  if chunk.usage.completion_tokens_details
499
- else None
551
+ and chunk.usage.completion_tokens_details.reasoning_tokens is not None
552
+ else 0
500
553
  ),
501
554
  )
502
555
  else:
@@ -506,17 +559,16 @@ class StreamingResponseOrchestrator:
506
559
  output_tokens=self.accumulated_usage.output_tokens + chunk.usage.completion_tokens,
507
560
  total_tokens=self.accumulated_usage.total_tokens + chunk.usage.total_tokens,
508
561
  # Use latest non-null details
509
- input_tokens_details=(
510
- OpenAIResponseUsageInputTokensDetails(cached_tokens=chunk.usage.prompt_tokens_details.cached_tokens)
511
- if chunk.usage.prompt_tokens_details
512
- else self.accumulated_usage.input_tokens_details
562
+ input_tokens_details=OpenAIResponseUsageInputTokensDetails(
563
+ cached_tokens=chunk.usage.prompt_tokens_details.cached_tokens
564
+ if chunk.usage.prompt_tokens_details and chunk.usage.prompt_tokens_details.cached_tokens is not None
565
+ else self.accumulated_usage.input_tokens_details.cached_tokens
513
566
  ),
514
- output_tokens_details=(
515
- OpenAIResponseUsageOutputTokensDetails(
516
- reasoning_tokens=chunk.usage.completion_tokens_details.reasoning_tokens
517
- )
567
+ output_tokens_details=OpenAIResponseUsageOutputTokensDetails(
568
+ reasoning_tokens=chunk.usage.completion_tokens_details.reasoning_tokens
518
569
  if chunk.usage.completion_tokens_details
519
- else self.accumulated_usage.output_tokens_details
570
+ and chunk.usage.completion_tokens_details.reasoning_tokens is not None
571
+ else self.accumulated_usage.output_tokens_details.reasoning_tokens
520
572
  ),
521
573
  )
522
574
 
@@ -652,7 +704,7 @@ class StreamingResponseOrchestrator:
652
704
  chat_response_tool_calls: dict[int, OpenAIChatCompletionToolCall] = {}
653
705
  chunk_created = 0
654
706
  chunk_model = ""
655
- chunk_finish_reason = ""
707
+ chunk_finish_reason: OpenAIFinishReason = "stop"
656
708
  chat_response_logprobs = []
657
709
 
658
710
  # Create a placeholder message item for delta events
@@ -744,9 +796,9 @@ class StreamingResponseOrchestrator:
744
796
  chunk_finish_reason = chunk_choice.finish_reason
745
797
 
746
798
  # Handle reasoning content if present (non-standard field for o1/o3 models)
747
- if hasattr(chunk_choice.delta, "reasoning_content") and chunk_choice.delta.reasoning_content:
799
+ if hasattr(chunk_choice.delta, "reasoning") and chunk_choice.delta.reasoning:
748
800
  async for event in self._handle_reasoning_content_chunk(
749
- reasoning_content=chunk_choice.delta.reasoning_content,
801
+ reasoning_content=chunk_choice.delta.reasoning,
750
802
  reasoning_part_emitted=reasoning_part_emitted,
751
803
  reasoning_content_index=reasoning_content_index,
752
804
  message_item_id=message_item_id,
@@ -758,7 +810,7 @@ class StreamingResponseOrchestrator:
758
810
  else:
759
811
  yield event
760
812
  reasoning_part_emitted = True
761
- reasoning_text_accumulated.append(chunk_choice.delta.reasoning_content)
813
+ reasoning_text_accumulated.append(chunk_choice.delta.reasoning)
762
814
 
763
815
  # Handle refusal content if present
764
816
  if chunk_choice.delta.refusal:
@@ -1175,6 +1227,9 @@ class StreamingResponseOrchestrator:
1175
1227
  """Process an MCP tool configuration and emit appropriate streaming events."""
1176
1228
  from llama_stack.providers.utils.tools.mcp import list_mcp_tools
1177
1229
 
1230
+ # Resolve connector_id to server_url if provided
1231
+ mcp_tool = await resolve_mcp_connector_id(mcp_tool, self.connectors_api)
1232
+
1178
1233
  # Emit mcp_list_tools.in_progress
1179
1234
  self.sequence_number += 1
1180
1235
  yield OpenAIResponseObjectStreamResponseMcpListToolsInProgress(
@@ -1489,3 +1544,25 @@ async def _process_tool_choice(
1489
1544
  tools=tool_choice,
1490
1545
  mode="required",
1491
1546
  )
1547
+
1548
+
1549
+ async def resolve_mcp_connector_id(
1550
+ mcp_tool: OpenAIResponseInputToolMCP,
1551
+ connectors_api: Connectors,
1552
+ ) -> OpenAIResponseInputToolMCP:
1553
+ """Resolve connector_id to server_url for an MCP tool.
1554
+
1555
+ If the mcp_tool has a connector_id but no server_url, this function
1556
+ looks up the connector and populates the server_url from it.
1557
+
1558
+ Args:
1559
+ mcp_tool: The MCP tool configuration to resolve
1560
+ connectors_api: The connectors API for looking up connectors
1561
+
1562
+ Returns:
1563
+ The mcp_tool with server_url populated (may be same instance if already set)
1564
+ """
1565
+ if mcp_tool.connector_id and not mcp_tool.server_url:
1566
+ connector = await connectors_api.get_connector(mcp_tool.connector_id)
1567
+ return mcp_tool.model_copy(update={"server_url": connector.url})
1568
+ return mcp_tool