llama-stack 0.4.4__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. llama_stack/cli/stack/_list_deps.py +11 -7
  2. llama_stack/cli/stack/run.py +3 -25
  3. llama_stack/core/access_control/datatypes.py +78 -0
  4. llama_stack/core/configure.py +2 -2
  5. llama_stack/{distributions/meta-reference-gpu → core/connectors}/__init__.py +3 -1
  6. llama_stack/core/connectors/connectors.py +162 -0
  7. llama_stack/core/conversations/conversations.py +61 -58
  8. llama_stack/core/datatypes.py +54 -8
  9. llama_stack/core/library_client.py +60 -13
  10. llama_stack/core/prompts/prompts.py +43 -42
  11. llama_stack/core/routers/datasets.py +20 -17
  12. llama_stack/core/routers/eval_scoring.py +143 -53
  13. llama_stack/core/routers/inference.py +20 -9
  14. llama_stack/core/routers/safety.py +30 -42
  15. llama_stack/core/routers/vector_io.py +15 -7
  16. llama_stack/core/routing_tables/models.py +42 -3
  17. llama_stack/core/routing_tables/scoring_functions.py +19 -19
  18. llama_stack/core/routing_tables/shields.py +20 -17
  19. llama_stack/core/routing_tables/vector_stores.py +8 -5
  20. llama_stack/core/server/auth.py +192 -17
  21. llama_stack/core/server/fastapi_router_registry.py +40 -5
  22. llama_stack/core/server/server.py +24 -5
  23. llama_stack/core/stack.py +54 -10
  24. llama_stack/core/storage/datatypes.py +9 -0
  25. llama_stack/core/store/registry.py +1 -1
  26. llama_stack/core/utils/exec.py +2 -2
  27. llama_stack/core/utils/type_inspection.py +16 -2
  28. llama_stack/distributions/dell/config.yaml +4 -1
  29. llama_stack/distributions/dell/run-with-safety.yaml +4 -1
  30. llama_stack/distributions/nvidia/config.yaml +4 -1
  31. llama_stack/distributions/nvidia/run-with-safety.yaml +4 -1
  32. llama_stack/distributions/oci/config.yaml +4 -1
  33. llama_stack/distributions/open-benchmark/config.yaml +9 -1
  34. llama_stack/distributions/postgres-demo/config.yaml +1 -1
  35. llama_stack/distributions/starter/build.yaml +62 -0
  36. llama_stack/distributions/starter/config.yaml +22 -3
  37. llama_stack/distributions/starter/run-with-postgres-store.yaml +22 -3
  38. llama_stack/distributions/starter/starter.py +13 -1
  39. llama_stack/distributions/starter-gpu/build.yaml +62 -0
  40. llama_stack/distributions/starter-gpu/config.yaml +22 -3
  41. llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +22 -3
  42. llama_stack/distributions/template.py +10 -2
  43. llama_stack/distributions/watsonx/config.yaml +4 -1
  44. llama_stack/log.py +1 -0
  45. llama_stack/providers/inline/agents/meta_reference/__init__.py +1 -0
  46. llama_stack/providers/inline/agents/meta_reference/agents.py +57 -61
  47. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +49 -51
  48. llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +94 -22
  49. llama_stack/providers/inline/agents/meta_reference/responses/types.py +2 -1
  50. llama_stack/providers/inline/agents/meta_reference/responses/utils.py +4 -1
  51. llama_stack/providers/inline/agents/meta_reference/safety.py +2 -2
  52. llama_stack/providers/inline/batches/reference/batches.py +2 -1
  53. llama_stack/providers/inline/eval/meta_reference/eval.py +40 -32
  54. llama_stack/providers/inline/post_training/huggingface/post_training.py +33 -38
  55. llama_stack/providers/inline/post_training/huggingface/utils.py +2 -5
  56. llama_stack/providers/inline/post_training/torchtune/post_training.py +28 -33
  57. llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +2 -4
  58. llama_stack/providers/inline/safety/code_scanner/code_scanner.py +12 -15
  59. llama_stack/providers/inline/safety/llama_guard/llama_guard.py +15 -18
  60. llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +11 -17
  61. llama_stack/providers/inline/scoring/basic/scoring.py +13 -17
  62. llama_stack/providers/inline/scoring/braintrust/braintrust.py +15 -15
  63. llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +13 -17
  64. llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +1 -1
  65. llama_stack/providers/registry/agents.py +1 -0
  66. llama_stack/providers/registry/inference.py +1 -9
  67. llama_stack/providers/registry/vector_io.py +136 -16
  68. llama_stack/providers/remote/eval/nvidia/eval.py +22 -21
  69. llama_stack/providers/remote/files/s3/config.py +5 -3
  70. llama_stack/providers/remote/files/s3/files.py +2 -2
  71. llama_stack/providers/remote/inference/gemini/gemini.py +4 -0
  72. llama_stack/providers/remote/inference/openai/openai.py +2 -0
  73. llama_stack/providers/remote/inference/together/together.py +4 -0
  74. llama_stack/providers/remote/inference/vertexai/config.py +3 -3
  75. llama_stack/providers/remote/inference/vertexai/vertexai.py +5 -2
  76. llama_stack/providers/remote/inference/vllm/config.py +37 -18
  77. llama_stack/providers/remote/inference/vllm/vllm.py +0 -3
  78. llama_stack/providers/remote/inference/watsonx/watsonx.py +4 -0
  79. llama_stack/providers/remote/post_training/nvidia/post_training.py +31 -33
  80. llama_stack/providers/remote/safety/bedrock/bedrock.py +10 -27
  81. llama_stack/providers/remote/safety/nvidia/nvidia.py +9 -25
  82. llama_stack/providers/remote/safety/sambanova/sambanova.py +13 -11
  83. llama_stack/providers/remote/vector_io/elasticsearch/__init__.py +17 -0
  84. llama_stack/providers/remote/vector_io/elasticsearch/config.py +32 -0
  85. llama_stack/providers/remote/vector_io/elasticsearch/elasticsearch.py +463 -0
  86. llama_stack/providers/remote/vector_io/oci/__init__.py +22 -0
  87. llama_stack/providers/remote/vector_io/oci/config.py +41 -0
  88. llama_stack/providers/remote/vector_io/oci/oci26ai.py +595 -0
  89. llama_stack/providers/remote/vector_io/pgvector/config.py +69 -2
  90. llama_stack/providers/remote/vector_io/pgvector/pgvector.py +255 -6
  91. llama_stack/providers/remote/vector_io/qdrant/qdrant.py +62 -38
  92. llama_stack/providers/utils/bedrock/client.py +3 -3
  93. llama_stack/providers/utils/bedrock/config.py +7 -7
  94. llama_stack/providers/utils/inference/embedding_mixin.py +4 -0
  95. llama_stack/providers/utils/inference/http_client.py +239 -0
  96. llama_stack/providers/utils/inference/litellm_openai_mixin.py +5 -0
  97. llama_stack/providers/utils/inference/model_registry.py +148 -2
  98. llama_stack/providers/utils/inference/openai_compat.py +2 -1
  99. llama_stack/providers/utils/inference/openai_mixin.py +41 -2
  100. llama_stack/providers/utils/memory/openai_vector_store_mixin.py +92 -5
  101. llama_stack/providers/utils/memory/vector_store.py +46 -19
  102. llama_stack/providers/utils/responses/responses_store.py +7 -7
  103. llama_stack/providers/utils/safety.py +114 -0
  104. llama_stack/providers/utils/tools/mcp.py +44 -3
  105. llama_stack/testing/api_recorder.py +9 -3
  106. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0rc1.dist-info}/METADATA +14 -2
  107. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0rc1.dist-info}/RECORD +111 -144
  108. llama_stack/distributions/meta-reference-gpu/config.yaml +0 -140
  109. llama_stack/distributions/meta-reference-gpu/doc_template.md +0 -119
  110. llama_stack/distributions/meta-reference-gpu/meta_reference.py +0 -163
  111. llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +0 -155
  112. llama_stack/models/llama/hadamard_utils.py +0 -88
  113. llama_stack/models/llama/llama3/args.py +0 -74
  114. llama_stack/models/llama/llama3/dog.jpg +0 -0
  115. llama_stack/models/llama/llama3/generation.py +0 -378
  116. llama_stack/models/llama/llama3/model.py +0 -304
  117. llama_stack/models/llama/llama3/multimodal/__init__.py +0 -12
  118. llama_stack/models/llama/llama3/multimodal/encoder_utils.py +0 -180
  119. llama_stack/models/llama/llama3/multimodal/image_transform.py +0 -409
  120. llama_stack/models/llama/llama3/multimodal/model.py +0 -1430
  121. llama_stack/models/llama/llama3/multimodal/utils.py +0 -26
  122. llama_stack/models/llama/llama3/pasta.jpeg +0 -0
  123. llama_stack/models/llama/llama3/quantization/__init__.py +0 -5
  124. llama_stack/models/llama/llama3/quantization/loader.py +0 -316
  125. llama_stack/models/llama/llama3_1/__init__.py +0 -12
  126. llama_stack/models/llama/llama3_1/prompt_format.md +0 -358
  127. llama_stack/models/llama/llama3_1/prompts.py +0 -258
  128. llama_stack/models/llama/llama3_2/__init__.py +0 -5
  129. llama_stack/models/llama/llama3_2/prompts_text.py +0 -229
  130. llama_stack/models/llama/llama3_2/prompts_vision.py +0 -126
  131. llama_stack/models/llama/llama3_2/text_prompt_format.md +0 -286
  132. llama_stack/models/llama/llama3_2/vision_prompt_format.md +0 -141
  133. llama_stack/models/llama/llama3_3/__init__.py +0 -5
  134. llama_stack/models/llama/llama3_3/prompts.py +0 -259
  135. llama_stack/models/llama/llama4/args.py +0 -107
  136. llama_stack/models/llama/llama4/ffn.py +0 -58
  137. llama_stack/models/llama/llama4/moe.py +0 -214
  138. llama_stack/models/llama/llama4/preprocess.py +0 -435
  139. llama_stack/models/llama/llama4/quantization/__init__.py +0 -5
  140. llama_stack/models/llama/llama4/quantization/loader.py +0 -226
  141. llama_stack/models/llama/llama4/vision/__init__.py +0 -5
  142. llama_stack/models/llama/llama4/vision/embedding.py +0 -210
  143. llama_stack/models/llama/llama4/vision/encoder.py +0 -412
  144. llama_stack/models/llama/quantize_impls.py +0 -316
  145. llama_stack/providers/inline/inference/meta_reference/__init__.py +0 -20
  146. llama_stack/providers/inline/inference/meta_reference/common.py +0 -24
  147. llama_stack/providers/inline/inference/meta_reference/config.py +0 -68
  148. llama_stack/providers/inline/inference/meta_reference/generators.py +0 -201
  149. llama_stack/providers/inline/inference/meta_reference/inference.py +0 -542
  150. llama_stack/providers/inline/inference/meta_reference/model_parallel.py +0 -77
  151. llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +0 -353
  152. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0rc1.dist-info}/WHEEL +0 -0
  153. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0rc1.dist-info}/entry_points.txt +0 -0
  154. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
  155. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,7 @@ async def get_provider_impl(
28
28
  deps[Api.conversations],
29
29
  deps[Api.prompts],
30
30
  deps[Api.files],
31
+ deps[Api.connectors],
31
32
  policy,
32
33
  )
33
34
  await impl.initialize()
@@ -4,6 +4,7 @@
4
4
  # This source code is licensed under the terms described in the LICENSE file in
5
5
  # the root directory of this source tree.
6
6
 
7
+ from collections.abc import AsyncIterator
7
8
 
8
9
  from llama_stack.core.datatypes import AccessRule
9
10
  from llama_stack.core.storage.kvstore import InmemoryKVStoreImpl, kvstore_impl
@@ -11,21 +12,21 @@ from llama_stack.log import get_logger
11
12
  from llama_stack.providers.utils.responses.responses_store import ResponsesStore
12
13
  from llama_stack_api import (
13
14
  Agents,
15
+ Connectors,
14
16
  Conversations,
17
+ CreateResponseRequest,
18
+ DeleteResponseRequest,
15
19
  Files,
16
20
  Inference,
17
21
  ListOpenAIResponseInputItem,
18
22
  ListOpenAIResponseObject,
23
+ ListResponseInputItemsRequest,
24
+ ListResponsesRequest,
19
25
  OpenAIDeleteResponseObject,
20
- OpenAIResponseInput,
21
- OpenAIResponseInputTool,
22
- OpenAIResponseInputToolChoice,
23
26
  OpenAIResponseObject,
24
- OpenAIResponsePrompt,
25
- OpenAIResponseText,
26
- Order,
27
+ OpenAIResponseObjectStream,
27
28
  Prompts,
28
- ResponseGuardrail,
29
+ RetrieveResponseRequest,
29
30
  Safety,
30
31
  ToolGroups,
31
32
  ToolRuntime,
@@ -50,6 +51,7 @@ class MetaReferenceAgentsImpl(Agents):
50
51
  conversations_api: Conversations,
51
52
  prompts_api: Prompts,
52
53
  files_api: Files,
54
+ connectors_api: Connectors,
53
55
  policy: list[AccessRule],
54
56
  ):
55
57
  self.config = config
@@ -64,6 +66,7 @@ class MetaReferenceAgentsImpl(Agents):
64
66
  self.in_memory_store = InmemoryKVStoreImpl()
65
67
  self.openai_responses_impl: OpenAIResponsesImpl | None = None
66
68
  self.policy = policy
69
+ self.connectors_api = connectors_api
67
70
 
68
71
  async def initialize(self) -> None:
69
72
  self.persistence_store = await kvstore_impl(self.config.persistence.agent_state)
@@ -80,6 +83,7 @@ class MetaReferenceAgentsImpl(Agents):
80
83
  prompts_api=self.prompts_api,
81
84
  files_api=self.files_api,
82
85
  vector_stores_config=self.config.vector_stores_config,
86
+ connectors_api=self.connectors_api,
83
87
  )
84
88
 
85
89
  async def shutdown(self) -> None:
@@ -88,79 +92,71 @@ class MetaReferenceAgentsImpl(Agents):
88
92
  # OpenAI responses
89
93
  async def get_openai_response(
90
94
  self,
91
- response_id: str,
95
+ request: RetrieveResponseRequest,
92
96
  ) -> OpenAIResponseObject:
93
97
  assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
94
- return await self.openai_responses_impl.get_openai_response(response_id)
98
+ return await self.openai_responses_impl.get_openai_response(request.response_id)
95
99
 
96
100
  async def create_openai_response(
97
101
  self,
98
- input: str | list[OpenAIResponseInput],
99
- model: str,
100
- prompt: OpenAIResponsePrompt | None = None,
101
- instructions: str | None = None,
102
- parallel_tool_calls: bool | None = True,
103
- previous_response_id: str | None = None,
104
- conversation: str | None = None,
105
- store: bool | None = True,
106
- stream: bool | None = False,
107
- temperature: float | None = None,
108
- text: OpenAIResponseText | None = None,
109
- tool_choice: OpenAIResponseInputToolChoice | None = None,
110
- tools: list[OpenAIResponseInputTool] | None = None,
111
- include: list[str] | None = None,
112
- max_infer_iters: int | None = 10,
113
- guardrails: list[ResponseGuardrail] | None = None,
114
- max_tool_calls: int | None = None,
115
- metadata: dict[str, str] | None = None,
116
- ) -> OpenAIResponseObject:
102
+ request: CreateResponseRequest,
103
+ ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
104
+ """Create an OpenAI response.
105
+
106
+ Returns either a single response object (non-streaming) or an async iterator
107
+ yielding response stream events (streaming).
108
+ """
117
109
  assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
118
110
  result = await self.openai_responses_impl.create_openai_response(
119
- input,
120
- model,
121
- prompt,
122
- instructions,
123
- previous_response_id,
124
- conversation,
125
- store,
126
- stream,
127
- temperature,
128
- text,
129
- tool_choice,
130
- tools,
131
- include,
132
- max_infer_iters,
133
- guardrails,
134
- parallel_tool_calls,
135
- max_tool_calls,
136
- metadata,
111
+ request.input,
112
+ request.model,
113
+ request.prompt,
114
+ request.instructions,
115
+ request.previous_response_id,
116
+ request.conversation,
117
+ request.store,
118
+ request.stream,
119
+ request.temperature,
120
+ request.text,
121
+ request.tool_choice,
122
+ request.tools,
123
+ request.include,
124
+ request.max_infer_iters,
125
+ request.guardrails,
126
+ request.parallel_tool_calls,
127
+ request.max_tool_calls,
128
+ request.max_output_tokens,
129
+ request.reasoning,
130
+ request.metadata,
137
131
  )
138
- return result # type: ignore[no-any-return]
132
+ return result
139
133
 
140
134
  async def list_openai_responses(
141
135
  self,
142
- after: str | None = None,
143
- limit: int | None = 50,
144
- model: str | None = None,
145
- order: Order | None = Order.desc,
136
+ request: ListResponsesRequest,
146
137
  ) -> ListOpenAIResponseObject:
147
138
  assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
148
- return await self.openai_responses_impl.list_openai_responses(after, limit, model, order)
139
+ return await self.openai_responses_impl.list_openai_responses(
140
+ request.after, request.limit, request.model, request.order
141
+ )
149
142
 
150
143
  async def list_openai_response_input_items(
151
144
  self,
152
- response_id: str,
153
- after: str | None = None,
154
- before: str | None = None,
155
- include: list[str] | None = None,
156
- limit: int | None = 20,
157
- order: Order | None = Order.desc,
145
+ request: ListResponseInputItemsRequest,
158
146
  ) -> ListOpenAIResponseInputItem:
159
147
  assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
160
148
  return await self.openai_responses_impl.list_openai_response_input_items(
161
- response_id, after, before, include, limit, order
149
+ request.response_id,
150
+ request.after,
151
+ request.before,
152
+ request.include,
153
+ request.limit,
154
+ request.order,
162
155
  )
163
156
 
164
- async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
157
+ async def delete_openai_response(
158
+ self,
159
+ request: DeleteResponseRequest,
160
+ ) -> OpenAIDeleteResponseObject:
165
161
  assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
166
- return await self.openai_responses_impl.delete_openai_response(response_id)
162
+ return await self.openai_responses_impl.delete_openai_response(request.response_id)
@@ -4,7 +4,6 @@
4
4
  # This source code is licensed under the terms described in the LICENSE file in
5
5
  # the root directory of this source tree.
6
6
 
7
- import asyncio
8
7
  import re
9
8
  import time
10
9
  import uuid
@@ -19,11 +18,14 @@ from llama_stack.providers.utils.responses.responses_store import (
19
18
  )
20
19
  from llama_stack.providers.utils.tools.mcp import MCPSessionManager
21
20
  from llama_stack_api import (
21
+ AddItemsRequest,
22
+ Connectors,
22
23
  ConversationItem,
23
24
  Conversations,
24
25
  Files,
25
26
  Inference,
26
27
  InvalidConversationIdError,
28
+ ListItemsRequest,
27
29
  ListOpenAIResponseInputItem,
28
30
  ListOpenAIResponseObject,
29
31
  OpenAIChatCompletionContentPartParam,
@@ -39,6 +41,7 @@ from llama_stack_api import (
39
41
  OpenAIResponseObject,
40
42
  OpenAIResponseObjectStream,
41
43
  OpenAIResponsePrompt,
44
+ OpenAIResponseReasoning,
42
45
  OpenAIResponseText,
43
46
  OpenAIResponseTextFormat,
44
47
  OpenAISystemMessageParam,
@@ -83,6 +86,7 @@ class OpenAIResponsesImpl:
83
86
  conversations_api: Conversations,
84
87
  prompts_api: Prompts,
85
88
  files_api: Files,
89
+ connectors_api: Connectors,
86
90
  vector_stores_config=None,
87
91
  ):
88
92
  self.inference_api = inference_api
@@ -100,6 +104,7 @@ class OpenAIResponsesImpl:
100
104
  )
101
105
  self.prompts_api = prompts_api
102
106
  self.files_api = files_api
107
+ self.connectors_api = connectors_api
103
108
 
104
109
  async def _prepend_previous_response(
105
110
  self,
@@ -150,7 +155,9 @@ class OpenAIResponsesImpl:
150
155
 
151
156
  tool_context.recover_tools_from_previous_response(previous_response)
152
157
  elif conversation is not None:
153
- conversation_items = await self.conversations_api.list_items(conversation, order="asc")
158
+ conversation_items = await self.conversations_api.list_items(
159
+ ListItemsRequest(conversation_id=conversation, order="asc")
160
+ )
154
161
 
155
162
  # Use stored messages as source of truth (like previous_response.messages)
156
163
  stored_messages = await self.responses_store.get_conversation_messages(conversation)
@@ -462,6 +469,8 @@ class OpenAIResponsesImpl:
462
469
  guardrails: list[str | ResponseGuardrailSpec] | None = None,
463
470
  parallel_tool_calls: bool | None = None,
464
471
  max_tool_calls: int | None = None,
472
+ reasoning: OpenAIResponseReasoning | None = None,
473
+ max_output_tokens: int | None = None,
465
474
  metadata: dict[str, str] | None = None,
466
475
  ):
467
476
  stream = bool(stream)
@@ -499,9 +508,6 @@ class OpenAIResponsesImpl:
499
508
  if not conversation.startswith("conv_"):
500
509
  raise InvalidConversationIdError(conversation)
501
510
 
502
- if max_tool_calls is not None and max_tool_calls < 1:
503
- raise ValueError(f"Invalid {max_tool_calls=}; should be >= 1")
504
-
505
511
  stream_gen = self._create_streaming_response(
506
512
  input=input,
507
513
  conversation=conversation,
@@ -518,6 +524,8 @@ class OpenAIResponsesImpl:
518
524
  guardrail_ids=guardrail_ids,
519
525
  parallel_tool_calls=parallel_tool_calls,
520
526
  max_tool_calls=max_tool_calls,
527
+ reasoning=reasoning,
528
+ max_output_tokens=max_output_tokens,
521
529
  metadata=metadata,
522
530
  include=include,
523
531
  )
@@ -573,6 +581,8 @@ class OpenAIResponsesImpl:
573
581
  guardrail_ids: list[str] | None = None,
574
582
  parallel_tool_calls: bool | None = True,
575
583
  max_tool_calls: int | None = None,
584
+ reasoning: OpenAIResponseReasoning | None = None,
585
+ max_output_tokens: int | None = None,
576
586
  metadata: dict[str, str] | None = None,
577
587
  include: list[ResponseItemInclude] | None = None,
578
588
  ) -> AsyncIterator[OpenAIResponseObjectStream]:
@@ -612,46 +622,44 @@ class OpenAIResponsesImpl:
612
622
 
613
623
  # Create a per-request MCP session manager for session reuse (fix for #4452)
614
624
  # This avoids redundant tools/list calls when making multiple MCP tool invocations
615
- mcp_session_manager = MCPSessionManager()
616
-
617
- # Create a per-request ToolExecutor with the session manager
618
- request_tool_executor = ToolExecutor(
619
- tool_groups_api=self.tool_groups_api,
620
- tool_runtime_api=self.tool_runtime_api,
621
- vector_io_api=self.vector_io_api,
622
- vector_stores_config=self.tool_executor.vector_stores_config,
623
- mcp_session_manager=mcp_session_manager,
624
- )
625
+ async with MCPSessionManager() as mcp_session_manager:
626
+ request_tool_executor = ToolExecutor(
627
+ tool_groups_api=self.tool_groups_api,
628
+ tool_runtime_api=self.tool_runtime_api,
629
+ vector_io_api=self.vector_io_api,
630
+ vector_stores_config=self.tool_executor.vector_stores_config,
631
+ mcp_session_manager=mcp_session_manager,
632
+ )
625
633
 
626
- orchestrator = StreamingResponseOrchestrator(
627
- inference_api=self.inference_api,
628
- ctx=ctx,
629
- response_id=response_id,
630
- created_at=created_at,
631
- prompt=prompt,
632
- text=text,
633
- max_infer_iters=max_infer_iters,
634
- parallel_tool_calls=parallel_tool_calls,
635
- tool_executor=request_tool_executor,
636
- safety_api=self.safety_api,
637
- guardrail_ids=guardrail_ids,
638
- instructions=instructions,
639
- max_tool_calls=max_tool_calls,
640
- metadata=metadata,
641
- include=include,
642
- )
634
+ orchestrator = StreamingResponseOrchestrator(
635
+ inference_api=self.inference_api,
636
+ ctx=ctx,
637
+ response_id=response_id,
638
+ created_at=created_at,
639
+ prompt=prompt,
640
+ text=text,
641
+ max_infer_iters=max_infer_iters,
642
+ parallel_tool_calls=parallel_tool_calls,
643
+ tool_executor=request_tool_executor,
644
+ safety_api=self.safety_api,
645
+ connectors_api=self.connectors_api,
646
+ guardrail_ids=guardrail_ids,
647
+ instructions=instructions,
648
+ max_tool_calls=max_tool_calls,
649
+ reasoning=reasoning,
650
+ max_output_tokens=max_output_tokens,
651
+ metadata=metadata,
652
+ include=include,
653
+ store=store,
654
+ )
643
655
 
644
- # Stream the response
645
- final_response = None
646
- failed_response = None
656
+ final_response = None
657
+ failed_response = None
647
658
 
648
- # Type as ConversationItem to avoid list invariance issues
649
- output_items: list[ConversationItem] = []
659
+ output_items: list[ConversationItem] = []
650
660
 
651
- # Prepare input items for storage once (used by all persistence calls)
652
- input_items_for_storage = self._prepare_input_items_for_storage(all_input)
661
+ input_items_for_storage = self._prepare_input_items_for_storage(all_input)
653
662
 
654
- try:
655
663
  async for stream_chunk in orchestrator.create_response():
656
664
  match stream_chunk.type:
657
665
  case "response.completed" | "response.incomplete":
@@ -689,16 +697,6 @@ class OpenAIResponsesImpl:
689
697
  await self.responses_store.store_conversation_messages(conversation, messages_to_store)
690
698
 
691
699
  yield stream_chunk
692
- finally:
693
- # Clean up MCP sessions at the end of the request (fix for #4452)
694
- # Use shield() to prevent cancellation from interrupting cleanup and leaking resources
695
- # Wrap in try/except as cleanup errors should not mask the original response
696
- try:
697
- await asyncio.shield(mcp_session_manager.close_all())
698
- except BaseException as e:
699
- # Debug level - cleanup errors are expected in streaming scenarios where
700
- # anyio cancel scopes may be in a different task context
701
- logger.debug(f"Error during MCP session cleanup: {e}")
702
700
 
703
701
  async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
704
702
  return await self.responses_store.delete_response_object(response_id)
@@ -721,4 +719,4 @@ class OpenAIResponsesImpl:
721
719
 
722
720
  adapter = TypeAdapter(list[ConversationItem])
723
721
  validated_items = adapter.validate_python(conversation_items)
724
- await self.conversations_api.add_items(conversation_id, validated_items)
722
+ await self.conversations_api.add_items(conversation_id, AddItemsRequest(items=validated_items))
@@ -4,6 +4,7 @@
4
4
  # This source code is licensed under the terms described in the LICENSE file in
5
5
  # the root directory of this source tree.
6
6
 
7
+ import time
7
8
  import uuid
8
9
  from collections.abc import AsyncIterator
9
10
  from typing import Any
@@ -16,6 +17,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import interleaved_con
16
17
  from llama_stack_api import (
17
18
  AllowedToolsFilter,
18
19
  ApprovalFilter,
20
+ Connectors,
19
21
  Inference,
20
22
  MCPListToolsTool,
21
23
  ModelNotFoundError,
@@ -30,6 +32,7 @@ from llama_stack_api import (
30
32
  OpenAIChatCompletionToolChoiceFunctionTool,
31
33
  OpenAIChoice,
32
34
  OpenAIChoiceLogprobs,
35
+ OpenAIFinishReason,
33
36
  OpenAIMessageParam,
34
37
  OpenAIResponseContentPartOutputText,
35
38
  OpenAIResponseContentPartReasoningText,
@@ -77,6 +80,7 @@ from llama_stack_api import (
77
80
  OpenAIResponseOutputMessageMCPListTools,
78
81
  OpenAIResponseOutputMessageWebSearchToolCall,
79
82
  OpenAIResponsePrompt,
83
+ OpenAIResponseReasoning,
80
84
  OpenAIResponseText,
81
85
  OpenAIResponseUsage,
82
86
  OpenAIResponseUsageInputTokensDetails,
@@ -133,11 +137,15 @@ class StreamingResponseOrchestrator:
133
137
  instructions: str | None,
134
138
  safety_api: Safety | None,
135
139
  guardrail_ids: list[str] | None = None,
140
+ connectors_api: Connectors | None = None,
136
141
  prompt: OpenAIResponsePrompt | None = None,
137
142
  parallel_tool_calls: bool | None = None,
138
143
  max_tool_calls: int | None = None,
144
+ reasoning: OpenAIResponseReasoning | None = None,
145
+ max_output_tokens: int | None = None,
139
146
  metadata: dict[str, str] | None = None,
140
147
  include: list[ResponseItemInclude] | None = None,
148
+ store: bool | None = True,
141
149
  ):
142
150
  self.inference_api = inference_api
143
151
  self.ctx = ctx
@@ -147,6 +155,7 @@ class StreamingResponseOrchestrator:
147
155
  self.max_infer_iters = max_infer_iters
148
156
  self.tool_executor = tool_executor
149
157
  self.safety_api = safety_api
158
+ self.connectors_api = connectors_api
150
159
  self.guardrail_ids = guardrail_ids or []
151
160
  self.prompt = prompt
152
161
  # System message that is inserted into the model's context
@@ -155,8 +164,13 @@ class StreamingResponseOrchestrator:
155
164
  self.parallel_tool_calls = parallel_tool_calls
156
165
  # Max number of total calls to built-in tools that can be processed in a response
157
166
  self.max_tool_calls = max_tool_calls
167
+ self.reasoning = reasoning
168
+ # An upper bound for the number of tokens that can be generated for a response
169
+ self.max_output_tokens = max_output_tokens
158
170
  self.metadata = metadata
171
+ self.store = store
159
172
  self.include = include
173
+ self.store = bool(store) if store is not None else True
160
174
  self.sequence_number = 0
161
175
  # Store MCP tool mapping that gets built during tool processing
162
176
  self.mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] = (
@@ -179,6 +193,8 @@ class StreamingResponseOrchestrator:
179
193
  self.violation_detected = False
180
194
  # Track total calls made to built-in tools
181
195
  self.accumulated_builtin_tool_calls = 0
196
+ # Track total output tokens generated across inference calls
197
+ self.accumulated_builtin_output_tokens = 0
182
198
 
183
199
  async def _create_refusal_response(self, violation_message: str) -> OpenAIResponseObjectStream:
184
200
  """Create a refusal response to replace streaming content."""
@@ -191,7 +207,9 @@ class StreamingResponseOrchestrator:
191
207
  model=self.ctx.model,
192
208
  status="completed",
193
209
  output=[OpenAIResponseMessage(role="assistant", content=[refusal_content], type="message")],
210
+ max_output_tokens=self.max_output_tokens,
194
211
  metadata=self.metadata,
212
+ store=self.store,
195
213
  )
196
214
 
197
215
  return OpenAIResponseObjectStreamResponseCompleted(response=refusal_response)
@@ -212,8 +230,10 @@ class StreamingResponseOrchestrator:
212
230
  *,
213
231
  error: OpenAIResponseError | None = None,
214
232
  ) -> OpenAIResponseObject:
233
+ completed_at = int(time.time()) if status == "completed" else None
215
234
  return OpenAIResponseObject(
216
235
  created_at=self.created_at,
236
+ completed_at=completed_at,
217
237
  id=self.response_id,
218
238
  model=self.ctx.model,
219
239
  object="response",
@@ -228,7 +248,10 @@ class StreamingResponseOrchestrator:
228
248
  prompt=self.prompt,
229
249
  parallel_tool_calls=self.parallel_tool_calls,
230
250
  max_tool_calls=self.max_tool_calls,
251
+ reasoning=self.reasoning,
252
+ max_output_tokens=self.max_output_tokens,
231
253
  metadata=self.metadata,
254
+ store=self.store,
232
255
  )
233
256
 
234
257
  async def create_response(self) -> AsyncIterator[OpenAIResponseObjectStream]:
@@ -292,6 +315,22 @@ class StreamingResponseOrchestrator:
292
315
 
293
316
  try:
294
317
  while True:
318
+ if (
319
+ self.max_output_tokens is not None
320
+ and self.accumulated_builtin_output_tokens >= self.max_output_tokens
321
+ ):
322
+ logger.info(
323
+ "Skipping inference call since max_output_tokens reached: "
324
+ f"{self.accumulated_builtin_output_tokens}/{self.max_output_tokens}"
325
+ )
326
+ final_status = "incomplete"
327
+ break
328
+
329
+ remaining_output_tokens = (
330
+ self.max_output_tokens - self.accumulated_builtin_output_tokens
331
+ if self.max_output_tokens is not None
332
+ else None
333
+ )
295
334
  # Text is the default response format for chat completion so don't need to pass it
296
335
  # (some providers don't support non-empty response_format when tools are present)
297
336
  response_format = (
@@ -311,6 +350,11 @@ class StreamingResponseOrchestrator:
311
350
  True if self.include and ResponseItemInclude.message_output_text_logprobs in self.include else None
312
351
  )
313
352
 
353
+ # In OpenAI, parallel_tool_calls is only allowed when 'tools' are specified.
354
+ effective_parallel_tool_calls = (
355
+ self.parallel_tool_calls if effective_tools is not None and len(effective_tools) > 0 else None
356
+ )
357
+
314
358
  params = OpenAIChatCompletionRequestWithExtraBody(
315
359
  model=self.ctx.model,
316
360
  messages=messages,
@@ -324,6 +368,9 @@ class StreamingResponseOrchestrator:
324
368
  "include_usage": True,
325
369
  },
326
370
  logprobs=logprobs,
371
+ parallel_tool_calls=effective_parallel_tool_calls,
372
+ reasoning_effort=self.reasoning.effort if self.reasoning else None,
373
+ max_completion_tokens=remaining_output_tokens,
327
374
  )
328
375
  completion_result = await self.inference_api.openai_chat_completion(params)
329
376
 
@@ -480,23 +527,24 @@ class StreamingResponseOrchestrator:
480
527
  if not chunk.usage:
481
528
  return
482
529
 
530
+ self.accumulated_builtin_output_tokens += chunk.usage.completion_tokens
531
+
483
532
  if self.accumulated_usage is None:
484
533
  # Convert from chat completion format to response format
485
534
  self.accumulated_usage = OpenAIResponseUsage(
486
535
  input_tokens=chunk.usage.prompt_tokens,
487
536
  output_tokens=chunk.usage.completion_tokens,
488
537
  total_tokens=chunk.usage.total_tokens,
489
- input_tokens_details=(
490
- OpenAIResponseUsageInputTokensDetails(cached_tokens=chunk.usage.prompt_tokens_details.cached_tokens)
491
- if chunk.usage.prompt_tokens_details
492
- else None
538
+ input_tokens_details=OpenAIResponseUsageInputTokensDetails(
539
+ cached_tokens=chunk.usage.prompt_tokens_details.cached_tokens
540
+ if chunk.usage.prompt_tokens_details and chunk.usage.prompt_tokens_details.cached_tokens is not None
541
+ else 0
493
542
  ),
494
- output_tokens_details=(
495
- OpenAIResponseUsageOutputTokensDetails(
496
- reasoning_tokens=chunk.usage.completion_tokens_details.reasoning_tokens
497
- )
543
+ output_tokens_details=OpenAIResponseUsageOutputTokensDetails(
544
+ reasoning_tokens=chunk.usage.completion_tokens_details.reasoning_tokens
498
545
  if chunk.usage.completion_tokens_details
499
- else None
546
+ and chunk.usage.completion_tokens_details.reasoning_tokens is not None
547
+ else 0
500
548
  ),
501
549
  )
502
550
  else:
@@ -506,17 +554,16 @@ class StreamingResponseOrchestrator:
506
554
  output_tokens=self.accumulated_usage.output_tokens + chunk.usage.completion_tokens,
507
555
  total_tokens=self.accumulated_usage.total_tokens + chunk.usage.total_tokens,
508
556
  # Use latest non-null details
509
- input_tokens_details=(
510
- OpenAIResponseUsageInputTokensDetails(cached_tokens=chunk.usage.prompt_tokens_details.cached_tokens)
511
- if chunk.usage.prompt_tokens_details
512
- else self.accumulated_usage.input_tokens_details
557
+ input_tokens_details=OpenAIResponseUsageInputTokensDetails(
558
+ cached_tokens=chunk.usage.prompt_tokens_details.cached_tokens
559
+ if chunk.usage.prompt_tokens_details and chunk.usage.prompt_tokens_details.cached_tokens is not None
560
+ else self.accumulated_usage.input_tokens_details.cached_tokens
513
561
  ),
514
- output_tokens_details=(
515
- OpenAIResponseUsageOutputTokensDetails(
516
- reasoning_tokens=chunk.usage.completion_tokens_details.reasoning_tokens
517
- )
562
+ output_tokens_details=OpenAIResponseUsageOutputTokensDetails(
563
+ reasoning_tokens=chunk.usage.completion_tokens_details.reasoning_tokens
518
564
  if chunk.usage.completion_tokens_details
519
- else self.accumulated_usage.output_tokens_details
565
+ and chunk.usage.completion_tokens_details.reasoning_tokens is not None
566
+ else self.accumulated_usage.output_tokens_details.reasoning_tokens
520
567
  ),
521
568
  )
522
569
 
@@ -652,7 +699,7 @@ class StreamingResponseOrchestrator:
652
699
  chat_response_tool_calls: dict[int, OpenAIChatCompletionToolCall] = {}
653
700
  chunk_created = 0
654
701
  chunk_model = ""
655
- chunk_finish_reason = ""
702
+ chunk_finish_reason: OpenAIFinishReason = "stop"
656
703
  chat_response_logprobs = []
657
704
 
658
705
  # Create a placeholder message item for delta events
@@ -744,9 +791,9 @@ class StreamingResponseOrchestrator:
744
791
  chunk_finish_reason = chunk_choice.finish_reason
745
792
 
746
793
  # Handle reasoning content if present (non-standard field for o1/o3 models)
747
- if hasattr(chunk_choice.delta, "reasoning_content") and chunk_choice.delta.reasoning_content:
794
+ if hasattr(chunk_choice.delta, "reasoning") and chunk_choice.delta.reasoning:
748
795
  async for event in self._handle_reasoning_content_chunk(
749
- reasoning_content=chunk_choice.delta.reasoning_content,
796
+ reasoning_content=chunk_choice.delta.reasoning,
750
797
  reasoning_part_emitted=reasoning_part_emitted,
751
798
  reasoning_content_index=reasoning_content_index,
752
799
  message_item_id=message_item_id,
@@ -758,7 +805,7 @@ class StreamingResponseOrchestrator:
758
805
  else:
759
806
  yield event
760
807
  reasoning_part_emitted = True
761
- reasoning_text_accumulated.append(chunk_choice.delta.reasoning_content)
808
+ reasoning_text_accumulated.append(chunk_choice.delta.reasoning)
762
809
 
763
810
  # Handle refusal content if present
764
811
  if chunk_choice.delta.refusal:
@@ -1175,6 +1222,9 @@ class StreamingResponseOrchestrator:
1175
1222
  """Process an MCP tool configuration and emit appropriate streaming events."""
1176
1223
  from llama_stack.providers.utils.tools.mcp import list_mcp_tools
1177
1224
 
1225
+ # Resolve connector_id to server_url if provided
1226
+ mcp_tool = await resolve_mcp_connector_id(mcp_tool, self.connectors_api)
1227
+
1178
1228
  # Emit mcp_list_tools.in_progress
1179
1229
  self.sequence_number += 1
1180
1230
  yield OpenAIResponseObjectStreamResponseMcpListToolsInProgress(
@@ -1489,3 +1539,25 @@ async def _process_tool_choice(
1489
1539
  tools=tool_choice,
1490
1540
  mode="required",
1491
1541
  )
1542
+
1543
+
1544
+ async def resolve_mcp_connector_id(
1545
+ mcp_tool: OpenAIResponseInputToolMCP,
1546
+ connectors_api: Connectors,
1547
+ ) -> OpenAIResponseInputToolMCP:
1548
+ """Resolve connector_id to server_url for an MCP tool.
1549
+
1550
+ If the mcp_tool has a connector_id but no server_url, this function
1551
+ looks up the connector and populates the server_url from it.
1552
+
1553
+ Args:
1554
+ mcp_tool: The MCP tool configuration to resolve
1555
+ connectors_api: The connectors API for looking up connectors
1556
+
1557
+ Returns:
1558
+ The mcp_tool with server_url populated (may be same instance if already set)
1559
+ """
1560
+ if mcp_tool.connector_id and not mcp_tool.server_url:
1561
+ connector = await connectors_api.get_connector(mcp_tool.connector_id)
1562
+ return mcp_tool.model_copy(update={"server_url": connector.url})
1563
+ return mcp_tool