llama-stack 0.4.3__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (311) hide show
  1. llama_stack/cli/stack/_list_deps.py +11 -7
  2. llama_stack/cli/stack/run.py +3 -25
  3. llama_stack/core/access_control/datatypes.py +78 -0
  4. llama_stack/core/configure.py +2 -2
  5. {llama_stack_api/internal → llama_stack/core/connectors}/__init__.py +2 -2
  6. llama_stack/core/connectors/connectors.py +162 -0
  7. llama_stack/core/conversations/conversations.py +61 -58
  8. llama_stack/core/datatypes.py +54 -8
  9. llama_stack/core/library_client.py +60 -13
  10. llama_stack/core/prompts/prompts.py +43 -42
  11. llama_stack/core/routers/datasets.py +20 -17
  12. llama_stack/core/routers/eval_scoring.py +143 -53
  13. llama_stack/core/routers/inference.py +20 -9
  14. llama_stack/core/routers/safety.py +30 -42
  15. llama_stack/core/routers/vector_io.py +15 -7
  16. llama_stack/core/routing_tables/models.py +42 -3
  17. llama_stack/core/routing_tables/scoring_functions.py +19 -19
  18. llama_stack/core/routing_tables/shields.py +20 -17
  19. llama_stack/core/routing_tables/vector_stores.py +8 -5
  20. llama_stack/core/server/auth.py +192 -17
  21. llama_stack/core/server/fastapi_router_registry.py +40 -5
  22. llama_stack/core/server/server.py +24 -5
  23. llama_stack/core/stack.py +54 -10
  24. llama_stack/core/storage/datatypes.py +9 -0
  25. llama_stack/core/store/registry.py +1 -1
  26. llama_stack/core/utils/exec.py +2 -2
  27. llama_stack/core/utils/type_inspection.py +16 -2
  28. llama_stack/distributions/dell/config.yaml +4 -1
  29. llama_stack/distributions/dell/doc_template.md +209 -0
  30. llama_stack/distributions/dell/run-with-safety.yaml +4 -1
  31. llama_stack/distributions/nvidia/config.yaml +4 -1
  32. llama_stack/distributions/nvidia/doc_template.md +170 -0
  33. llama_stack/distributions/nvidia/run-with-safety.yaml +4 -1
  34. llama_stack/distributions/oci/config.yaml +4 -1
  35. llama_stack/distributions/oci/doc_template.md +140 -0
  36. llama_stack/distributions/open-benchmark/config.yaml +9 -1
  37. llama_stack/distributions/postgres-demo/config.yaml +1 -1
  38. llama_stack/distributions/starter/build.yaml +62 -0
  39. llama_stack/distributions/starter/config.yaml +22 -3
  40. llama_stack/distributions/starter/run-with-postgres-store.yaml +22 -3
  41. llama_stack/distributions/starter/starter.py +13 -1
  42. llama_stack/distributions/starter-gpu/build.yaml +62 -0
  43. llama_stack/distributions/starter-gpu/config.yaml +22 -3
  44. llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +22 -3
  45. llama_stack/distributions/template.py +10 -2
  46. llama_stack/distributions/watsonx/config.yaml +4 -1
  47. llama_stack/log.py +1 -0
  48. llama_stack/models/llama/resources/dog.jpg +0 -0
  49. llama_stack/models/llama/resources/pasta.jpeg +0 -0
  50. llama_stack/models/llama/resources/small_dog.jpg +0 -0
  51. llama_stack/providers/inline/agents/meta_reference/__init__.py +1 -0
  52. llama_stack/providers/inline/agents/meta_reference/agents.py +58 -61
  53. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +187 -60
  54. llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +99 -22
  55. llama_stack/providers/inline/agents/meta_reference/responses/types.py +2 -1
  56. llama_stack/providers/inline/agents/meta_reference/responses/utils.py +4 -1
  57. llama_stack/providers/inline/agents/meta_reference/safety.py +2 -2
  58. llama_stack/providers/inline/batches/reference/batches.py +2 -1
  59. llama_stack/providers/inline/eval/meta_reference/eval.py +40 -32
  60. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h +9 -0
  61. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift +189 -0
  62. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift +238 -0
  63. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/PromptTemplate.swift +12 -0
  64. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/SystemPrompts.swift +89 -0
  65. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.pbxproj +550 -0
  66. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
  67. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
  68. llama_stack/providers/inline/post_training/huggingface/post_training.py +33 -38
  69. llama_stack/providers/inline/post_training/huggingface/utils.py +2 -5
  70. llama_stack/providers/inline/post_training/torchtune/common/utils.py +5 -9
  71. llama_stack/providers/inline/post_training/torchtune/post_training.py +28 -33
  72. llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +2 -4
  73. llama_stack/providers/inline/safety/code_scanner/code_scanner.py +12 -15
  74. llama_stack/providers/inline/safety/llama_guard/llama_guard.py +20 -24
  75. llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +11 -17
  76. llama_stack/providers/inline/scoring/basic/scoring.py +13 -17
  77. llama_stack/providers/inline/scoring/braintrust/braintrust.py +15 -15
  78. llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +13 -17
  79. llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +1 -1
  80. llama_stack/providers/registry/agents.py +1 -0
  81. llama_stack/providers/registry/inference.py +1 -9
  82. llama_stack/providers/registry/vector_io.py +136 -16
  83. llama_stack/providers/remote/datasetio/nvidia/README.md +74 -0
  84. llama_stack/providers/remote/eval/nvidia/README.md +134 -0
  85. llama_stack/providers/remote/eval/nvidia/eval.py +22 -21
  86. llama_stack/providers/remote/files/s3/README.md +266 -0
  87. llama_stack/providers/remote/files/s3/config.py +5 -3
  88. llama_stack/providers/remote/files/s3/files.py +2 -2
  89. llama_stack/providers/remote/inference/gemini/gemini.py +4 -0
  90. llama_stack/providers/remote/inference/nvidia/NVIDIA.md +203 -0
  91. llama_stack/providers/remote/inference/openai/openai.py +2 -0
  92. llama_stack/providers/remote/inference/together/together.py +4 -0
  93. llama_stack/providers/remote/inference/vertexai/config.py +3 -3
  94. llama_stack/providers/remote/inference/vertexai/vertexai.py +5 -2
  95. llama_stack/providers/remote/inference/vllm/config.py +37 -18
  96. llama_stack/providers/remote/inference/vllm/vllm.py +0 -3
  97. llama_stack/providers/remote/inference/watsonx/watsonx.py +4 -0
  98. llama_stack/providers/remote/post_training/nvidia/README.md +151 -0
  99. llama_stack/providers/remote/post_training/nvidia/models.py +3 -11
  100. llama_stack/providers/remote/post_training/nvidia/post_training.py +31 -33
  101. llama_stack/providers/remote/safety/bedrock/bedrock.py +10 -27
  102. llama_stack/providers/remote/safety/nvidia/README.md +78 -0
  103. llama_stack/providers/remote/safety/nvidia/nvidia.py +9 -25
  104. llama_stack/providers/remote/safety/sambanova/sambanova.py +13 -11
  105. llama_stack/providers/remote/vector_io/elasticsearch/__init__.py +17 -0
  106. llama_stack/providers/remote/vector_io/elasticsearch/config.py +32 -0
  107. llama_stack/providers/remote/vector_io/elasticsearch/elasticsearch.py +463 -0
  108. llama_stack/providers/remote/vector_io/oci/__init__.py +22 -0
  109. llama_stack/providers/remote/vector_io/oci/config.py +41 -0
  110. llama_stack/providers/remote/vector_io/oci/oci26ai.py +595 -0
  111. llama_stack/providers/remote/vector_io/pgvector/config.py +69 -2
  112. llama_stack/providers/remote/vector_io/pgvector/pgvector.py +255 -6
  113. llama_stack/providers/remote/vector_io/qdrant/qdrant.py +62 -38
  114. llama_stack/providers/utils/bedrock/client.py +3 -3
  115. llama_stack/providers/utils/bedrock/config.py +7 -7
  116. llama_stack/providers/utils/inference/__init__.py +0 -25
  117. llama_stack/providers/utils/inference/embedding_mixin.py +4 -0
  118. llama_stack/providers/utils/inference/http_client.py +239 -0
  119. llama_stack/providers/utils/inference/litellm_openai_mixin.py +6 -0
  120. llama_stack/providers/utils/inference/model_registry.py +148 -2
  121. llama_stack/providers/utils/inference/openai_compat.py +1 -158
  122. llama_stack/providers/utils/inference/openai_mixin.py +42 -2
  123. llama_stack/providers/utils/inference/prompt_adapter.py +0 -209
  124. llama_stack/providers/utils/memory/openai_vector_store_mixin.py +92 -5
  125. llama_stack/providers/utils/memory/vector_store.py +46 -19
  126. llama_stack/providers/utils/responses/responses_store.py +40 -6
  127. llama_stack/providers/utils/safety.py +114 -0
  128. llama_stack/providers/utils/tools/mcp.py +44 -3
  129. llama_stack/testing/api_recorder.py +9 -3
  130. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/METADATA +14 -2
  131. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/RECORD +135 -279
  132. llama_stack-0.5.0.dist-info/top_level.txt +1 -0
  133. llama_stack/distributions/meta-reference-gpu/__init__.py +0 -7
  134. llama_stack/distributions/meta-reference-gpu/config.yaml +0 -140
  135. llama_stack/distributions/meta-reference-gpu/meta_reference.py +0 -163
  136. llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +0 -155
  137. llama_stack/models/llama/hadamard_utils.py +0 -88
  138. llama_stack/models/llama/llama3/args.py +0 -74
  139. llama_stack/models/llama/llama3/generation.py +0 -378
  140. llama_stack/models/llama/llama3/model.py +0 -304
  141. llama_stack/models/llama/llama3/multimodal/__init__.py +0 -12
  142. llama_stack/models/llama/llama3/multimodal/encoder_utils.py +0 -180
  143. llama_stack/models/llama/llama3/multimodal/image_transform.py +0 -409
  144. llama_stack/models/llama/llama3/multimodal/model.py +0 -1430
  145. llama_stack/models/llama/llama3/multimodal/utils.py +0 -26
  146. llama_stack/models/llama/llama3/quantization/__init__.py +0 -5
  147. llama_stack/models/llama/llama3/quantization/loader.py +0 -316
  148. llama_stack/models/llama/llama3_1/__init__.py +0 -12
  149. llama_stack/models/llama/llama3_1/prompt_format.md +0 -358
  150. llama_stack/models/llama/llama3_1/prompts.py +0 -258
  151. llama_stack/models/llama/llama3_2/__init__.py +0 -5
  152. llama_stack/models/llama/llama3_2/prompts_text.py +0 -229
  153. llama_stack/models/llama/llama3_2/prompts_vision.py +0 -126
  154. llama_stack/models/llama/llama3_2/text_prompt_format.md +0 -286
  155. llama_stack/models/llama/llama3_2/vision_prompt_format.md +0 -141
  156. llama_stack/models/llama/llama3_3/__init__.py +0 -5
  157. llama_stack/models/llama/llama3_3/prompts.py +0 -259
  158. llama_stack/models/llama/llama4/args.py +0 -107
  159. llama_stack/models/llama/llama4/ffn.py +0 -58
  160. llama_stack/models/llama/llama4/moe.py +0 -214
  161. llama_stack/models/llama/llama4/preprocess.py +0 -435
  162. llama_stack/models/llama/llama4/quantization/__init__.py +0 -5
  163. llama_stack/models/llama/llama4/quantization/loader.py +0 -226
  164. llama_stack/models/llama/llama4/vision/__init__.py +0 -5
  165. llama_stack/models/llama/llama4/vision/embedding.py +0 -210
  166. llama_stack/models/llama/llama4/vision/encoder.py +0 -412
  167. llama_stack/models/llama/quantize_impls.py +0 -316
  168. llama_stack/providers/inline/inference/meta_reference/__init__.py +0 -20
  169. llama_stack/providers/inline/inference/meta_reference/common.py +0 -24
  170. llama_stack/providers/inline/inference/meta_reference/config.py +0 -68
  171. llama_stack/providers/inline/inference/meta_reference/generators.py +0 -201
  172. llama_stack/providers/inline/inference/meta_reference/inference.py +0 -542
  173. llama_stack/providers/inline/inference/meta_reference/model_parallel.py +0 -77
  174. llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +0 -353
  175. llama_stack-0.4.3.dist-info/top_level.txt +0 -2
  176. llama_stack_api/__init__.py +0 -945
  177. llama_stack_api/admin/__init__.py +0 -45
  178. llama_stack_api/admin/api.py +0 -72
  179. llama_stack_api/admin/fastapi_routes.py +0 -117
  180. llama_stack_api/admin/models.py +0 -113
  181. llama_stack_api/agents.py +0 -173
  182. llama_stack_api/batches/__init__.py +0 -40
  183. llama_stack_api/batches/api.py +0 -53
  184. llama_stack_api/batches/fastapi_routes.py +0 -113
  185. llama_stack_api/batches/models.py +0 -78
  186. llama_stack_api/benchmarks/__init__.py +0 -43
  187. llama_stack_api/benchmarks/api.py +0 -39
  188. llama_stack_api/benchmarks/fastapi_routes.py +0 -109
  189. llama_stack_api/benchmarks/models.py +0 -109
  190. llama_stack_api/common/__init__.py +0 -5
  191. llama_stack_api/common/content_types.py +0 -101
  192. llama_stack_api/common/errors.py +0 -95
  193. llama_stack_api/common/job_types.py +0 -38
  194. llama_stack_api/common/responses.py +0 -77
  195. llama_stack_api/common/training_types.py +0 -47
  196. llama_stack_api/common/type_system.py +0 -146
  197. llama_stack_api/connectors.py +0 -146
  198. llama_stack_api/conversations.py +0 -270
  199. llama_stack_api/datasetio.py +0 -55
  200. llama_stack_api/datasets/__init__.py +0 -61
  201. llama_stack_api/datasets/api.py +0 -35
  202. llama_stack_api/datasets/fastapi_routes.py +0 -104
  203. llama_stack_api/datasets/models.py +0 -152
  204. llama_stack_api/datatypes.py +0 -373
  205. llama_stack_api/eval.py +0 -137
  206. llama_stack_api/file_processors/__init__.py +0 -27
  207. llama_stack_api/file_processors/api.py +0 -64
  208. llama_stack_api/file_processors/fastapi_routes.py +0 -78
  209. llama_stack_api/file_processors/models.py +0 -42
  210. llama_stack_api/files/__init__.py +0 -35
  211. llama_stack_api/files/api.py +0 -51
  212. llama_stack_api/files/fastapi_routes.py +0 -124
  213. llama_stack_api/files/models.py +0 -107
  214. llama_stack_api/inference.py +0 -1169
  215. llama_stack_api/inspect_api/__init__.py +0 -37
  216. llama_stack_api/inspect_api/api.py +0 -25
  217. llama_stack_api/inspect_api/fastapi_routes.py +0 -76
  218. llama_stack_api/inspect_api/models.py +0 -28
  219. llama_stack_api/internal/kvstore.py +0 -28
  220. llama_stack_api/internal/sqlstore.py +0 -81
  221. llama_stack_api/llama_stack_api/__init__.py +0 -945
  222. llama_stack_api/llama_stack_api/admin/__init__.py +0 -45
  223. llama_stack_api/llama_stack_api/admin/api.py +0 -72
  224. llama_stack_api/llama_stack_api/admin/fastapi_routes.py +0 -117
  225. llama_stack_api/llama_stack_api/admin/models.py +0 -113
  226. llama_stack_api/llama_stack_api/agents.py +0 -173
  227. llama_stack_api/llama_stack_api/batches/__init__.py +0 -40
  228. llama_stack_api/llama_stack_api/batches/api.py +0 -53
  229. llama_stack_api/llama_stack_api/batches/fastapi_routes.py +0 -113
  230. llama_stack_api/llama_stack_api/batches/models.py +0 -78
  231. llama_stack_api/llama_stack_api/benchmarks/__init__.py +0 -43
  232. llama_stack_api/llama_stack_api/benchmarks/api.py +0 -39
  233. llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +0 -109
  234. llama_stack_api/llama_stack_api/benchmarks/models.py +0 -109
  235. llama_stack_api/llama_stack_api/common/__init__.py +0 -5
  236. llama_stack_api/llama_stack_api/common/content_types.py +0 -101
  237. llama_stack_api/llama_stack_api/common/errors.py +0 -95
  238. llama_stack_api/llama_stack_api/common/job_types.py +0 -38
  239. llama_stack_api/llama_stack_api/common/responses.py +0 -77
  240. llama_stack_api/llama_stack_api/common/training_types.py +0 -47
  241. llama_stack_api/llama_stack_api/common/type_system.py +0 -146
  242. llama_stack_api/llama_stack_api/connectors.py +0 -146
  243. llama_stack_api/llama_stack_api/conversations.py +0 -270
  244. llama_stack_api/llama_stack_api/datasetio.py +0 -55
  245. llama_stack_api/llama_stack_api/datasets/__init__.py +0 -61
  246. llama_stack_api/llama_stack_api/datasets/api.py +0 -35
  247. llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +0 -104
  248. llama_stack_api/llama_stack_api/datasets/models.py +0 -152
  249. llama_stack_api/llama_stack_api/datatypes.py +0 -373
  250. llama_stack_api/llama_stack_api/eval.py +0 -137
  251. llama_stack_api/llama_stack_api/file_processors/__init__.py +0 -27
  252. llama_stack_api/llama_stack_api/file_processors/api.py +0 -64
  253. llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +0 -78
  254. llama_stack_api/llama_stack_api/file_processors/models.py +0 -42
  255. llama_stack_api/llama_stack_api/files/__init__.py +0 -35
  256. llama_stack_api/llama_stack_api/files/api.py +0 -51
  257. llama_stack_api/llama_stack_api/files/fastapi_routes.py +0 -124
  258. llama_stack_api/llama_stack_api/files/models.py +0 -107
  259. llama_stack_api/llama_stack_api/inference.py +0 -1169
  260. llama_stack_api/llama_stack_api/inspect_api/__init__.py +0 -37
  261. llama_stack_api/llama_stack_api/inspect_api/api.py +0 -25
  262. llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +0 -76
  263. llama_stack_api/llama_stack_api/inspect_api/models.py +0 -28
  264. llama_stack_api/llama_stack_api/internal/__init__.py +0 -9
  265. llama_stack_api/llama_stack_api/internal/kvstore.py +0 -28
  266. llama_stack_api/llama_stack_api/internal/sqlstore.py +0 -81
  267. llama_stack_api/llama_stack_api/models.py +0 -171
  268. llama_stack_api/llama_stack_api/openai_responses.py +0 -1468
  269. llama_stack_api/llama_stack_api/post_training.py +0 -370
  270. llama_stack_api/llama_stack_api/prompts.py +0 -203
  271. llama_stack_api/llama_stack_api/providers/__init__.py +0 -33
  272. llama_stack_api/llama_stack_api/providers/api.py +0 -16
  273. llama_stack_api/llama_stack_api/providers/fastapi_routes.py +0 -57
  274. llama_stack_api/llama_stack_api/providers/models.py +0 -24
  275. llama_stack_api/llama_stack_api/py.typed +0 -0
  276. llama_stack_api/llama_stack_api/rag_tool.py +0 -168
  277. llama_stack_api/llama_stack_api/resource.py +0 -37
  278. llama_stack_api/llama_stack_api/router_utils.py +0 -160
  279. llama_stack_api/llama_stack_api/safety.py +0 -132
  280. llama_stack_api/llama_stack_api/schema_utils.py +0 -208
  281. llama_stack_api/llama_stack_api/scoring.py +0 -93
  282. llama_stack_api/llama_stack_api/scoring_functions.py +0 -211
  283. llama_stack_api/llama_stack_api/shields.py +0 -93
  284. llama_stack_api/llama_stack_api/tools.py +0 -226
  285. llama_stack_api/llama_stack_api/vector_io.py +0 -941
  286. llama_stack_api/llama_stack_api/vector_stores.py +0 -53
  287. llama_stack_api/llama_stack_api/version.py +0 -9
  288. llama_stack_api/models.py +0 -171
  289. llama_stack_api/openai_responses.py +0 -1468
  290. llama_stack_api/post_training.py +0 -370
  291. llama_stack_api/prompts.py +0 -203
  292. llama_stack_api/providers/__init__.py +0 -33
  293. llama_stack_api/providers/api.py +0 -16
  294. llama_stack_api/providers/fastapi_routes.py +0 -57
  295. llama_stack_api/providers/models.py +0 -24
  296. llama_stack_api/py.typed +0 -0
  297. llama_stack_api/rag_tool.py +0 -168
  298. llama_stack_api/resource.py +0 -37
  299. llama_stack_api/router_utils.py +0 -160
  300. llama_stack_api/safety.py +0 -132
  301. llama_stack_api/schema_utils.py +0 -208
  302. llama_stack_api/scoring.py +0 -93
  303. llama_stack_api/scoring_functions.py +0 -211
  304. llama_stack_api/shields.py +0 -93
  305. llama_stack_api/tools.py +0 -226
  306. llama_stack_api/vector_io.py +0 -941
  307. llama_stack_api/vector_stores.py +0 -53
  308. llama_stack_api/version.py +0 -9
  309. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/WHEEL +0 -0
  310. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/entry_points.txt +0 -0
  311. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -4,6 +4,7 @@
4
4
  # This source code is licensed under the terms described in the LICENSE file in
5
5
  # the root directory of this source tree.
6
6
 
7
+ from collections.abc import AsyncIterator
7
8
 
8
9
  from llama_stack.core.datatypes import AccessRule
9
10
  from llama_stack.core.storage.kvstore import InmemoryKVStoreImpl, kvstore_impl
@@ -11,21 +12,21 @@ from llama_stack.log import get_logger
11
12
  from llama_stack.providers.utils.responses.responses_store import ResponsesStore
12
13
  from llama_stack_api import (
13
14
  Agents,
15
+ Connectors,
14
16
  Conversations,
17
+ CreateResponseRequest,
18
+ DeleteResponseRequest,
15
19
  Files,
16
20
  Inference,
17
21
  ListOpenAIResponseInputItem,
18
22
  ListOpenAIResponseObject,
23
+ ListResponseInputItemsRequest,
24
+ ListResponsesRequest,
19
25
  OpenAIDeleteResponseObject,
20
- OpenAIResponseInput,
21
- OpenAIResponseInputTool,
22
- OpenAIResponseInputToolChoice,
23
26
  OpenAIResponseObject,
24
- OpenAIResponsePrompt,
25
- OpenAIResponseText,
26
- Order,
27
+ OpenAIResponseObjectStream,
27
28
  Prompts,
28
- ResponseGuardrail,
29
+ RetrieveResponseRequest,
29
30
  Safety,
30
31
  ToolGroups,
31
32
  ToolRuntime,
@@ -50,6 +51,7 @@ class MetaReferenceAgentsImpl(Agents):
50
51
  conversations_api: Conversations,
51
52
  prompts_api: Prompts,
52
53
  files_api: Files,
54
+ connectors_api: Connectors,
53
55
  policy: list[AccessRule],
54
56
  ):
55
57
  self.config = config
@@ -64,6 +66,7 @@ class MetaReferenceAgentsImpl(Agents):
64
66
  self.in_memory_store = InmemoryKVStoreImpl()
65
67
  self.openai_responses_impl: OpenAIResponsesImpl | None = None
66
68
  self.policy = policy
69
+ self.connectors_api = connectors_api
67
70
 
68
71
  async def initialize(self) -> None:
69
72
  self.persistence_store = await kvstore_impl(self.config.persistence.agent_state)
@@ -80,6 +83,7 @@ class MetaReferenceAgentsImpl(Agents):
80
83
  prompts_api=self.prompts_api,
81
84
  files_api=self.files_api,
82
85
  vector_stores_config=self.config.vector_stores_config,
86
+ connectors_api=self.connectors_api,
83
87
  )
84
88
 
85
89
  async def shutdown(self) -> None:
@@ -88,79 +92,72 @@ class MetaReferenceAgentsImpl(Agents):
88
92
  # OpenAI responses
89
93
  async def get_openai_response(
90
94
  self,
91
- response_id: str,
95
+ request: RetrieveResponseRequest,
92
96
  ) -> OpenAIResponseObject:
93
97
  assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
94
- return await self.openai_responses_impl.get_openai_response(response_id)
98
+ return await self.openai_responses_impl.get_openai_response(request.response_id)
95
99
 
96
100
  async def create_openai_response(
97
101
  self,
98
- input: str | list[OpenAIResponseInput],
99
- model: str,
100
- prompt: OpenAIResponsePrompt | None = None,
101
- instructions: str | None = None,
102
- parallel_tool_calls: bool | None = True,
103
- previous_response_id: str | None = None,
104
- conversation: str | None = None,
105
- store: bool | None = True,
106
- stream: bool | None = False,
107
- temperature: float | None = None,
108
- text: OpenAIResponseText | None = None,
109
- tool_choice: OpenAIResponseInputToolChoice | None = None,
110
- tools: list[OpenAIResponseInputTool] | None = None,
111
- include: list[str] | None = None,
112
- max_infer_iters: int | None = 10,
113
- guardrails: list[ResponseGuardrail] | None = None,
114
- max_tool_calls: int | None = None,
115
- metadata: dict[str, str] | None = None,
116
- ) -> OpenAIResponseObject:
102
+ request: CreateResponseRequest,
103
+ ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
104
+ """Create an OpenAI response.
105
+
106
+ Returns either a single response object (non-streaming) or an async iterator
107
+ yielding response stream events (streaming).
108
+ """
117
109
  assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
118
110
  result = await self.openai_responses_impl.create_openai_response(
119
- input,
120
- model,
121
- prompt,
122
- instructions,
123
- previous_response_id,
124
- conversation,
125
- store,
126
- stream,
127
- temperature,
128
- text,
129
- tool_choice,
130
- tools,
131
- include,
132
- max_infer_iters,
133
- guardrails,
134
- parallel_tool_calls,
135
- max_tool_calls,
136
- metadata,
111
+ request.input,
112
+ request.model,
113
+ request.prompt,
114
+ request.instructions,
115
+ request.previous_response_id,
116
+ request.conversation,
117
+ request.store,
118
+ request.stream,
119
+ request.temperature,
120
+ request.text,
121
+ request.tool_choice,
122
+ request.tools,
123
+ request.include,
124
+ request.max_infer_iters,
125
+ request.guardrails,
126
+ request.parallel_tool_calls,
127
+ request.max_tool_calls,
128
+ request.max_output_tokens,
129
+ request.reasoning,
130
+ request.safety_identifier,
131
+ request.metadata,
137
132
  )
138
- return result # type: ignore[no-any-return]
133
+ return result
139
134
 
140
135
  async def list_openai_responses(
141
136
  self,
142
- after: str | None = None,
143
- limit: int | None = 50,
144
- model: str | None = None,
145
- order: Order | None = Order.desc,
137
+ request: ListResponsesRequest,
146
138
  ) -> ListOpenAIResponseObject:
147
139
  assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
148
- return await self.openai_responses_impl.list_openai_responses(after, limit, model, order)
140
+ return await self.openai_responses_impl.list_openai_responses(
141
+ request.after, request.limit, request.model, request.order
142
+ )
149
143
 
150
144
  async def list_openai_response_input_items(
151
145
  self,
152
- response_id: str,
153
- after: str | None = None,
154
- before: str | None = None,
155
- include: list[str] | None = None,
156
- limit: int | None = 20,
157
- order: Order | None = Order.desc,
146
+ request: ListResponseInputItemsRequest,
158
147
  ) -> ListOpenAIResponseInputItem:
159
148
  assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
160
149
  return await self.openai_responses_impl.list_openai_response_input_items(
161
- response_id, after, before, include, limit, order
150
+ request.response_id,
151
+ request.after,
152
+ request.before,
153
+ request.include,
154
+ request.limit,
155
+ request.order,
162
156
  )
163
157
 
164
- async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
158
+ async def delete_openai_response(
159
+ self,
160
+ request: DeleteResponseRequest,
161
+ ) -> OpenAIDeleteResponseObject:
165
162
  assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
166
- return await self.openai_responses_impl.delete_openai_response(response_id)
163
+ return await self.openai_responses_impl.delete_openai_response(request.response_id)
@@ -4,7 +4,6 @@
4
4
  # This source code is licensed under the terms described in the LICENSE file in
5
5
  # the root directory of this source tree.
6
6
 
7
- import asyncio
8
7
  import re
9
8
  import time
10
9
  import uuid
@@ -19,11 +18,14 @@ from llama_stack.providers.utils.responses.responses_store import (
19
18
  )
20
19
  from llama_stack.providers.utils.tools.mcp import MCPSessionManager
21
20
  from llama_stack_api import (
21
+ AddItemsRequest,
22
+ Connectors,
22
23
  ConversationItem,
23
24
  Conversations,
24
25
  Files,
25
26
  Inference,
26
27
  InvalidConversationIdError,
28
+ ListItemsRequest,
27
29
  ListOpenAIResponseInputItem,
28
30
  ListOpenAIResponseObject,
29
31
  OpenAIChatCompletionContentPartParam,
@@ -39,6 +41,7 @@ from llama_stack_api import (
39
41
  OpenAIResponseObject,
40
42
  OpenAIResponseObjectStream,
41
43
  OpenAIResponsePrompt,
44
+ OpenAIResponseReasoning,
42
45
  OpenAIResponseText,
43
46
  OpenAIResponseTextFormat,
44
47
  OpenAISystemMessageParam,
@@ -83,6 +86,7 @@ class OpenAIResponsesImpl:
83
86
  conversations_api: Conversations,
84
87
  prompts_api: Prompts,
85
88
  files_api: Files,
89
+ connectors_api: Connectors,
86
90
  vector_stores_config=None,
87
91
  ):
88
92
  self.inference_api = inference_api
@@ -100,6 +104,7 @@ class OpenAIResponsesImpl:
100
104
  )
101
105
  self.prompts_api = prompts_api
102
106
  self.files_api = files_api
107
+ self.connectors_api = connectors_api
103
108
 
104
109
  async def _prepend_previous_response(
105
110
  self,
@@ -150,7 +155,9 @@ class OpenAIResponsesImpl:
150
155
 
151
156
  tool_context.recover_tools_from_previous_response(previous_response)
152
157
  elif conversation is not None:
153
- conversation_items = await self.conversations_api.list_items(conversation, order="asc")
158
+ conversation_items = await self.conversations_api.list_items(
159
+ ListItemsRequest(conversation_id=conversation, order="asc")
160
+ )
154
161
 
155
162
  # Use stored messages as source of truth (like previous_response.messages)
156
163
  stored_messages = await self.responses_store.get_conversation_messages(conversation)
@@ -324,6 +331,125 @@ class OpenAIResponsesImpl:
324
331
  messages=messages,
325
332
  )
326
333
 
334
+ def _prepare_input_items_for_storage(
335
+ self,
336
+ input: str | list[OpenAIResponseInput],
337
+ ) -> list[OpenAIResponseInput]:
338
+ """Prepare input items for storage, adding IDs where needed.
339
+
340
+ This method is called once at the start of streaming to prepare input items
341
+ that will be reused across multiple persistence calls during streaming.
342
+ """
343
+ new_input_id = f"msg_{uuid.uuid4()}"
344
+ input_items_data: list[OpenAIResponseInput] = []
345
+
346
+ if isinstance(input, str):
347
+ input_content = OpenAIResponseInputMessageContentText(text=input)
348
+ input_content_item = OpenAIResponseMessage(
349
+ role="user",
350
+ content=[input_content],
351
+ id=new_input_id,
352
+ )
353
+ input_items_data = [input_content_item]
354
+ else:
355
+ for input_item in input:
356
+ if isinstance(input_item, OpenAIResponseMessage):
357
+ input_item_dict = input_item.model_dump()
358
+ if "id" not in input_item_dict:
359
+ input_item_dict["id"] = new_input_id
360
+ input_items_data.append(OpenAIResponseMessage(**input_item_dict))
361
+ else:
362
+ input_items_data.append(input_item)
363
+
364
+ return input_items_data
365
+
366
+ async def _persist_streaming_state(
367
+ self,
368
+ stream_chunk: OpenAIResponseObjectStream,
369
+ orchestrator,
370
+ input_items: list[OpenAIResponseInput],
371
+ output_items: list,
372
+ ) -> None:
373
+ """Persist response state at significant streaming events.
374
+
375
+ This enables clients to poll GET /v1/responses/{response_id} during streaming
376
+ to see in-progress turn state instead of empty results.
377
+
378
+ Persistence occurs at:
379
+ - response.in_progress: Initial INSERT with empty output
380
+ - response.output_item.done: UPDATE with accumulated output items
381
+ - response.completed/response.incomplete: Final UPDATE with complete state
382
+ - response.failed: UPDATE with error state
383
+
384
+ :param stream_chunk: The current streaming event.
385
+ :param orchestrator: The streaming orchestrator (for snapshotting response).
386
+ :param input_items: Pre-prepared input items for storage.
387
+ :param output_items: Accumulated output items so far.
388
+ """
389
+ try:
390
+ match stream_chunk.type:
391
+ case "response.in_progress":
392
+ # Initial persistence when response starts
393
+ in_progress_response = stream_chunk.response
394
+ await self.responses_store.upsert_response_object(
395
+ response_object=in_progress_response,
396
+ input=input_items,
397
+ messages=[],
398
+ )
399
+
400
+ case "response.output_item.done":
401
+ # Incremental update when an output item completes (tool call, message)
402
+ current_snapshot = orchestrator._snapshot_response(
403
+ status="in_progress",
404
+ outputs=output_items,
405
+ )
406
+ # Get current messages (filter out system messages)
407
+ messages_to_store = list(
408
+ filter(
409
+ lambda x: not isinstance(x, OpenAISystemMessageParam),
410
+ orchestrator.final_messages or orchestrator.ctx.messages,
411
+ )
412
+ )
413
+ await self.responses_store.upsert_response_object(
414
+ response_object=current_snapshot,
415
+ input=input_items,
416
+ messages=messages_to_store,
417
+ )
418
+
419
+ case "response.completed" | "response.incomplete":
420
+ # Final persistence when response finishes
421
+ final_response = stream_chunk.response
422
+ messages_to_store = list(
423
+ filter(
424
+ lambda x: not isinstance(x, OpenAISystemMessageParam),
425
+ orchestrator.final_messages,
426
+ )
427
+ )
428
+ await self.responses_store.upsert_response_object(
429
+ response_object=final_response,
430
+ input=input_items,
431
+ messages=messages_to_store,
432
+ )
433
+
434
+ case "response.failed":
435
+ # Persist failed state so GET shows error
436
+ failed_response = stream_chunk.response
437
+ # Preserve any accumulated non-system messages for failed responses
438
+ messages_to_store = list(
439
+ filter(
440
+ lambda x: not isinstance(x, OpenAISystemMessageParam),
441
+ orchestrator.final_messages or orchestrator.ctx.messages,
442
+ )
443
+ )
444
+ await self.responses_store.upsert_response_object(
445
+ response_object=failed_response,
446
+ input=input_items,
447
+ messages=messages_to_store,
448
+ )
449
+ except Exception as e:
450
+ # Best-effort persistence: log error but don't fail the stream
451
+ logger.warning(f"Failed to persist streaming state for {stream_chunk.type}: {e}")
452
+
327
453
  async def create_openai_response(
328
454
  self,
329
455
  input: str | list[OpenAIResponseInput],
@@ -343,6 +469,9 @@ class OpenAIResponsesImpl:
343
469
  guardrails: list[str | ResponseGuardrailSpec] | None = None,
344
470
  parallel_tool_calls: bool | None = None,
345
471
  max_tool_calls: int | None = None,
472
+ reasoning: OpenAIResponseReasoning | None = None,
473
+ max_output_tokens: int | None = None,
474
+ safety_identifier: str | None = None,
346
475
  metadata: dict[str, str] | None = None,
347
476
  ):
348
477
  stream = bool(stream)
@@ -380,9 +509,6 @@ class OpenAIResponsesImpl:
380
509
  if not conversation.startswith("conv_"):
381
510
  raise InvalidConversationIdError(conversation)
382
511
 
383
- if max_tool_calls is not None and max_tool_calls < 1:
384
- raise ValueError(f"Invalid {max_tool_calls=}; should be >= 1")
385
-
386
512
  stream_gen = self._create_streaming_response(
387
513
  input=input,
388
514
  conversation=conversation,
@@ -399,6 +525,9 @@ class OpenAIResponsesImpl:
399
525
  guardrail_ids=guardrail_ids,
400
526
  parallel_tool_calls=parallel_tool_calls,
401
527
  max_tool_calls=max_tool_calls,
528
+ reasoning=reasoning,
529
+ max_output_tokens=max_output_tokens,
530
+ safety_identifier=safety_identifier,
402
531
  metadata=metadata,
403
532
  include=include,
404
533
  )
@@ -454,6 +583,9 @@ class OpenAIResponsesImpl:
454
583
  guardrail_ids: list[str] | None = None,
455
584
  parallel_tool_calls: bool | None = True,
456
585
  max_tool_calls: int | None = None,
586
+ reasoning: OpenAIResponseReasoning | None = None,
587
+ max_output_tokens: int | None = None,
588
+ safety_identifier: str | None = None,
457
589
  metadata: dict[str, str] | None = None,
458
590
  include: list[ResponseItemInclude] | None = None,
459
591
  ) -> AsyncIterator[OpenAIResponseObjectStream]:
@@ -493,42 +625,45 @@ class OpenAIResponsesImpl:
493
625
 
494
626
  # Create a per-request MCP session manager for session reuse (fix for #4452)
495
627
  # This avoids redundant tools/list calls when making multiple MCP tool invocations
496
- mcp_session_manager = MCPSessionManager()
497
-
498
- # Create a per-request ToolExecutor with the session manager
499
- request_tool_executor = ToolExecutor(
500
- tool_groups_api=self.tool_groups_api,
501
- tool_runtime_api=self.tool_runtime_api,
502
- vector_io_api=self.vector_io_api,
503
- vector_stores_config=self.tool_executor.vector_stores_config,
504
- mcp_session_manager=mcp_session_manager,
505
- )
628
+ async with MCPSessionManager() as mcp_session_manager:
629
+ request_tool_executor = ToolExecutor(
630
+ tool_groups_api=self.tool_groups_api,
631
+ tool_runtime_api=self.tool_runtime_api,
632
+ vector_io_api=self.vector_io_api,
633
+ vector_stores_config=self.tool_executor.vector_stores_config,
634
+ mcp_session_manager=mcp_session_manager,
635
+ )
506
636
 
507
- orchestrator = StreamingResponseOrchestrator(
508
- inference_api=self.inference_api,
509
- ctx=ctx,
510
- response_id=response_id,
511
- created_at=created_at,
512
- prompt=prompt,
513
- text=text,
514
- max_infer_iters=max_infer_iters,
515
- parallel_tool_calls=parallel_tool_calls,
516
- tool_executor=request_tool_executor,
517
- safety_api=self.safety_api,
518
- guardrail_ids=guardrail_ids,
519
- instructions=instructions,
520
- max_tool_calls=max_tool_calls,
521
- metadata=metadata,
522
- include=include,
523
- )
637
+ orchestrator = StreamingResponseOrchestrator(
638
+ inference_api=self.inference_api,
639
+ ctx=ctx,
640
+ response_id=response_id,
641
+ created_at=created_at,
642
+ prompt=prompt,
643
+ text=text,
644
+ max_infer_iters=max_infer_iters,
645
+ parallel_tool_calls=parallel_tool_calls,
646
+ tool_executor=request_tool_executor,
647
+ safety_api=self.safety_api,
648
+ connectors_api=self.connectors_api,
649
+ guardrail_ids=guardrail_ids,
650
+ instructions=instructions,
651
+ max_tool_calls=max_tool_calls,
652
+ reasoning=reasoning,
653
+ max_output_tokens=max_output_tokens,
654
+ safety_identifier=safety_identifier,
655
+ metadata=metadata,
656
+ include=include,
657
+ store=store,
658
+ )
524
659
 
525
- # Stream the response
526
- final_response = None
527
- failed_response = None
660
+ final_response = None
661
+ failed_response = None
662
+
663
+ output_items: list[ConversationItem] = []
664
+
665
+ input_items_for_storage = self._prepare_input_items_for_storage(all_input)
528
666
 
529
- # Type as ConversationItem to avoid list invariance issues
530
- output_items: list[ConversationItem] = []
531
- try:
532
667
  async for stream_chunk in orchestrator.create_response():
533
668
  match stream_chunk.type:
534
669
  case "response.completed" | "response.incomplete":
@@ -541,6 +676,16 @@ class OpenAIResponsesImpl:
541
676
  case _:
542
677
  pass # Other event types
543
678
 
679
+ # Incremental persistence: persist on significant state changes
680
+ # This enables clients to poll GET /v1/responses/{response_id} during streaming
681
+ if store:
682
+ await self._persist_streaming_state(
683
+ stream_chunk=stream_chunk,
684
+ orchestrator=orchestrator,
685
+ input_items=input_items_for_storage,
686
+ output_items=output_items,
687
+ )
688
+
544
689
  # Store and sync before yielding terminal events
545
690
  # This ensures the storage/syncing happens even if the consumer breaks after receiving the event
546
691
  if (
@@ -548,32 +693,14 @@ class OpenAIResponsesImpl:
548
693
  and final_response
549
694
  and failed_response is None
550
695
  ):
551
- messages_to_store = list(
552
- filter(lambda x: not isinstance(x, OpenAISystemMessageParam), orchestrator.final_messages)
553
- )
554
- if store:
555
- # TODO: we really should work off of output_items instead of "final_messages"
556
- await self._store_response(
557
- response=final_response,
558
- input=all_input,
559
- messages=messages_to_store,
560
- )
561
-
562
696
  if conversation:
697
+ messages_to_store = list(
698
+ filter(lambda x: not isinstance(x, OpenAISystemMessageParam), orchestrator.final_messages)
699
+ )
563
700
  await self._sync_response_to_conversation(conversation, input, output_items)
564
701
  await self.responses_store.store_conversation_messages(conversation, messages_to_store)
565
702
 
566
703
  yield stream_chunk
567
- finally:
568
- # Clean up MCP sessions at the end of the request (fix for #4452)
569
- # Use shield() to prevent cancellation from interrupting cleanup and leaking resources
570
- # Wrap in try/except as cleanup errors should not mask the original response
571
- try:
572
- await asyncio.shield(mcp_session_manager.close_all())
573
- except BaseException as e:
574
- # Debug level - cleanup errors are expected in streaming scenarios where
575
- # anyio cancel scopes may be in a different task context
576
- logger.debug(f"Error during MCP session cleanup: {e}")
577
704
 
578
705
  async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
579
706
  return await self.responses_store.delete_response_object(response_id)
@@ -596,4 +723,4 @@ class OpenAIResponsesImpl:
596
723
 
597
724
  adapter = TypeAdapter(list[ConversationItem])
598
725
  validated_items = adapter.validate_python(conversation_items)
599
- await self.conversations_api.add_items(conversation_id, validated_items)
726
+ await self.conversations_api.add_items(conversation_id, AddItemsRequest(items=validated_items))