llama-stack 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (738) hide show
  1. llama_stack/__init__.py +5 -0
  2. llama_stack/apis/agents/__init__.py +1 -1
  3. llama_stack/apis/agents/agents.py +700 -281
  4. llama_stack/apis/agents/openai_responses.py +1311 -0
  5. llama_stack/{providers/adapters/memory/sample/config.py → apis/batches/__init__.py} +2 -5
  6. llama_stack/apis/batches/batches.py +100 -0
  7. llama_stack/apis/benchmarks/__init__.py +7 -0
  8. llama_stack/apis/benchmarks/benchmarks.py +108 -0
  9. llama_stack/apis/common/content_types.py +143 -0
  10. llama_stack/apis/common/errors.py +103 -0
  11. llama_stack/apis/common/job_types.py +38 -0
  12. llama_stack/apis/common/responses.py +36 -0
  13. llama_stack/apis/common/training_types.py +36 -5
  14. llama_stack/apis/common/type_system.py +158 -0
  15. llama_stack/apis/conversations/__init__.py +31 -0
  16. llama_stack/apis/conversations/conversations.py +286 -0
  17. llama_stack/apis/datasetio/__init__.py +7 -0
  18. llama_stack/apis/datasetio/datasetio.py +59 -0
  19. llama_stack/apis/datasets/__init__.py +7 -0
  20. llama_stack/apis/datasets/datasets.py +251 -0
  21. llama_stack/apis/datatypes.py +160 -0
  22. llama_stack/apis/eval/__init__.py +7 -0
  23. llama_stack/apis/eval/eval.py +169 -0
  24. llama_stack/apis/files/__init__.py +7 -0
  25. llama_stack/apis/files/files.py +199 -0
  26. llama_stack/apis/inference/__init__.py +1 -1
  27. llama_stack/apis/inference/inference.py +1169 -113
  28. llama_stack/apis/inspect/__init__.py +1 -1
  29. llama_stack/apis/inspect/inspect.py +69 -16
  30. llama_stack/apis/models/__init__.py +1 -1
  31. llama_stack/apis/models/models.py +148 -21
  32. llama_stack/apis/post_training/__init__.py +1 -1
  33. llama_stack/apis/post_training/post_training.py +265 -120
  34. llama_stack/{providers/adapters/agents/sample/config.py → apis/prompts/__init__.py} +2 -5
  35. llama_stack/apis/prompts/prompts.py +204 -0
  36. llama_stack/apis/providers/__init__.py +7 -0
  37. llama_stack/apis/providers/providers.py +69 -0
  38. llama_stack/apis/resource.py +37 -0
  39. llama_stack/apis/safety/__init__.py +1 -1
  40. llama_stack/apis/safety/safety.py +95 -12
  41. llama_stack/apis/scoring/__init__.py +7 -0
  42. llama_stack/apis/scoring/scoring.py +93 -0
  43. llama_stack/apis/scoring_functions/__init__.py +7 -0
  44. llama_stack/apis/scoring_functions/scoring_functions.py +208 -0
  45. llama_stack/apis/shields/__init__.py +1 -1
  46. llama_stack/apis/shields/shields.py +76 -33
  47. llama_stack/apis/synthetic_data_generation/__init__.py +1 -1
  48. llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py +40 -17
  49. llama_stack/apis/telemetry/__init__.py +1 -1
  50. llama_stack/apis/telemetry/telemetry.py +322 -31
  51. llama_stack/apis/{dataset → tools}/__init__.py +2 -1
  52. llama_stack/apis/tools/rag_tool.py +218 -0
  53. llama_stack/apis/tools/tools.py +221 -0
  54. llama_stack/apis/vector_io/__init__.py +7 -0
  55. llama_stack/apis/vector_io/vector_io.py +960 -0
  56. llama_stack/apis/vector_stores/__init__.py +7 -0
  57. llama_stack/apis/vector_stores/vector_stores.py +51 -0
  58. llama_stack/apis/version.py +9 -0
  59. llama_stack/cli/llama.py +13 -5
  60. llama_stack/cli/stack/_list_deps.py +182 -0
  61. llama_stack/cli/stack/list_apis.py +1 -1
  62. llama_stack/cli/stack/list_deps.py +55 -0
  63. llama_stack/cli/stack/list_providers.py +24 -10
  64. llama_stack/cli/stack/list_stacks.py +56 -0
  65. llama_stack/cli/stack/remove.py +115 -0
  66. llama_stack/cli/stack/run.py +169 -56
  67. llama_stack/cli/stack/stack.py +18 -4
  68. llama_stack/cli/stack/utils.py +151 -0
  69. llama_stack/cli/table.py +23 -61
  70. llama_stack/cli/utils.py +29 -0
  71. llama_stack/core/access_control/access_control.py +131 -0
  72. llama_stack/core/access_control/conditions.py +129 -0
  73. llama_stack/core/access_control/datatypes.py +107 -0
  74. llama_stack/core/build.py +164 -0
  75. llama_stack/core/client.py +205 -0
  76. llama_stack/core/common.sh +37 -0
  77. llama_stack/{distribution → core}/configure.py +74 -55
  78. llama_stack/core/conversations/conversations.py +309 -0
  79. llama_stack/core/datatypes.py +625 -0
  80. llama_stack/core/distribution.py +276 -0
  81. llama_stack/core/external.py +54 -0
  82. llama_stack/core/id_generation.py +42 -0
  83. llama_stack/core/inspect.py +86 -0
  84. llama_stack/core/library_client.py +539 -0
  85. llama_stack/core/prompts/prompts.py +234 -0
  86. llama_stack/core/providers.py +137 -0
  87. llama_stack/core/request_headers.py +115 -0
  88. llama_stack/core/resolver.py +506 -0
  89. llama_stack/core/routers/__init__.py +101 -0
  90. llama_stack/core/routers/datasets.py +73 -0
  91. llama_stack/core/routers/eval_scoring.py +155 -0
  92. llama_stack/core/routers/inference.py +645 -0
  93. llama_stack/core/routers/safety.py +85 -0
  94. llama_stack/core/routers/tool_runtime.py +91 -0
  95. llama_stack/core/routers/vector_io.py +442 -0
  96. llama_stack/core/routing_tables/benchmarks.py +62 -0
  97. llama_stack/core/routing_tables/common.py +254 -0
  98. llama_stack/core/routing_tables/datasets.py +91 -0
  99. llama_stack/core/routing_tables/models.py +163 -0
  100. llama_stack/core/routing_tables/scoring_functions.py +66 -0
  101. llama_stack/core/routing_tables/shields.py +61 -0
  102. llama_stack/core/routing_tables/toolgroups.py +129 -0
  103. llama_stack/core/routing_tables/vector_stores.py +292 -0
  104. llama_stack/core/server/auth.py +187 -0
  105. llama_stack/core/server/auth_providers.py +494 -0
  106. llama_stack/core/server/quota.py +110 -0
  107. llama_stack/core/server/routes.py +141 -0
  108. llama_stack/core/server/server.py +542 -0
  109. llama_stack/core/server/tracing.py +80 -0
  110. llama_stack/core/stack.py +546 -0
  111. llama_stack/core/start_stack.sh +117 -0
  112. llama_stack/core/storage/datatypes.py +283 -0
  113. llama_stack/{cli/model → core/store}/__init__.py +1 -1
  114. llama_stack/core/store/registry.py +199 -0
  115. llama_stack/core/testing_context.py +49 -0
  116. llama_stack/core/ui/app.py +55 -0
  117. llama_stack/core/ui/modules/api.py +32 -0
  118. llama_stack/core/ui/modules/utils.py +42 -0
  119. llama_stack/core/ui/page/distribution/datasets.py +18 -0
  120. llama_stack/core/ui/page/distribution/eval_tasks.py +20 -0
  121. llama_stack/core/ui/page/distribution/models.py +18 -0
  122. llama_stack/core/ui/page/distribution/providers.py +27 -0
  123. llama_stack/core/ui/page/distribution/resources.py +48 -0
  124. llama_stack/core/ui/page/distribution/scoring_functions.py +18 -0
  125. llama_stack/core/ui/page/distribution/shields.py +19 -0
  126. llama_stack/core/ui/page/evaluations/app_eval.py +143 -0
  127. llama_stack/core/ui/page/evaluations/native_eval.py +253 -0
  128. llama_stack/core/ui/page/playground/chat.py +130 -0
  129. llama_stack/core/ui/page/playground/tools.py +352 -0
  130. llama_stack/core/utils/config.py +30 -0
  131. llama_stack/{distribution → core}/utils/config_dirs.py +3 -6
  132. llama_stack/core/utils/config_resolution.py +125 -0
  133. llama_stack/core/utils/context.py +84 -0
  134. llama_stack/core/utils/exec.py +96 -0
  135. llama_stack/{providers/impls/meta_reference/codeshield/config.py → core/utils/image_types.py} +4 -3
  136. llama_stack/{distribution → core}/utils/model_utils.py +2 -2
  137. llama_stack/{distribution → core}/utils/prompt_for_config.py +30 -63
  138. llama_stack/{apis/batch_inference → distributions/dell}/__init__.py +1 -1
  139. llama_stack/distributions/dell/build.yaml +33 -0
  140. llama_stack/distributions/dell/dell.py +158 -0
  141. llama_stack/distributions/dell/run-with-safety.yaml +141 -0
  142. llama_stack/distributions/dell/run.yaml +132 -0
  143. llama_stack/distributions/meta-reference-gpu/__init__.py +7 -0
  144. llama_stack/distributions/meta-reference-gpu/build.yaml +32 -0
  145. llama_stack/distributions/meta-reference-gpu/meta_reference.py +163 -0
  146. llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +154 -0
  147. llama_stack/distributions/meta-reference-gpu/run.yaml +139 -0
  148. llama_stack/{apis/evals → distributions/nvidia}/__init__.py +1 -1
  149. llama_stack/distributions/nvidia/build.yaml +29 -0
  150. llama_stack/distributions/nvidia/nvidia.py +154 -0
  151. llama_stack/distributions/nvidia/run-with-safety.yaml +137 -0
  152. llama_stack/distributions/nvidia/run.yaml +116 -0
  153. llama_stack/distributions/open-benchmark/__init__.py +7 -0
  154. llama_stack/distributions/open-benchmark/build.yaml +36 -0
  155. llama_stack/distributions/open-benchmark/open_benchmark.py +303 -0
  156. llama_stack/distributions/open-benchmark/run.yaml +252 -0
  157. llama_stack/distributions/postgres-demo/__init__.py +7 -0
  158. llama_stack/distributions/postgres-demo/build.yaml +23 -0
  159. llama_stack/distributions/postgres-demo/postgres_demo.py +125 -0
  160. llama_stack/distributions/postgres-demo/run.yaml +115 -0
  161. llama_stack/{apis/memory → distributions/starter}/__init__.py +1 -1
  162. llama_stack/distributions/starter/build.yaml +61 -0
  163. llama_stack/distributions/starter/run-with-postgres-store.yaml +285 -0
  164. llama_stack/distributions/starter/run.yaml +276 -0
  165. llama_stack/distributions/starter/starter.py +345 -0
  166. llama_stack/distributions/starter-gpu/__init__.py +7 -0
  167. llama_stack/distributions/starter-gpu/build.yaml +61 -0
  168. llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +288 -0
  169. llama_stack/distributions/starter-gpu/run.yaml +279 -0
  170. llama_stack/distributions/starter-gpu/starter_gpu.py +20 -0
  171. llama_stack/distributions/template.py +456 -0
  172. llama_stack/distributions/watsonx/__init__.py +7 -0
  173. llama_stack/distributions/watsonx/build.yaml +33 -0
  174. llama_stack/distributions/watsonx/run.yaml +133 -0
  175. llama_stack/distributions/watsonx/watsonx.py +95 -0
  176. llama_stack/env.py +24 -0
  177. llama_stack/log.py +314 -0
  178. llama_stack/models/llama/checkpoint.py +164 -0
  179. llama_stack/models/llama/datatypes.py +164 -0
  180. llama_stack/models/llama/hadamard_utils.py +86 -0
  181. llama_stack/models/llama/llama3/args.py +74 -0
  182. llama_stack/models/llama/llama3/chat_format.py +286 -0
  183. llama_stack/models/llama/llama3/generation.py +376 -0
  184. llama_stack/models/llama/llama3/interface.py +255 -0
  185. llama_stack/models/llama/llama3/model.py +304 -0
  186. llama_stack/models/llama/llama3/multimodal/__init__.py +12 -0
  187. llama_stack/models/llama/llama3/multimodal/encoder_utils.py +180 -0
  188. llama_stack/models/llama/llama3/multimodal/image_transform.py +409 -0
  189. llama_stack/models/llama/llama3/multimodal/model.py +1430 -0
  190. llama_stack/models/llama/llama3/multimodal/utils.py +26 -0
  191. llama_stack/models/llama/llama3/prompt_templates/__init__.py +22 -0
  192. llama_stack/models/llama/llama3/prompt_templates/base.py +39 -0
  193. llama_stack/models/llama/llama3/prompt_templates/system_prompts.py +319 -0
  194. llama_stack/models/llama/llama3/prompt_templates/tool_response.py +62 -0
  195. llama_stack/models/llama/llama3/quantization/loader.py +316 -0
  196. llama_stack/models/llama/llama3/template_data.py +116 -0
  197. llama_stack/models/llama/llama3/tokenizer.model +128000 -0
  198. llama_stack/models/llama/llama3/tokenizer.py +198 -0
  199. llama_stack/models/llama/llama3/tool_utils.py +266 -0
  200. llama_stack/models/llama/llama3_1/__init__.py +12 -0
  201. llama_stack/models/llama/llama3_1/prompt_format.md +358 -0
  202. llama_stack/models/llama/llama3_1/prompts.py +258 -0
  203. llama_stack/models/llama/llama3_2/prompts_text.py +229 -0
  204. llama_stack/models/llama/llama3_2/prompts_vision.py +126 -0
  205. llama_stack/models/llama/llama3_2/text_prompt_format.md +286 -0
  206. llama_stack/models/llama/llama3_2/vision_prompt_format.md +141 -0
  207. llama_stack/models/llama/llama3_3/prompts.py +259 -0
  208. llama_stack/models/llama/llama4/args.py +107 -0
  209. llama_stack/models/llama/llama4/chat_format.py +317 -0
  210. llama_stack/models/llama/llama4/datatypes.py +56 -0
  211. llama_stack/models/llama/llama4/ffn.py +58 -0
  212. llama_stack/models/llama/llama4/generation.py +313 -0
  213. llama_stack/models/llama/llama4/model.py +437 -0
  214. llama_stack/models/llama/llama4/moe.py +214 -0
  215. llama_stack/models/llama/llama4/preprocess.py +435 -0
  216. llama_stack/models/llama/llama4/prompt_format.md +304 -0
  217. llama_stack/models/llama/llama4/prompt_templates/system_prompts.py +136 -0
  218. llama_stack/models/llama/llama4/prompts.py +279 -0
  219. llama_stack/models/llama/llama4/quantization/__init__.py +5 -0
  220. llama_stack/models/llama/llama4/quantization/loader.py +226 -0
  221. llama_stack/models/llama/llama4/tokenizer.model +200000 -0
  222. llama_stack/models/llama/llama4/tokenizer.py +263 -0
  223. llama_stack/models/llama/llama4/vision/__init__.py +5 -0
  224. llama_stack/models/llama/llama4/vision/embedding.py +210 -0
  225. llama_stack/models/llama/llama4/vision/encoder.py +412 -0
  226. llama_stack/models/llama/prompt_format.py +191 -0
  227. llama_stack/models/llama/quantize_impls.py +316 -0
  228. llama_stack/models/llama/sku_list.py +1029 -0
  229. llama_stack/models/llama/sku_types.py +233 -0
  230. llama_stack/models/llama/tokenizer_utils.py +40 -0
  231. llama_stack/providers/datatypes.py +136 -107
  232. llama_stack/providers/inline/__init__.py +5 -0
  233. llama_stack/providers/inline/agents/__init__.py +5 -0
  234. llama_stack/providers/{impls/meta_reference/agents → inline/agents/meta_reference}/__init__.py +12 -5
  235. llama_stack/providers/inline/agents/meta_reference/agent_instance.py +1024 -0
  236. llama_stack/providers/inline/agents/meta_reference/agents.py +383 -0
  237. llama_stack/providers/inline/agents/meta_reference/config.py +37 -0
  238. llama_stack/providers/inline/agents/meta_reference/persistence.py +228 -0
  239. llama_stack/providers/inline/agents/meta_reference/responses/__init__.py +5 -0
  240. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +423 -0
  241. llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +1226 -0
  242. llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +449 -0
  243. llama_stack/providers/inline/agents/meta_reference/responses/types.py +194 -0
  244. llama_stack/providers/inline/agents/meta_reference/responses/utils.py +365 -0
  245. llama_stack/providers/inline/agents/meta_reference/safety.py +52 -0
  246. llama_stack/providers/inline/batches/__init__.py +5 -0
  247. llama_stack/providers/inline/batches/reference/__init__.py +36 -0
  248. llama_stack/providers/inline/batches/reference/batches.py +679 -0
  249. llama_stack/providers/inline/batches/reference/config.py +40 -0
  250. llama_stack/providers/inline/datasetio/__init__.py +5 -0
  251. llama_stack/providers/inline/datasetio/localfs/__init__.py +20 -0
  252. llama_stack/providers/inline/datasetio/localfs/config.py +23 -0
  253. llama_stack/providers/inline/datasetio/localfs/datasetio.py +113 -0
  254. llama_stack/providers/inline/eval/__init__.py +5 -0
  255. llama_stack/providers/inline/eval/meta_reference/__init__.py +28 -0
  256. llama_stack/providers/inline/eval/meta_reference/config.py +23 -0
  257. llama_stack/providers/inline/eval/meta_reference/eval.py +259 -0
  258. llama_stack/providers/inline/files/localfs/__init__.py +20 -0
  259. llama_stack/providers/inline/files/localfs/config.py +31 -0
  260. llama_stack/providers/inline/files/localfs/files.py +219 -0
  261. llama_stack/providers/inline/inference/__init__.py +5 -0
  262. llama_stack/providers/{impls/meta_reference/inference → inline/inference/meta_reference}/__init__.py +4 -4
  263. llama_stack/providers/inline/inference/meta_reference/common.py +24 -0
  264. llama_stack/providers/inline/inference/meta_reference/config.py +68 -0
  265. llama_stack/providers/inline/inference/meta_reference/generators.py +211 -0
  266. llama_stack/providers/inline/inference/meta_reference/inference.py +158 -0
  267. llama_stack/providers/inline/inference/meta_reference/model_parallel.py +96 -0
  268. llama_stack/providers/{impls/meta_reference/inference → inline/inference/meta_reference}/parallel_utils.py +56 -73
  269. llama_stack/providers/inline/inference/sentence_transformers/__init__.py +22 -0
  270. llama_stack/providers/{impls/meta_reference/agents → inline/inference/sentence_transformers}/config.py +6 -4
  271. llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +83 -0
  272. llama_stack/providers/inline/post_training/__init__.py +5 -0
  273. llama_stack/providers/inline/post_training/common/__init__.py +5 -0
  274. llama_stack/providers/inline/post_training/common/utils.py +35 -0
  275. llama_stack/providers/inline/post_training/common/validator.py +36 -0
  276. llama_stack/providers/inline/post_training/huggingface/__init__.py +27 -0
  277. llama_stack/providers/inline/post_training/huggingface/config.py +83 -0
  278. llama_stack/providers/inline/post_training/huggingface/post_training.py +208 -0
  279. llama_stack/providers/inline/post_training/huggingface/recipes/__init__.py +5 -0
  280. llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py +519 -0
  281. llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device_dpo.py +485 -0
  282. llama_stack/providers/inline/post_training/huggingface/utils.py +269 -0
  283. llama_stack/providers/inline/post_training/torchtune/__init__.py +27 -0
  284. llama_stack/providers/inline/post_training/torchtune/common/__init__.py +5 -0
  285. llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py +240 -0
  286. llama_stack/providers/inline/post_training/torchtune/common/utils.py +99 -0
  287. llama_stack/providers/inline/post_training/torchtune/config.py +20 -0
  288. llama_stack/providers/inline/post_training/torchtune/datasets/__init__.py +5 -0
  289. llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py +57 -0
  290. llama_stack/providers/inline/post_training/torchtune/datasets/sft.py +78 -0
  291. llama_stack/providers/inline/post_training/torchtune/post_training.py +178 -0
  292. llama_stack/providers/inline/post_training/torchtune/recipes/__init__.py +5 -0
  293. llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +588 -0
  294. llama_stack/providers/inline/safety/__init__.py +5 -0
  295. llama_stack/providers/{impls/meta_reference/codeshield → inline/safety/code_scanner}/__init__.py +4 -2
  296. llama_stack/providers/inline/safety/code_scanner/code_scanner.py +128 -0
  297. llama_stack/providers/{impls/meta_reference/memory → inline/safety/code_scanner}/config.py +5 -3
  298. llama_stack/providers/inline/safety/llama_guard/__init__.py +19 -0
  299. llama_stack/providers/inline/safety/llama_guard/config.py +19 -0
  300. llama_stack/providers/inline/safety/llama_guard/llama_guard.py +489 -0
  301. llama_stack/providers/{adapters/memory/sample → inline/safety/prompt_guard}/__init__.py +4 -4
  302. llama_stack/providers/inline/safety/prompt_guard/config.py +32 -0
  303. llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +131 -0
  304. llama_stack/providers/inline/scoring/__init__.py +5 -0
  305. llama_stack/providers/inline/scoring/basic/__init__.py +25 -0
  306. llama_stack/providers/{adapters/memory/weaviate → inline/scoring/basic}/config.py +5 -7
  307. llama_stack/providers/inline/scoring/basic/scoring.py +126 -0
  308. llama_stack/providers/inline/scoring/basic/scoring_fn/__init__.py +5 -0
  309. llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py +240 -0
  310. llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py +41 -0
  311. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/__init__.py +5 -0
  312. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py +21 -0
  313. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py +21 -0
  314. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py +23 -0
  315. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py +27 -0
  316. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py +71 -0
  317. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py +21 -0
  318. llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py +80 -0
  319. llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py +66 -0
  320. llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py +58 -0
  321. llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py +38 -0
  322. llama_stack/providers/inline/scoring/basic/utils/__init__.py +5 -0
  323. llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py +3319 -0
  324. llama_stack/providers/inline/scoring/basic/utils/math_utils.py +330 -0
  325. llama_stack/providers/inline/scoring/braintrust/__init__.py +27 -0
  326. llama_stack/providers/inline/scoring/braintrust/braintrust.py +230 -0
  327. llama_stack/providers/inline/scoring/braintrust/config.py +21 -0
  328. llama_stack/providers/inline/scoring/braintrust/scoring_fn/__init__.py +5 -0
  329. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/__init__.py +5 -0
  330. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py +24 -0
  331. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py +24 -0
  332. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py +24 -0
  333. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py +24 -0
  334. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py +24 -0
  335. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py +24 -0
  336. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py +23 -0
  337. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py +24 -0
  338. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py +24 -0
  339. llama_stack/providers/inline/scoring/llm_as_judge/__init__.py +21 -0
  340. llama_stack/providers/inline/scoring/llm_as_judge/config.py +14 -0
  341. llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +113 -0
  342. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/__init__.py +5 -0
  343. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/__init__.py +5 -0
  344. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py +96 -0
  345. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py +20 -0
  346. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py +81 -0
  347. llama_stack/providers/inline/telemetry/__init__.py +5 -0
  348. llama_stack/providers/inline/telemetry/meta_reference/__init__.py +21 -0
  349. llama_stack/providers/inline/telemetry/meta_reference/config.py +47 -0
  350. llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +252 -0
  351. llama_stack/providers/inline/tool_runtime/__init__.py +5 -0
  352. llama_stack/providers/inline/tool_runtime/rag/__init__.py +19 -0
  353. llama_stack/providers/{impls/meta_reference/telemetry → inline/tool_runtime/rag}/config.py +5 -3
  354. llama_stack/providers/inline/tool_runtime/rag/context_retriever.py +77 -0
  355. llama_stack/providers/inline/tool_runtime/rag/memory.py +332 -0
  356. llama_stack/providers/inline/vector_io/__init__.py +5 -0
  357. llama_stack/providers/inline/vector_io/chroma/__init__.py +19 -0
  358. llama_stack/providers/inline/vector_io/chroma/config.py +30 -0
  359. llama_stack/providers/inline/vector_io/faiss/__init__.py +21 -0
  360. llama_stack/providers/inline/vector_io/faiss/config.py +26 -0
  361. llama_stack/providers/inline/vector_io/faiss/faiss.py +293 -0
  362. llama_stack/providers/inline/vector_io/milvus/__init__.py +19 -0
  363. llama_stack/providers/inline/vector_io/milvus/config.py +29 -0
  364. llama_stack/providers/inline/vector_io/qdrant/__init__.py +20 -0
  365. llama_stack/providers/inline/vector_io/qdrant/config.py +29 -0
  366. llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py +20 -0
  367. llama_stack/providers/inline/vector_io/sqlite_vec/config.py +26 -0
  368. llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +483 -0
  369. llama_stack/providers/registry/agents.py +16 -18
  370. llama_stack/providers/registry/batches.py +26 -0
  371. llama_stack/providers/registry/datasetio.py +49 -0
  372. llama_stack/providers/registry/eval.py +46 -0
  373. llama_stack/providers/registry/files.py +31 -0
  374. llama_stack/providers/registry/inference.py +273 -118
  375. llama_stack/providers/registry/post_training.py +69 -0
  376. llama_stack/providers/registry/safety.py +46 -41
  377. llama_stack/providers/registry/scoring.py +51 -0
  378. llama_stack/providers/registry/tool_runtime.py +87 -0
  379. llama_stack/providers/registry/vector_io.py +828 -0
  380. llama_stack/providers/remote/__init__.py +5 -0
  381. llama_stack/providers/remote/agents/__init__.py +5 -0
  382. llama_stack/providers/remote/datasetio/__init__.py +5 -0
  383. llama_stack/providers/{adapters/memory/chroma → remote/datasetio/huggingface}/__init__.py +7 -4
  384. llama_stack/providers/remote/datasetio/huggingface/config.py +23 -0
  385. llama_stack/providers/remote/datasetio/huggingface/huggingface.py +99 -0
  386. llama_stack/providers/remote/datasetio/nvidia/__init__.py +23 -0
  387. llama_stack/providers/remote/datasetio/nvidia/config.py +61 -0
  388. llama_stack/providers/remote/datasetio/nvidia/datasetio.py +116 -0
  389. llama_stack/providers/remote/eval/__init__.py +5 -0
  390. llama_stack/providers/remote/eval/nvidia/__init__.py +31 -0
  391. llama_stack/providers/remote/eval/nvidia/config.py +29 -0
  392. llama_stack/providers/remote/eval/nvidia/eval.py +162 -0
  393. llama_stack/providers/remote/files/s3/__init__.py +19 -0
  394. llama_stack/providers/remote/files/s3/config.py +42 -0
  395. llama_stack/providers/remote/files/s3/files.py +313 -0
  396. llama_stack/providers/remote/inference/__init__.py +5 -0
  397. llama_stack/providers/{adapters/safety/sample → remote/inference/anthropic}/__init__.py +4 -6
  398. llama_stack/providers/remote/inference/anthropic/anthropic.py +36 -0
  399. llama_stack/providers/remote/inference/anthropic/config.py +28 -0
  400. llama_stack/providers/{impls/meta_reference/telemetry → remote/inference/azure}/__init__.py +4 -4
  401. llama_stack/providers/remote/inference/azure/azure.py +25 -0
  402. llama_stack/providers/remote/inference/azure/config.py +61 -0
  403. llama_stack/providers/{adapters → remote}/inference/bedrock/__init__.py +18 -17
  404. llama_stack/providers/remote/inference/bedrock/bedrock.py +142 -0
  405. llama_stack/providers/{adapters/inference/sample → remote/inference/bedrock}/config.py +3 -4
  406. llama_stack/providers/remote/inference/bedrock/models.py +29 -0
  407. llama_stack/providers/remote/inference/cerebras/__init__.py +19 -0
  408. llama_stack/providers/remote/inference/cerebras/cerebras.py +28 -0
  409. llama_stack/providers/remote/inference/cerebras/config.py +30 -0
  410. llama_stack/providers/{adapters → remote}/inference/databricks/__init__.py +4 -5
  411. llama_stack/providers/remote/inference/databricks/config.py +37 -0
  412. llama_stack/providers/remote/inference/databricks/databricks.py +44 -0
  413. llama_stack/providers/{adapters → remote}/inference/fireworks/__init__.py +8 -4
  414. llama_stack/providers/remote/inference/fireworks/config.py +27 -0
  415. llama_stack/providers/remote/inference/fireworks/fireworks.py +27 -0
  416. llama_stack/providers/{adapters/memory/pgvector → remote/inference/gemini}/__init__.py +4 -4
  417. llama_stack/providers/remote/inference/gemini/config.py +28 -0
  418. llama_stack/providers/remote/inference/gemini/gemini.py +82 -0
  419. llama_stack/providers/remote/inference/groq/__init__.py +15 -0
  420. llama_stack/providers/remote/inference/groq/config.py +34 -0
  421. llama_stack/providers/remote/inference/groq/groq.py +18 -0
  422. llama_stack/providers/remote/inference/llama_openai_compat/__init__.py +15 -0
  423. llama_stack/providers/remote/inference/llama_openai_compat/config.py +34 -0
  424. llama_stack/providers/remote/inference/llama_openai_compat/llama.py +46 -0
  425. llama_stack/providers/remote/inference/nvidia/__init__.py +23 -0
  426. llama_stack/providers/remote/inference/nvidia/config.py +64 -0
  427. llama_stack/providers/remote/inference/nvidia/nvidia.py +61 -0
  428. llama_stack/providers/{adapters/safety/sample/config.py → remote/inference/nvidia/utils.py} +3 -4
  429. llama_stack/providers/{impls/vllm → remote/inference/ollama}/__init__.py +4 -6
  430. llama_stack/providers/remote/inference/ollama/config.py +25 -0
  431. llama_stack/providers/remote/inference/ollama/ollama.py +102 -0
  432. llama_stack/providers/{adapters/telemetry/opentelemetry → remote/inference/openai}/__init__.py +4 -4
  433. llama_stack/providers/remote/inference/openai/config.py +39 -0
  434. llama_stack/providers/remote/inference/openai/openai.py +38 -0
  435. llama_stack/providers/remote/inference/passthrough/__init__.py +23 -0
  436. llama_stack/providers/remote/inference/passthrough/config.py +34 -0
  437. llama_stack/providers/remote/inference/passthrough/passthrough.py +122 -0
  438. llama_stack/providers/remote/inference/runpod/__init__.py +16 -0
  439. llama_stack/providers/remote/inference/runpod/config.py +32 -0
  440. llama_stack/providers/remote/inference/runpod/runpod.py +42 -0
  441. llama_stack/providers/remote/inference/sambanova/__init__.py +16 -0
  442. llama_stack/providers/remote/inference/sambanova/config.py +34 -0
  443. llama_stack/providers/remote/inference/sambanova/sambanova.py +28 -0
  444. llama_stack/providers/{adapters → remote}/inference/tgi/__init__.py +3 -4
  445. llama_stack/providers/remote/inference/tgi/config.py +76 -0
  446. llama_stack/providers/remote/inference/tgi/tgi.py +85 -0
  447. llama_stack/providers/{adapters → remote}/inference/together/__init__.py +8 -4
  448. llama_stack/providers/remote/inference/together/config.py +27 -0
  449. llama_stack/providers/remote/inference/together/together.py +102 -0
  450. llama_stack/providers/remote/inference/vertexai/__init__.py +15 -0
  451. llama_stack/providers/remote/inference/vertexai/config.py +48 -0
  452. llama_stack/providers/remote/inference/vertexai/vertexai.py +54 -0
  453. llama_stack/providers/remote/inference/vllm/__init__.py +22 -0
  454. llama_stack/providers/remote/inference/vllm/config.py +59 -0
  455. llama_stack/providers/remote/inference/vllm/vllm.py +111 -0
  456. llama_stack/providers/remote/inference/watsonx/__init__.py +15 -0
  457. llama_stack/providers/remote/inference/watsonx/config.py +45 -0
  458. llama_stack/providers/remote/inference/watsonx/watsonx.py +336 -0
  459. llama_stack/providers/remote/post_training/__init__.py +5 -0
  460. llama_stack/providers/remote/post_training/nvidia/__init__.py +23 -0
  461. llama_stack/providers/remote/post_training/nvidia/config.py +113 -0
  462. llama_stack/providers/remote/post_training/nvidia/models.py +27 -0
  463. llama_stack/providers/remote/post_training/nvidia/post_training.py +430 -0
  464. llama_stack/providers/remote/post_training/nvidia/utils.py +63 -0
  465. llama_stack/providers/remote/safety/__init__.py +5 -0
  466. llama_stack/providers/remote/safety/bedrock/bedrock.py +111 -0
  467. llama_stack/providers/remote/safety/bedrock/config.py +14 -0
  468. llama_stack/providers/{adapters/inference/sample → remote/safety/nvidia}/__init__.py +5 -4
  469. llama_stack/providers/remote/safety/nvidia/config.py +40 -0
  470. llama_stack/providers/remote/safety/nvidia/nvidia.py +161 -0
  471. llama_stack/providers/{adapters/agents/sample → remote/safety/sambanova}/__init__.py +5 -4
  472. llama_stack/providers/remote/safety/sambanova/config.py +37 -0
  473. llama_stack/providers/remote/safety/sambanova/sambanova.py +98 -0
  474. llama_stack/providers/remote/tool_runtime/__init__.py +5 -0
  475. llama_stack/providers/remote/tool_runtime/bing_search/__init__.py +21 -0
  476. llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py +112 -0
  477. llama_stack/providers/remote/tool_runtime/bing_search/config.py +22 -0
  478. llama_stack/providers/remote/tool_runtime/brave_search/__init__.py +20 -0
  479. llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py +148 -0
  480. llama_stack/providers/remote/tool_runtime/brave_search/config.py +27 -0
  481. llama_stack/providers/remote/tool_runtime/model_context_protocol/__init__.py +15 -0
  482. llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py +20 -0
  483. llama_stack/providers/remote/tool_runtime/model_context_protocol/model_context_protocol.py +73 -0
  484. llama_stack/providers/remote/tool_runtime/tavily_search/__init__.py +20 -0
  485. llama_stack/providers/remote/tool_runtime/tavily_search/config.py +27 -0
  486. llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py +84 -0
  487. llama_stack/providers/remote/tool_runtime/wolfram_alpha/__init__.py +22 -0
  488. llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py +21 -0
  489. llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py +140 -0
  490. llama_stack/providers/remote/vector_io/__init__.py +5 -0
  491. llama_stack/providers/remote/vector_io/chroma/__init__.py +17 -0
  492. llama_stack/providers/remote/vector_io/chroma/chroma.py +215 -0
  493. llama_stack/providers/remote/vector_io/chroma/config.py +28 -0
  494. llama_stack/providers/remote/vector_io/milvus/__init__.py +18 -0
  495. llama_stack/providers/remote/vector_io/milvus/config.py +35 -0
  496. llama_stack/providers/remote/vector_io/milvus/milvus.py +375 -0
  497. llama_stack/providers/remote/vector_io/pgvector/__init__.py +17 -0
  498. llama_stack/providers/remote/vector_io/pgvector/config.py +47 -0
  499. llama_stack/providers/remote/vector_io/pgvector/pgvector.py +460 -0
  500. llama_stack/providers/remote/vector_io/qdrant/__init__.py +17 -0
  501. llama_stack/providers/remote/vector_io/qdrant/config.py +37 -0
  502. llama_stack/providers/remote/vector_io/qdrant/qdrant.py +265 -0
  503. llama_stack/providers/remote/vector_io/weaviate/__init__.py +17 -0
  504. llama_stack/providers/remote/vector_io/weaviate/config.py +32 -0
  505. llama_stack/providers/remote/vector_io/weaviate/weaviate.py +393 -0
  506. llama_stack/providers/utils/bedrock/__init__.py +5 -0
  507. llama_stack/providers/utils/bedrock/client.py +74 -0
  508. llama_stack/providers/utils/bedrock/config.py +64 -0
  509. llama_stack/providers/utils/bedrock/refreshable_boto_session.py +112 -0
  510. llama_stack/providers/utils/common/__init__.py +5 -0
  511. llama_stack/providers/utils/common/data_schema_validator.py +103 -0
  512. llama_stack/providers/utils/datasetio/__init__.py +5 -0
  513. llama_stack/providers/utils/datasetio/url_utils.py +47 -0
  514. llama_stack/providers/utils/files/__init__.py +5 -0
  515. llama_stack/providers/utils/files/form_data.py +69 -0
  516. llama_stack/providers/utils/inference/__init__.py +8 -7
  517. llama_stack/providers/utils/inference/embedding_mixin.py +101 -0
  518. llama_stack/providers/utils/inference/inference_store.py +264 -0
  519. llama_stack/providers/utils/inference/litellm_openai_mixin.py +336 -0
  520. llama_stack/providers/utils/inference/model_registry.py +173 -23
  521. llama_stack/providers/utils/inference/openai_compat.py +1261 -49
  522. llama_stack/providers/utils/inference/openai_mixin.py +506 -0
  523. llama_stack/providers/utils/inference/prompt_adapter.py +365 -67
  524. llama_stack/providers/utils/kvstore/api.py +6 -6
  525. llama_stack/providers/utils/kvstore/config.py +28 -48
  526. llama_stack/providers/utils/kvstore/kvstore.py +61 -15
  527. llama_stack/providers/utils/kvstore/mongodb/__init__.py +9 -0
  528. llama_stack/providers/utils/kvstore/mongodb/mongodb.py +82 -0
  529. llama_stack/providers/utils/kvstore/postgres/__init__.py +7 -0
  530. llama_stack/providers/utils/kvstore/postgres/postgres.py +114 -0
  531. llama_stack/providers/utils/kvstore/redis/redis.py +33 -9
  532. llama_stack/providers/utils/kvstore/sqlite/config.py +2 -1
  533. llama_stack/providers/utils/kvstore/sqlite/sqlite.py +123 -22
  534. llama_stack/providers/utils/memory/file_utils.py +1 -1
  535. llama_stack/providers/utils/memory/openai_vector_store_mixin.py +1304 -0
  536. llama_stack/providers/utils/memory/vector_store.py +220 -82
  537. llama_stack/providers/utils/pagination.py +43 -0
  538. llama_stack/providers/utils/responses/__init__.py +5 -0
  539. llama_stack/providers/utils/responses/responses_store.py +292 -0
  540. llama_stack/providers/utils/scheduler.py +270 -0
  541. llama_stack/providers/utils/scoring/__init__.py +5 -0
  542. llama_stack/providers/utils/scoring/aggregation_utils.py +75 -0
  543. llama_stack/providers/utils/scoring/base_scoring_fn.py +114 -0
  544. llama_stack/providers/utils/scoring/basic_scoring_utils.py +26 -0
  545. llama_stack/providers/utils/sqlstore/__init__.py +5 -0
  546. llama_stack/providers/utils/sqlstore/api.py +128 -0
  547. llama_stack/providers/utils/sqlstore/authorized_sqlstore.py +319 -0
  548. llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py +343 -0
  549. llama_stack/providers/utils/sqlstore/sqlstore.py +70 -0
  550. llama_stack/providers/utils/telemetry/trace_protocol.py +142 -0
  551. llama_stack/providers/utils/telemetry/tracing.py +192 -53
  552. llama_stack/providers/utils/tools/__init__.py +5 -0
  553. llama_stack/providers/utils/tools/mcp.py +148 -0
  554. llama_stack/providers/utils/tools/ttl_dict.py +70 -0
  555. llama_stack/providers/utils/vector_io/__init__.py +5 -0
  556. llama_stack/providers/utils/vector_io/vector_utils.py +156 -0
  557. llama_stack/schema_utils.py +118 -0
  558. llama_stack/strong_typing/__init__.py +19 -0
  559. llama_stack/strong_typing/auxiliary.py +228 -0
  560. llama_stack/strong_typing/classdef.py +440 -0
  561. llama_stack/strong_typing/core.py +46 -0
  562. llama_stack/strong_typing/deserializer.py +877 -0
  563. llama_stack/strong_typing/docstring.py +409 -0
  564. llama_stack/strong_typing/exception.py +23 -0
  565. llama_stack/strong_typing/inspection.py +1085 -0
  566. llama_stack/strong_typing/mapping.py +40 -0
  567. llama_stack/strong_typing/name.py +182 -0
  568. llama_stack/strong_typing/py.typed +0 -0
  569. llama_stack/strong_typing/schema.py +792 -0
  570. llama_stack/strong_typing/serialization.py +97 -0
  571. llama_stack/strong_typing/serializer.py +500 -0
  572. llama_stack/strong_typing/slots.py +27 -0
  573. llama_stack/strong_typing/topological.py +89 -0
  574. llama_stack/testing/__init__.py +5 -0
  575. llama_stack/testing/api_recorder.py +956 -0
  576. llama_stack/ui/node_modules/flatted/python/flatted.py +149 -0
  577. llama_stack-0.3.4.dist-info/METADATA +261 -0
  578. llama_stack-0.3.4.dist-info/RECORD +625 -0
  579. {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info}/WHEEL +1 -1
  580. llama_stack/apis/agents/client.py +0 -292
  581. llama_stack/apis/agents/event_logger.py +0 -184
  582. llama_stack/apis/batch_inference/batch_inference.py +0 -72
  583. llama_stack/apis/common/deployment_types.py +0 -31
  584. llama_stack/apis/dataset/dataset.py +0 -63
  585. llama_stack/apis/evals/evals.py +0 -122
  586. llama_stack/apis/inference/client.py +0 -197
  587. llama_stack/apis/inspect/client.py +0 -82
  588. llama_stack/apis/memory/client.py +0 -155
  589. llama_stack/apis/memory/memory.py +0 -65
  590. llama_stack/apis/memory_banks/__init__.py +0 -7
  591. llama_stack/apis/memory_banks/client.py +0 -101
  592. llama_stack/apis/memory_banks/memory_banks.py +0 -78
  593. llama_stack/apis/models/client.py +0 -83
  594. llama_stack/apis/reward_scoring/__init__.py +0 -7
  595. llama_stack/apis/reward_scoring/reward_scoring.py +0 -55
  596. llama_stack/apis/safety/client.py +0 -105
  597. llama_stack/apis/shields/client.py +0 -79
  598. llama_stack/cli/download.py +0 -340
  599. llama_stack/cli/model/describe.py +0 -82
  600. llama_stack/cli/model/download.py +0 -24
  601. llama_stack/cli/model/list.py +0 -62
  602. llama_stack/cli/model/model.py +0 -34
  603. llama_stack/cli/model/prompt_format.py +0 -112
  604. llama_stack/cli/model/safety_models.py +0 -52
  605. llama_stack/cli/stack/build.py +0 -299
  606. llama_stack/cli/stack/configure.py +0 -178
  607. llama_stack/distribution/build.py +0 -123
  608. llama_stack/distribution/build_conda_env.sh +0 -136
  609. llama_stack/distribution/build_container.sh +0 -142
  610. llama_stack/distribution/common.sh +0 -40
  611. llama_stack/distribution/configure_container.sh +0 -47
  612. llama_stack/distribution/datatypes.py +0 -139
  613. llama_stack/distribution/distribution.py +0 -58
  614. llama_stack/distribution/inspect.py +0 -67
  615. llama_stack/distribution/request_headers.py +0 -57
  616. llama_stack/distribution/resolver.py +0 -323
  617. llama_stack/distribution/routers/__init__.py +0 -48
  618. llama_stack/distribution/routers/routers.py +0 -158
  619. llama_stack/distribution/routers/routing_tables.py +0 -173
  620. llama_stack/distribution/server/endpoints.py +0 -48
  621. llama_stack/distribution/server/server.py +0 -343
  622. llama_stack/distribution/start_conda_env.sh +0 -42
  623. llama_stack/distribution/start_container.sh +0 -64
  624. llama_stack/distribution/templates/local-bedrock-conda-example-build.yaml +0 -10
  625. llama_stack/distribution/templates/local-build.yaml +0 -10
  626. llama_stack/distribution/templates/local-databricks-build.yaml +0 -10
  627. llama_stack/distribution/templates/local-fireworks-build.yaml +0 -10
  628. llama_stack/distribution/templates/local-hf-endpoint-build.yaml +0 -10
  629. llama_stack/distribution/templates/local-hf-serverless-build.yaml +0 -10
  630. llama_stack/distribution/templates/local-ollama-build.yaml +0 -10
  631. llama_stack/distribution/templates/local-tgi-build.yaml +0 -10
  632. llama_stack/distribution/templates/local-together-build.yaml +0 -10
  633. llama_stack/distribution/templates/local-vllm-build.yaml +0 -10
  634. llama_stack/distribution/utils/exec.py +0 -105
  635. llama_stack/providers/adapters/agents/sample/sample.py +0 -18
  636. llama_stack/providers/adapters/inference/bedrock/bedrock.py +0 -451
  637. llama_stack/providers/adapters/inference/bedrock/config.py +0 -55
  638. llama_stack/providers/adapters/inference/databricks/config.py +0 -21
  639. llama_stack/providers/adapters/inference/databricks/databricks.py +0 -125
  640. llama_stack/providers/adapters/inference/fireworks/config.py +0 -20
  641. llama_stack/providers/adapters/inference/fireworks/fireworks.py +0 -130
  642. llama_stack/providers/adapters/inference/ollama/__init__.py +0 -19
  643. llama_stack/providers/adapters/inference/ollama/ollama.py +0 -175
  644. llama_stack/providers/adapters/inference/sample/sample.py +0 -23
  645. llama_stack/providers/adapters/inference/tgi/config.py +0 -43
  646. llama_stack/providers/adapters/inference/tgi/tgi.py +0 -200
  647. llama_stack/providers/adapters/inference/together/config.py +0 -22
  648. llama_stack/providers/adapters/inference/together/together.py +0 -143
  649. llama_stack/providers/adapters/memory/chroma/chroma.py +0 -157
  650. llama_stack/providers/adapters/memory/pgvector/config.py +0 -17
  651. llama_stack/providers/adapters/memory/pgvector/pgvector.py +0 -211
  652. llama_stack/providers/adapters/memory/sample/sample.py +0 -23
  653. llama_stack/providers/adapters/memory/weaviate/__init__.py +0 -15
  654. llama_stack/providers/adapters/memory/weaviate/weaviate.py +0 -190
  655. llama_stack/providers/adapters/safety/bedrock/bedrock.py +0 -113
  656. llama_stack/providers/adapters/safety/bedrock/config.py +0 -16
  657. llama_stack/providers/adapters/safety/sample/sample.py +0 -23
  658. llama_stack/providers/adapters/safety/together/__init__.py +0 -18
  659. llama_stack/providers/adapters/safety/together/config.py +0 -26
  660. llama_stack/providers/adapters/safety/together/together.py +0 -101
  661. llama_stack/providers/adapters/telemetry/opentelemetry/config.py +0 -12
  662. llama_stack/providers/adapters/telemetry/opentelemetry/opentelemetry.py +0 -201
  663. llama_stack/providers/adapters/telemetry/sample/__init__.py +0 -17
  664. llama_stack/providers/adapters/telemetry/sample/config.py +0 -12
  665. llama_stack/providers/adapters/telemetry/sample/sample.py +0 -18
  666. llama_stack/providers/impls/meta_reference/agents/agent_instance.py +0 -844
  667. llama_stack/providers/impls/meta_reference/agents/agents.py +0 -161
  668. llama_stack/providers/impls/meta_reference/agents/persistence.py +0 -84
  669. llama_stack/providers/impls/meta_reference/agents/rag/context_retriever.py +0 -74
  670. llama_stack/providers/impls/meta_reference/agents/safety.py +0 -57
  671. llama_stack/providers/impls/meta_reference/agents/tests/code_execution.py +0 -93
  672. llama_stack/providers/impls/meta_reference/agents/tests/test_chat_agent.py +0 -305
  673. llama_stack/providers/impls/meta_reference/agents/tools/base.py +0 -20
  674. llama_stack/providers/impls/meta_reference/agents/tools/builtin.py +0 -375
  675. llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/code_env_prefix.py +0 -133
  676. llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/code_execution.py +0 -256
  677. llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/matplotlib_custom_backend.py +0 -87
  678. llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/utils.py +0 -21
  679. llama_stack/providers/impls/meta_reference/agents/tools/safety.py +0 -43
  680. llama_stack/providers/impls/meta_reference/codeshield/code_scanner.py +0 -58
  681. llama_stack/providers/impls/meta_reference/inference/config.py +0 -45
  682. llama_stack/providers/impls/meta_reference/inference/generation.py +0 -376
  683. llama_stack/providers/impls/meta_reference/inference/inference.py +0 -280
  684. llama_stack/providers/impls/meta_reference/inference/model_parallel.py +0 -99
  685. llama_stack/providers/impls/meta_reference/inference/quantization/fp8_impls.py +0 -184
  686. llama_stack/providers/impls/meta_reference/inference/quantization/fp8_txest_disabled.py +0 -76
  687. llama_stack/providers/impls/meta_reference/inference/quantization/loader.py +0 -97
  688. llama_stack/providers/impls/meta_reference/inference/quantization/scripts/quantize_checkpoint.py +0 -161
  689. llama_stack/providers/impls/meta_reference/memory/__init__.py +0 -19
  690. llama_stack/providers/impls/meta_reference/memory/faiss.py +0 -113
  691. llama_stack/providers/impls/meta_reference/safety/__init__.py +0 -17
  692. llama_stack/providers/impls/meta_reference/safety/base.py +0 -57
  693. llama_stack/providers/impls/meta_reference/safety/config.py +0 -48
  694. llama_stack/providers/impls/meta_reference/safety/llama_guard.py +0 -268
  695. llama_stack/providers/impls/meta_reference/safety/prompt_guard.py +0 -145
  696. llama_stack/providers/impls/meta_reference/safety/safety.py +0 -112
  697. llama_stack/providers/impls/meta_reference/telemetry/console.py +0 -89
  698. llama_stack/providers/impls/vllm/config.py +0 -35
  699. llama_stack/providers/impls/vllm/vllm.py +0 -241
  700. llama_stack/providers/registry/memory.py +0 -78
  701. llama_stack/providers/registry/telemetry.py +0 -44
  702. llama_stack/providers/tests/agents/test_agents.py +0 -210
  703. llama_stack/providers/tests/inference/test_inference.py +0 -257
  704. llama_stack/providers/tests/inference/test_prompt_adapter.py +0 -126
  705. llama_stack/providers/tests/memory/test_memory.py +0 -136
  706. llama_stack/providers/tests/resolver.py +0 -100
  707. llama_stack/providers/tests/safety/test_safety.py +0 -77
  708. llama_stack-0.0.42.dist-info/METADATA +0 -137
  709. llama_stack-0.0.42.dist-info/RECORD +0 -256
  710. /llama_stack/{distribution → core}/__init__.py +0 -0
  711. /llama_stack/{distribution/server → core/access_control}/__init__.py +0 -0
  712. /llama_stack/{distribution/utils → core/conversations}/__init__.py +0 -0
  713. /llama_stack/{providers/adapters → core/prompts}/__init__.py +0 -0
  714. /llama_stack/{providers/adapters/agents → core/routing_tables}/__init__.py +0 -0
  715. /llama_stack/{providers/adapters/inference → core/server}/__init__.py +0 -0
  716. /llama_stack/{providers/adapters/memory → core/storage}/__init__.py +0 -0
  717. /llama_stack/{providers/adapters/safety → core/ui}/__init__.py +0 -0
  718. /llama_stack/{providers/adapters/telemetry → core/ui/modules}/__init__.py +0 -0
  719. /llama_stack/{providers/impls → core/ui/page}/__init__.py +0 -0
  720. /llama_stack/{providers/impls/meta_reference → core/ui/page/distribution}/__init__.py +0 -0
  721. /llama_stack/{providers/impls/meta_reference/agents/rag → core/ui/page/evaluations}/__init__.py +0 -0
  722. /llama_stack/{providers/impls/meta_reference/agents/tests → core/ui/page/playground}/__init__.py +0 -0
  723. /llama_stack/{providers/impls/meta_reference/agents/tools → core/utils}/__init__.py +0 -0
  724. /llama_stack/{distribution → core}/utils/dynamic.py +0 -0
  725. /llama_stack/{distribution → core}/utils/serialize.py +0 -0
  726. /llama_stack/{providers/impls/meta_reference/agents/tools/ipython_tool → distributions}/__init__.py +0 -0
  727. /llama_stack/{providers/impls/meta_reference/inference/quantization → models}/__init__.py +0 -0
  728. /llama_stack/{providers/impls/meta_reference/inference/quantization/scripts → models/llama}/__init__.py +0 -0
  729. /llama_stack/{providers/tests → models/llama/llama3}/__init__.py +0 -0
  730. /llama_stack/{providers/tests/agents → models/llama/llama3/quantization}/__init__.py +0 -0
  731. /llama_stack/{providers/tests/inference → models/llama/llama3_2}/__init__.py +0 -0
  732. /llama_stack/{providers/tests/memory → models/llama/llama3_3}/__init__.py +0 -0
  733. /llama_stack/{providers/tests/safety → models/llama/llama4}/__init__.py +0 -0
  734. /llama_stack/{scripts → models/llama/llama4/prompt_templates}/__init__.py +0 -0
  735. /llama_stack/providers/{adapters → remote}/safety/bedrock/__init__.py +0 -0
  736. {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info}/entry_points.txt +0 -0
  737. {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info/licenses}/LICENSE +0 -0
  738. {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1226 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the terms described in the LICENSE file in
5
+ # the root directory of this source tree.
6
+
7
+ import uuid
8
+ from collections.abc import AsyncIterator
9
+ from typing import Any
10
+
11
+ from llama_stack.apis.agents.openai_responses import (
12
+ AllowedToolsFilter,
13
+ ApprovalFilter,
14
+ MCPListToolsTool,
15
+ OpenAIResponseContentPartOutputText,
16
+ OpenAIResponseContentPartReasoningText,
17
+ OpenAIResponseContentPartRefusal,
18
+ OpenAIResponseError,
19
+ OpenAIResponseInputTool,
20
+ OpenAIResponseInputToolMCP,
21
+ OpenAIResponseMCPApprovalRequest,
22
+ OpenAIResponseMessage,
23
+ OpenAIResponseObject,
24
+ OpenAIResponseObjectStream,
25
+ OpenAIResponseObjectStreamResponseCompleted,
26
+ OpenAIResponseObjectStreamResponseContentPartAdded,
27
+ OpenAIResponseObjectStreamResponseContentPartDone,
28
+ OpenAIResponseObjectStreamResponseCreated,
29
+ OpenAIResponseObjectStreamResponseFailed,
30
+ OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta,
31
+ OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone,
32
+ OpenAIResponseObjectStreamResponseIncomplete,
33
+ OpenAIResponseObjectStreamResponseInProgress,
34
+ OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta,
35
+ OpenAIResponseObjectStreamResponseMcpCallArgumentsDone,
36
+ OpenAIResponseObjectStreamResponseMcpListToolsCompleted,
37
+ OpenAIResponseObjectStreamResponseMcpListToolsInProgress,
38
+ OpenAIResponseObjectStreamResponseOutputItemAdded,
39
+ OpenAIResponseObjectStreamResponseOutputItemDone,
40
+ OpenAIResponseObjectStreamResponseOutputTextDelta,
41
+ OpenAIResponseObjectStreamResponseReasoningTextDelta,
42
+ OpenAIResponseObjectStreamResponseReasoningTextDone,
43
+ OpenAIResponseObjectStreamResponseRefusalDelta,
44
+ OpenAIResponseObjectStreamResponseRefusalDone,
45
+ OpenAIResponseOutput,
46
+ OpenAIResponseOutputMessageContentOutputText,
47
+ OpenAIResponseOutputMessageFileSearchToolCall,
48
+ OpenAIResponseOutputMessageFunctionToolCall,
49
+ OpenAIResponseOutputMessageMCPCall,
50
+ OpenAIResponseOutputMessageMCPListTools,
51
+ OpenAIResponseOutputMessageWebSearchToolCall,
52
+ OpenAIResponseText,
53
+ OpenAIResponseUsage,
54
+ OpenAIResponseUsageInputTokensDetails,
55
+ OpenAIResponseUsageOutputTokensDetails,
56
+ WebSearchToolTypes,
57
+ )
58
+ from llama_stack.apis.inference import (
59
+ Inference,
60
+ OpenAIAssistantMessageParam,
61
+ OpenAIChatCompletion,
62
+ OpenAIChatCompletionChunk,
63
+ OpenAIChatCompletionRequestWithExtraBody,
64
+ OpenAIChatCompletionToolCall,
65
+ OpenAIChoice,
66
+ OpenAIMessageParam,
67
+ )
68
+ from llama_stack.log import get_logger
69
+ from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
70
+ from llama_stack.providers.utils.telemetry import tracing
71
+
72
+ from .types import ChatCompletionContext, ChatCompletionResult
73
+ from .utils import (
74
+ convert_chat_choice_to_response_message,
75
+ is_function_tool_call,
76
+ run_guardrails,
77
+ )
78
+
79
+ logger = get_logger(name=__name__, category="agents::meta_reference")
80
+
81
+
82
+ def convert_tooldef_to_chat_tool(tool_def):
83
+ """Convert a ToolDef to OpenAI ChatCompletionToolParam format.
84
+
85
+ Args:
86
+ tool_def: ToolDef from the tools API
87
+
88
+ Returns:
89
+ ChatCompletionToolParam suitable for OpenAI chat completion
90
+ """
91
+
92
+ from llama_stack.models.llama.datatypes import ToolDefinition
93
+ from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
94
+
95
+ internal_tool_def = ToolDefinition(
96
+ tool_name=tool_def.name,
97
+ description=tool_def.description,
98
+ input_schema=tool_def.input_schema,
99
+ )
100
+ return convert_tooldef_to_openai_tool(internal_tool_def)
101
+
102
+
103
+ class StreamingResponseOrchestrator:
104
+ def __init__(
105
+ self,
106
+ inference_api: Inference,
107
+ ctx: ChatCompletionContext,
108
+ response_id: str,
109
+ created_at: int,
110
+ text: OpenAIResponseText,
111
+ max_infer_iters: int,
112
+ tool_executor, # Will be the tool execution logic from the main class
113
+ instructions: str,
114
+ safety_api,
115
+ guardrail_ids: list[str] | None = None,
116
+ ):
117
+ self.inference_api = inference_api
118
+ self.ctx = ctx
119
+ self.response_id = response_id
120
+ self.created_at = created_at
121
+ self.text = text
122
+ self.max_infer_iters = max_infer_iters
123
+ self.tool_executor = tool_executor
124
+ self.safety_api = safety_api
125
+ self.guardrail_ids = guardrail_ids or []
126
+ self.sequence_number = 0
127
+ # Store MCP tool mapping that gets built during tool processing
128
+ self.mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] = ctx.tool_context.previous_tools or {}
129
+ # Track final messages after all tool executions
130
+ self.final_messages: list[OpenAIMessageParam] = []
131
+ # mapping for annotations
132
+ self.citation_files: dict[str, str] = {}
133
+ # Track accumulated usage across all inference calls
134
+ self.accumulated_usage: OpenAIResponseUsage | None = None
135
+ # Track if we've sent a refusal response
136
+ self.violation_detected = False
137
+ # system message that is inserted into the model's context
138
+ self.instructions = instructions
139
+
140
+ async def _create_refusal_response(self, violation_message: str) -> OpenAIResponseObjectStream:
141
+ """Create a refusal response to replace streaming content."""
142
+ refusal_content = OpenAIResponseContentPartRefusal(refusal=violation_message)
143
+
144
+ # Create a completed refusal response
145
+ refusal_response = OpenAIResponseObject(
146
+ id=self.response_id,
147
+ created_at=self.created_at,
148
+ model=self.ctx.model,
149
+ status="completed",
150
+ output=[OpenAIResponseMessage(role="assistant", content=[refusal_content], type="message")],
151
+ )
152
+
153
+ return OpenAIResponseObjectStreamResponseCompleted(response=refusal_response)
154
+
155
+ def _clone_outputs(self, outputs: list[OpenAIResponseOutput]) -> list[OpenAIResponseOutput]:
156
+ cloned: list[OpenAIResponseOutput] = []
157
+ for item in outputs:
158
+ if hasattr(item, "model_copy"):
159
+ cloned.append(item.model_copy(deep=True))
160
+ else:
161
+ cloned.append(item)
162
+ return cloned
163
+
164
+ def _snapshot_response(
165
+ self,
166
+ status: str,
167
+ outputs: list[OpenAIResponseOutput],
168
+ *,
169
+ error: OpenAIResponseError | None = None,
170
+ ) -> OpenAIResponseObject:
171
+ return OpenAIResponseObject(
172
+ created_at=self.created_at,
173
+ id=self.response_id,
174
+ model=self.ctx.model,
175
+ object="response",
176
+ status=status,
177
+ output=self._clone_outputs(outputs),
178
+ text=self.text,
179
+ tools=self.ctx.available_tools(),
180
+ error=error,
181
+ usage=self.accumulated_usage,
182
+ instructions=self.instructions,
183
+ )
184
+
185
+ async def create_response(self) -> AsyncIterator[OpenAIResponseObjectStream]:
186
+ output_messages: list[OpenAIResponseOutput] = []
187
+
188
+ # Emit response.created followed by response.in_progress to align with OpenAI streaming
189
+ yield OpenAIResponseObjectStreamResponseCreated(
190
+ response=self._snapshot_response("in_progress", output_messages)
191
+ )
192
+
193
+ self.sequence_number += 1
194
+ yield OpenAIResponseObjectStreamResponseInProgress(
195
+ response=self._snapshot_response("in_progress", output_messages),
196
+ sequence_number=self.sequence_number,
197
+ )
198
+
199
+ # Input safety validation - check messages before processing
200
+ if self.guardrail_ids:
201
+ combined_text = interleaved_content_as_str([msg.content for msg in self.ctx.messages])
202
+ input_violation_message = await run_guardrails(self.safety_api, combined_text, self.guardrail_ids)
203
+ if input_violation_message:
204
+ logger.info(f"Input guardrail violation: {input_violation_message}")
205
+ yield await self._create_refusal_response(input_violation_message)
206
+ return
207
+
208
+ async for stream_event in self._process_tools(output_messages):
209
+ yield stream_event
210
+
211
+ n_iter = 0
212
+ messages = self.ctx.messages.copy()
213
+ final_status = "completed"
214
+ last_completion_result: ChatCompletionResult | None = None
215
+
216
+ try:
217
+ while True:
218
+ # Text is the default response format for chat completion so don't need to pass it
219
+ # (some providers don't support non-empty response_format when tools are present)
220
+ response_format = None if self.ctx.response_format.type == "text" else self.ctx.response_format
221
+ logger.debug(f"calling openai_chat_completion with tools: {self.ctx.chat_tools}")
222
+
223
+ params = OpenAIChatCompletionRequestWithExtraBody(
224
+ model=self.ctx.model,
225
+ messages=messages,
226
+ tools=self.ctx.chat_tools,
227
+ stream=True,
228
+ temperature=self.ctx.temperature,
229
+ response_format=response_format,
230
+ stream_options={
231
+ "include_usage": True,
232
+ },
233
+ )
234
+ completion_result = await self.inference_api.openai_chat_completion(params)
235
+
236
+ # Process streaming chunks and build complete response
237
+ completion_result_data = None
238
+ async for stream_event_or_result in self._process_streaming_chunks(completion_result, output_messages):
239
+ if isinstance(stream_event_or_result, ChatCompletionResult):
240
+ completion_result_data = stream_event_or_result
241
+ else:
242
+ yield stream_event_or_result
243
+
244
+ # If violation detected, skip the rest of processing since we already sent refusal
245
+ if self.violation_detected:
246
+ return
247
+
248
+ if not completion_result_data:
249
+ raise ValueError("Streaming chunk processor failed to return completion data")
250
+ last_completion_result = completion_result_data
251
+ current_response = self._build_chat_completion(completion_result_data)
252
+
253
+ (
254
+ function_tool_calls,
255
+ non_function_tool_calls,
256
+ approvals,
257
+ next_turn_messages,
258
+ ) = self._separate_tool_calls(current_response, messages)
259
+
260
+ # add any approval requests required
261
+ for tool_call in approvals:
262
+ async for evt in self._add_mcp_approval_request(
263
+ tool_call.function.name, tool_call.function.arguments, output_messages
264
+ ):
265
+ yield evt
266
+
267
+ # Handle choices with no tool calls
268
+ for choice in current_response.choices:
269
+ if not (choice.message.tool_calls and self.ctx.response_tools):
270
+ output_messages.append(
271
+ await convert_chat_choice_to_response_message(
272
+ choice,
273
+ self.citation_files,
274
+ message_id=completion_result_data.message_item_id,
275
+ )
276
+ )
277
+
278
+ # Execute tool calls and coordinate results
279
+ async for stream_event in self._coordinate_tool_execution(
280
+ function_tool_calls,
281
+ non_function_tool_calls,
282
+ completion_result_data,
283
+ output_messages,
284
+ next_turn_messages,
285
+ ):
286
+ yield stream_event
287
+
288
+ messages = next_turn_messages
289
+
290
+ if not function_tool_calls and not non_function_tool_calls:
291
+ break
292
+
293
+ if function_tool_calls:
294
+ logger.info("Exiting inference loop since there is a function (client-side) tool call")
295
+ break
296
+
297
+ n_iter += 1
298
+ if n_iter >= self.max_infer_iters:
299
+ logger.info(
300
+ f"Exiting inference loop since iteration count({n_iter}) exceeds {self.max_infer_iters=}"
301
+ )
302
+ final_status = "incomplete"
303
+ break
304
+
305
+ if last_completion_result and last_completion_result.finish_reason == "length":
306
+ final_status = "incomplete"
307
+
308
+ except Exception as exc: # noqa: BLE001
309
+ self.final_messages = messages.copy()
310
+ self.sequence_number += 1
311
+ error = OpenAIResponseError(code="internal_error", message=str(exc))
312
+ failure_response = self._snapshot_response("failed", output_messages, error=error)
313
+ yield OpenAIResponseObjectStreamResponseFailed(
314
+ response=failure_response,
315
+ sequence_number=self.sequence_number,
316
+ )
317
+ return
318
+
319
+ self.final_messages = messages.copy()
320
+
321
+ if final_status == "incomplete":
322
+ self.sequence_number += 1
323
+ final_response = self._snapshot_response("incomplete", output_messages)
324
+ yield OpenAIResponseObjectStreamResponseIncomplete(
325
+ response=final_response,
326
+ sequence_number=self.sequence_number,
327
+ )
328
+ else:
329
+ final_response = self._snapshot_response("completed", output_messages)
330
+ yield OpenAIResponseObjectStreamResponseCompleted(response=final_response)
331
+
332
+ def _separate_tool_calls(self, current_response, messages) -> tuple[list, list, list, list]:
333
+ """Separate tool calls into function and non-function categories."""
334
+ function_tool_calls = []
335
+ non_function_tool_calls = []
336
+ approvals = []
337
+ next_turn_messages = messages.copy()
338
+
339
+ for choice in current_response.choices:
340
+ next_turn_messages.append(choice.message)
341
+ logger.debug(f"Choice message content: {choice.message.content}")
342
+ logger.debug(f"Choice message tool_calls: {choice.message.tool_calls}")
343
+
344
+ if choice.message.tool_calls and self.ctx.response_tools:
345
+ for tool_call in choice.message.tool_calls:
346
+ if is_function_tool_call(tool_call, self.ctx.response_tools):
347
+ function_tool_calls.append(tool_call)
348
+ else:
349
+ if self._approval_required(tool_call.function.name):
350
+ approval_response = self.ctx.approval_response(
351
+ tool_call.function.name, tool_call.function.arguments
352
+ )
353
+ if approval_response:
354
+ if approval_response.approve:
355
+ logger.info(f"Approval granted for {tool_call.id} on {tool_call.function.name}")
356
+ non_function_tool_calls.append(tool_call)
357
+ else:
358
+ logger.info(f"Approval denied for {tool_call.id} on {tool_call.function.name}")
359
+ next_turn_messages.pop()
360
+ else:
361
+ logger.info(f"Requesting approval for {tool_call.id} on {tool_call.function.name}")
362
+ approvals.append(tool_call)
363
+ next_turn_messages.pop()
364
+ else:
365
+ non_function_tool_calls.append(tool_call)
366
+
367
+ return function_tool_calls, non_function_tool_calls, approvals, next_turn_messages
368
+
369
+ def _accumulate_chunk_usage(self, chunk: OpenAIChatCompletionChunk) -> None:
370
+ """Accumulate usage from a streaming chunk into the response usage format."""
371
+ if not chunk.usage:
372
+ return
373
+
374
+ if self.accumulated_usage is None:
375
+ # Convert from chat completion format to response format
376
+ self.accumulated_usage = OpenAIResponseUsage(
377
+ input_tokens=chunk.usage.prompt_tokens,
378
+ output_tokens=chunk.usage.completion_tokens,
379
+ total_tokens=chunk.usage.total_tokens,
380
+ input_tokens_details=(
381
+ OpenAIResponseUsageInputTokensDetails(cached_tokens=chunk.usage.prompt_tokens_details.cached_tokens)
382
+ if chunk.usage.prompt_tokens_details
383
+ else None
384
+ ),
385
+ output_tokens_details=(
386
+ OpenAIResponseUsageOutputTokensDetails(
387
+ reasoning_tokens=chunk.usage.completion_tokens_details.reasoning_tokens
388
+ )
389
+ if chunk.usage.completion_tokens_details
390
+ else None
391
+ ),
392
+ )
393
+ else:
394
+ # Accumulate across multiple inference calls
395
+ self.accumulated_usage = OpenAIResponseUsage(
396
+ input_tokens=self.accumulated_usage.input_tokens + chunk.usage.prompt_tokens,
397
+ output_tokens=self.accumulated_usage.output_tokens + chunk.usage.completion_tokens,
398
+ total_tokens=self.accumulated_usage.total_tokens + chunk.usage.total_tokens,
399
+ # Use latest non-null details
400
+ input_tokens_details=(
401
+ OpenAIResponseUsageInputTokensDetails(cached_tokens=chunk.usage.prompt_tokens_details.cached_tokens)
402
+ if chunk.usage.prompt_tokens_details
403
+ else self.accumulated_usage.input_tokens_details
404
+ ),
405
+ output_tokens_details=(
406
+ OpenAIResponseUsageOutputTokensDetails(
407
+ reasoning_tokens=chunk.usage.completion_tokens_details.reasoning_tokens
408
+ )
409
+ if chunk.usage.completion_tokens_details
410
+ else self.accumulated_usage.output_tokens_details
411
+ ),
412
+ )
413
+
414
+ async def _handle_reasoning_content_chunk(
415
+ self,
416
+ reasoning_content: str,
417
+ reasoning_part_emitted: bool,
418
+ reasoning_content_index: int,
419
+ message_item_id: str,
420
+ message_output_index: int,
421
+ ) -> AsyncIterator[OpenAIResponseObjectStream]:
422
+ # Emit content_part.added event for first reasoning chunk
423
+ if not reasoning_part_emitted:
424
+ self.sequence_number += 1
425
+ yield OpenAIResponseObjectStreamResponseContentPartAdded(
426
+ content_index=reasoning_content_index,
427
+ response_id=self.response_id,
428
+ item_id=message_item_id,
429
+ output_index=message_output_index,
430
+ part=OpenAIResponseContentPartReasoningText(
431
+ text="", # Will be filled incrementally via reasoning deltas
432
+ ),
433
+ sequence_number=self.sequence_number,
434
+ )
435
+ # Emit reasoning_text.delta event
436
+ self.sequence_number += 1
437
+ yield OpenAIResponseObjectStreamResponseReasoningTextDelta(
438
+ content_index=reasoning_content_index,
439
+ delta=reasoning_content,
440
+ item_id=message_item_id,
441
+ output_index=message_output_index,
442
+ sequence_number=self.sequence_number,
443
+ )
444
+
445
+ async def _handle_refusal_content_chunk(
446
+ self,
447
+ refusal_content: str,
448
+ refusal_part_emitted: bool,
449
+ refusal_content_index: int,
450
+ message_item_id: str,
451
+ message_output_index: int,
452
+ ) -> AsyncIterator[OpenAIResponseObjectStream]:
453
+ # Emit content_part.added event for first refusal chunk
454
+ if not refusal_part_emitted:
455
+ self.sequence_number += 1
456
+ yield OpenAIResponseObjectStreamResponseContentPartAdded(
457
+ content_index=refusal_content_index,
458
+ response_id=self.response_id,
459
+ item_id=message_item_id,
460
+ output_index=message_output_index,
461
+ part=OpenAIResponseContentPartRefusal(
462
+ refusal="", # Will be filled incrementally via refusal deltas
463
+ ),
464
+ sequence_number=self.sequence_number,
465
+ )
466
+ # Emit refusal.delta event
467
+ self.sequence_number += 1
468
+ yield OpenAIResponseObjectStreamResponseRefusalDelta(
469
+ content_index=refusal_content_index,
470
+ delta=refusal_content,
471
+ item_id=message_item_id,
472
+ output_index=message_output_index,
473
+ sequence_number=self.sequence_number,
474
+ )
475
+
476
+ async def _emit_reasoning_done_events(
477
+ self,
478
+ reasoning_text_accumulated: list[str],
479
+ reasoning_content_index: int,
480
+ message_item_id: str,
481
+ message_output_index: int,
482
+ ) -> AsyncIterator[OpenAIResponseObjectStream]:
483
+ final_reasoning_text = "".join(reasoning_text_accumulated)
484
+ # Emit reasoning_text.done event
485
+ self.sequence_number += 1
486
+ yield OpenAIResponseObjectStreamResponseReasoningTextDone(
487
+ content_index=reasoning_content_index,
488
+ text=final_reasoning_text,
489
+ item_id=message_item_id,
490
+ output_index=message_output_index,
491
+ sequence_number=self.sequence_number,
492
+ )
493
+ # Emit content_part.done for reasoning
494
+ self.sequence_number += 1
495
+ yield OpenAIResponseObjectStreamResponseContentPartDone(
496
+ content_index=reasoning_content_index,
497
+ response_id=self.response_id,
498
+ item_id=message_item_id,
499
+ output_index=message_output_index,
500
+ part=OpenAIResponseContentPartReasoningText(
501
+ text=final_reasoning_text,
502
+ ),
503
+ sequence_number=self.sequence_number,
504
+ )
505
+
506
+ async def _emit_refusal_done_events(
507
+ self,
508
+ refusal_text_accumulated: list[str],
509
+ refusal_content_index: int,
510
+ message_item_id: str,
511
+ message_output_index: int,
512
+ ) -> AsyncIterator[OpenAIResponseObjectStream]:
513
+ final_refusal_text = "".join(refusal_text_accumulated)
514
+ # Emit refusal.done event
515
+ self.sequence_number += 1
516
+ yield OpenAIResponseObjectStreamResponseRefusalDone(
517
+ content_index=refusal_content_index,
518
+ refusal=final_refusal_text,
519
+ item_id=message_item_id,
520
+ output_index=message_output_index,
521
+ sequence_number=self.sequence_number,
522
+ )
523
+ # Emit content_part.done for refusal
524
+ self.sequence_number += 1
525
+ yield OpenAIResponseObjectStreamResponseContentPartDone(
526
+ content_index=refusal_content_index,
527
+ response_id=self.response_id,
528
+ item_id=message_item_id,
529
+ output_index=message_output_index,
530
+ part=OpenAIResponseContentPartRefusal(
531
+ refusal=final_refusal_text,
532
+ ),
533
+ sequence_number=self.sequence_number,
534
+ )
535
+
536
+ async def _process_streaming_chunks(
537
+ self, completion_result, output_messages: list[OpenAIResponseOutput]
538
+ ) -> AsyncIterator[OpenAIResponseObjectStream | ChatCompletionResult]:
539
+ """Process streaming chunks and emit events, returning completion data."""
540
+ # Initialize result tracking
541
+ chat_response_id = ""
542
+ chat_response_content = []
543
+ chat_response_tool_calls: dict[int, OpenAIChatCompletionToolCall] = {}
544
+ chunk_created = 0
545
+ chunk_model = ""
546
+ chunk_finish_reason = ""
547
+
548
+ # Create a placeholder message item for delta events
549
+ message_item_id = f"msg_{uuid.uuid4()}"
550
+ # Track tool call items for streaming events
551
+ tool_call_item_ids: dict[int, str] = {}
552
+ # Track content parts for streaming events
553
+ message_item_added_emitted = False
554
+ content_part_emitted = False
555
+ reasoning_part_emitted = False
556
+ refusal_part_emitted = False
557
+ content_index = 0
558
+ reasoning_content_index = 1 # reasoning is a separate content part
559
+ refusal_content_index = 2 # refusal is a separate content part
560
+ message_output_index = len(output_messages)
561
+ reasoning_text_accumulated = []
562
+ refusal_text_accumulated = []
563
+
564
+ async for chunk in completion_result:
565
+ chat_response_id = chunk.id
566
+ chunk_created = chunk.created
567
+ chunk_model = chunk.model
568
+
569
+ # Accumulate usage from chunks (typically in final chunk with stream_options)
570
+ self._accumulate_chunk_usage(chunk)
571
+
572
+ # Track deltas for this specific chunk for guardrail validation
573
+ chunk_events: list[OpenAIResponseObjectStream] = []
574
+
575
+ for chunk_choice in chunk.choices:
576
+ # Emit incremental text content as delta events
577
+ if chunk_choice.delta.content:
578
+ # Emit output_item.added for the message on first content
579
+ if not message_item_added_emitted:
580
+ message_item_added_emitted = True
581
+ self.sequence_number += 1
582
+ message_item = OpenAIResponseMessage(
583
+ id=message_item_id,
584
+ content=[],
585
+ role="assistant",
586
+ status="in_progress",
587
+ )
588
+ yield OpenAIResponseObjectStreamResponseOutputItemAdded(
589
+ response_id=self.response_id,
590
+ item=message_item,
591
+ output_index=message_output_index,
592
+ sequence_number=self.sequence_number,
593
+ )
594
+
595
+ # Emit content_part.added event for first text chunk
596
+ if not content_part_emitted:
597
+ content_part_emitted = True
598
+ self.sequence_number += 1
599
+ yield OpenAIResponseObjectStreamResponseContentPartAdded(
600
+ content_index=content_index,
601
+ response_id=self.response_id,
602
+ item_id=message_item_id,
603
+ output_index=message_output_index,
604
+ part=OpenAIResponseContentPartOutputText(
605
+ text="", # Will be filled incrementally via text deltas
606
+ ),
607
+ sequence_number=self.sequence_number,
608
+ )
609
+ self.sequence_number += 1
610
+
611
+ text_delta_event = OpenAIResponseObjectStreamResponseOutputTextDelta(
612
+ content_index=content_index,
613
+ delta=chunk_choice.delta.content,
614
+ item_id=message_item_id,
615
+ output_index=message_output_index,
616
+ sequence_number=self.sequence_number,
617
+ )
618
+ # Buffer text delta events for guardrail check
619
+ if self.guardrail_ids:
620
+ chunk_events.append(text_delta_event)
621
+ else:
622
+ yield text_delta_event
623
+
624
+ # Collect content for final response
625
+ chat_response_content.append(chunk_choice.delta.content or "")
626
+ if chunk_choice.finish_reason:
627
+ chunk_finish_reason = chunk_choice.finish_reason
628
+
629
+ # Handle reasoning content if present (non-standard field for o1/o3 models)
630
+ if hasattr(chunk_choice.delta, "reasoning_content") and chunk_choice.delta.reasoning_content:
631
+ async for event in self._handle_reasoning_content_chunk(
632
+ reasoning_content=chunk_choice.delta.reasoning_content,
633
+ reasoning_part_emitted=reasoning_part_emitted,
634
+ reasoning_content_index=reasoning_content_index,
635
+ message_item_id=message_item_id,
636
+ message_output_index=message_output_index,
637
+ ):
638
+ # Buffer reasoning events for guardrail check
639
+ if self.guardrail_ids:
640
+ chunk_events.append(event)
641
+ else:
642
+ yield event
643
+ reasoning_part_emitted = True
644
+ reasoning_text_accumulated.append(chunk_choice.delta.reasoning_content)
645
+
646
+ # Handle refusal content if present
647
+ if chunk_choice.delta.refusal:
648
+ async for event in self._handle_refusal_content_chunk(
649
+ refusal_content=chunk_choice.delta.refusal,
650
+ refusal_part_emitted=refusal_part_emitted,
651
+ refusal_content_index=refusal_content_index,
652
+ message_item_id=message_item_id,
653
+ message_output_index=message_output_index,
654
+ ):
655
+ yield event
656
+ refusal_part_emitted = True
657
+ refusal_text_accumulated.append(chunk_choice.delta.refusal)
658
+
659
+ # Aggregate tool call arguments across chunks
660
+ if chunk_choice.delta.tool_calls:
661
+ for tool_call in chunk_choice.delta.tool_calls:
662
+ response_tool_call = chat_response_tool_calls.get(tool_call.index, None)
663
+ # Create new tool call entry if this is the first chunk for this index
664
+ is_new_tool_call = response_tool_call is None
665
+ if is_new_tool_call:
666
+ tool_call_dict: dict[str, Any] = tool_call.model_dump()
667
+ tool_call_dict.pop("type", None)
668
+ response_tool_call = OpenAIChatCompletionToolCall(**tool_call_dict)
669
+ chat_response_tool_calls[tool_call.index] = response_tool_call
670
+
671
+ # Create item ID for this tool call for streaming events
672
+ tool_call_item_id = f"fc_{uuid.uuid4()}"
673
+ tool_call_item_ids[tool_call.index] = tool_call_item_id
674
+
675
+ # Emit output_item.added event for the new function call
676
+ self.sequence_number += 1
677
+ is_mcp_tool = tool_call.function.name and tool_call.function.name in self.mcp_tool_to_server
678
+ if not is_mcp_tool and tool_call.function.name not in ["web_search", "knowledge_search"]:
679
+ # for MCP tools (and even other non-function tools) we emit an output message item later
680
+ function_call_item = OpenAIResponseOutputMessageFunctionToolCall(
681
+ arguments="", # Will be filled incrementally via delta events
682
+ call_id=tool_call.id or "",
683
+ name=tool_call.function.name if tool_call.function else "",
684
+ id=tool_call_item_id,
685
+ status="in_progress",
686
+ )
687
+ yield OpenAIResponseObjectStreamResponseOutputItemAdded(
688
+ response_id=self.response_id,
689
+ item=function_call_item,
690
+ output_index=len(output_messages),
691
+ sequence_number=self.sequence_number,
692
+ )
693
+
694
+ # Stream tool call arguments as they arrive (differentiate between MCP and function calls)
695
+ if tool_call.function and tool_call.function.arguments:
696
+ tool_call_item_id = tool_call_item_ids[tool_call.index]
697
+ self.sequence_number += 1
698
+
699
+ # Check if this is an MCP tool call
700
+ is_mcp_tool = tool_call.function.name and tool_call.function.name in self.mcp_tool_to_server
701
+ if is_mcp_tool:
702
+ # Emit MCP-specific argument delta event
703
+ yield OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta(
704
+ delta=tool_call.function.arguments,
705
+ item_id=tool_call_item_id,
706
+ output_index=len(output_messages),
707
+ sequence_number=self.sequence_number,
708
+ )
709
+ else:
710
+ # Emit function call argument delta event
711
+ yield OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta(
712
+ delta=tool_call.function.arguments,
713
+ item_id=tool_call_item_id,
714
+ output_index=len(output_messages),
715
+ sequence_number=self.sequence_number,
716
+ )
717
+
718
+ # Accumulate arguments for final response (only for subsequent chunks)
719
+ if not is_new_tool_call:
720
+ response_tool_call.function.arguments = (
721
+ response_tool_call.function.arguments or ""
722
+ ) + tool_call.function.arguments
723
+
724
+ # Output Safety Validation for this chunk
725
+ if self.guardrail_ids:
726
+ # Check guardrails on accumulated text so far
727
+ accumulated_text = "".join(chat_response_content)
728
+ violation_message = await run_guardrails(self.safety_api, accumulated_text, self.guardrail_ids)
729
+ if violation_message:
730
+ logger.info(f"Output guardrail violation: {violation_message}")
731
+ chunk_events.clear()
732
+ yield await self._create_refusal_response(violation_message)
733
+ self.violation_detected = True
734
+ return
735
+ else:
736
+ # No violation detected, emit all content events for this chunk
737
+ for event in chunk_events:
738
+ yield event
739
+
740
+ # Emit arguments.done events for completed tool calls (differentiate between MCP and function calls)
741
+ for tool_call_index in sorted(chat_response_tool_calls.keys()):
742
+ tool_call = chat_response_tool_calls[tool_call_index]
743
+ # Ensure that arguments, if sent back to the inference provider, are not None
744
+ tool_call.function.arguments = tool_call.function.arguments or "{}"
745
+ tool_call_item_id = tool_call_item_ids[tool_call_index]
746
+ final_arguments = tool_call.function.arguments
747
+ tool_call_name = chat_response_tool_calls[tool_call_index].function.name
748
+
749
+ # Check if this is an MCP tool call
750
+ is_mcp_tool = tool_call_name and tool_call_name in self.mcp_tool_to_server
751
+ self.sequence_number += 1
752
+ done_event_cls = (
753
+ OpenAIResponseObjectStreamResponseMcpCallArgumentsDone
754
+ if is_mcp_tool
755
+ else OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone
756
+ )
757
+ yield done_event_cls(
758
+ arguments=final_arguments,
759
+ item_id=tool_call_item_id,
760
+ output_index=len(output_messages),
761
+ sequence_number=self.sequence_number,
762
+ )
763
+
764
+ # Emit content_part.done event if text content was streamed (before content gets cleared)
765
+ if content_part_emitted:
766
+ final_text = "".join(chat_response_content)
767
+ self.sequence_number += 1
768
+ yield OpenAIResponseObjectStreamResponseContentPartDone(
769
+ content_index=content_index,
770
+ response_id=self.response_id,
771
+ item_id=message_item_id,
772
+ output_index=message_output_index,
773
+ part=OpenAIResponseContentPartOutputText(
774
+ text=final_text,
775
+ ),
776
+ sequence_number=self.sequence_number,
777
+ )
778
+
779
+ # Emit reasoning done events if reasoning content was streamed
780
+ if reasoning_part_emitted:
781
+ async for event in self._emit_reasoning_done_events(
782
+ reasoning_text_accumulated=reasoning_text_accumulated,
783
+ reasoning_content_index=reasoning_content_index,
784
+ message_item_id=message_item_id,
785
+ message_output_index=message_output_index,
786
+ ):
787
+ yield event
788
+
789
+ # Emit refusal done events if refusal content was streamed
790
+ if refusal_part_emitted:
791
+ async for event in self._emit_refusal_done_events(
792
+ refusal_text_accumulated=refusal_text_accumulated,
793
+ refusal_content_index=refusal_content_index,
794
+ message_item_id=message_item_id,
795
+ message_output_index=message_output_index,
796
+ ):
797
+ yield event
798
+
799
+ # Clear content when there are tool calls (OpenAI spec behavior)
800
+ if chat_response_tool_calls:
801
+ chat_response_content = []
802
+
803
+ # Emit output_item.done for message when we have content and no tool calls
804
+ if message_item_added_emitted and not chat_response_tool_calls:
805
+ content_parts = []
806
+ if content_part_emitted:
807
+ final_text = "".join(chat_response_content)
808
+ content_parts.append(
809
+ OpenAIResponseOutputMessageContentOutputText(
810
+ text=final_text,
811
+ annotations=[],
812
+ )
813
+ )
814
+
815
+ self.sequence_number += 1
816
+ message_item = OpenAIResponseMessage(
817
+ id=message_item_id,
818
+ content=content_parts,
819
+ role="assistant",
820
+ status="completed",
821
+ )
822
+ yield OpenAIResponseObjectStreamResponseOutputItemDone(
823
+ response_id=self.response_id,
824
+ item=message_item,
825
+ output_index=message_output_index,
826
+ sequence_number=self.sequence_number,
827
+ )
828
+
829
+ yield ChatCompletionResult(
830
+ response_id=chat_response_id,
831
+ content=chat_response_content,
832
+ tool_calls=chat_response_tool_calls,
833
+ created=chunk_created,
834
+ model=chunk_model,
835
+ finish_reason=chunk_finish_reason,
836
+ message_item_id=message_item_id,
837
+ tool_call_item_ids=tool_call_item_ids,
838
+ content_part_emitted=content_part_emitted,
839
+ )
840
+
841
+ def _build_chat_completion(self, result: ChatCompletionResult) -> OpenAIChatCompletion:
842
+ """Build OpenAIChatCompletion from ChatCompletionResult."""
843
+ # Convert collected chunks to complete response
844
+ if result.tool_calls:
845
+ tool_calls = [result.tool_calls[i] for i in sorted(result.tool_calls.keys())]
846
+ else:
847
+ tool_calls = None
848
+
849
+ assistant_message = OpenAIAssistantMessageParam(
850
+ content=result.content_text,
851
+ tool_calls=tool_calls,
852
+ )
853
+ return OpenAIChatCompletion(
854
+ id=result.response_id,
855
+ choices=[
856
+ OpenAIChoice(
857
+ message=assistant_message,
858
+ finish_reason=result.finish_reason,
859
+ index=0,
860
+ )
861
+ ],
862
+ created=result.created,
863
+ model=result.model,
864
+ )
865
+
866
+ async def _coordinate_tool_execution(
867
+ self,
868
+ function_tool_calls: list,
869
+ non_function_tool_calls: list,
870
+ completion_result_data: ChatCompletionResult,
871
+ output_messages: list[OpenAIResponseOutput],
872
+ next_turn_messages: list,
873
+ ) -> AsyncIterator[OpenAIResponseObjectStream]:
874
+ """Coordinate execution of both function and non-function tool calls."""
875
+ # Execute non-function tool calls
876
+ for tool_call in non_function_tool_calls:
877
+ # Find the item_id for this tool call
878
+ matching_item_id = None
879
+ for index, item_id in completion_result_data.tool_call_item_ids.items():
880
+ response_tool_call = completion_result_data.tool_calls.get(index)
881
+ if response_tool_call and response_tool_call.id == tool_call.id:
882
+ matching_item_id = item_id
883
+ break
884
+
885
+ # Use a fallback item_id if not found
886
+ if not matching_item_id:
887
+ matching_item_id = f"tc_{uuid.uuid4()}"
888
+
889
+ self.sequence_number += 1
890
+ if tool_call.function.name and tool_call.function.name in self.mcp_tool_to_server:
891
+ item = OpenAIResponseOutputMessageMCPCall(
892
+ arguments="",
893
+ name=tool_call.function.name,
894
+ id=matching_item_id,
895
+ server_label=self.mcp_tool_to_server[tool_call.function.name].server_label,
896
+ status="in_progress",
897
+ )
898
+ elif tool_call.function.name == "web_search":
899
+ item = OpenAIResponseOutputMessageWebSearchToolCall(
900
+ id=matching_item_id,
901
+ status="in_progress",
902
+ )
903
+ elif tool_call.function.name == "knowledge_search":
904
+ item = OpenAIResponseOutputMessageFileSearchToolCall(
905
+ id=matching_item_id,
906
+ status="in_progress",
907
+ queries=[tool_call.function.arguments or ""],
908
+ )
909
+ else:
910
+ raise ValueError(f"Unsupported tool call: {tool_call.function.name}")
911
+
912
+ yield OpenAIResponseObjectStreamResponseOutputItemAdded(
913
+ response_id=self.response_id,
914
+ item=item,
915
+ output_index=len(output_messages),
916
+ sequence_number=self.sequence_number,
917
+ )
918
+
919
+ # Execute tool call with streaming
920
+ tool_call_log = None
921
+ tool_response_message = None
922
+ async for result in self.tool_executor.execute_tool_call(
923
+ tool_call,
924
+ self.ctx,
925
+ self.sequence_number,
926
+ len(output_messages),
927
+ matching_item_id,
928
+ self.mcp_tool_to_server,
929
+ ):
930
+ if result.stream_event:
931
+ # Forward streaming events
932
+ self.sequence_number = result.sequence_number
933
+ yield result.stream_event
934
+
935
+ if result.final_output_message is not None:
936
+ tool_call_log = result.final_output_message
937
+ tool_response_message = result.final_input_message
938
+ self.sequence_number = result.sequence_number
939
+ if result.citation_files:
940
+ self.citation_files.update(result.citation_files)
941
+
942
+ if tool_call_log:
943
+ output_messages.append(tool_call_log)
944
+
945
+ # Emit output_item.done event for completed non-function tool call
946
+ if matching_item_id:
947
+ self.sequence_number += 1
948
+ yield OpenAIResponseObjectStreamResponseOutputItemDone(
949
+ response_id=self.response_id,
950
+ item=tool_call_log,
951
+ output_index=len(output_messages) - 1,
952
+ sequence_number=self.sequence_number,
953
+ )
954
+
955
+ if tool_response_message:
956
+ next_turn_messages.append(tool_response_message)
957
+
958
+ # Execute function tool calls (client-side)
959
+ for tool_call in function_tool_calls:
960
+ # Find the item_id for this tool call from our tracking dictionary
961
+ matching_item_id = None
962
+ for index, item_id in completion_result_data.tool_call_item_ids.items():
963
+ response_tool_call = completion_result_data.tool_calls.get(index)
964
+ if response_tool_call and response_tool_call.id == tool_call.id:
965
+ matching_item_id = item_id
966
+ break
967
+
968
+ # Use existing item_id or create new one if not found
969
+ final_item_id = matching_item_id or f"fc_{uuid.uuid4()}"
970
+
971
+ function_call_item = OpenAIResponseOutputMessageFunctionToolCall(
972
+ arguments=tool_call.function.arguments or "",
973
+ call_id=tool_call.id,
974
+ name=tool_call.function.name or "",
975
+ id=final_item_id,
976
+ status="completed",
977
+ )
978
+ output_messages.append(function_call_item)
979
+
980
+ # Emit output_item.done event for completed function call
981
+ self.sequence_number += 1
982
+ yield OpenAIResponseObjectStreamResponseOutputItemDone(
983
+ response_id=self.response_id,
984
+ item=function_call_item,
985
+ output_index=len(output_messages) - 1,
986
+ sequence_number=self.sequence_number,
987
+ )
988
+
989
+ async def _process_new_tools(
990
+ self, tools: list[OpenAIResponseInputTool], output_messages: list[OpenAIResponseOutput]
991
+ ) -> AsyncIterator[OpenAIResponseObjectStream]:
992
+ """Process all tools and emit appropriate streaming events."""
993
+ from openai.types.chat import ChatCompletionToolParam
994
+
995
+ from llama_stack.apis.tools import ToolDef
996
+ from llama_stack.models.llama.datatypes import ToolDefinition
997
+ from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
998
+
999
+ def make_openai_tool(tool_name: str, tool: ToolDef) -> ChatCompletionToolParam:
1000
+ tool_def = ToolDefinition(
1001
+ tool_name=tool_name,
1002
+ description=tool.description,
1003
+ input_schema=tool.input_schema,
1004
+ )
1005
+ return convert_tooldef_to_openai_tool(tool_def)
1006
+
1007
+ # Initialize chat_tools if not already set
1008
+ if self.ctx.chat_tools is None:
1009
+ self.ctx.chat_tools = []
1010
+
1011
+ for input_tool in tools:
1012
+ if input_tool.type == "function":
1013
+ self.ctx.chat_tools.append(ChatCompletionToolParam(type="function", function=input_tool.model_dump()))
1014
+ elif input_tool.type in WebSearchToolTypes:
1015
+ tool_name = "web_search"
1016
+ # Need to access tool_groups_api from tool_executor
1017
+ tool = await self.tool_executor.tool_groups_api.get_tool(tool_name)
1018
+ if not tool:
1019
+ raise ValueError(f"Tool {tool_name} not found")
1020
+ self.ctx.chat_tools.append(make_openai_tool(tool_name, tool))
1021
+ elif input_tool.type == "file_search":
1022
+ tool_name = "knowledge_search"
1023
+ tool = await self.tool_executor.tool_groups_api.get_tool(tool_name)
1024
+ if not tool:
1025
+ raise ValueError(f"Tool {tool_name} not found")
1026
+ self.ctx.chat_tools.append(make_openai_tool(tool_name, tool))
1027
+ elif input_tool.type == "mcp":
1028
+ async for stream_event in self._process_mcp_tool(input_tool, output_messages):
1029
+ yield stream_event
1030
+ else:
1031
+ raise ValueError(f"Llama Stack OpenAI Responses does not yet support tool type: {input_tool.type}")
1032
+
1033
+ async def _process_mcp_tool(
1034
+ self, mcp_tool: OpenAIResponseInputToolMCP, output_messages: list[OpenAIResponseOutput]
1035
+ ) -> AsyncIterator[OpenAIResponseObjectStream]:
1036
+ """Process an MCP tool configuration and emit appropriate streaming events."""
1037
+ from llama_stack.providers.utils.tools.mcp import list_mcp_tools
1038
+
1039
+ # Emit mcp_list_tools.in_progress
1040
+ self.sequence_number += 1
1041
+ yield OpenAIResponseObjectStreamResponseMcpListToolsInProgress(
1042
+ sequence_number=self.sequence_number,
1043
+ )
1044
+ try:
1045
+ # Parse allowed/never allowed tools
1046
+ always_allowed = None
1047
+ never_allowed = None
1048
+ if mcp_tool.allowed_tools:
1049
+ if isinstance(mcp_tool.allowed_tools, list):
1050
+ always_allowed = mcp_tool.allowed_tools
1051
+ elif isinstance(mcp_tool.allowed_tools, AllowedToolsFilter):
1052
+ always_allowed = mcp_tool.allowed_tools.always
1053
+ never_allowed = mcp_tool.allowed_tools.never
1054
+
1055
+ # Call list_mcp_tools
1056
+ tool_defs = None
1057
+ list_id = f"mcp_list_{uuid.uuid4()}"
1058
+ attributes = {
1059
+ "server_label": mcp_tool.server_label,
1060
+ "server_url": mcp_tool.server_url,
1061
+ "mcp_list_tools_id": list_id,
1062
+ }
1063
+ async with tracing.span("list_mcp_tools", attributes):
1064
+ tool_defs = await list_mcp_tools(
1065
+ endpoint=mcp_tool.server_url,
1066
+ headers=mcp_tool.headers or {},
1067
+ )
1068
+
1069
+ # Create the MCP list tools message
1070
+ mcp_list_message = OpenAIResponseOutputMessageMCPListTools(
1071
+ id=list_id,
1072
+ server_label=mcp_tool.server_label,
1073
+ tools=[],
1074
+ )
1075
+
1076
+ # Process tools and update context
1077
+ for t in tool_defs.data:
1078
+ if never_allowed and t.name in never_allowed:
1079
+ continue
1080
+ if not always_allowed or t.name in always_allowed:
1081
+ # Add to chat tools for inference
1082
+ openai_tool = convert_tooldef_to_chat_tool(t)
1083
+ if self.ctx.chat_tools is None:
1084
+ self.ctx.chat_tools = []
1085
+ self.ctx.chat_tools.append(openai_tool)
1086
+
1087
+ # Add to MCP tool mapping
1088
+ if t.name in self.mcp_tool_to_server:
1089
+ raise ValueError(f"Duplicate tool name {t.name} found for server {mcp_tool.server_label}")
1090
+ self.mcp_tool_to_server[t.name] = mcp_tool
1091
+
1092
+ # Add to MCP list message
1093
+ mcp_list_message.tools.append(
1094
+ MCPListToolsTool(
1095
+ name=t.name,
1096
+ description=t.description,
1097
+ input_schema=t.input_schema
1098
+ or {
1099
+ "type": "object",
1100
+ "properties": {},
1101
+ "required": [],
1102
+ },
1103
+ )
1104
+ )
1105
+ async for stream_event in self._add_mcp_list_tools(mcp_list_message, output_messages):
1106
+ yield stream_event
1107
+
1108
+ except Exception as e:
1109
+ # TODO: Emit mcp_list_tools.failed event if needed
1110
+ logger.exception(f"Failed to list MCP tools from {mcp_tool.server_url}: {e}")
1111
+ raise
1112
+
1113
+ async def _process_tools(
1114
+ self, output_messages: list[OpenAIResponseOutput]
1115
+ ) -> AsyncIterator[OpenAIResponseObjectStream]:
1116
+ # Handle all mcp tool lists from previous response that are still valid:
1117
+ for tool in self.ctx.tool_context.previous_tool_listings:
1118
+ async for evt in self._reuse_mcp_list_tools(tool, output_messages):
1119
+ yield evt
1120
+ # Process all remaining tools (including MCP tools) and emit streaming events
1121
+ if self.ctx.tool_context.tools_to_process:
1122
+ async for stream_event in self._process_new_tools(self.ctx.tool_context.tools_to_process, output_messages):
1123
+ yield stream_event
1124
+
1125
+ def _approval_required(self, tool_name: str) -> bool:
1126
+ if tool_name not in self.mcp_tool_to_server:
1127
+ return False
1128
+ mcp_server = self.mcp_tool_to_server[tool_name]
1129
+ if mcp_server.require_approval == "always":
1130
+ return True
1131
+ if mcp_server.require_approval == "never":
1132
+ return False
1133
+ if isinstance(mcp_server, ApprovalFilter):
1134
+ if tool_name in mcp_server.always:
1135
+ return True
1136
+ if tool_name in mcp_server.never:
1137
+ return False
1138
+ return True
1139
+
1140
+ async def _add_mcp_approval_request(
1141
+ self, tool_name: str, arguments: str, output_messages: list[OpenAIResponseOutput]
1142
+ ) -> AsyncIterator[OpenAIResponseObjectStream]:
1143
+ mcp_server = self.mcp_tool_to_server[tool_name]
1144
+ mcp_approval_request = OpenAIResponseMCPApprovalRequest(
1145
+ arguments=arguments,
1146
+ id=f"approval_{uuid.uuid4()}",
1147
+ name=tool_name,
1148
+ server_label=mcp_server.server_label,
1149
+ )
1150
+ output_messages.append(mcp_approval_request)
1151
+
1152
+ self.sequence_number += 1
1153
+ yield OpenAIResponseObjectStreamResponseOutputItemAdded(
1154
+ response_id=self.response_id,
1155
+ item=mcp_approval_request,
1156
+ output_index=len(output_messages) - 1,
1157
+ sequence_number=self.sequence_number,
1158
+ )
1159
+ self.sequence_number += 1
1160
+ yield OpenAIResponseObjectStreamResponseOutputItemDone(
1161
+ response_id=self.response_id,
1162
+ item=mcp_approval_request,
1163
+ output_index=len(output_messages) - 1,
1164
+ sequence_number=self.sequence_number,
1165
+ )
1166
+
1167
+ async def _add_mcp_list_tools(
1168
+ self, mcp_list_message: OpenAIResponseOutputMessageMCPListTools, output_messages: list[OpenAIResponseOutput]
1169
+ ) -> AsyncIterator[OpenAIResponseObjectStream]:
1170
+ # Add the MCP list message to output
1171
+ output_messages.append(mcp_list_message)
1172
+
1173
+ # Emit output_item.added for the MCP list tools message
1174
+ self.sequence_number += 1
1175
+ yield OpenAIResponseObjectStreamResponseOutputItemAdded(
1176
+ response_id=self.response_id,
1177
+ item=OpenAIResponseOutputMessageMCPListTools(
1178
+ id=mcp_list_message.id,
1179
+ server_label=mcp_list_message.server_label,
1180
+ tools=[],
1181
+ ),
1182
+ output_index=len(output_messages) - 1,
1183
+ sequence_number=self.sequence_number,
1184
+ )
1185
+ # Emit mcp_list_tools.completed
1186
+ self.sequence_number += 1
1187
+ yield OpenAIResponseObjectStreamResponseMcpListToolsCompleted(
1188
+ sequence_number=self.sequence_number,
1189
+ )
1190
+
1191
+ # Emit output_item.done for the MCP list tools message
1192
+ self.sequence_number += 1
1193
+ yield OpenAIResponseObjectStreamResponseOutputItemDone(
1194
+ response_id=self.response_id,
1195
+ item=mcp_list_message,
1196
+ output_index=len(output_messages) - 1,
1197
+ sequence_number=self.sequence_number,
1198
+ )
1199
+
1200
+ async def _reuse_mcp_list_tools(
1201
+ self, original: OpenAIResponseOutputMessageMCPListTools, output_messages: list[OpenAIResponseOutput]
1202
+ ) -> AsyncIterator[OpenAIResponseObjectStream]:
1203
+ for t in original.tools:
1204
+ from llama_stack.models.llama.datatypes import ToolDefinition
1205
+ from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
1206
+
1207
+ # convert from input_schema to map of ToolParamDefinitions...
1208
+ tool_def = ToolDefinition(
1209
+ tool_name=t.name,
1210
+ description=t.description,
1211
+ input_schema=t.input_schema,
1212
+ )
1213
+ # ...then can convert that to openai completions tool
1214
+ openai_tool = convert_tooldef_to_openai_tool(tool_def)
1215
+ if self.ctx.chat_tools is None:
1216
+ self.ctx.chat_tools = []
1217
+ self.ctx.chat_tools.append(openai_tool)
1218
+
1219
+ mcp_list_message = OpenAIResponseOutputMessageMCPListTools(
1220
+ id=f"mcp_list_{uuid.uuid4()}",
1221
+ server_label=original.server_label,
1222
+ tools=original.tools,
1223
+ )
1224
+
1225
+ async for stream_event in self._add_mcp_list_tools(mcp_list_message, output_messages):
1226
+ yield stream_event