llama-stack 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (738) hide show
  1. llama_stack/__init__.py +5 -0
  2. llama_stack/apis/agents/__init__.py +1 -1
  3. llama_stack/apis/agents/agents.py +700 -281
  4. llama_stack/apis/agents/openai_responses.py +1311 -0
  5. llama_stack/{providers/adapters/memory/sample/config.py → apis/batches/__init__.py} +2 -5
  6. llama_stack/apis/batches/batches.py +100 -0
  7. llama_stack/apis/benchmarks/__init__.py +7 -0
  8. llama_stack/apis/benchmarks/benchmarks.py +108 -0
  9. llama_stack/apis/common/content_types.py +143 -0
  10. llama_stack/apis/common/errors.py +103 -0
  11. llama_stack/apis/common/job_types.py +38 -0
  12. llama_stack/apis/common/responses.py +36 -0
  13. llama_stack/apis/common/training_types.py +36 -5
  14. llama_stack/apis/common/type_system.py +158 -0
  15. llama_stack/apis/conversations/__init__.py +31 -0
  16. llama_stack/apis/conversations/conversations.py +286 -0
  17. llama_stack/apis/datasetio/__init__.py +7 -0
  18. llama_stack/apis/datasetio/datasetio.py +59 -0
  19. llama_stack/apis/datasets/__init__.py +7 -0
  20. llama_stack/apis/datasets/datasets.py +251 -0
  21. llama_stack/apis/datatypes.py +160 -0
  22. llama_stack/apis/eval/__init__.py +7 -0
  23. llama_stack/apis/eval/eval.py +169 -0
  24. llama_stack/apis/files/__init__.py +7 -0
  25. llama_stack/apis/files/files.py +199 -0
  26. llama_stack/apis/inference/__init__.py +1 -1
  27. llama_stack/apis/inference/inference.py +1169 -113
  28. llama_stack/apis/inspect/__init__.py +1 -1
  29. llama_stack/apis/inspect/inspect.py +69 -16
  30. llama_stack/apis/models/__init__.py +1 -1
  31. llama_stack/apis/models/models.py +148 -21
  32. llama_stack/apis/post_training/__init__.py +1 -1
  33. llama_stack/apis/post_training/post_training.py +265 -120
  34. llama_stack/{providers/adapters/agents/sample/config.py → apis/prompts/__init__.py} +2 -5
  35. llama_stack/apis/prompts/prompts.py +204 -0
  36. llama_stack/apis/providers/__init__.py +7 -0
  37. llama_stack/apis/providers/providers.py +69 -0
  38. llama_stack/apis/resource.py +37 -0
  39. llama_stack/apis/safety/__init__.py +1 -1
  40. llama_stack/apis/safety/safety.py +95 -12
  41. llama_stack/apis/scoring/__init__.py +7 -0
  42. llama_stack/apis/scoring/scoring.py +93 -0
  43. llama_stack/apis/scoring_functions/__init__.py +7 -0
  44. llama_stack/apis/scoring_functions/scoring_functions.py +208 -0
  45. llama_stack/apis/shields/__init__.py +1 -1
  46. llama_stack/apis/shields/shields.py +76 -33
  47. llama_stack/apis/synthetic_data_generation/__init__.py +1 -1
  48. llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py +40 -17
  49. llama_stack/apis/telemetry/__init__.py +1 -1
  50. llama_stack/apis/telemetry/telemetry.py +322 -31
  51. llama_stack/apis/{dataset → tools}/__init__.py +2 -1
  52. llama_stack/apis/tools/rag_tool.py +218 -0
  53. llama_stack/apis/tools/tools.py +221 -0
  54. llama_stack/apis/vector_io/__init__.py +7 -0
  55. llama_stack/apis/vector_io/vector_io.py +960 -0
  56. llama_stack/apis/vector_stores/__init__.py +7 -0
  57. llama_stack/apis/vector_stores/vector_stores.py +51 -0
  58. llama_stack/apis/version.py +9 -0
  59. llama_stack/cli/llama.py +13 -5
  60. llama_stack/cli/stack/_list_deps.py +182 -0
  61. llama_stack/cli/stack/list_apis.py +1 -1
  62. llama_stack/cli/stack/list_deps.py +55 -0
  63. llama_stack/cli/stack/list_providers.py +24 -10
  64. llama_stack/cli/stack/list_stacks.py +56 -0
  65. llama_stack/cli/stack/remove.py +115 -0
  66. llama_stack/cli/stack/run.py +169 -56
  67. llama_stack/cli/stack/stack.py +18 -4
  68. llama_stack/cli/stack/utils.py +151 -0
  69. llama_stack/cli/table.py +23 -61
  70. llama_stack/cli/utils.py +29 -0
  71. llama_stack/core/access_control/access_control.py +131 -0
  72. llama_stack/core/access_control/conditions.py +129 -0
  73. llama_stack/core/access_control/datatypes.py +107 -0
  74. llama_stack/core/build.py +164 -0
  75. llama_stack/core/client.py +205 -0
  76. llama_stack/core/common.sh +37 -0
  77. llama_stack/{distribution → core}/configure.py +74 -55
  78. llama_stack/core/conversations/conversations.py +309 -0
  79. llama_stack/core/datatypes.py +625 -0
  80. llama_stack/core/distribution.py +276 -0
  81. llama_stack/core/external.py +54 -0
  82. llama_stack/core/id_generation.py +42 -0
  83. llama_stack/core/inspect.py +86 -0
  84. llama_stack/core/library_client.py +539 -0
  85. llama_stack/core/prompts/prompts.py +234 -0
  86. llama_stack/core/providers.py +137 -0
  87. llama_stack/core/request_headers.py +115 -0
  88. llama_stack/core/resolver.py +506 -0
  89. llama_stack/core/routers/__init__.py +101 -0
  90. llama_stack/core/routers/datasets.py +73 -0
  91. llama_stack/core/routers/eval_scoring.py +155 -0
  92. llama_stack/core/routers/inference.py +645 -0
  93. llama_stack/core/routers/safety.py +85 -0
  94. llama_stack/core/routers/tool_runtime.py +91 -0
  95. llama_stack/core/routers/vector_io.py +442 -0
  96. llama_stack/core/routing_tables/benchmarks.py +62 -0
  97. llama_stack/core/routing_tables/common.py +254 -0
  98. llama_stack/core/routing_tables/datasets.py +91 -0
  99. llama_stack/core/routing_tables/models.py +163 -0
  100. llama_stack/core/routing_tables/scoring_functions.py +66 -0
  101. llama_stack/core/routing_tables/shields.py +61 -0
  102. llama_stack/core/routing_tables/toolgroups.py +129 -0
  103. llama_stack/core/routing_tables/vector_stores.py +292 -0
  104. llama_stack/core/server/auth.py +187 -0
  105. llama_stack/core/server/auth_providers.py +494 -0
  106. llama_stack/core/server/quota.py +110 -0
  107. llama_stack/core/server/routes.py +141 -0
  108. llama_stack/core/server/server.py +542 -0
  109. llama_stack/core/server/tracing.py +80 -0
  110. llama_stack/core/stack.py +546 -0
  111. llama_stack/core/start_stack.sh +117 -0
  112. llama_stack/core/storage/datatypes.py +283 -0
  113. llama_stack/{cli/model → core/store}/__init__.py +1 -1
  114. llama_stack/core/store/registry.py +199 -0
  115. llama_stack/core/testing_context.py +49 -0
  116. llama_stack/core/ui/app.py +55 -0
  117. llama_stack/core/ui/modules/api.py +32 -0
  118. llama_stack/core/ui/modules/utils.py +42 -0
  119. llama_stack/core/ui/page/distribution/datasets.py +18 -0
  120. llama_stack/core/ui/page/distribution/eval_tasks.py +20 -0
  121. llama_stack/core/ui/page/distribution/models.py +18 -0
  122. llama_stack/core/ui/page/distribution/providers.py +27 -0
  123. llama_stack/core/ui/page/distribution/resources.py +48 -0
  124. llama_stack/core/ui/page/distribution/scoring_functions.py +18 -0
  125. llama_stack/core/ui/page/distribution/shields.py +19 -0
  126. llama_stack/core/ui/page/evaluations/app_eval.py +143 -0
  127. llama_stack/core/ui/page/evaluations/native_eval.py +253 -0
  128. llama_stack/core/ui/page/playground/chat.py +130 -0
  129. llama_stack/core/ui/page/playground/tools.py +352 -0
  130. llama_stack/core/utils/config.py +30 -0
  131. llama_stack/{distribution → core}/utils/config_dirs.py +3 -6
  132. llama_stack/core/utils/config_resolution.py +125 -0
  133. llama_stack/core/utils/context.py +84 -0
  134. llama_stack/core/utils/exec.py +96 -0
  135. llama_stack/{providers/impls/meta_reference/codeshield/config.py → core/utils/image_types.py} +4 -3
  136. llama_stack/{distribution → core}/utils/model_utils.py +2 -2
  137. llama_stack/{distribution → core}/utils/prompt_for_config.py +30 -63
  138. llama_stack/{apis/batch_inference → distributions/dell}/__init__.py +1 -1
  139. llama_stack/distributions/dell/build.yaml +33 -0
  140. llama_stack/distributions/dell/dell.py +158 -0
  141. llama_stack/distributions/dell/run-with-safety.yaml +141 -0
  142. llama_stack/distributions/dell/run.yaml +132 -0
  143. llama_stack/distributions/meta-reference-gpu/__init__.py +7 -0
  144. llama_stack/distributions/meta-reference-gpu/build.yaml +32 -0
  145. llama_stack/distributions/meta-reference-gpu/meta_reference.py +163 -0
  146. llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +154 -0
  147. llama_stack/distributions/meta-reference-gpu/run.yaml +139 -0
  148. llama_stack/{apis/evals → distributions/nvidia}/__init__.py +1 -1
  149. llama_stack/distributions/nvidia/build.yaml +29 -0
  150. llama_stack/distributions/nvidia/nvidia.py +154 -0
  151. llama_stack/distributions/nvidia/run-with-safety.yaml +137 -0
  152. llama_stack/distributions/nvidia/run.yaml +116 -0
  153. llama_stack/distributions/open-benchmark/__init__.py +7 -0
  154. llama_stack/distributions/open-benchmark/build.yaml +36 -0
  155. llama_stack/distributions/open-benchmark/open_benchmark.py +303 -0
  156. llama_stack/distributions/open-benchmark/run.yaml +252 -0
  157. llama_stack/distributions/postgres-demo/__init__.py +7 -0
  158. llama_stack/distributions/postgres-demo/build.yaml +23 -0
  159. llama_stack/distributions/postgres-demo/postgres_demo.py +125 -0
  160. llama_stack/distributions/postgres-demo/run.yaml +115 -0
  161. llama_stack/{apis/memory → distributions/starter}/__init__.py +1 -1
  162. llama_stack/distributions/starter/build.yaml +61 -0
  163. llama_stack/distributions/starter/run-with-postgres-store.yaml +285 -0
  164. llama_stack/distributions/starter/run.yaml +276 -0
  165. llama_stack/distributions/starter/starter.py +345 -0
  166. llama_stack/distributions/starter-gpu/__init__.py +7 -0
  167. llama_stack/distributions/starter-gpu/build.yaml +61 -0
  168. llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +288 -0
  169. llama_stack/distributions/starter-gpu/run.yaml +279 -0
  170. llama_stack/distributions/starter-gpu/starter_gpu.py +20 -0
  171. llama_stack/distributions/template.py +456 -0
  172. llama_stack/distributions/watsonx/__init__.py +7 -0
  173. llama_stack/distributions/watsonx/build.yaml +33 -0
  174. llama_stack/distributions/watsonx/run.yaml +133 -0
  175. llama_stack/distributions/watsonx/watsonx.py +95 -0
  176. llama_stack/env.py +24 -0
  177. llama_stack/log.py +314 -0
  178. llama_stack/models/llama/checkpoint.py +164 -0
  179. llama_stack/models/llama/datatypes.py +164 -0
  180. llama_stack/models/llama/hadamard_utils.py +86 -0
  181. llama_stack/models/llama/llama3/args.py +74 -0
  182. llama_stack/models/llama/llama3/chat_format.py +286 -0
  183. llama_stack/models/llama/llama3/generation.py +376 -0
  184. llama_stack/models/llama/llama3/interface.py +255 -0
  185. llama_stack/models/llama/llama3/model.py +304 -0
  186. llama_stack/models/llama/llama3/multimodal/__init__.py +12 -0
  187. llama_stack/models/llama/llama3/multimodal/encoder_utils.py +180 -0
  188. llama_stack/models/llama/llama3/multimodal/image_transform.py +409 -0
  189. llama_stack/models/llama/llama3/multimodal/model.py +1430 -0
  190. llama_stack/models/llama/llama3/multimodal/utils.py +26 -0
  191. llama_stack/models/llama/llama3/prompt_templates/__init__.py +22 -0
  192. llama_stack/models/llama/llama3/prompt_templates/base.py +39 -0
  193. llama_stack/models/llama/llama3/prompt_templates/system_prompts.py +319 -0
  194. llama_stack/models/llama/llama3/prompt_templates/tool_response.py +62 -0
  195. llama_stack/models/llama/llama3/quantization/loader.py +316 -0
  196. llama_stack/models/llama/llama3/template_data.py +116 -0
  197. llama_stack/models/llama/llama3/tokenizer.model +128000 -0
  198. llama_stack/models/llama/llama3/tokenizer.py +198 -0
  199. llama_stack/models/llama/llama3/tool_utils.py +266 -0
  200. llama_stack/models/llama/llama3_1/__init__.py +12 -0
  201. llama_stack/models/llama/llama3_1/prompt_format.md +358 -0
  202. llama_stack/models/llama/llama3_1/prompts.py +258 -0
  203. llama_stack/models/llama/llama3_2/prompts_text.py +229 -0
  204. llama_stack/models/llama/llama3_2/prompts_vision.py +126 -0
  205. llama_stack/models/llama/llama3_2/text_prompt_format.md +286 -0
  206. llama_stack/models/llama/llama3_2/vision_prompt_format.md +141 -0
  207. llama_stack/models/llama/llama3_3/prompts.py +259 -0
  208. llama_stack/models/llama/llama4/args.py +107 -0
  209. llama_stack/models/llama/llama4/chat_format.py +317 -0
  210. llama_stack/models/llama/llama4/datatypes.py +56 -0
  211. llama_stack/models/llama/llama4/ffn.py +58 -0
  212. llama_stack/models/llama/llama4/generation.py +313 -0
  213. llama_stack/models/llama/llama4/model.py +437 -0
  214. llama_stack/models/llama/llama4/moe.py +214 -0
  215. llama_stack/models/llama/llama4/preprocess.py +435 -0
  216. llama_stack/models/llama/llama4/prompt_format.md +304 -0
  217. llama_stack/models/llama/llama4/prompt_templates/system_prompts.py +136 -0
  218. llama_stack/models/llama/llama4/prompts.py +279 -0
  219. llama_stack/models/llama/llama4/quantization/__init__.py +5 -0
  220. llama_stack/models/llama/llama4/quantization/loader.py +226 -0
  221. llama_stack/models/llama/llama4/tokenizer.model +200000 -0
  222. llama_stack/models/llama/llama4/tokenizer.py +263 -0
  223. llama_stack/models/llama/llama4/vision/__init__.py +5 -0
  224. llama_stack/models/llama/llama4/vision/embedding.py +210 -0
  225. llama_stack/models/llama/llama4/vision/encoder.py +412 -0
  226. llama_stack/models/llama/prompt_format.py +191 -0
  227. llama_stack/models/llama/quantize_impls.py +316 -0
  228. llama_stack/models/llama/sku_list.py +1029 -0
  229. llama_stack/models/llama/sku_types.py +233 -0
  230. llama_stack/models/llama/tokenizer_utils.py +40 -0
  231. llama_stack/providers/datatypes.py +136 -107
  232. llama_stack/providers/inline/__init__.py +5 -0
  233. llama_stack/providers/inline/agents/__init__.py +5 -0
  234. llama_stack/providers/{impls/meta_reference/agents → inline/agents/meta_reference}/__init__.py +12 -5
  235. llama_stack/providers/inline/agents/meta_reference/agent_instance.py +1024 -0
  236. llama_stack/providers/inline/agents/meta_reference/agents.py +383 -0
  237. llama_stack/providers/inline/agents/meta_reference/config.py +37 -0
  238. llama_stack/providers/inline/agents/meta_reference/persistence.py +228 -0
  239. llama_stack/providers/inline/agents/meta_reference/responses/__init__.py +5 -0
  240. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +423 -0
  241. llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +1226 -0
  242. llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +449 -0
  243. llama_stack/providers/inline/agents/meta_reference/responses/types.py +194 -0
  244. llama_stack/providers/inline/agents/meta_reference/responses/utils.py +365 -0
  245. llama_stack/providers/inline/agents/meta_reference/safety.py +52 -0
  246. llama_stack/providers/inline/batches/__init__.py +5 -0
  247. llama_stack/providers/inline/batches/reference/__init__.py +36 -0
  248. llama_stack/providers/inline/batches/reference/batches.py +679 -0
  249. llama_stack/providers/inline/batches/reference/config.py +40 -0
  250. llama_stack/providers/inline/datasetio/__init__.py +5 -0
  251. llama_stack/providers/inline/datasetio/localfs/__init__.py +20 -0
  252. llama_stack/providers/inline/datasetio/localfs/config.py +23 -0
  253. llama_stack/providers/inline/datasetio/localfs/datasetio.py +113 -0
  254. llama_stack/providers/inline/eval/__init__.py +5 -0
  255. llama_stack/providers/inline/eval/meta_reference/__init__.py +28 -0
  256. llama_stack/providers/inline/eval/meta_reference/config.py +23 -0
  257. llama_stack/providers/inline/eval/meta_reference/eval.py +259 -0
  258. llama_stack/providers/inline/files/localfs/__init__.py +20 -0
  259. llama_stack/providers/inline/files/localfs/config.py +31 -0
  260. llama_stack/providers/inline/files/localfs/files.py +219 -0
  261. llama_stack/providers/inline/inference/__init__.py +5 -0
  262. llama_stack/providers/{impls/meta_reference/inference → inline/inference/meta_reference}/__init__.py +4 -4
  263. llama_stack/providers/inline/inference/meta_reference/common.py +24 -0
  264. llama_stack/providers/inline/inference/meta_reference/config.py +68 -0
  265. llama_stack/providers/inline/inference/meta_reference/generators.py +211 -0
  266. llama_stack/providers/inline/inference/meta_reference/inference.py +158 -0
  267. llama_stack/providers/inline/inference/meta_reference/model_parallel.py +96 -0
  268. llama_stack/providers/{impls/meta_reference/inference → inline/inference/meta_reference}/parallel_utils.py +56 -73
  269. llama_stack/providers/inline/inference/sentence_transformers/__init__.py +22 -0
  270. llama_stack/providers/{impls/meta_reference/agents → inline/inference/sentence_transformers}/config.py +6 -4
  271. llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +83 -0
  272. llama_stack/providers/inline/post_training/__init__.py +5 -0
  273. llama_stack/providers/inline/post_training/common/__init__.py +5 -0
  274. llama_stack/providers/inline/post_training/common/utils.py +35 -0
  275. llama_stack/providers/inline/post_training/common/validator.py +36 -0
  276. llama_stack/providers/inline/post_training/huggingface/__init__.py +27 -0
  277. llama_stack/providers/inline/post_training/huggingface/config.py +83 -0
  278. llama_stack/providers/inline/post_training/huggingface/post_training.py +208 -0
  279. llama_stack/providers/inline/post_training/huggingface/recipes/__init__.py +5 -0
  280. llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py +519 -0
  281. llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device_dpo.py +485 -0
  282. llama_stack/providers/inline/post_training/huggingface/utils.py +269 -0
  283. llama_stack/providers/inline/post_training/torchtune/__init__.py +27 -0
  284. llama_stack/providers/inline/post_training/torchtune/common/__init__.py +5 -0
  285. llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py +240 -0
  286. llama_stack/providers/inline/post_training/torchtune/common/utils.py +99 -0
  287. llama_stack/providers/inline/post_training/torchtune/config.py +20 -0
  288. llama_stack/providers/inline/post_training/torchtune/datasets/__init__.py +5 -0
  289. llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py +57 -0
  290. llama_stack/providers/inline/post_training/torchtune/datasets/sft.py +78 -0
  291. llama_stack/providers/inline/post_training/torchtune/post_training.py +178 -0
  292. llama_stack/providers/inline/post_training/torchtune/recipes/__init__.py +5 -0
  293. llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +588 -0
  294. llama_stack/providers/inline/safety/__init__.py +5 -0
  295. llama_stack/providers/{impls/meta_reference/codeshield → inline/safety/code_scanner}/__init__.py +4 -2
  296. llama_stack/providers/inline/safety/code_scanner/code_scanner.py +128 -0
  297. llama_stack/providers/{impls/meta_reference/memory → inline/safety/code_scanner}/config.py +5 -3
  298. llama_stack/providers/inline/safety/llama_guard/__init__.py +19 -0
  299. llama_stack/providers/inline/safety/llama_guard/config.py +19 -0
  300. llama_stack/providers/inline/safety/llama_guard/llama_guard.py +489 -0
  301. llama_stack/providers/{adapters/memory/sample → inline/safety/prompt_guard}/__init__.py +4 -4
  302. llama_stack/providers/inline/safety/prompt_guard/config.py +32 -0
  303. llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +131 -0
  304. llama_stack/providers/inline/scoring/__init__.py +5 -0
  305. llama_stack/providers/inline/scoring/basic/__init__.py +25 -0
  306. llama_stack/providers/{adapters/memory/weaviate → inline/scoring/basic}/config.py +5 -7
  307. llama_stack/providers/inline/scoring/basic/scoring.py +126 -0
  308. llama_stack/providers/inline/scoring/basic/scoring_fn/__init__.py +5 -0
  309. llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py +240 -0
  310. llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py +41 -0
  311. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/__init__.py +5 -0
  312. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py +21 -0
  313. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py +21 -0
  314. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py +23 -0
  315. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py +27 -0
  316. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py +71 -0
  317. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py +21 -0
  318. llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py +80 -0
  319. llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py +66 -0
  320. llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py +58 -0
  321. llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py +38 -0
  322. llama_stack/providers/inline/scoring/basic/utils/__init__.py +5 -0
  323. llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py +3319 -0
  324. llama_stack/providers/inline/scoring/basic/utils/math_utils.py +330 -0
  325. llama_stack/providers/inline/scoring/braintrust/__init__.py +27 -0
  326. llama_stack/providers/inline/scoring/braintrust/braintrust.py +230 -0
  327. llama_stack/providers/inline/scoring/braintrust/config.py +21 -0
  328. llama_stack/providers/inline/scoring/braintrust/scoring_fn/__init__.py +5 -0
  329. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/__init__.py +5 -0
  330. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py +24 -0
  331. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py +24 -0
  332. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py +24 -0
  333. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py +24 -0
  334. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py +24 -0
  335. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py +24 -0
  336. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py +23 -0
  337. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py +24 -0
  338. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py +24 -0
  339. llama_stack/providers/inline/scoring/llm_as_judge/__init__.py +21 -0
  340. llama_stack/providers/inline/scoring/llm_as_judge/config.py +14 -0
  341. llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +113 -0
  342. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/__init__.py +5 -0
  343. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/__init__.py +5 -0
  344. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py +96 -0
  345. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py +20 -0
  346. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py +81 -0
  347. llama_stack/providers/inline/telemetry/__init__.py +5 -0
  348. llama_stack/providers/inline/telemetry/meta_reference/__init__.py +21 -0
  349. llama_stack/providers/inline/telemetry/meta_reference/config.py +47 -0
  350. llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +252 -0
  351. llama_stack/providers/inline/tool_runtime/__init__.py +5 -0
  352. llama_stack/providers/inline/tool_runtime/rag/__init__.py +19 -0
  353. llama_stack/providers/{impls/meta_reference/telemetry → inline/tool_runtime/rag}/config.py +5 -3
  354. llama_stack/providers/inline/tool_runtime/rag/context_retriever.py +77 -0
  355. llama_stack/providers/inline/tool_runtime/rag/memory.py +332 -0
  356. llama_stack/providers/inline/vector_io/__init__.py +5 -0
  357. llama_stack/providers/inline/vector_io/chroma/__init__.py +19 -0
  358. llama_stack/providers/inline/vector_io/chroma/config.py +30 -0
  359. llama_stack/providers/inline/vector_io/faiss/__init__.py +21 -0
  360. llama_stack/providers/inline/vector_io/faiss/config.py +26 -0
  361. llama_stack/providers/inline/vector_io/faiss/faiss.py +293 -0
  362. llama_stack/providers/inline/vector_io/milvus/__init__.py +19 -0
  363. llama_stack/providers/inline/vector_io/milvus/config.py +29 -0
  364. llama_stack/providers/inline/vector_io/qdrant/__init__.py +20 -0
  365. llama_stack/providers/inline/vector_io/qdrant/config.py +29 -0
  366. llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py +20 -0
  367. llama_stack/providers/inline/vector_io/sqlite_vec/config.py +26 -0
  368. llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +483 -0
  369. llama_stack/providers/registry/agents.py +16 -18
  370. llama_stack/providers/registry/batches.py +26 -0
  371. llama_stack/providers/registry/datasetio.py +49 -0
  372. llama_stack/providers/registry/eval.py +46 -0
  373. llama_stack/providers/registry/files.py +31 -0
  374. llama_stack/providers/registry/inference.py +273 -118
  375. llama_stack/providers/registry/post_training.py +69 -0
  376. llama_stack/providers/registry/safety.py +46 -41
  377. llama_stack/providers/registry/scoring.py +51 -0
  378. llama_stack/providers/registry/tool_runtime.py +87 -0
  379. llama_stack/providers/registry/vector_io.py +828 -0
  380. llama_stack/providers/remote/__init__.py +5 -0
  381. llama_stack/providers/remote/agents/__init__.py +5 -0
  382. llama_stack/providers/remote/datasetio/__init__.py +5 -0
  383. llama_stack/providers/{adapters/memory/chroma → remote/datasetio/huggingface}/__init__.py +7 -4
  384. llama_stack/providers/remote/datasetio/huggingface/config.py +23 -0
  385. llama_stack/providers/remote/datasetio/huggingface/huggingface.py +99 -0
  386. llama_stack/providers/remote/datasetio/nvidia/__init__.py +23 -0
  387. llama_stack/providers/remote/datasetio/nvidia/config.py +61 -0
  388. llama_stack/providers/remote/datasetio/nvidia/datasetio.py +116 -0
  389. llama_stack/providers/remote/eval/__init__.py +5 -0
  390. llama_stack/providers/remote/eval/nvidia/__init__.py +31 -0
  391. llama_stack/providers/remote/eval/nvidia/config.py +29 -0
  392. llama_stack/providers/remote/eval/nvidia/eval.py +162 -0
  393. llama_stack/providers/remote/files/s3/__init__.py +19 -0
  394. llama_stack/providers/remote/files/s3/config.py +42 -0
  395. llama_stack/providers/remote/files/s3/files.py +313 -0
  396. llama_stack/providers/remote/inference/__init__.py +5 -0
  397. llama_stack/providers/{adapters/safety/sample → remote/inference/anthropic}/__init__.py +4 -6
  398. llama_stack/providers/remote/inference/anthropic/anthropic.py +36 -0
  399. llama_stack/providers/remote/inference/anthropic/config.py +28 -0
  400. llama_stack/providers/{impls/meta_reference/telemetry → remote/inference/azure}/__init__.py +4 -4
  401. llama_stack/providers/remote/inference/azure/azure.py +25 -0
  402. llama_stack/providers/remote/inference/azure/config.py +61 -0
  403. llama_stack/providers/{adapters → remote}/inference/bedrock/__init__.py +18 -17
  404. llama_stack/providers/remote/inference/bedrock/bedrock.py +142 -0
  405. llama_stack/providers/{adapters/inference/sample → remote/inference/bedrock}/config.py +3 -4
  406. llama_stack/providers/remote/inference/bedrock/models.py +29 -0
  407. llama_stack/providers/remote/inference/cerebras/__init__.py +19 -0
  408. llama_stack/providers/remote/inference/cerebras/cerebras.py +28 -0
  409. llama_stack/providers/remote/inference/cerebras/config.py +30 -0
  410. llama_stack/providers/{adapters → remote}/inference/databricks/__init__.py +4 -5
  411. llama_stack/providers/remote/inference/databricks/config.py +37 -0
  412. llama_stack/providers/remote/inference/databricks/databricks.py +44 -0
  413. llama_stack/providers/{adapters → remote}/inference/fireworks/__init__.py +8 -4
  414. llama_stack/providers/remote/inference/fireworks/config.py +27 -0
  415. llama_stack/providers/remote/inference/fireworks/fireworks.py +27 -0
  416. llama_stack/providers/{adapters/memory/pgvector → remote/inference/gemini}/__init__.py +4 -4
  417. llama_stack/providers/remote/inference/gemini/config.py +28 -0
  418. llama_stack/providers/remote/inference/gemini/gemini.py +82 -0
  419. llama_stack/providers/remote/inference/groq/__init__.py +15 -0
  420. llama_stack/providers/remote/inference/groq/config.py +34 -0
  421. llama_stack/providers/remote/inference/groq/groq.py +18 -0
  422. llama_stack/providers/remote/inference/llama_openai_compat/__init__.py +15 -0
  423. llama_stack/providers/remote/inference/llama_openai_compat/config.py +34 -0
  424. llama_stack/providers/remote/inference/llama_openai_compat/llama.py +46 -0
  425. llama_stack/providers/remote/inference/nvidia/__init__.py +23 -0
  426. llama_stack/providers/remote/inference/nvidia/config.py +64 -0
  427. llama_stack/providers/remote/inference/nvidia/nvidia.py +61 -0
  428. llama_stack/providers/{adapters/safety/sample/config.py → remote/inference/nvidia/utils.py} +3 -4
  429. llama_stack/providers/{impls/vllm → remote/inference/ollama}/__init__.py +4 -6
  430. llama_stack/providers/remote/inference/ollama/config.py +25 -0
  431. llama_stack/providers/remote/inference/ollama/ollama.py +102 -0
  432. llama_stack/providers/{adapters/telemetry/opentelemetry → remote/inference/openai}/__init__.py +4 -4
  433. llama_stack/providers/remote/inference/openai/config.py +39 -0
  434. llama_stack/providers/remote/inference/openai/openai.py +38 -0
  435. llama_stack/providers/remote/inference/passthrough/__init__.py +23 -0
  436. llama_stack/providers/remote/inference/passthrough/config.py +34 -0
  437. llama_stack/providers/remote/inference/passthrough/passthrough.py +122 -0
  438. llama_stack/providers/remote/inference/runpod/__init__.py +16 -0
  439. llama_stack/providers/remote/inference/runpod/config.py +32 -0
  440. llama_stack/providers/remote/inference/runpod/runpod.py +42 -0
  441. llama_stack/providers/remote/inference/sambanova/__init__.py +16 -0
  442. llama_stack/providers/remote/inference/sambanova/config.py +34 -0
  443. llama_stack/providers/remote/inference/sambanova/sambanova.py +28 -0
  444. llama_stack/providers/{adapters → remote}/inference/tgi/__init__.py +3 -4
  445. llama_stack/providers/remote/inference/tgi/config.py +76 -0
  446. llama_stack/providers/remote/inference/tgi/tgi.py +85 -0
  447. llama_stack/providers/{adapters → remote}/inference/together/__init__.py +8 -4
  448. llama_stack/providers/remote/inference/together/config.py +27 -0
  449. llama_stack/providers/remote/inference/together/together.py +102 -0
  450. llama_stack/providers/remote/inference/vertexai/__init__.py +15 -0
  451. llama_stack/providers/remote/inference/vertexai/config.py +48 -0
  452. llama_stack/providers/remote/inference/vertexai/vertexai.py +54 -0
  453. llama_stack/providers/remote/inference/vllm/__init__.py +22 -0
  454. llama_stack/providers/remote/inference/vllm/config.py +59 -0
  455. llama_stack/providers/remote/inference/vllm/vllm.py +111 -0
  456. llama_stack/providers/remote/inference/watsonx/__init__.py +15 -0
  457. llama_stack/providers/remote/inference/watsonx/config.py +45 -0
  458. llama_stack/providers/remote/inference/watsonx/watsonx.py +336 -0
  459. llama_stack/providers/remote/post_training/__init__.py +5 -0
  460. llama_stack/providers/remote/post_training/nvidia/__init__.py +23 -0
  461. llama_stack/providers/remote/post_training/nvidia/config.py +113 -0
  462. llama_stack/providers/remote/post_training/nvidia/models.py +27 -0
  463. llama_stack/providers/remote/post_training/nvidia/post_training.py +430 -0
  464. llama_stack/providers/remote/post_training/nvidia/utils.py +63 -0
  465. llama_stack/providers/remote/safety/__init__.py +5 -0
  466. llama_stack/providers/remote/safety/bedrock/bedrock.py +111 -0
  467. llama_stack/providers/remote/safety/bedrock/config.py +14 -0
  468. llama_stack/providers/{adapters/inference/sample → remote/safety/nvidia}/__init__.py +5 -4
  469. llama_stack/providers/remote/safety/nvidia/config.py +40 -0
  470. llama_stack/providers/remote/safety/nvidia/nvidia.py +161 -0
  471. llama_stack/providers/{adapters/agents/sample → remote/safety/sambanova}/__init__.py +5 -4
  472. llama_stack/providers/remote/safety/sambanova/config.py +37 -0
  473. llama_stack/providers/remote/safety/sambanova/sambanova.py +98 -0
  474. llama_stack/providers/remote/tool_runtime/__init__.py +5 -0
  475. llama_stack/providers/remote/tool_runtime/bing_search/__init__.py +21 -0
  476. llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py +112 -0
  477. llama_stack/providers/remote/tool_runtime/bing_search/config.py +22 -0
  478. llama_stack/providers/remote/tool_runtime/brave_search/__init__.py +20 -0
  479. llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py +148 -0
  480. llama_stack/providers/remote/tool_runtime/brave_search/config.py +27 -0
  481. llama_stack/providers/remote/tool_runtime/model_context_protocol/__init__.py +15 -0
  482. llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py +20 -0
  483. llama_stack/providers/remote/tool_runtime/model_context_protocol/model_context_protocol.py +73 -0
  484. llama_stack/providers/remote/tool_runtime/tavily_search/__init__.py +20 -0
  485. llama_stack/providers/remote/tool_runtime/tavily_search/config.py +27 -0
  486. llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py +84 -0
  487. llama_stack/providers/remote/tool_runtime/wolfram_alpha/__init__.py +22 -0
  488. llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py +21 -0
  489. llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py +140 -0
  490. llama_stack/providers/remote/vector_io/__init__.py +5 -0
  491. llama_stack/providers/remote/vector_io/chroma/__init__.py +17 -0
  492. llama_stack/providers/remote/vector_io/chroma/chroma.py +215 -0
  493. llama_stack/providers/remote/vector_io/chroma/config.py +28 -0
  494. llama_stack/providers/remote/vector_io/milvus/__init__.py +18 -0
  495. llama_stack/providers/remote/vector_io/milvus/config.py +35 -0
  496. llama_stack/providers/remote/vector_io/milvus/milvus.py +375 -0
  497. llama_stack/providers/remote/vector_io/pgvector/__init__.py +17 -0
  498. llama_stack/providers/remote/vector_io/pgvector/config.py +47 -0
  499. llama_stack/providers/remote/vector_io/pgvector/pgvector.py +460 -0
  500. llama_stack/providers/remote/vector_io/qdrant/__init__.py +17 -0
  501. llama_stack/providers/remote/vector_io/qdrant/config.py +37 -0
  502. llama_stack/providers/remote/vector_io/qdrant/qdrant.py +265 -0
  503. llama_stack/providers/remote/vector_io/weaviate/__init__.py +17 -0
  504. llama_stack/providers/remote/vector_io/weaviate/config.py +32 -0
  505. llama_stack/providers/remote/vector_io/weaviate/weaviate.py +393 -0
  506. llama_stack/providers/utils/bedrock/__init__.py +5 -0
  507. llama_stack/providers/utils/bedrock/client.py +74 -0
  508. llama_stack/providers/utils/bedrock/config.py +64 -0
  509. llama_stack/providers/utils/bedrock/refreshable_boto_session.py +112 -0
  510. llama_stack/providers/utils/common/__init__.py +5 -0
  511. llama_stack/providers/utils/common/data_schema_validator.py +103 -0
  512. llama_stack/providers/utils/datasetio/__init__.py +5 -0
  513. llama_stack/providers/utils/datasetio/url_utils.py +47 -0
  514. llama_stack/providers/utils/files/__init__.py +5 -0
  515. llama_stack/providers/utils/files/form_data.py +69 -0
  516. llama_stack/providers/utils/inference/__init__.py +8 -7
  517. llama_stack/providers/utils/inference/embedding_mixin.py +101 -0
  518. llama_stack/providers/utils/inference/inference_store.py +264 -0
  519. llama_stack/providers/utils/inference/litellm_openai_mixin.py +336 -0
  520. llama_stack/providers/utils/inference/model_registry.py +173 -23
  521. llama_stack/providers/utils/inference/openai_compat.py +1261 -49
  522. llama_stack/providers/utils/inference/openai_mixin.py +506 -0
  523. llama_stack/providers/utils/inference/prompt_adapter.py +365 -67
  524. llama_stack/providers/utils/kvstore/api.py +6 -6
  525. llama_stack/providers/utils/kvstore/config.py +28 -48
  526. llama_stack/providers/utils/kvstore/kvstore.py +61 -15
  527. llama_stack/providers/utils/kvstore/mongodb/__init__.py +9 -0
  528. llama_stack/providers/utils/kvstore/mongodb/mongodb.py +82 -0
  529. llama_stack/providers/utils/kvstore/postgres/__init__.py +7 -0
  530. llama_stack/providers/utils/kvstore/postgres/postgres.py +114 -0
  531. llama_stack/providers/utils/kvstore/redis/redis.py +33 -9
  532. llama_stack/providers/utils/kvstore/sqlite/config.py +2 -1
  533. llama_stack/providers/utils/kvstore/sqlite/sqlite.py +123 -22
  534. llama_stack/providers/utils/memory/file_utils.py +1 -1
  535. llama_stack/providers/utils/memory/openai_vector_store_mixin.py +1304 -0
  536. llama_stack/providers/utils/memory/vector_store.py +220 -82
  537. llama_stack/providers/utils/pagination.py +43 -0
  538. llama_stack/providers/utils/responses/__init__.py +5 -0
  539. llama_stack/providers/utils/responses/responses_store.py +292 -0
  540. llama_stack/providers/utils/scheduler.py +270 -0
  541. llama_stack/providers/utils/scoring/__init__.py +5 -0
  542. llama_stack/providers/utils/scoring/aggregation_utils.py +75 -0
  543. llama_stack/providers/utils/scoring/base_scoring_fn.py +114 -0
  544. llama_stack/providers/utils/scoring/basic_scoring_utils.py +26 -0
  545. llama_stack/providers/utils/sqlstore/__init__.py +5 -0
  546. llama_stack/providers/utils/sqlstore/api.py +128 -0
  547. llama_stack/providers/utils/sqlstore/authorized_sqlstore.py +319 -0
  548. llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py +343 -0
  549. llama_stack/providers/utils/sqlstore/sqlstore.py +70 -0
  550. llama_stack/providers/utils/telemetry/trace_protocol.py +142 -0
  551. llama_stack/providers/utils/telemetry/tracing.py +192 -53
  552. llama_stack/providers/utils/tools/__init__.py +5 -0
  553. llama_stack/providers/utils/tools/mcp.py +148 -0
  554. llama_stack/providers/utils/tools/ttl_dict.py +70 -0
  555. llama_stack/providers/utils/vector_io/__init__.py +5 -0
  556. llama_stack/providers/utils/vector_io/vector_utils.py +156 -0
  557. llama_stack/schema_utils.py +118 -0
  558. llama_stack/strong_typing/__init__.py +19 -0
  559. llama_stack/strong_typing/auxiliary.py +228 -0
  560. llama_stack/strong_typing/classdef.py +440 -0
  561. llama_stack/strong_typing/core.py +46 -0
  562. llama_stack/strong_typing/deserializer.py +877 -0
  563. llama_stack/strong_typing/docstring.py +409 -0
  564. llama_stack/strong_typing/exception.py +23 -0
  565. llama_stack/strong_typing/inspection.py +1085 -0
  566. llama_stack/strong_typing/mapping.py +40 -0
  567. llama_stack/strong_typing/name.py +182 -0
  568. llama_stack/strong_typing/py.typed +0 -0
  569. llama_stack/strong_typing/schema.py +792 -0
  570. llama_stack/strong_typing/serialization.py +97 -0
  571. llama_stack/strong_typing/serializer.py +500 -0
  572. llama_stack/strong_typing/slots.py +27 -0
  573. llama_stack/strong_typing/topological.py +89 -0
  574. llama_stack/testing/__init__.py +5 -0
  575. llama_stack/testing/api_recorder.py +956 -0
  576. llama_stack/ui/node_modules/flatted/python/flatted.py +149 -0
  577. llama_stack-0.3.4.dist-info/METADATA +261 -0
  578. llama_stack-0.3.4.dist-info/RECORD +625 -0
  579. {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info}/WHEEL +1 -1
  580. llama_stack/apis/agents/client.py +0 -292
  581. llama_stack/apis/agents/event_logger.py +0 -184
  582. llama_stack/apis/batch_inference/batch_inference.py +0 -72
  583. llama_stack/apis/common/deployment_types.py +0 -31
  584. llama_stack/apis/dataset/dataset.py +0 -63
  585. llama_stack/apis/evals/evals.py +0 -122
  586. llama_stack/apis/inference/client.py +0 -197
  587. llama_stack/apis/inspect/client.py +0 -82
  588. llama_stack/apis/memory/client.py +0 -155
  589. llama_stack/apis/memory/memory.py +0 -65
  590. llama_stack/apis/memory_banks/__init__.py +0 -7
  591. llama_stack/apis/memory_banks/client.py +0 -101
  592. llama_stack/apis/memory_banks/memory_banks.py +0 -78
  593. llama_stack/apis/models/client.py +0 -83
  594. llama_stack/apis/reward_scoring/__init__.py +0 -7
  595. llama_stack/apis/reward_scoring/reward_scoring.py +0 -55
  596. llama_stack/apis/safety/client.py +0 -105
  597. llama_stack/apis/shields/client.py +0 -79
  598. llama_stack/cli/download.py +0 -340
  599. llama_stack/cli/model/describe.py +0 -82
  600. llama_stack/cli/model/download.py +0 -24
  601. llama_stack/cli/model/list.py +0 -62
  602. llama_stack/cli/model/model.py +0 -34
  603. llama_stack/cli/model/prompt_format.py +0 -112
  604. llama_stack/cli/model/safety_models.py +0 -52
  605. llama_stack/cli/stack/build.py +0 -299
  606. llama_stack/cli/stack/configure.py +0 -178
  607. llama_stack/distribution/build.py +0 -123
  608. llama_stack/distribution/build_conda_env.sh +0 -136
  609. llama_stack/distribution/build_container.sh +0 -142
  610. llama_stack/distribution/common.sh +0 -40
  611. llama_stack/distribution/configure_container.sh +0 -47
  612. llama_stack/distribution/datatypes.py +0 -139
  613. llama_stack/distribution/distribution.py +0 -58
  614. llama_stack/distribution/inspect.py +0 -67
  615. llama_stack/distribution/request_headers.py +0 -57
  616. llama_stack/distribution/resolver.py +0 -323
  617. llama_stack/distribution/routers/__init__.py +0 -48
  618. llama_stack/distribution/routers/routers.py +0 -158
  619. llama_stack/distribution/routers/routing_tables.py +0 -173
  620. llama_stack/distribution/server/endpoints.py +0 -48
  621. llama_stack/distribution/server/server.py +0 -343
  622. llama_stack/distribution/start_conda_env.sh +0 -42
  623. llama_stack/distribution/start_container.sh +0 -64
  624. llama_stack/distribution/templates/local-bedrock-conda-example-build.yaml +0 -10
  625. llama_stack/distribution/templates/local-build.yaml +0 -10
  626. llama_stack/distribution/templates/local-databricks-build.yaml +0 -10
  627. llama_stack/distribution/templates/local-fireworks-build.yaml +0 -10
  628. llama_stack/distribution/templates/local-hf-endpoint-build.yaml +0 -10
  629. llama_stack/distribution/templates/local-hf-serverless-build.yaml +0 -10
  630. llama_stack/distribution/templates/local-ollama-build.yaml +0 -10
  631. llama_stack/distribution/templates/local-tgi-build.yaml +0 -10
  632. llama_stack/distribution/templates/local-together-build.yaml +0 -10
  633. llama_stack/distribution/templates/local-vllm-build.yaml +0 -10
  634. llama_stack/distribution/utils/exec.py +0 -105
  635. llama_stack/providers/adapters/agents/sample/sample.py +0 -18
  636. llama_stack/providers/adapters/inference/bedrock/bedrock.py +0 -451
  637. llama_stack/providers/adapters/inference/bedrock/config.py +0 -55
  638. llama_stack/providers/adapters/inference/databricks/config.py +0 -21
  639. llama_stack/providers/adapters/inference/databricks/databricks.py +0 -125
  640. llama_stack/providers/adapters/inference/fireworks/config.py +0 -20
  641. llama_stack/providers/adapters/inference/fireworks/fireworks.py +0 -130
  642. llama_stack/providers/adapters/inference/ollama/__init__.py +0 -19
  643. llama_stack/providers/adapters/inference/ollama/ollama.py +0 -175
  644. llama_stack/providers/adapters/inference/sample/sample.py +0 -23
  645. llama_stack/providers/adapters/inference/tgi/config.py +0 -43
  646. llama_stack/providers/adapters/inference/tgi/tgi.py +0 -200
  647. llama_stack/providers/adapters/inference/together/config.py +0 -22
  648. llama_stack/providers/adapters/inference/together/together.py +0 -143
  649. llama_stack/providers/adapters/memory/chroma/chroma.py +0 -157
  650. llama_stack/providers/adapters/memory/pgvector/config.py +0 -17
  651. llama_stack/providers/adapters/memory/pgvector/pgvector.py +0 -211
  652. llama_stack/providers/adapters/memory/sample/sample.py +0 -23
  653. llama_stack/providers/adapters/memory/weaviate/__init__.py +0 -15
  654. llama_stack/providers/adapters/memory/weaviate/weaviate.py +0 -190
  655. llama_stack/providers/adapters/safety/bedrock/bedrock.py +0 -113
  656. llama_stack/providers/adapters/safety/bedrock/config.py +0 -16
  657. llama_stack/providers/adapters/safety/sample/sample.py +0 -23
  658. llama_stack/providers/adapters/safety/together/__init__.py +0 -18
  659. llama_stack/providers/adapters/safety/together/config.py +0 -26
  660. llama_stack/providers/adapters/safety/together/together.py +0 -101
  661. llama_stack/providers/adapters/telemetry/opentelemetry/config.py +0 -12
  662. llama_stack/providers/adapters/telemetry/opentelemetry/opentelemetry.py +0 -201
  663. llama_stack/providers/adapters/telemetry/sample/__init__.py +0 -17
  664. llama_stack/providers/adapters/telemetry/sample/config.py +0 -12
  665. llama_stack/providers/adapters/telemetry/sample/sample.py +0 -18
  666. llama_stack/providers/impls/meta_reference/agents/agent_instance.py +0 -844
  667. llama_stack/providers/impls/meta_reference/agents/agents.py +0 -161
  668. llama_stack/providers/impls/meta_reference/agents/persistence.py +0 -84
  669. llama_stack/providers/impls/meta_reference/agents/rag/context_retriever.py +0 -74
  670. llama_stack/providers/impls/meta_reference/agents/safety.py +0 -57
  671. llama_stack/providers/impls/meta_reference/agents/tests/code_execution.py +0 -93
  672. llama_stack/providers/impls/meta_reference/agents/tests/test_chat_agent.py +0 -305
  673. llama_stack/providers/impls/meta_reference/agents/tools/base.py +0 -20
  674. llama_stack/providers/impls/meta_reference/agents/tools/builtin.py +0 -375
  675. llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/code_env_prefix.py +0 -133
  676. llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/code_execution.py +0 -256
  677. llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/matplotlib_custom_backend.py +0 -87
  678. llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/utils.py +0 -21
  679. llama_stack/providers/impls/meta_reference/agents/tools/safety.py +0 -43
  680. llama_stack/providers/impls/meta_reference/codeshield/code_scanner.py +0 -58
  681. llama_stack/providers/impls/meta_reference/inference/config.py +0 -45
  682. llama_stack/providers/impls/meta_reference/inference/generation.py +0 -376
  683. llama_stack/providers/impls/meta_reference/inference/inference.py +0 -280
  684. llama_stack/providers/impls/meta_reference/inference/model_parallel.py +0 -99
  685. llama_stack/providers/impls/meta_reference/inference/quantization/fp8_impls.py +0 -184
  686. llama_stack/providers/impls/meta_reference/inference/quantization/fp8_txest_disabled.py +0 -76
  687. llama_stack/providers/impls/meta_reference/inference/quantization/loader.py +0 -97
  688. llama_stack/providers/impls/meta_reference/inference/quantization/scripts/quantize_checkpoint.py +0 -161
  689. llama_stack/providers/impls/meta_reference/memory/__init__.py +0 -19
  690. llama_stack/providers/impls/meta_reference/memory/faiss.py +0 -113
  691. llama_stack/providers/impls/meta_reference/safety/__init__.py +0 -17
  692. llama_stack/providers/impls/meta_reference/safety/base.py +0 -57
  693. llama_stack/providers/impls/meta_reference/safety/config.py +0 -48
  694. llama_stack/providers/impls/meta_reference/safety/llama_guard.py +0 -268
  695. llama_stack/providers/impls/meta_reference/safety/prompt_guard.py +0 -145
  696. llama_stack/providers/impls/meta_reference/safety/safety.py +0 -112
  697. llama_stack/providers/impls/meta_reference/telemetry/console.py +0 -89
  698. llama_stack/providers/impls/vllm/config.py +0 -35
  699. llama_stack/providers/impls/vllm/vllm.py +0 -241
  700. llama_stack/providers/registry/memory.py +0 -78
  701. llama_stack/providers/registry/telemetry.py +0 -44
  702. llama_stack/providers/tests/agents/test_agents.py +0 -210
  703. llama_stack/providers/tests/inference/test_inference.py +0 -257
  704. llama_stack/providers/tests/inference/test_prompt_adapter.py +0 -126
  705. llama_stack/providers/tests/memory/test_memory.py +0 -136
  706. llama_stack/providers/tests/resolver.py +0 -100
  707. llama_stack/providers/tests/safety/test_safety.py +0 -77
  708. llama_stack-0.0.42.dist-info/METADATA +0 -137
  709. llama_stack-0.0.42.dist-info/RECORD +0 -256
  710. /llama_stack/{distribution → core}/__init__.py +0 -0
  711. /llama_stack/{distribution/server → core/access_control}/__init__.py +0 -0
  712. /llama_stack/{distribution/utils → core/conversations}/__init__.py +0 -0
  713. /llama_stack/{providers/adapters → core/prompts}/__init__.py +0 -0
  714. /llama_stack/{providers/adapters/agents → core/routing_tables}/__init__.py +0 -0
  715. /llama_stack/{providers/adapters/inference → core/server}/__init__.py +0 -0
  716. /llama_stack/{providers/adapters/memory → core/storage}/__init__.py +0 -0
  717. /llama_stack/{providers/adapters/safety → core/ui}/__init__.py +0 -0
  718. /llama_stack/{providers/adapters/telemetry → core/ui/modules}/__init__.py +0 -0
  719. /llama_stack/{providers/impls → core/ui/page}/__init__.py +0 -0
  720. /llama_stack/{providers/impls/meta_reference → core/ui/page/distribution}/__init__.py +0 -0
  721. /llama_stack/{providers/impls/meta_reference/agents/rag → core/ui/page/evaluations}/__init__.py +0 -0
  722. /llama_stack/{providers/impls/meta_reference/agents/tests → core/ui/page/playground}/__init__.py +0 -0
  723. /llama_stack/{providers/impls/meta_reference/agents/tools → core/utils}/__init__.py +0 -0
  724. /llama_stack/{distribution → core}/utils/dynamic.py +0 -0
  725. /llama_stack/{distribution → core}/utils/serialize.py +0 -0
  726. /llama_stack/{providers/impls/meta_reference/agents/tools/ipython_tool → distributions}/__init__.py +0 -0
  727. /llama_stack/{providers/impls/meta_reference/inference/quantization → models}/__init__.py +0 -0
  728. /llama_stack/{providers/impls/meta_reference/inference/quantization/scripts → models/llama}/__init__.py +0 -0
  729. /llama_stack/{providers/tests → models/llama/llama3}/__init__.py +0 -0
  730. /llama_stack/{providers/tests/agents → models/llama/llama3/quantization}/__init__.py +0 -0
  731. /llama_stack/{providers/tests/inference → models/llama/llama3_2}/__init__.py +0 -0
  732. /llama_stack/{providers/tests/memory → models/llama/llama3_3}/__init__.py +0 -0
  733. /llama_stack/{providers/tests/safety → models/llama/llama4}/__init__.py +0 -0
  734. /llama_stack/{scripts → models/llama/llama4/prompt_templates}/__init__.py +0 -0
  735. /llama_stack/providers/{adapters → remote}/safety/bedrock/__init__.py +0 -0
  736. {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info}/entry_points.txt +0 -0
  737. {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info/licenses}/LICENSE +0 -0
  738. {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info}/top_level.txt +0 -0
@@ -3,42 +3,190 @@
3
3
  #
4
4
  # This source code is licensed under the terms described in the LICENSE file in
5
5
  # the root directory of this source tree.
6
+ import json
7
+ import time
8
+ import uuid
9
+ import warnings
10
+ from collections.abc import AsyncGenerator, AsyncIterator, Awaitable, Iterable
11
+ from typing import (
12
+ Any,
13
+ )
6
14
 
7
- from typing import AsyncGenerator, Optional
15
+ from openai import AsyncStream
16
+ from openai.types.chat import (
17
+ ChatCompletionAssistantMessageParam as OpenAIChatCompletionAssistantMessage,
18
+ )
19
+ from openai.types.chat import (
20
+ ChatCompletionChunk as OpenAIChatCompletionChunk,
21
+ )
22
+ from openai.types.chat import (
23
+ ChatCompletionContentPartImageParam as OpenAIChatCompletionContentPartImageParam,
24
+ )
25
+ from openai.types.chat import (
26
+ ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam,
27
+ )
28
+ from openai.types.chat import (
29
+ ChatCompletionContentPartTextParam as OpenAIChatCompletionContentPartTextParam,
30
+ )
8
31
 
9
- from llama_models.llama3.api.chat_format import ChatFormat
10
-
11
- from llama_models.llama3.api.datatypes import StopReason
32
+ try:
33
+ from openai.types.chat import (
34
+ ChatCompletionMessageFunctionToolCall as OpenAIChatCompletionMessageFunctionToolCall,
35
+ )
36
+ except ImportError:
37
+ from openai.types.chat.chat_completion_message_tool_call import (
38
+ ChatCompletionMessageToolCall as OpenAIChatCompletionMessageFunctionToolCall,
39
+ )
40
+ from openai.types.chat import (
41
+ ChatCompletionMessageParam as OpenAIChatCompletionMessage,
42
+ )
43
+ from openai.types.chat import (
44
+ ChatCompletionMessageToolCall,
45
+ )
46
+ from openai.types.chat import (
47
+ ChatCompletionSystemMessageParam as OpenAIChatCompletionSystemMessage,
48
+ )
49
+ from openai.types.chat import (
50
+ ChatCompletionToolMessageParam as OpenAIChatCompletionToolMessage,
51
+ )
52
+ from openai.types.chat import (
53
+ ChatCompletionUserMessageParam as OpenAIChatCompletionUserMessage,
54
+ )
55
+ from openai.types.chat.chat_completion import (
56
+ Choice as OpenAIChoice,
57
+ )
58
+ from openai.types.chat.chat_completion import (
59
+ ChoiceLogprobs as OpenAIChoiceLogprobs, # same as chat_completion_chunk ChoiceLogprobs
60
+ )
61
+ from openai.types.chat.chat_completion_chunk import (
62
+ Choice as OpenAIChatCompletionChunkChoice,
63
+ )
64
+ from openai.types.chat.chat_completion_chunk import (
65
+ ChoiceDelta as OpenAIChoiceDelta,
66
+ )
67
+ from openai.types.chat.chat_completion_chunk import (
68
+ ChoiceDeltaToolCall as OpenAIChoiceDeltaToolCall,
69
+ )
70
+ from openai.types.chat.chat_completion_chunk import (
71
+ ChoiceDeltaToolCallFunction as OpenAIChoiceDeltaToolCallFunction,
72
+ )
73
+ from openai.types.chat.chat_completion_content_part_image_param import (
74
+ ImageURL as OpenAIImageURL,
75
+ )
76
+ from openai.types.chat.chat_completion_message_tool_call import (
77
+ Function as OpenAIFunction,
78
+ )
79
+ from pydantic import BaseModel
12
80
 
13
- from llama_stack.apis.inference import * # noqa: F403
81
+ from llama_stack.apis.common.content_types import (
82
+ URL,
83
+ ImageContentItem,
84
+ InterleavedContent,
85
+ TextContentItem,
86
+ TextDelta,
87
+ ToolCallDelta,
88
+ ToolCallParseStatus,
89
+ _URLOrData,
90
+ )
91
+ from llama_stack.apis.inference import (
92
+ ChatCompletionRequest,
93
+ ChatCompletionResponse,
94
+ ChatCompletionResponseEvent,
95
+ ChatCompletionResponseEventType,
96
+ ChatCompletionResponseStreamChunk,
97
+ CompletionMessage,
98
+ CompletionResponse,
99
+ CompletionResponseStreamChunk,
100
+ GreedySamplingStrategy,
101
+ JsonSchemaResponseFormat,
102
+ Message,
103
+ OpenAIChatCompletion,
104
+ OpenAIMessageParam,
105
+ OpenAIResponseFormatParam,
106
+ SamplingParams,
107
+ SystemMessage,
108
+ TokenLogProbs,
109
+ ToolChoice,
110
+ ToolConfig,
111
+ ToolResponseMessage,
112
+ TopKSamplingStrategy,
113
+ TopPSamplingStrategy,
114
+ UserMessage,
115
+ )
116
+ from llama_stack.apis.inference import (
117
+ OpenAIChoice as OpenAIChatCompletionChoice,
118
+ )
119
+ from llama_stack.log import get_logger
120
+ from llama_stack.models.llama.datatypes import (
121
+ BuiltinTool,
122
+ StopReason,
123
+ ToolCall,
124
+ ToolDefinition,
125
+ )
126
+ from llama_stack.providers.utils.inference.prompt_adapter import (
127
+ convert_image_content_to_url,
128
+ decode_assistant_message,
129
+ )
14
130
 
15
- from pydantic import BaseModel
131
+ logger = get_logger(name=__name__, category="providers::utils")
16
132
 
17
133
 
18
134
  class OpenAICompatCompletionChoiceDelta(BaseModel):
19
135
  content: str
20
136
 
21
137
 
138
+ class OpenAICompatLogprobs(BaseModel):
139
+ text_offset: list[int] | None = None
140
+
141
+ token_logprobs: list[float] | None = None
142
+
143
+ tokens: list[str] | None = None
144
+
145
+ top_logprobs: list[dict[str, float]] | None = None
146
+
147
+
22
148
  class OpenAICompatCompletionChoice(BaseModel):
23
- finish_reason: Optional[str] = None
24
- text: Optional[str] = None
25
- delta: Optional[OpenAICompatCompletionChoiceDelta] = None
149
+ finish_reason: str | None = None
150
+ text: str | None = None
151
+ delta: OpenAICompatCompletionChoiceDelta | None = None
152
+ logprobs: OpenAICompatLogprobs | None = None
26
153
 
27
154
 
28
155
  class OpenAICompatCompletionResponse(BaseModel):
29
- choices: List[OpenAICompatCompletionChoice]
156
+ choices: list[OpenAICompatCompletionChoice]
30
157
 
31
158
 
32
- def get_sampling_options(request: ChatCompletionRequest) -> dict:
159
+ def get_sampling_strategy_options(params: SamplingParams) -> dict:
33
160
  options = {}
34
- if params := request.sampling_params:
35
- for attr in {"temperature", "top_p", "top_k", "max_tokens"}:
36
- if getattr(params, attr):
37
- options[attr] = getattr(params, attr)
161
+ if isinstance(params.strategy, GreedySamplingStrategy):
162
+ options["temperature"] = 0.0
163
+ elif isinstance(params.strategy, TopPSamplingStrategy):
164
+ options["temperature"] = params.strategy.temperature
165
+ options["top_p"] = params.strategy.top_p
166
+ elif isinstance(params.strategy, TopKSamplingStrategy):
167
+ options["top_k"] = params.strategy.top_k
168
+ else:
169
+ raise ValueError(f"Unsupported sampling strategy: {params.strategy}")
170
+
171
+ return options
172
+
173
+
174
+ def get_sampling_options(params: SamplingParams | None) -> dict:
175
+ if not params:
176
+ return {}
177
+
178
+ options = {}
179
+ if params:
180
+ options.update(get_sampling_strategy_options(params))
181
+ if params.max_tokens:
182
+ options["max_tokens"] = params.max_tokens
38
183
 
39
184
  if params.repetition_penalty is not None and params.repetition_penalty != 1.0:
40
185
  options["repeat_penalty"] = params.repetition_penalty
41
186
 
187
+ if params.stop is not None:
188
+ options["stop"] = params.stop
189
+
42
190
  return options
43
191
 
44
192
 
@@ -46,46 +194,188 @@ def text_from_choice(choice) -> str:
46
194
  if hasattr(choice, "delta") and choice.delta:
47
195
  return choice.delta.content
48
196
 
197
+ if hasattr(choice, "message"):
198
+ return choice.message.content
199
+
49
200
  return choice.text
50
201
 
51
202
 
203
+ def get_stop_reason(finish_reason: str) -> StopReason:
204
+ if finish_reason in ["stop", "eos"]:
205
+ return StopReason.end_of_turn
206
+ elif finish_reason == "eom":
207
+ return StopReason.end_of_message
208
+ elif finish_reason == "length":
209
+ return StopReason.out_of_tokens
210
+
211
+ return StopReason.out_of_tokens
212
+
213
+
214
+ def convert_openai_completion_logprobs(
215
+ logprobs: OpenAICompatLogprobs | None,
216
+ ) -> list[TokenLogProbs] | None:
217
+ if not logprobs:
218
+ return None
219
+ if hasattr(logprobs, "top_logprobs"):
220
+ return [TokenLogProbs(logprobs_by_token=x) for x in logprobs.top_logprobs]
221
+
222
+ # Together supports logprobs with top_k=1 only. This means for each token position,
223
+ # they return only the logprobs for the selected token (vs. the top n most likely tokens).
224
+ # Here we construct the response by matching the selected token with the logprobs.
225
+ if logprobs.tokens and logprobs.token_logprobs:
226
+ return [
227
+ TokenLogProbs(logprobs_by_token={token: token_lp})
228
+ for token, token_lp in zip(logprobs.tokens, logprobs.token_logprobs, strict=False)
229
+ ]
230
+ return None
231
+
232
+
233
+ def convert_openai_completion_logprobs_stream(text: str, logprobs: float | OpenAICompatLogprobs | None):
234
+ if logprobs is None:
235
+ return None
236
+ if isinstance(logprobs, float):
237
+ # Adapt response from Together CompletionChoicesChunk
238
+ return [TokenLogProbs(logprobs_by_token={text: logprobs})]
239
+ if hasattr(logprobs, "top_logprobs"):
240
+ return [TokenLogProbs(logprobs_by_token=x) for x in logprobs.top_logprobs]
241
+ return None
242
+
243
+
244
+ def process_completion_response(
245
+ response: OpenAICompatCompletionResponse,
246
+ ) -> CompletionResponse:
247
+ choice = response.choices[0]
248
+ # drop suffix <eot_id> if present and return stop reason as end of turn
249
+ if choice.text.endswith("<|eot_id|>"):
250
+ return CompletionResponse(
251
+ stop_reason=StopReason.end_of_turn,
252
+ content=choice.text[: -len("<|eot_id|>")],
253
+ logprobs=convert_openai_completion_logprobs(choice.logprobs),
254
+ )
255
+ # drop suffix <eom_id> if present and return stop reason as end of message
256
+ if choice.text.endswith("<|eom_id|>"):
257
+ return CompletionResponse(
258
+ stop_reason=StopReason.end_of_message,
259
+ content=choice.text[: -len("<|eom_id|>")],
260
+ logprobs=convert_openai_completion_logprobs(choice.logprobs),
261
+ )
262
+ return CompletionResponse(
263
+ stop_reason=get_stop_reason(choice.finish_reason),
264
+ content=choice.text,
265
+ logprobs=convert_openai_completion_logprobs(choice.logprobs),
266
+ )
267
+
268
+
52
269
  def process_chat_completion_response(
53
- request: ChatCompletionRequest,
54
270
  response: OpenAICompatCompletionResponse,
55
- formatter: ChatFormat,
271
+ request: ChatCompletionRequest,
56
272
  ) -> ChatCompletionResponse:
57
273
  choice = response.choices[0]
274
+ if choice.finish_reason == "tool_calls":
275
+ if not choice.message or not choice.message.tool_calls:
276
+ raise ValueError("Tool calls are not present in the response")
58
277
 
59
- stop_reason = None
60
- if reason := choice.finish_reason:
61
- if reason in ["stop", "eos"]:
62
- stop_reason = StopReason.end_of_turn
63
- elif reason == "eom":
64
- stop_reason = StopReason.end_of_message
65
- elif reason == "length":
66
- stop_reason = StopReason.out_of_tokens
278
+ tool_calls = [convert_tool_call(tool_call) for tool_call in choice.message.tool_calls]
279
+ if any(isinstance(tool_call, UnparseableToolCall) for tool_call in tool_calls):
280
+ # If we couldn't parse a tool call, jsonify the tool calls and return them
281
+ return ChatCompletionResponse(
282
+ completion_message=CompletionMessage(
283
+ stop_reason=StopReason.end_of_turn,
284
+ content=json.dumps(tool_calls, default=lambda x: x.model_dump()),
285
+ ),
286
+ logprobs=None,
287
+ )
288
+ else:
289
+ # Otherwise, return tool calls as normal
290
+ return ChatCompletionResponse(
291
+ completion_message=CompletionMessage(
292
+ tool_calls=tool_calls,
293
+ stop_reason=StopReason.end_of_turn,
294
+ # Content is not optional
295
+ content="",
296
+ ),
297
+ logprobs=None,
298
+ )
67
299
 
68
- if stop_reason is None:
69
- stop_reason = StopReason.out_of_tokens
300
+ # TODO: This does not work well with tool calls for vLLM remote provider
301
+ # Ref: https://github.com/meta-llama/llama-stack/issues/1058
302
+ raw_message = decode_assistant_message(text_from_choice(choice), get_stop_reason(choice.finish_reason))
303
+
304
+ # NOTE: If we do not set tools in chat-completion request, we should not
305
+ # expect the ToolCall in the response. Instead, we should return the raw
306
+ # response from the model.
307
+ if raw_message.tool_calls:
308
+ if not request.tools:
309
+ raw_message.tool_calls = []
310
+ raw_message.content = text_from_choice(choice)
311
+ else:
312
+ # only return tool_calls if provided in the request
313
+ new_tool_calls = []
314
+ request_tools = {t.tool_name: t for t in request.tools}
315
+ for t in raw_message.tool_calls:
316
+ if t.tool_name in request_tools:
317
+ new_tool_calls.append(t)
318
+ else:
319
+ logger.warning(f"Tool {t.tool_name} not found in request tools")
320
+
321
+ if len(new_tool_calls) < len(raw_message.tool_calls):
322
+ raw_message.tool_calls = new_tool_calls
323
+ raw_message.content = text_from_choice(choice)
70
324
 
71
- completion_message = formatter.decode_assistant_message_from_content(
72
- text_from_choice(choice), stop_reason
73
- )
74
325
  return ChatCompletionResponse(
75
- completion_message=completion_message,
326
+ completion_message=CompletionMessage(
327
+ content=raw_message.content,
328
+ stop_reason=raw_message.stop_reason,
329
+ tool_calls=raw_message.tool_calls,
330
+ ),
76
331
  logprobs=None,
77
332
  )
78
333
 
79
334
 
335
+ async def process_completion_stream_response(
336
+ stream: AsyncGenerator[OpenAICompatCompletionResponse, None],
337
+ ) -> AsyncGenerator[CompletionResponseStreamChunk, None]:
338
+ stop_reason = None
339
+
340
+ async for chunk in stream:
341
+ choice = chunk.choices[0]
342
+ finish_reason = choice.finish_reason
343
+
344
+ text = text_from_choice(choice)
345
+ if text == "<|eot_id|>":
346
+ stop_reason = StopReason.end_of_turn
347
+ text = ""
348
+ continue
349
+ elif text == "<|eom_id|>":
350
+ stop_reason = StopReason.end_of_message
351
+ text = ""
352
+ continue
353
+ yield CompletionResponseStreamChunk(
354
+ delta=text,
355
+ stop_reason=stop_reason,
356
+ logprobs=convert_openai_completion_logprobs_stream(text, choice.logprobs),
357
+ )
358
+ if finish_reason:
359
+ if finish_reason in ["stop", "eos", "eos_token"]:
360
+ stop_reason = StopReason.end_of_turn
361
+ elif finish_reason == "length":
362
+ stop_reason = StopReason.out_of_tokens
363
+ break
364
+
365
+ yield CompletionResponseStreamChunk(
366
+ delta="",
367
+ stop_reason=stop_reason,
368
+ )
369
+
370
+
80
371
  async def process_chat_completion_stream_response(
81
- request: ChatCompletionRequest,
82
372
  stream: AsyncGenerator[OpenAICompatCompletionResponse, None],
83
- formatter: ChatFormat,
84
- ) -> AsyncGenerator:
373
+ request: ChatCompletionRequest,
374
+ ) -> AsyncGenerator[ChatCompletionResponseStreamChunk, None]:
85
375
  yield ChatCompletionResponseStreamChunk(
86
376
  event=ChatCompletionResponseEvent(
87
377
  event_type=ChatCompletionResponseEventType.start,
88
- delta="",
378
+ delta=TextDelta(text=""),
89
379
  )
90
380
  )
91
381
 
@@ -105,6 +395,10 @@ async def process_chat_completion_stream_response(
105
395
  break
106
396
 
107
397
  text = text_from_choice(choice)
398
+ if not text:
399
+ # Sometimes you get empty chunks from providers
400
+ continue
401
+
108
402
  # check if its a tool call ( aka starts with <|python_tag|> )
109
403
  if not ipython and text.startswith("<|python_tag|>"):
110
404
  ipython = True
@@ -112,7 +406,7 @@ async def process_chat_completion_stream_response(
112
406
  event=ChatCompletionResponseEvent(
113
407
  event_type=ChatCompletionResponseEventType.progress,
114
408
  delta=ToolCallDelta(
115
- content="",
409
+ tool_call="",
116
410
  parse_status=ToolCallParseStatus.started,
117
411
  ),
118
412
  )
@@ -132,7 +426,7 @@ async def process_chat_completion_stream_response(
132
426
  if ipython:
133
427
  buffer += text
134
428
  delta = ToolCallDelta(
135
- content=text,
429
+ tool_call=text,
136
430
  parse_status=ToolCallParseStatus.in_progress,
137
431
  )
138
432
 
@@ -148,42 +442,960 @@ async def process_chat_completion_stream_response(
148
442
  yield ChatCompletionResponseStreamChunk(
149
443
  event=ChatCompletionResponseEvent(
150
444
  event_type=ChatCompletionResponseEventType.progress,
151
- delta=text,
445
+ delta=TextDelta(text=text),
152
446
  stop_reason=stop_reason,
153
447
  )
154
448
  )
155
449
 
156
450
  # parse tool calls and report errors
157
- message = formatter.decode_assistant_message_from_content(buffer, stop_reason)
451
+ message = decode_assistant_message(buffer, stop_reason)
452
+
158
453
  parsed_tool_calls = len(message.tool_calls) > 0
159
454
  if ipython and not parsed_tool_calls:
160
455
  yield ChatCompletionResponseStreamChunk(
161
456
  event=ChatCompletionResponseEvent(
162
457
  event_type=ChatCompletionResponseEventType.progress,
163
458
  delta=ToolCallDelta(
164
- content="",
165
- parse_status=ToolCallParseStatus.failure,
459
+ tool_call="",
460
+ parse_status=ToolCallParseStatus.failed,
166
461
  ),
167
462
  stop_reason=stop_reason,
168
463
  )
169
464
  )
170
465
 
466
+ request_tools = {t.tool_name: t for t in request.tools}
171
467
  for tool_call in message.tool_calls:
172
- yield ChatCompletionResponseStreamChunk(
173
- event=ChatCompletionResponseEvent(
174
- event_type=ChatCompletionResponseEventType.progress,
175
- delta=ToolCallDelta(
176
- content=tool_call,
177
- parse_status=ToolCallParseStatus.success,
468
+ if tool_call.tool_name in request_tools:
469
+ yield ChatCompletionResponseStreamChunk(
470
+ event=ChatCompletionResponseEvent(
471
+ event_type=ChatCompletionResponseEventType.progress,
472
+ delta=ToolCallDelta(
473
+ tool_call=tool_call,
474
+ parse_status=ToolCallParseStatus.succeeded,
475
+ ),
476
+ stop_reason=stop_reason,
477
+ )
478
+ )
479
+ else:
480
+ logger.warning(f"Tool {tool_call.tool_name} not found in request tools")
481
+ yield ChatCompletionResponseStreamChunk(
482
+ event=ChatCompletionResponseEvent(
483
+ event_type=ChatCompletionResponseEventType.progress,
484
+ delta=ToolCallDelta(
485
+ # Parsing tool call failed due to tool call not being found in request tools,
486
+ # We still add the raw message text inside tool_call for responding back to the user
487
+ tool_call=buffer,
488
+ parse_status=ToolCallParseStatus.failed,
489
+ ),
490
+ stop_reason=stop_reason,
491
+ )
492
+ )
493
+
494
+ yield ChatCompletionResponseStreamChunk(
495
+ event=ChatCompletionResponseEvent(
496
+ event_type=ChatCompletionResponseEventType.complete,
497
+ delta=TextDelta(text=""),
498
+ stop_reason=stop_reason,
499
+ )
500
+ )
501
+
502
+
503
+ async def convert_message_to_openai_dict(message: Message, download: bool = False) -> dict:
504
+ async def _convert_content(content) -> dict:
505
+ if isinstance(content, ImageContentItem):
506
+ return {
507
+ "type": "image_url",
508
+ "image_url": {
509
+ "url": await convert_image_content_to_url(content, download=download),
510
+ },
511
+ }
512
+ else:
513
+ text = content.text if isinstance(content, TextContentItem) else content
514
+ assert isinstance(text, str)
515
+ return {"type": "text", "text": text}
516
+
517
+ if isinstance(message.content, list):
518
+ content = [await _convert_content(c) for c in message.content]
519
+ else:
520
+ content = [await _convert_content(message.content)]
521
+
522
+ result = {
523
+ "role": message.role,
524
+ "content": content,
525
+ }
526
+
527
+ if hasattr(message, "tool_calls") and message.tool_calls:
528
+ result["tool_calls"] = []
529
+ for tc in message.tool_calls:
530
+ # The tool.tool_name can be a str or a BuiltinTool enum. If
531
+ # it's the latter, convert to a string.
532
+ tool_name = tc.tool_name
533
+ if isinstance(tool_name, BuiltinTool):
534
+ tool_name = tool_name.value
535
+
536
+ result["tool_calls"].append(
537
+ {
538
+ "id": tc.call_id,
539
+ "type": "function",
540
+ "function": {
541
+ "name": tool_name,
542
+ "arguments": tc.arguments,
543
+ },
544
+ }
545
+ )
546
+ return result
547
+
548
+
549
+ class UnparseableToolCall(BaseModel):
550
+ """
551
+ A ToolCall with arguments that are not valid JSON.
552
+ Mirrors the ToolCall schema, but with arguments as a string.
553
+ """
554
+
555
+ call_id: str = ""
556
+ tool_name: str = ""
557
+ arguments: str = ""
558
+
559
+
560
+ async def convert_message_to_openai_dict_new(
561
+ message: Message | dict,
562
+ download_images: bool = False,
563
+ ) -> OpenAIChatCompletionMessage:
564
+ """
565
+ Convert a Message to an OpenAI API-compatible dictionary.
566
+ """
567
+ # users can supply a dict instead of a Message object, we'll
568
+ # convert it to a Message object and proceed with some type safety.
569
+ if isinstance(message, dict):
570
+ if "role" not in message:
571
+ raise ValueError("role is required in message")
572
+ if message["role"] == "user":
573
+ message = UserMessage(**message)
574
+ elif message["role"] == "assistant":
575
+ message = CompletionMessage(**message)
576
+ elif message["role"] == "tool":
577
+ message = ToolResponseMessage(**message)
578
+ elif message["role"] == "system":
579
+ message = SystemMessage(**message)
580
+ else:
581
+ raise ValueError(f"Unsupported message role: {message['role']}")
582
+
583
+ # Map Llama Stack spec to OpenAI spec -
584
+ # str -> str
585
+ # {"type": "text", "text": ...} -> {"type": "text", "text": ...}
586
+ # {"type": "image", "image": {"url": {"uri": ...}}} -> {"type": "image_url", "image_url": {"url": ...}}
587
+ # {"type": "image", "image": {"data": ...}} -> {"type": "image_url", "image_url": {"url": "data:image/?;base64,..."}}
588
+ # List[...] -> List[...]
589
+ async def _convert_message_content(
590
+ content: InterleavedContent,
591
+ ) -> str | Iterable[OpenAIChatCompletionContentPartParam]:
592
+ async def impl(
593
+ content_: InterleavedContent,
594
+ ) -> str | OpenAIChatCompletionContentPartParam | list[OpenAIChatCompletionContentPartParam]:
595
+ # Llama Stack and OpenAI spec match for str and text input
596
+ if isinstance(content_, str):
597
+ return content_
598
+ elif isinstance(content_, TextContentItem):
599
+ return OpenAIChatCompletionContentPartTextParam(
600
+ type="text",
601
+ text=content_.text,
602
+ )
603
+ elif isinstance(content_, ImageContentItem):
604
+ return OpenAIChatCompletionContentPartImageParam(
605
+ type="image_url",
606
+ image_url=OpenAIImageURL(
607
+ url=await convert_image_content_to_url(content_, download=download_images)
608
+ ),
609
+ )
610
+ elif isinstance(content_, list):
611
+ return [await impl(item) for item in content_]
612
+ else:
613
+ raise ValueError(f"Unsupported content type: {type(content_)}")
614
+
615
+ ret = await impl(content)
616
+
617
+ # OpenAI*Message expects a str or list
618
+ if isinstance(ret, str) or isinstance(ret, list):
619
+ return ret
620
+ else:
621
+ return [ret]
622
+
623
+ out: OpenAIChatCompletionMessage = None
624
+ if isinstance(message, UserMessage):
625
+ out = OpenAIChatCompletionUserMessage(
626
+ role="user",
627
+ content=await _convert_message_content(message.content),
628
+ )
629
+ elif isinstance(message, CompletionMessage):
630
+ tool_calls = [
631
+ OpenAIChatCompletionMessageFunctionToolCall(
632
+ id=tool.call_id,
633
+ function=OpenAIFunction(
634
+ name=(tool.tool_name if not isinstance(tool.tool_name, BuiltinTool) else tool.tool_name.value),
635
+ arguments=tool.arguments, # Already a JSON string, don't double-encode
178
636
  ),
179
- stop_reason=stop_reason,
637
+ type="function",
180
638
  )
639
+ for tool in message.tool_calls
640
+ ]
641
+ params = {}
642
+ if tool_calls:
643
+ params["tool_calls"] = tool_calls
644
+ out = OpenAIChatCompletionAssistantMessage(
645
+ role="assistant",
646
+ content=await _convert_message_content(message.content),
647
+ **params,
648
+ )
649
+ elif isinstance(message, ToolResponseMessage):
650
+ out = OpenAIChatCompletionToolMessage(
651
+ role="tool",
652
+ tool_call_id=message.call_id,
653
+ content=await _convert_message_content(message.content),
654
+ )
655
+ elif isinstance(message, SystemMessage):
656
+ out = OpenAIChatCompletionSystemMessage(
657
+ role="system",
658
+ content=await _convert_message_content(message.content),
181
659
  )
660
+ else:
661
+ raise ValueError(f"Unsupported message type: {type(message)}")
662
+
663
+ return out
664
+
665
+
666
+ def convert_tool_call(
667
+ tool_call: ChatCompletionMessageToolCall,
668
+ ) -> ToolCall | UnparseableToolCall:
669
+ """
670
+ Convert a ChatCompletionMessageToolCall tool call to either a
671
+ ToolCall or UnparseableToolCall. Returns an UnparseableToolCall
672
+ if the tool call is not valid ToolCall.
673
+ """
674
+ try:
675
+ valid_tool_call = ToolCall(
676
+ call_id=tool_call.id,
677
+ tool_name=tool_call.function.name,
678
+ arguments=tool_call.function.arguments,
679
+ )
680
+ except Exception:
681
+ return UnparseableToolCall(
682
+ call_id=tool_call.id or "",
683
+ tool_name=tool_call.function.name or "",
684
+ arguments=tool_call.function.arguments or "",
685
+ )
686
+
687
+ return valid_tool_call
688
+
689
+
690
+ PYTHON_TYPE_TO_LITELLM_TYPE = {
691
+ "int": "integer",
692
+ "float": "number",
693
+ "bool": "boolean",
694
+ "str": "string",
695
+ }
696
+
697
+
698
+ def to_openai_param_type(param_type: str) -> dict:
699
+ """
700
+ Convert Python type hints to OpenAI parameter type format.
701
+
702
+ Examples:
703
+ 'str' -> {'type': 'string'}
704
+ 'int' -> {'type': 'integer'}
705
+ 'list[str]' -> {'type': 'array', 'items': {'type': 'string'}}
706
+ 'list[int]' -> {'type': 'array', 'items': {'type': 'integer'}}
707
+ """
708
+ # Handle basic types first
709
+ basic_types = {
710
+ "str": "string",
711
+ "int": "integer",
712
+ "float": "number",
713
+ "bool": "boolean",
714
+ }
715
+
716
+ if param_type in basic_types:
717
+ return {"type": basic_types[param_type]}
718
+
719
+ # Handle list/array types
720
+ if param_type.startswith("list[") and param_type.endswith("]"):
721
+ inner_type = param_type[5:-1]
722
+ if inner_type in basic_types:
723
+ return {
724
+ "type": "array",
725
+ "items": {"type": basic_types.get(inner_type, inner_type)},
726
+ }
727
+
728
+ return {"type": param_type}
729
+
730
+
731
+ def convert_tooldef_to_openai_tool(tool: ToolDefinition) -> dict:
732
+ """
733
+ Convert a ToolDefinition to an OpenAI API-compatible dictionary.
734
+
735
+ ToolDefinition:
736
+ tool_name: str | BuiltinTool
737
+ description: Optional[str]
738
+ input_schema: Optional[Dict[str, Any]] # JSON Schema
739
+ output_schema: Optional[Dict[str, Any]] # JSON Schema (not used by OpenAI)
740
+
741
+ OpenAI spec -
742
+
743
+ {
744
+ "type": "function",
745
+ "function": {
746
+ "name": tool_name,
747
+ "description": description,
748
+ "parameters": {<JSON Schema>},
749
+ },
750
+ }
751
+
752
+ NOTE: OpenAI does not support output_schema, so it is dropped here.
753
+ """
754
+ out = {
755
+ "type": "function",
756
+ "function": {},
757
+ }
758
+ function = out["function"]
759
+
760
+ if isinstance(tool.tool_name, BuiltinTool):
761
+ function["name"] = tool.tool_name.value
762
+ else:
763
+ function["name"] = tool.tool_name
764
+
765
+ if tool.description:
766
+ function["description"] = tool.description
767
+
768
+ if tool.input_schema:
769
+ # Pass through the entire JSON Schema as-is
770
+ function["parameters"] = tool.input_schema
771
+
772
+ # NOTE: OpenAI does not support output_schema, so we drop it here
773
+ # It's stored in LlamaStack for validation and other provider usage
774
+
775
+ return out
776
+
777
+
778
+ def _convert_stop_reason_to_openai_finish_reason(stop_reason: StopReason) -> str:
779
+ """
780
+ Convert a StopReason to an OpenAI chat completion finish_reason.
781
+ """
782
+ return {
783
+ StopReason.end_of_turn: "stop",
784
+ StopReason.end_of_message: "tool_calls",
785
+ StopReason.out_of_tokens: "length",
786
+ }.get(stop_reason, "stop")
787
+
788
+
789
+ def _convert_openai_finish_reason(finish_reason: str) -> StopReason:
790
+ """
791
+ Convert an OpenAI chat completion finish_reason to a StopReason.
792
+
793
+ finish_reason: Literal["stop", "length", "tool_calls", ...]
794
+ - stop: model hit a natural stop point or a provided stop sequence
795
+ - length: maximum number of tokens specified in the request was reached
796
+ - tool_calls: model called a tool
797
+
798
+ ->
799
+
800
+ class StopReason(Enum):
801
+ end_of_turn = "end_of_turn"
802
+ end_of_message = "end_of_message"
803
+ out_of_tokens = "out_of_tokens"
804
+ """
805
+
806
+ # TODO(mf): are end_of_turn and end_of_message semantics correct?
807
+ return {
808
+ "stop": StopReason.end_of_turn,
809
+ "length": StopReason.out_of_tokens,
810
+ "tool_calls": StopReason.end_of_message,
811
+ }.get(finish_reason, StopReason.end_of_turn)
812
+
813
+
814
+ def _convert_openai_request_tool_config(tool_choice: str | dict[str, Any] | None = None) -> ToolConfig:
815
+ tool_config = ToolConfig()
816
+ if tool_choice:
817
+ try:
818
+ tool_choice = ToolChoice(tool_choice)
819
+ except ValueError:
820
+ pass
821
+ tool_config.tool_choice = tool_choice
822
+ return tool_config
823
+
824
+
825
+ def _convert_openai_request_tools(tools: list[dict[str, Any]] | None = None) -> list[ToolDefinition]:
826
+ lls_tools = []
827
+ if not tools:
828
+ return lls_tools
829
+
830
+ for tool in tools:
831
+ tool_fn = tool.get("function", {})
832
+ tool_name = tool_fn.get("name", None)
833
+ tool_desc = tool_fn.get("description", None)
834
+ tool_params = tool_fn.get("parameters", None)
835
+
836
+ lls_tool = ToolDefinition(
837
+ tool_name=tool_name,
838
+ description=tool_desc,
839
+ input_schema=tool_params, # Pass through entire JSON Schema
840
+ )
841
+ lls_tools.append(lls_tool)
842
+ return lls_tools
843
+
844
+
845
+ def _convert_openai_request_response_format(
846
+ response_format: OpenAIResponseFormatParam = None,
847
+ ):
848
+ if not response_format:
849
+ return None
850
+ # response_format can be a dict or a pydantic model
851
+ response_format = dict(response_format)
852
+ if response_format.get("type", "") == "json_schema":
853
+ return JsonSchemaResponseFormat(
854
+ type="json_schema",
855
+ json_schema=response_format.get("json_schema", {}).get("schema", ""),
856
+ )
857
+ return None
858
+
859
+
860
+ def _convert_openai_tool_calls(
861
+ tool_calls: list[OpenAIChatCompletionMessageFunctionToolCall],
862
+ ) -> list[ToolCall]:
863
+ """
864
+ Convert an OpenAI ChatCompletionMessageToolCall list into a list of ToolCall.
865
+
866
+ OpenAI ChatCompletionMessageToolCall:
867
+ id: str
868
+ function: Function
869
+ type: Literal["function"]
870
+
871
+ OpenAI Function:
872
+ arguments: str
873
+ name: str
874
+
875
+ ->
876
+
877
+ ToolCall:
878
+ call_id: str
879
+ tool_name: str
880
+ arguments: Dict[str, ...]
881
+ """
882
+ if not tool_calls:
883
+ return [] # CompletionMessage tool_calls is not optional
884
+
885
+ return [
886
+ ToolCall(
887
+ call_id=call.id,
888
+ tool_name=call.function.name,
889
+ arguments=call.function.arguments,
890
+ )
891
+ for call in tool_calls
892
+ ]
893
+
894
+
895
+ def _convert_openai_logprobs(
896
+ logprobs: OpenAIChoiceLogprobs,
897
+ ) -> list[TokenLogProbs] | None:
898
+ """
899
+ Convert an OpenAI ChoiceLogprobs into a list of TokenLogProbs.
900
+
901
+ OpenAI ChoiceLogprobs:
902
+ content: Optional[List[ChatCompletionTokenLogprob]]
903
+
904
+ OpenAI ChatCompletionTokenLogprob:
905
+ token: str
906
+ logprob: float
907
+ top_logprobs: List[TopLogprob]
908
+
909
+ OpenAI TopLogprob:
910
+ token: str
911
+ logprob: float
912
+
913
+ ->
914
+
915
+ TokenLogProbs:
916
+ logprobs_by_token: Dict[str, float]
917
+ - token, logprob
918
+
919
+ """
920
+ if not logprobs or not logprobs.content:
921
+ return None
922
+
923
+ return [
924
+ TokenLogProbs(logprobs_by_token={logprobs.token: logprobs.logprob for logprobs in content.top_logprobs})
925
+ for content in logprobs.content
926
+ ]
927
+
928
+
929
+ def _convert_openai_sampling_params(
930
+ max_tokens: int | None = None,
931
+ temperature: float | None = None,
932
+ top_p: float | None = None,
933
+ ) -> SamplingParams:
934
+ sampling_params = SamplingParams()
935
+
936
+ if max_tokens:
937
+ sampling_params.max_tokens = max_tokens
938
+
939
+ # Map an explicit temperature of 0 to greedy sampling
940
+ if temperature == 0:
941
+ strategy = GreedySamplingStrategy()
942
+ else:
943
+ # OpenAI defaults to 1.0 for temperature and top_p if unset
944
+ if temperature is None:
945
+ temperature = 1.0
946
+ if top_p is None:
947
+ top_p = 1.0
948
+ strategy = TopPSamplingStrategy(temperature=temperature, top_p=top_p)
949
+
950
+ sampling_params.strategy = strategy
951
+ return sampling_params
952
+
953
+
954
+ def openai_messages_to_messages(
955
+ messages: list[OpenAIMessageParam],
956
+ ) -> list[Message]:
957
+ """
958
+ Convert a list of OpenAIChatCompletionMessage into a list of Message.
959
+ """
960
+ converted_messages = []
961
+ for message in messages:
962
+ if message.role == "system":
963
+ converted_message = SystemMessage(content=openai_content_to_content(message.content))
964
+ elif message.role == "user":
965
+ converted_message = UserMessage(content=openai_content_to_content(message.content))
966
+ elif message.role == "assistant":
967
+ converted_message = CompletionMessage(
968
+ content=openai_content_to_content(message.content),
969
+ tool_calls=_convert_openai_tool_calls(message.tool_calls),
970
+ stop_reason=StopReason.end_of_turn,
971
+ )
972
+ elif message.role == "tool":
973
+ converted_message = ToolResponseMessage(
974
+ role="tool",
975
+ call_id=message.tool_call_id,
976
+ content=openai_content_to_content(message.content),
977
+ )
978
+ else:
979
+ raise ValueError(f"Unknown role {message.role}")
980
+ converted_messages.append(converted_message)
981
+ return converted_messages
982
+
983
+
984
+ def openai_content_to_content(content: str | Iterable[OpenAIChatCompletionContentPartParam] | None):
985
+ if content is None:
986
+ return ""
987
+ if isinstance(content, str):
988
+ return content
989
+ elif isinstance(content, list):
990
+ return [openai_content_to_content(c) for c in content]
991
+ elif hasattr(content, "type"):
992
+ if content.type == "text":
993
+ return TextContentItem(type="text", text=content.text)
994
+ elif content.type == "image_url":
995
+ return ImageContentItem(type="image", image=_URLOrData(url=URL(uri=content.image_url.url)))
996
+ else:
997
+ raise ValueError(f"Unknown content type: {content.type}")
998
+ else:
999
+ raise ValueError(f"Unknown content type: {content}")
1000
+
1001
+
1002
+ def convert_openai_chat_completion_choice(
1003
+ choice: OpenAIChoice,
1004
+ ) -> ChatCompletionResponse:
1005
+ """
1006
+ Convert an OpenAI Choice into a ChatCompletionResponse.
1007
+
1008
+ OpenAI Choice:
1009
+ message: ChatCompletionMessage
1010
+ finish_reason: str
1011
+ logprobs: Optional[ChoiceLogprobs]
1012
+
1013
+ OpenAI ChatCompletionMessage:
1014
+ role: Literal["assistant"]
1015
+ content: Optional[str]
1016
+ tool_calls: Optional[List[ChatCompletionMessageToolCall]]
1017
+
1018
+ ->
1019
+
1020
+ ChatCompletionResponse:
1021
+ completion_message: CompletionMessage
1022
+ logprobs: Optional[List[TokenLogProbs]]
1023
+
1024
+ CompletionMessage:
1025
+ role: Literal["assistant"]
1026
+ content: str | ImageMedia | List[str | ImageMedia]
1027
+ stop_reason: StopReason
1028
+ tool_calls: List[ToolCall]
1029
+
1030
+ class StopReason(Enum):
1031
+ end_of_turn = "end_of_turn"
1032
+ end_of_message = "end_of_message"
1033
+ out_of_tokens = "out_of_tokens"
1034
+ """
1035
+ assert hasattr(choice, "message") and choice.message, "error in server response: message not found"
1036
+ assert hasattr(choice, "finish_reason") and choice.finish_reason, (
1037
+ "error in server response: finish_reason not found"
1038
+ )
1039
+
1040
+ return ChatCompletionResponse(
1041
+ completion_message=CompletionMessage(
1042
+ content=choice.message.content or "", # CompletionMessage content is not optional
1043
+ stop_reason=_convert_openai_finish_reason(choice.finish_reason),
1044
+ tool_calls=_convert_openai_tool_calls(choice.message.tool_calls),
1045
+ ),
1046
+ logprobs=_convert_openai_logprobs(getattr(choice, "logprobs", None)),
1047
+ )
1048
+
1049
+
1050
+ async def convert_openai_chat_completion_stream(
1051
+ stream: AsyncStream[OpenAIChatCompletionChunk],
1052
+ enable_incremental_tool_calls: bool,
1053
+ ) -> AsyncGenerator[ChatCompletionResponseStreamChunk, None]:
1054
+ """
1055
+ Convert a stream of OpenAI chat completion chunks into a stream
1056
+ of ChatCompletionResponseStreamChunk.
1057
+ """
1058
+ yield ChatCompletionResponseStreamChunk(
1059
+ event=ChatCompletionResponseEvent(
1060
+ event_type=ChatCompletionResponseEventType.start,
1061
+ delta=TextDelta(text=""),
1062
+ )
1063
+ )
1064
+ event_type = ChatCompletionResponseEventType.progress
1065
+
1066
+ stop_reason = None
1067
+ tool_call_idx_to_buffer = {}
1068
+
1069
+ async for chunk in stream:
1070
+ choice = chunk.choices[0] # assuming only one choice per chunk
1071
+
1072
+ # we assume there's only one finish_reason in the stream
1073
+ stop_reason = _convert_openai_finish_reason(choice.finish_reason) or stop_reason
1074
+ logprobs = getattr(choice, "logprobs", None)
1075
+
1076
+ # if there's a tool call, emit an event for each tool in the list
1077
+ # if tool call and content, emit both separately
1078
+ if choice.delta.tool_calls:
1079
+ # the call may have content and a tool call. ChatCompletionResponseEvent
1080
+ # does not support both, so we emit the content first
1081
+ if choice.delta.content:
1082
+ yield ChatCompletionResponseStreamChunk(
1083
+ event=ChatCompletionResponseEvent(
1084
+ event_type=event_type,
1085
+ delta=TextDelta(text=choice.delta.content),
1086
+ logprobs=_convert_openai_logprobs(logprobs),
1087
+ )
1088
+ )
1089
+
1090
+ # it is possible to have parallel tool calls in stream, but
1091
+ # ChatCompletionResponseEvent only supports one per stream
1092
+ if len(choice.delta.tool_calls) > 1:
1093
+ warnings.warn(
1094
+ "multiple tool calls found in a single delta, using the first, ignoring the rest",
1095
+ stacklevel=2,
1096
+ )
1097
+
1098
+ if not enable_incremental_tool_calls:
1099
+ for tool_call in choice.delta.tool_calls:
1100
+ yield ChatCompletionResponseStreamChunk(
1101
+ event=ChatCompletionResponseEvent(
1102
+ event_type=event_type,
1103
+ delta=ToolCallDelta(
1104
+ tool_call=_convert_openai_tool_calls([tool_call])[0],
1105
+ parse_status=ToolCallParseStatus.succeeded,
1106
+ ),
1107
+ logprobs=_convert_openai_logprobs(logprobs),
1108
+ )
1109
+ )
1110
+ else:
1111
+ for tool_call in choice.delta.tool_calls:
1112
+ idx = tool_call.index if hasattr(tool_call, "index") else 0
1113
+
1114
+ if idx not in tool_call_idx_to_buffer:
1115
+ tool_call_idx_to_buffer[idx] = {
1116
+ "call_id": tool_call.id,
1117
+ "name": None,
1118
+ "arguments": "",
1119
+ "content": "",
1120
+ }
1121
+
1122
+ buffer = tool_call_idx_to_buffer[idx]
1123
+
1124
+ if tool_call.function:
1125
+ if tool_call.function.name:
1126
+ buffer["name"] = tool_call.function.name
1127
+ delta = f"{buffer['name']}("
1128
+ buffer["content"] += delta
1129
+
1130
+ if tool_call.function.arguments:
1131
+ delta = tool_call.function.arguments
1132
+ buffer["arguments"] += delta
1133
+ buffer["content"] += delta
1134
+
1135
+ yield ChatCompletionResponseStreamChunk(
1136
+ event=ChatCompletionResponseEvent(
1137
+ event_type=event_type,
1138
+ delta=ToolCallDelta(
1139
+ tool_call=delta,
1140
+ parse_status=ToolCallParseStatus.in_progress,
1141
+ ),
1142
+ logprobs=_convert_openai_logprobs(logprobs),
1143
+ )
1144
+ )
1145
+ elif choice.delta.content:
1146
+ yield ChatCompletionResponseStreamChunk(
1147
+ event=ChatCompletionResponseEvent(
1148
+ event_type=event_type,
1149
+ delta=TextDelta(text=choice.delta.content or ""),
1150
+ logprobs=_convert_openai_logprobs(logprobs),
1151
+ )
1152
+ )
1153
+
1154
+ for idx, buffer in tool_call_idx_to_buffer.items():
1155
+ logger.debug(f"toolcall_buffer[{idx}]: {buffer}")
1156
+ if buffer["name"]:
1157
+ delta = ")"
1158
+ buffer["content"] += delta
1159
+ yield ChatCompletionResponseStreamChunk(
1160
+ event=ChatCompletionResponseEvent(
1161
+ event_type=event_type,
1162
+ delta=ToolCallDelta(
1163
+ tool_call=delta,
1164
+ parse_status=ToolCallParseStatus.in_progress,
1165
+ ),
1166
+ logprobs=None,
1167
+ )
1168
+ )
1169
+
1170
+ try:
1171
+ tool_call = ToolCall(
1172
+ call_id=buffer["call_id"],
1173
+ tool_name=buffer["name"],
1174
+ arguments=buffer["arguments"],
1175
+ )
1176
+ yield ChatCompletionResponseStreamChunk(
1177
+ event=ChatCompletionResponseEvent(
1178
+ event_type=ChatCompletionResponseEventType.progress,
1179
+ delta=ToolCallDelta(
1180
+ tool_call=tool_call,
1181
+ parse_status=ToolCallParseStatus.succeeded,
1182
+ ),
1183
+ stop_reason=stop_reason,
1184
+ )
1185
+ )
1186
+ except json.JSONDecodeError as e:
1187
+ print(f"Failed to parse arguments: {e}")
1188
+ yield ChatCompletionResponseStreamChunk(
1189
+ event=ChatCompletionResponseEvent(
1190
+ event_type=ChatCompletionResponseEventType.progress,
1191
+ delta=ToolCallDelta(
1192
+ tool_call=buffer["content"],
1193
+ parse_status=ToolCallParseStatus.failed,
1194
+ ),
1195
+ stop_reason=stop_reason,
1196
+ )
1197
+ )
182
1198
 
183
1199
  yield ChatCompletionResponseStreamChunk(
184
1200
  event=ChatCompletionResponseEvent(
185
1201
  event_type=ChatCompletionResponseEventType.complete,
186
- delta="",
1202
+ delta=TextDelta(text=""),
187
1203
  stop_reason=stop_reason,
188
1204
  )
189
1205
  )
1206
+
1207
+
1208
+ async def prepare_openai_completion_params(**params):
1209
+ async def _prepare_value(value: Any) -> Any:
1210
+ new_value = value
1211
+ if isinstance(value, list):
1212
+ new_value = [await _prepare_value(v) for v in value]
1213
+ elif isinstance(value, dict):
1214
+ new_value = {k: await _prepare_value(v) for k, v in value.items()}
1215
+ elif isinstance(value, BaseModel):
1216
+ new_value = value.model_dump(exclude_none=True)
1217
+ return new_value
1218
+
1219
+ completion_params = {}
1220
+ for k, v in params.items():
1221
+ if v is not None:
1222
+ completion_params[k] = await _prepare_value(v)
1223
+ return completion_params
1224
+
1225
+
1226
+ class OpenAIChatCompletionToLlamaStackMixin:
1227
+ async def openai_chat_completion(
1228
+ self,
1229
+ model: str,
1230
+ messages: list[OpenAIMessageParam],
1231
+ frequency_penalty: float | None = None,
1232
+ function_call: str | dict[str, Any] | None = None,
1233
+ functions: list[dict[str, Any]] | None = None,
1234
+ logit_bias: dict[str, float] | None = None,
1235
+ logprobs: bool | None = None,
1236
+ max_completion_tokens: int | None = None,
1237
+ max_tokens: int | None = None,
1238
+ n: int | None = None,
1239
+ parallel_tool_calls: bool | None = None,
1240
+ presence_penalty: float | None = None,
1241
+ response_format: OpenAIResponseFormatParam | None = None,
1242
+ seed: int | None = None,
1243
+ stop: str | list[str] | None = None,
1244
+ stream: bool | None = None,
1245
+ stream_options: dict[str, Any] | None = None,
1246
+ temperature: float | None = None,
1247
+ tool_choice: str | dict[str, Any] | None = None,
1248
+ tools: list[dict[str, Any]] | None = None,
1249
+ top_logprobs: int | None = None,
1250
+ top_p: float | None = None,
1251
+ user: str | None = None,
1252
+ ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
1253
+ messages = openai_messages_to_messages(messages)
1254
+ response_format = _convert_openai_request_response_format(response_format)
1255
+ sampling_params = _convert_openai_sampling_params(
1256
+ max_tokens=max_tokens,
1257
+ temperature=temperature,
1258
+ top_p=top_p,
1259
+ )
1260
+ tool_config = _convert_openai_request_tool_config(tool_choice)
1261
+
1262
+ tools = _convert_openai_request_tools(tools)
1263
+ if tool_config.tool_choice == ToolChoice.none:
1264
+ tools = []
1265
+
1266
+ outstanding_responses = []
1267
+ # "n" is the number of completions to generate per prompt
1268
+ n = n or 1
1269
+ for _i in range(0, n):
1270
+ response = self.chat_completion(
1271
+ model_id=model,
1272
+ messages=messages,
1273
+ sampling_params=sampling_params,
1274
+ response_format=response_format,
1275
+ stream=stream,
1276
+ tool_config=tool_config,
1277
+ tools=tools,
1278
+ )
1279
+ outstanding_responses.append(response)
1280
+
1281
+ if stream:
1282
+ return OpenAIChatCompletionToLlamaStackMixin._process_stream_response(self, model, outstanding_responses)
1283
+
1284
+ return await OpenAIChatCompletionToLlamaStackMixin._process_non_stream_response(
1285
+ self, model, outstanding_responses
1286
+ )
1287
+
1288
+ async def _process_stream_response(
1289
+ self,
1290
+ model: str,
1291
+ outstanding_responses: list[Awaitable[AsyncIterator[ChatCompletionResponseStreamChunk]]],
1292
+ ):
1293
+ id = f"chatcmpl-{uuid.uuid4()}"
1294
+ for i, outstanding_response in enumerate(outstanding_responses):
1295
+ response = await outstanding_response
1296
+ async for chunk in response:
1297
+ event = chunk.event
1298
+ finish_reason = _convert_stop_reason_to_openai_finish_reason(event.stop_reason)
1299
+
1300
+ if isinstance(event.delta, TextDelta):
1301
+ text_delta = event.delta.text
1302
+ delta = OpenAIChoiceDelta(content=text_delta)
1303
+ yield OpenAIChatCompletionChunk(
1304
+ id=id,
1305
+ choices=[OpenAIChatCompletionChunkChoice(index=i, finish_reason=finish_reason, delta=delta)],
1306
+ created=int(time.time()),
1307
+ model=model,
1308
+ object="chat.completion.chunk",
1309
+ )
1310
+ elif isinstance(event.delta, ToolCallDelta):
1311
+ if event.delta.parse_status == ToolCallParseStatus.succeeded:
1312
+ tool_call = event.delta.tool_call
1313
+
1314
+ # First chunk includes full structure
1315
+ openai_tool_call = OpenAIChoiceDeltaToolCall(
1316
+ index=0,
1317
+ id=tool_call.call_id,
1318
+ function=OpenAIChoiceDeltaToolCallFunction(
1319
+ name=tool_call.tool_name,
1320
+ arguments="",
1321
+ ),
1322
+ )
1323
+ delta = OpenAIChoiceDelta(tool_calls=[openai_tool_call])
1324
+ yield OpenAIChatCompletionChunk(
1325
+ id=id,
1326
+ choices=[
1327
+ OpenAIChatCompletionChunkChoice(index=i, finish_reason=finish_reason, delta=delta)
1328
+ ],
1329
+ created=int(time.time()),
1330
+ model=model,
1331
+ object="chat.completion.chunk",
1332
+ )
1333
+ # arguments
1334
+ openai_tool_call = OpenAIChoiceDeltaToolCall(
1335
+ index=0,
1336
+ function=OpenAIChoiceDeltaToolCallFunction(
1337
+ arguments=tool_call.arguments,
1338
+ ),
1339
+ )
1340
+ delta = OpenAIChoiceDelta(tool_calls=[openai_tool_call])
1341
+ yield OpenAIChatCompletionChunk(
1342
+ id=id,
1343
+ choices=[
1344
+ OpenAIChatCompletionChunkChoice(index=i, finish_reason=finish_reason, delta=delta)
1345
+ ],
1346
+ created=int(time.time()),
1347
+ model=model,
1348
+ object="chat.completion.chunk",
1349
+ )
1350
+
1351
+ async def _process_non_stream_response(
1352
+ self, model: str, outstanding_responses: list[Awaitable[ChatCompletionResponse]]
1353
+ ) -> OpenAIChatCompletion:
1354
+ choices = []
1355
+ for outstanding_response in outstanding_responses:
1356
+ response = await outstanding_response
1357
+ completion_message = response.completion_message
1358
+ message = await convert_message_to_openai_dict_new(completion_message)
1359
+ finish_reason = _convert_stop_reason_to_openai_finish_reason(completion_message.stop_reason)
1360
+
1361
+ choice = OpenAIChatCompletionChoice(
1362
+ index=len(choices),
1363
+ message=message,
1364
+ finish_reason=finish_reason,
1365
+ )
1366
+ choices.append(choice)
1367
+
1368
+ return OpenAIChatCompletion(
1369
+ id=f"chatcmpl-{uuid.uuid4()}",
1370
+ choices=choices,
1371
+ created=int(time.time()),
1372
+ model=model,
1373
+ object="chat.completion",
1374
+ )
1375
+
1376
+
1377
+ def prepare_openai_embeddings_params(
1378
+ model: str,
1379
+ input: str | list[str],
1380
+ encoding_format: str | None = "float",
1381
+ dimensions: int | None = None,
1382
+ user: str | None = None,
1383
+ ):
1384
+ if model is None:
1385
+ raise ValueError("Model must be provided for embeddings")
1386
+
1387
+ input_list = [input] if isinstance(input, str) else input
1388
+
1389
+ params: dict[str, Any] = {
1390
+ "model": model,
1391
+ "input": input_list,
1392
+ }
1393
+
1394
+ if encoding_format is not None:
1395
+ params["encoding_format"] = encoding_format
1396
+ if dimensions is not None:
1397
+ params["dimensions"] = dimensions
1398
+ if user is not None:
1399
+ params["user"] = user
1400
+
1401
+ return params