llama-stack 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (458) hide show
  1. llama_stack/__init__.py +0 -5
  2. llama_stack/cli/llama.py +3 -3
  3. llama_stack/cli/stack/_list_deps.py +12 -23
  4. llama_stack/cli/stack/list_stacks.py +37 -18
  5. llama_stack/cli/stack/run.py +121 -11
  6. llama_stack/cli/stack/utils.py +0 -127
  7. llama_stack/core/access_control/access_control.py +69 -28
  8. llama_stack/core/access_control/conditions.py +15 -5
  9. llama_stack/core/admin.py +267 -0
  10. llama_stack/core/build.py +6 -74
  11. llama_stack/core/client.py +1 -1
  12. llama_stack/core/configure.py +6 -6
  13. llama_stack/core/conversations/conversations.py +28 -25
  14. llama_stack/core/datatypes.py +271 -79
  15. llama_stack/core/distribution.py +15 -16
  16. llama_stack/core/external.py +3 -3
  17. llama_stack/core/inspect.py +98 -15
  18. llama_stack/core/library_client.py +73 -61
  19. llama_stack/core/prompts/prompts.py +12 -11
  20. llama_stack/core/providers.py +17 -11
  21. llama_stack/core/resolver.py +65 -56
  22. llama_stack/core/routers/__init__.py +8 -12
  23. llama_stack/core/routers/datasets.py +1 -4
  24. llama_stack/core/routers/eval_scoring.py +7 -4
  25. llama_stack/core/routers/inference.py +55 -271
  26. llama_stack/core/routers/safety.py +52 -24
  27. llama_stack/core/routers/tool_runtime.py +6 -48
  28. llama_stack/core/routers/vector_io.py +130 -51
  29. llama_stack/core/routing_tables/benchmarks.py +24 -20
  30. llama_stack/core/routing_tables/common.py +1 -4
  31. llama_stack/core/routing_tables/datasets.py +22 -22
  32. llama_stack/core/routing_tables/models.py +119 -6
  33. llama_stack/core/routing_tables/scoring_functions.py +7 -7
  34. llama_stack/core/routing_tables/shields.py +1 -2
  35. llama_stack/core/routing_tables/toolgroups.py +17 -7
  36. llama_stack/core/routing_tables/vector_stores.py +51 -16
  37. llama_stack/core/server/auth.py +5 -3
  38. llama_stack/core/server/auth_providers.py +36 -20
  39. llama_stack/core/server/fastapi_router_registry.py +84 -0
  40. llama_stack/core/server/quota.py +2 -2
  41. llama_stack/core/server/routes.py +79 -27
  42. llama_stack/core/server/server.py +102 -87
  43. llama_stack/core/stack.py +201 -58
  44. llama_stack/core/storage/datatypes.py +26 -3
  45. llama_stack/{providers/utils → core/storage}/kvstore/__init__.py +2 -0
  46. llama_stack/{providers/utils → core/storage}/kvstore/kvstore.py +55 -24
  47. llama_stack/{providers/utils → core/storage}/kvstore/mongodb/mongodb.py +13 -10
  48. llama_stack/{providers/utils → core/storage}/kvstore/postgres/postgres.py +28 -17
  49. llama_stack/{providers/utils → core/storage}/kvstore/redis/redis.py +41 -16
  50. llama_stack/{providers/utils → core/storage}/kvstore/sqlite/sqlite.py +1 -1
  51. llama_stack/core/storage/sqlstore/__init__.py +17 -0
  52. llama_stack/{providers/utils → core/storage}/sqlstore/authorized_sqlstore.py +69 -49
  53. llama_stack/{providers/utils → core/storage}/sqlstore/sqlalchemy_sqlstore.py +47 -17
  54. llama_stack/{providers/utils → core/storage}/sqlstore/sqlstore.py +25 -8
  55. llama_stack/core/store/registry.py +1 -1
  56. llama_stack/core/utils/config.py +8 -2
  57. llama_stack/core/utils/config_resolution.py +32 -29
  58. llama_stack/core/utils/context.py +4 -10
  59. llama_stack/core/utils/exec.py +9 -0
  60. llama_stack/core/utils/type_inspection.py +45 -0
  61. llama_stack/distributions/dell/{run.yaml → config.yaml} +3 -2
  62. llama_stack/distributions/dell/dell.py +2 -2
  63. llama_stack/distributions/dell/run-with-safety.yaml +3 -2
  64. llama_stack/distributions/meta-reference-gpu/{run.yaml → config.yaml} +3 -2
  65. llama_stack/distributions/meta-reference-gpu/meta_reference.py +2 -2
  66. llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +3 -2
  67. llama_stack/distributions/nvidia/{run.yaml → config.yaml} +4 -4
  68. llama_stack/distributions/nvidia/nvidia.py +1 -1
  69. llama_stack/distributions/nvidia/run-with-safety.yaml +4 -4
  70. llama_stack/{apis/datasetio → distributions/oci}/__init__.py +1 -1
  71. llama_stack/distributions/oci/config.yaml +134 -0
  72. llama_stack/distributions/oci/oci.py +108 -0
  73. llama_stack/distributions/open-benchmark/{run.yaml → config.yaml} +5 -4
  74. llama_stack/distributions/open-benchmark/open_benchmark.py +2 -3
  75. llama_stack/distributions/postgres-demo/{run.yaml → config.yaml} +4 -3
  76. llama_stack/distributions/starter/{run.yaml → config.yaml} +64 -13
  77. llama_stack/distributions/starter/run-with-postgres-store.yaml +64 -13
  78. llama_stack/distributions/starter/starter.py +8 -5
  79. llama_stack/distributions/starter-gpu/{run.yaml → config.yaml} +64 -13
  80. llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +64 -13
  81. llama_stack/distributions/template.py +13 -69
  82. llama_stack/distributions/watsonx/{run.yaml → config.yaml} +4 -3
  83. llama_stack/distributions/watsonx/watsonx.py +1 -1
  84. llama_stack/log.py +28 -11
  85. llama_stack/models/llama/checkpoint.py +6 -6
  86. llama_stack/models/llama/hadamard_utils.py +2 -0
  87. llama_stack/models/llama/llama3/generation.py +3 -1
  88. llama_stack/models/llama/llama3/interface.py +2 -5
  89. llama_stack/models/llama/llama3/multimodal/encoder_utils.py +3 -3
  90. llama_stack/models/llama/llama3/multimodal/image_transform.py +6 -6
  91. llama_stack/models/llama/llama3/prompt_templates/system_prompts.py +1 -1
  92. llama_stack/models/llama/llama3/tool_utils.py +2 -1
  93. llama_stack/models/llama/llama4/prompt_templates/system_prompts.py +1 -1
  94. llama_stack/providers/inline/agents/meta_reference/__init__.py +3 -3
  95. llama_stack/providers/inline/agents/meta_reference/agents.py +44 -261
  96. llama_stack/providers/inline/agents/meta_reference/config.py +6 -1
  97. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +207 -57
  98. llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +308 -47
  99. llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +162 -96
  100. llama_stack/providers/inline/agents/meta_reference/responses/types.py +23 -8
  101. llama_stack/providers/inline/agents/meta_reference/responses/utils.py +201 -33
  102. llama_stack/providers/inline/agents/meta_reference/safety.py +8 -13
  103. llama_stack/providers/inline/batches/reference/__init__.py +2 -4
  104. llama_stack/providers/inline/batches/reference/batches.py +78 -60
  105. llama_stack/providers/inline/datasetio/localfs/datasetio.py +2 -5
  106. llama_stack/providers/inline/eval/meta_reference/eval.py +16 -61
  107. llama_stack/providers/inline/files/localfs/files.py +37 -28
  108. llama_stack/providers/inline/inference/meta_reference/config.py +2 -2
  109. llama_stack/providers/inline/inference/meta_reference/generators.py +50 -60
  110. llama_stack/providers/inline/inference/meta_reference/inference.py +403 -19
  111. llama_stack/providers/inline/inference/meta_reference/model_parallel.py +7 -26
  112. llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +2 -12
  113. llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +10 -15
  114. llama_stack/providers/inline/post_training/common/validator.py +1 -5
  115. llama_stack/providers/inline/post_training/huggingface/post_training.py +8 -8
  116. llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py +18 -10
  117. llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device_dpo.py +12 -9
  118. llama_stack/providers/inline/post_training/huggingface/utils.py +27 -6
  119. llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py +1 -1
  120. llama_stack/providers/inline/post_training/torchtune/common/utils.py +1 -1
  121. llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py +1 -1
  122. llama_stack/providers/inline/post_training/torchtune/post_training.py +8 -8
  123. llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +16 -16
  124. llama_stack/providers/inline/safety/code_scanner/code_scanner.py +13 -9
  125. llama_stack/providers/inline/safety/llama_guard/llama_guard.py +18 -15
  126. llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +9 -9
  127. llama_stack/providers/inline/scoring/basic/scoring.py +6 -13
  128. llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py +1 -2
  129. llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py +1 -2
  130. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py +2 -2
  131. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py +2 -2
  132. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py +2 -2
  133. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py +2 -2
  134. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py +2 -2
  135. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py +2 -2
  136. llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py +1 -2
  137. llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py +1 -2
  138. llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py +1 -2
  139. llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py +1 -2
  140. llama_stack/providers/inline/scoring/braintrust/braintrust.py +12 -15
  141. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py +2 -2
  142. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py +2 -2
  143. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py +2 -2
  144. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py +2 -2
  145. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py +2 -2
  146. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py +2 -2
  147. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py +2 -2
  148. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py +2 -2
  149. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py +2 -2
  150. llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +7 -14
  151. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py +2 -2
  152. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py +1 -2
  153. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py +1 -3
  154. llama_stack/providers/inline/tool_runtime/rag/__init__.py +1 -1
  155. llama_stack/providers/inline/tool_runtime/rag/config.py +8 -1
  156. llama_stack/providers/inline/tool_runtime/rag/context_retriever.py +7 -6
  157. llama_stack/providers/inline/tool_runtime/rag/memory.py +64 -48
  158. llama_stack/providers/inline/vector_io/chroma/__init__.py +1 -1
  159. llama_stack/providers/inline/vector_io/chroma/config.py +1 -1
  160. llama_stack/providers/inline/vector_io/faiss/__init__.py +1 -1
  161. llama_stack/providers/inline/vector_io/faiss/config.py +1 -1
  162. llama_stack/providers/inline/vector_io/faiss/faiss.py +43 -28
  163. llama_stack/providers/inline/vector_io/milvus/__init__.py +1 -1
  164. llama_stack/providers/inline/vector_io/milvus/config.py +1 -1
  165. llama_stack/providers/inline/vector_io/qdrant/__init__.py +1 -1
  166. llama_stack/providers/inline/vector_io/qdrant/config.py +1 -1
  167. llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py +1 -1
  168. llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +40 -33
  169. llama_stack/providers/registry/agents.py +7 -3
  170. llama_stack/providers/registry/batches.py +1 -1
  171. llama_stack/providers/registry/datasetio.py +1 -1
  172. llama_stack/providers/registry/eval.py +1 -1
  173. llama_stack/{apis/datasets/__init__.py → providers/registry/file_processors.py} +5 -1
  174. llama_stack/providers/registry/files.py +11 -2
  175. llama_stack/providers/registry/inference.py +22 -3
  176. llama_stack/providers/registry/post_training.py +1 -1
  177. llama_stack/providers/registry/safety.py +1 -1
  178. llama_stack/providers/registry/scoring.py +1 -1
  179. llama_stack/providers/registry/tool_runtime.py +2 -2
  180. llama_stack/providers/registry/vector_io.py +7 -7
  181. llama_stack/providers/remote/datasetio/huggingface/huggingface.py +2 -5
  182. llama_stack/providers/remote/datasetio/nvidia/datasetio.py +1 -4
  183. llama_stack/providers/remote/eval/nvidia/eval.py +15 -9
  184. llama_stack/providers/remote/files/openai/__init__.py +19 -0
  185. llama_stack/providers/remote/files/openai/config.py +28 -0
  186. llama_stack/providers/remote/files/openai/files.py +253 -0
  187. llama_stack/providers/remote/files/s3/files.py +52 -30
  188. llama_stack/providers/remote/inference/anthropic/anthropic.py +2 -1
  189. llama_stack/providers/remote/inference/anthropic/config.py +1 -1
  190. llama_stack/providers/remote/inference/azure/azure.py +1 -3
  191. llama_stack/providers/remote/inference/azure/config.py +8 -7
  192. llama_stack/providers/remote/inference/bedrock/__init__.py +1 -1
  193. llama_stack/providers/remote/inference/bedrock/bedrock.py +82 -105
  194. llama_stack/providers/remote/inference/bedrock/config.py +24 -3
  195. llama_stack/providers/remote/inference/cerebras/cerebras.py +5 -5
  196. llama_stack/providers/remote/inference/cerebras/config.py +12 -5
  197. llama_stack/providers/remote/inference/databricks/config.py +13 -6
  198. llama_stack/providers/remote/inference/databricks/databricks.py +16 -6
  199. llama_stack/providers/remote/inference/fireworks/config.py +5 -5
  200. llama_stack/providers/remote/inference/fireworks/fireworks.py +1 -1
  201. llama_stack/providers/remote/inference/gemini/config.py +1 -1
  202. llama_stack/providers/remote/inference/gemini/gemini.py +13 -14
  203. llama_stack/providers/remote/inference/groq/config.py +5 -5
  204. llama_stack/providers/remote/inference/groq/groq.py +1 -1
  205. llama_stack/providers/remote/inference/llama_openai_compat/config.py +5 -5
  206. llama_stack/providers/remote/inference/llama_openai_compat/llama.py +8 -6
  207. llama_stack/providers/remote/inference/nvidia/__init__.py +1 -1
  208. llama_stack/providers/remote/inference/nvidia/config.py +21 -11
  209. llama_stack/providers/remote/inference/nvidia/nvidia.py +115 -3
  210. llama_stack/providers/remote/inference/nvidia/utils.py +1 -1
  211. llama_stack/providers/remote/inference/oci/__init__.py +17 -0
  212. llama_stack/providers/remote/inference/oci/auth.py +79 -0
  213. llama_stack/providers/remote/inference/oci/config.py +75 -0
  214. llama_stack/providers/remote/inference/oci/oci.py +162 -0
  215. llama_stack/providers/remote/inference/ollama/config.py +7 -5
  216. llama_stack/providers/remote/inference/ollama/ollama.py +17 -8
  217. llama_stack/providers/remote/inference/openai/config.py +4 -4
  218. llama_stack/providers/remote/inference/openai/openai.py +1 -1
  219. llama_stack/providers/remote/inference/passthrough/__init__.py +2 -2
  220. llama_stack/providers/remote/inference/passthrough/config.py +5 -10
  221. llama_stack/providers/remote/inference/passthrough/passthrough.py +97 -75
  222. llama_stack/providers/remote/inference/runpod/config.py +12 -5
  223. llama_stack/providers/remote/inference/runpod/runpod.py +2 -20
  224. llama_stack/providers/remote/inference/sambanova/config.py +5 -5
  225. llama_stack/providers/remote/inference/sambanova/sambanova.py +1 -1
  226. llama_stack/providers/remote/inference/tgi/config.py +7 -6
  227. llama_stack/providers/remote/inference/tgi/tgi.py +19 -11
  228. llama_stack/providers/remote/inference/together/config.py +5 -5
  229. llama_stack/providers/remote/inference/together/together.py +15 -12
  230. llama_stack/providers/remote/inference/vertexai/config.py +1 -1
  231. llama_stack/providers/remote/inference/vllm/config.py +5 -5
  232. llama_stack/providers/remote/inference/vllm/vllm.py +13 -14
  233. llama_stack/providers/remote/inference/watsonx/config.py +4 -4
  234. llama_stack/providers/remote/inference/watsonx/watsonx.py +21 -94
  235. llama_stack/providers/remote/post_training/nvidia/post_training.py +4 -4
  236. llama_stack/providers/remote/post_training/nvidia/utils.py +1 -1
  237. llama_stack/providers/remote/safety/bedrock/bedrock.py +6 -6
  238. llama_stack/providers/remote/safety/bedrock/config.py +1 -1
  239. llama_stack/providers/remote/safety/nvidia/config.py +1 -1
  240. llama_stack/providers/remote/safety/nvidia/nvidia.py +11 -5
  241. llama_stack/providers/remote/safety/sambanova/config.py +1 -1
  242. llama_stack/providers/remote/safety/sambanova/sambanova.py +6 -6
  243. llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py +11 -6
  244. llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py +12 -7
  245. llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py +8 -2
  246. llama_stack/providers/remote/tool_runtime/model_context_protocol/model_context_protocol.py +57 -15
  247. llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py +11 -6
  248. llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py +11 -6
  249. llama_stack/providers/remote/vector_io/chroma/__init__.py +1 -1
  250. llama_stack/providers/remote/vector_io/chroma/chroma.py +125 -20
  251. llama_stack/providers/remote/vector_io/chroma/config.py +1 -1
  252. llama_stack/providers/remote/vector_io/milvus/__init__.py +1 -1
  253. llama_stack/providers/remote/vector_io/milvus/config.py +1 -1
  254. llama_stack/providers/remote/vector_io/milvus/milvus.py +27 -21
  255. llama_stack/providers/remote/vector_io/pgvector/__init__.py +1 -1
  256. llama_stack/providers/remote/vector_io/pgvector/config.py +1 -1
  257. llama_stack/providers/remote/vector_io/pgvector/pgvector.py +26 -18
  258. llama_stack/providers/remote/vector_io/qdrant/__init__.py +1 -1
  259. llama_stack/providers/remote/vector_io/qdrant/config.py +1 -1
  260. llama_stack/providers/remote/vector_io/qdrant/qdrant.py +141 -24
  261. llama_stack/providers/remote/vector_io/weaviate/__init__.py +1 -1
  262. llama_stack/providers/remote/vector_io/weaviate/config.py +1 -1
  263. llama_stack/providers/remote/vector_io/weaviate/weaviate.py +26 -21
  264. llama_stack/providers/utils/common/data_schema_validator.py +1 -5
  265. llama_stack/providers/utils/files/form_data.py +1 -1
  266. llama_stack/providers/utils/inference/embedding_mixin.py +1 -1
  267. llama_stack/providers/utils/inference/inference_store.py +12 -21
  268. llama_stack/providers/utils/inference/litellm_openai_mixin.py +79 -79
  269. llama_stack/providers/utils/inference/model_registry.py +1 -3
  270. llama_stack/providers/utils/inference/openai_compat.py +44 -1171
  271. llama_stack/providers/utils/inference/openai_mixin.py +68 -42
  272. llama_stack/providers/utils/inference/prompt_adapter.py +50 -265
  273. llama_stack/providers/utils/inference/stream_utils.py +23 -0
  274. llama_stack/providers/utils/memory/__init__.py +2 -0
  275. llama_stack/providers/utils/memory/file_utils.py +1 -1
  276. llama_stack/providers/utils/memory/openai_vector_store_mixin.py +181 -84
  277. llama_stack/providers/utils/memory/vector_store.py +39 -38
  278. llama_stack/providers/utils/pagination.py +1 -1
  279. llama_stack/providers/utils/responses/responses_store.py +15 -25
  280. llama_stack/providers/utils/scoring/aggregation_utils.py +1 -2
  281. llama_stack/providers/utils/scoring/base_scoring_fn.py +1 -2
  282. llama_stack/providers/utils/tools/mcp.py +93 -11
  283. llama_stack/telemetry/constants.py +27 -0
  284. llama_stack/telemetry/helpers.py +43 -0
  285. llama_stack/testing/api_recorder.py +25 -16
  286. {llama_stack-0.3.4.dist-info → llama_stack-0.4.0.dist-info}/METADATA +56 -131
  287. llama_stack-0.4.0.dist-info/RECORD +588 -0
  288. llama_stack-0.4.0.dist-info/top_level.txt +2 -0
  289. llama_stack_api/__init__.py +945 -0
  290. llama_stack_api/admin/__init__.py +45 -0
  291. llama_stack_api/admin/api.py +72 -0
  292. llama_stack_api/admin/fastapi_routes.py +117 -0
  293. llama_stack_api/admin/models.py +113 -0
  294. llama_stack_api/agents.py +173 -0
  295. llama_stack_api/batches/__init__.py +40 -0
  296. llama_stack_api/batches/api.py +53 -0
  297. llama_stack_api/batches/fastapi_routes.py +113 -0
  298. llama_stack_api/batches/models.py +78 -0
  299. llama_stack_api/benchmarks/__init__.py +43 -0
  300. llama_stack_api/benchmarks/api.py +39 -0
  301. llama_stack_api/benchmarks/fastapi_routes.py +109 -0
  302. llama_stack_api/benchmarks/models.py +109 -0
  303. {llama_stack/apis → llama_stack_api}/common/content_types.py +1 -43
  304. {llama_stack/apis → llama_stack_api}/common/errors.py +0 -8
  305. {llama_stack/apis → llama_stack_api}/common/job_types.py +1 -1
  306. llama_stack_api/common/responses.py +77 -0
  307. {llama_stack/apis → llama_stack_api}/common/training_types.py +1 -1
  308. {llama_stack/apis → llama_stack_api}/common/type_system.py +2 -14
  309. llama_stack_api/connectors.py +146 -0
  310. {llama_stack/apis/conversations → llama_stack_api}/conversations.py +23 -39
  311. {llama_stack/apis/datasetio → llama_stack_api}/datasetio.py +4 -8
  312. llama_stack_api/datasets/__init__.py +61 -0
  313. llama_stack_api/datasets/api.py +35 -0
  314. llama_stack_api/datasets/fastapi_routes.py +104 -0
  315. llama_stack_api/datasets/models.py +152 -0
  316. {llama_stack/providers → llama_stack_api}/datatypes.py +166 -10
  317. {llama_stack/apis/eval → llama_stack_api}/eval.py +8 -40
  318. llama_stack_api/file_processors/__init__.py +27 -0
  319. llama_stack_api/file_processors/api.py +64 -0
  320. llama_stack_api/file_processors/fastapi_routes.py +78 -0
  321. llama_stack_api/file_processors/models.py +42 -0
  322. llama_stack_api/files/__init__.py +35 -0
  323. llama_stack_api/files/api.py +51 -0
  324. llama_stack_api/files/fastapi_routes.py +124 -0
  325. llama_stack_api/files/models.py +107 -0
  326. {llama_stack/apis/inference → llama_stack_api}/inference.py +90 -194
  327. llama_stack_api/inspect_api/__init__.py +37 -0
  328. llama_stack_api/inspect_api/api.py +25 -0
  329. llama_stack_api/inspect_api/fastapi_routes.py +76 -0
  330. llama_stack_api/inspect_api/models.py +28 -0
  331. {llama_stack/apis/agents → llama_stack_api/internal}/__init__.py +3 -1
  332. llama_stack/providers/utils/kvstore/api.py → llama_stack_api/internal/kvstore.py +5 -0
  333. llama_stack_api/internal/sqlstore.py +79 -0
  334. {llama_stack/apis/models → llama_stack_api}/models.py +11 -9
  335. {llama_stack/apis/agents → llama_stack_api}/openai_responses.py +184 -27
  336. {llama_stack/apis/post_training → llama_stack_api}/post_training.py +7 -11
  337. {llama_stack/apis/prompts → llama_stack_api}/prompts.py +3 -4
  338. llama_stack_api/providers/__init__.py +33 -0
  339. llama_stack_api/providers/api.py +16 -0
  340. llama_stack_api/providers/fastapi_routes.py +57 -0
  341. llama_stack_api/providers/models.py +24 -0
  342. {llama_stack/apis/tools → llama_stack_api}/rag_tool.py +2 -52
  343. {llama_stack/apis → llama_stack_api}/resource.py +1 -1
  344. llama_stack_api/router_utils.py +160 -0
  345. {llama_stack/apis/safety → llama_stack_api}/safety.py +6 -9
  346. {llama_stack → llama_stack_api}/schema_utils.py +94 -4
  347. {llama_stack/apis/scoring → llama_stack_api}/scoring.py +3 -3
  348. {llama_stack/apis/scoring_functions → llama_stack_api}/scoring_functions.py +9 -6
  349. {llama_stack/apis/shields → llama_stack_api}/shields.py +6 -7
  350. {llama_stack/apis/tools → llama_stack_api}/tools.py +26 -21
  351. {llama_stack/apis/vector_io → llama_stack_api}/vector_io.py +133 -152
  352. {llama_stack/apis/vector_stores → llama_stack_api}/vector_stores.py +1 -1
  353. llama_stack/apis/agents/agents.py +0 -894
  354. llama_stack/apis/batches/__init__.py +0 -9
  355. llama_stack/apis/batches/batches.py +0 -100
  356. llama_stack/apis/benchmarks/__init__.py +0 -7
  357. llama_stack/apis/benchmarks/benchmarks.py +0 -108
  358. llama_stack/apis/common/responses.py +0 -36
  359. llama_stack/apis/conversations/__init__.py +0 -31
  360. llama_stack/apis/datasets/datasets.py +0 -251
  361. llama_stack/apis/datatypes.py +0 -160
  362. llama_stack/apis/eval/__init__.py +0 -7
  363. llama_stack/apis/files/__init__.py +0 -7
  364. llama_stack/apis/files/files.py +0 -199
  365. llama_stack/apis/inference/__init__.py +0 -7
  366. llama_stack/apis/inference/event_logger.py +0 -43
  367. llama_stack/apis/inspect/__init__.py +0 -7
  368. llama_stack/apis/inspect/inspect.py +0 -94
  369. llama_stack/apis/models/__init__.py +0 -7
  370. llama_stack/apis/post_training/__init__.py +0 -7
  371. llama_stack/apis/prompts/__init__.py +0 -9
  372. llama_stack/apis/providers/__init__.py +0 -7
  373. llama_stack/apis/providers/providers.py +0 -69
  374. llama_stack/apis/safety/__init__.py +0 -7
  375. llama_stack/apis/scoring/__init__.py +0 -7
  376. llama_stack/apis/scoring_functions/__init__.py +0 -7
  377. llama_stack/apis/shields/__init__.py +0 -7
  378. llama_stack/apis/synthetic_data_generation/__init__.py +0 -7
  379. llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py +0 -77
  380. llama_stack/apis/telemetry/__init__.py +0 -7
  381. llama_stack/apis/telemetry/telemetry.py +0 -423
  382. llama_stack/apis/tools/__init__.py +0 -8
  383. llama_stack/apis/vector_io/__init__.py +0 -7
  384. llama_stack/apis/vector_stores/__init__.py +0 -7
  385. llama_stack/core/server/tracing.py +0 -80
  386. llama_stack/core/ui/app.py +0 -55
  387. llama_stack/core/ui/modules/__init__.py +0 -5
  388. llama_stack/core/ui/modules/api.py +0 -32
  389. llama_stack/core/ui/modules/utils.py +0 -42
  390. llama_stack/core/ui/page/__init__.py +0 -5
  391. llama_stack/core/ui/page/distribution/__init__.py +0 -5
  392. llama_stack/core/ui/page/distribution/datasets.py +0 -18
  393. llama_stack/core/ui/page/distribution/eval_tasks.py +0 -20
  394. llama_stack/core/ui/page/distribution/models.py +0 -18
  395. llama_stack/core/ui/page/distribution/providers.py +0 -27
  396. llama_stack/core/ui/page/distribution/resources.py +0 -48
  397. llama_stack/core/ui/page/distribution/scoring_functions.py +0 -18
  398. llama_stack/core/ui/page/distribution/shields.py +0 -19
  399. llama_stack/core/ui/page/evaluations/__init__.py +0 -5
  400. llama_stack/core/ui/page/evaluations/app_eval.py +0 -143
  401. llama_stack/core/ui/page/evaluations/native_eval.py +0 -253
  402. llama_stack/core/ui/page/playground/__init__.py +0 -5
  403. llama_stack/core/ui/page/playground/chat.py +0 -130
  404. llama_stack/core/ui/page/playground/tools.py +0 -352
  405. llama_stack/distributions/dell/build.yaml +0 -33
  406. llama_stack/distributions/meta-reference-gpu/build.yaml +0 -32
  407. llama_stack/distributions/nvidia/build.yaml +0 -29
  408. llama_stack/distributions/open-benchmark/build.yaml +0 -36
  409. llama_stack/distributions/postgres-demo/__init__.py +0 -7
  410. llama_stack/distributions/postgres-demo/build.yaml +0 -23
  411. llama_stack/distributions/postgres-demo/postgres_demo.py +0 -125
  412. llama_stack/distributions/starter/build.yaml +0 -61
  413. llama_stack/distributions/starter-gpu/build.yaml +0 -61
  414. llama_stack/distributions/watsonx/build.yaml +0 -33
  415. llama_stack/providers/inline/agents/meta_reference/agent_instance.py +0 -1024
  416. llama_stack/providers/inline/agents/meta_reference/persistence.py +0 -228
  417. llama_stack/providers/inline/telemetry/__init__.py +0 -5
  418. llama_stack/providers/inline/telemetry/meta_reference/__init__.py +0 -21
  419. llama_stack/providers/inline/telemetry/meta_reference/config.py +0 -47
  420. llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +0 -252
  421. llama_stack/providers/remote/inference/bedrock/models.py +0 -29
  422. llama_stack/providers/utils/kvstore/sqlite/config.py +0 -20
  423. llama_stack/providers/utils/sqlstore/__init__.py +0 -5
  424. llama_stack/providers/utils/sqlstore/api.py +0 -128
  425. llama_stack/providers/utils/telemetry/__init__.py +0 -5
  426. llama_stack/providers/utils/telemetry/trace_protocol.py +0 -142
  427. llama_stack/providers/utils/telemetry/tracing.py +0 -384
  428. llama_stack/strong_typing/__init__.py +0 -19
  429. llama_stack/strong_typing/auxiliary.py +0 -228
  430. llama_stack/strong_typing/classdef.py +0 -440
  431. llama_stack/strong_typing/core.py +0 -46
  432. llama_stack/strong_typing/deserializer.py +0 -877
  433. llama_stack/strong_typing/docstring.py +0 -409
  434. llama_stack/strong_typing/exception.py +0 -23
  435. llama_stack/strong_typing/inspection.py +0 -1085
  436. llama_stack/strong_typing/mapping.py +0 -40
  437. llama_stack/strong_typing/name.py +0 -182
  438. llama_stack/strong_typing/schema.py +0 -792
  439. llama_stack/strong_typing/serialization.py +0 -97
  440. llama_stack/strong_typing/serializer.py +0 -500
  441. llama_stack/strong_typing/slots.py +0 -27
  442. llama_stack/strong_typing/topological.py +0 -89
  443. llama_stack/ui/node_modules/flatted/python/flatted.py +0 -149
  444. llama_stack-0.3.4.dist-info/RECORD +0 -625
  445. llama_stack-0.3.4.dist-info/top_level.txt +0 -1
  446. /llama_stack/{providers/utils → core/storage}/kvstore/config.py +0 -0
  447. /llama_stack/{providers/utils → core/storage}/kvstore/mongodb/__init__.py +0 -0
  448. /llama_stack/{providers/utils → core/storage}/kvstore/postgres/__init__.py +0 -0
  449. /llama_stack/{providers/utils → core/storage}/kvstore/redis/__init__.py +0 -0
  450. /llama_stack/{providers/utils → core/storage}/kvstore/sqlite/__init__.py +0 -0
  451. /llama_stack/{apis → providers/inline/file_processor}/__init__.py +0 -0
  452. /llama_stack/{apis/common → telemetry}/__init__.py +0 -0
  453. {llama_stack-0.3.4.dist-info → llama_stack-0.4.0.dist-info}/WHEEL +0 -0
  454. {llama_stack-0.3.4.dist-info → llama_stack-0.4.0.dist-info}/entry_points.txt +0 -0
  455. {llama_stack-0.3.4.dist-info → llama_stack-0.4.0.dist-info}/licenses/LICENSE +0 -0
  456. {llama_stack/core/ui → llama_stack_api/common}/__init__.py +0 -0
  457. {llama_stack/strong_typing → llama_stack_api}/py.typed +0 -0
  458. {llama_stack/apis → llama_stack_api}/version.py +0 -0
@@ -3,119 +3,15 @@
3
3
  #
4
4
  # This source code is licensed under the terms described in the LICENSE file in
5
5
  # the root directory of this source tree.
6
- import json
7
- import time
8
- import uuid
9
- import warnings
10
- from collections.abc import AsyncGenerator, AsyncIterator, Awaitable, Iterable
11
6
  from typing import (
12
7
  Any,
13
8
  )
14
9
 
15
- from openai import AsyncStream
16
- from openai.types.chat import (
17
- ChatCompletionAssistantMessageParam as OpenAIChatCompletionAssistantMessage,
18
- )
19
- from openai.types.chat import (
20
- ChatCompletionChunk as OpenAIChatCompletionChunk,
21
- )
22
- from openai.types.chat import (
23
- ChatCompletionContentPartImageParam as OpenAIChatCompletionContentPartImageParam,
24
- )
25
- from openai.types.chat import (
26
- ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam,
27
- )
28
- from openai.types.chat import (
29
- ChatCompletionContentPartTextParam as OpenAIChatCompletionContentPartTextParam,
30
- )
31
-
32
- try:
33
- from openai.types.chat import (
34
- ChatCompletionMessageFunctionToolCall as OpenAIChatCompletionMessageFunctionToolCall,
35
- )
36
- except ImportError:
37
- from openai.types.chat.chat_completion_message_tool_call import (
38
- ChatCompletionMessageToolCall as OpenAIChatCompletionMessageFunctionToolCall,
39
- )
40
- from openai.types.chat import (
41
- ChatCompletionMessageParam as OpenAIChatCompletionMessage,
42
- )
43
10
  from openai.types.chat import (
44
11
  ChatCompletionMessageToolCall,
45
12
  )
46
- from openai.types.chat import (
47
- ChatCompletionSystemMessageParam as OpenAIChatCompletionSystemMessage,
48
- )
49
- from openai.types.chat import (
50
- ChatCompletionToolMessageParam as OpenAIChatCompletionToolMessage,
51
- )
52
- from openai.types.chat import (
53
- ChatCompletionUserMessageParam as OpenAIChatCompletionUserMessage,
54
- )
55
- from openai.types.chat.chat_completion import (
56
- Choice as OpenAIChoice,
57
- )
58
- from openai.types.chat.chat_completion import (
59
- ChoiceLogprobs as OpenAIChoiceLogprobs, # same as chat_completion_chunk ChoiceLogprobs
60
- )
61
- from openai.types.chat.chat_completion_chunk import (
62
- Choice as OpenAIChatCompletionChunkChoice,
63
- )
64
- from openai.types.chat.chat_completion_chunk import (
65
- ChoiceDelta as OpenAIChoiceDelta,
66
- )
67
- from openai.types.chat.chat_completion_chunk import (
68
- ChoiceDeltaToolCall as OpenAIChoiceDeltaToolCall,
69
- )
70
- from openai.types.chat.chat_completion_chunk import (
71
- ChoiceDeltaToolCallFunction as OpenAIChoiceDeltaToolCallFunction,
72
- )
73
- from openai.types.chat.chat_completion_content_part_image_param import (
74
- ImageURL as OpenAIImageURL,
75
- )
76
- from openai.types.chat.chat_completion_message_tool_call import (
77
- Function as OpenAIFunction,
78
- )
79
13
  from pydantic import BaseModel
80
14
 
81
- from llama_stack.apis.common.content_types import (
82
- URL,
83
- ImageContentItem,
84
- InterleavedContent,
85
- TextContentItem,
86
- TextDelta,
87
- ToolCallDelta,
88
- ToolCallParseStatus,
89
- _URLOrData,
90
- )
91
- from llama_stack.apis.inference import (
92
- ChatCompletionRequest,
93
- ChatCompletionResponse,
94
- ChatCompletionResponseEvent,
95
- ChatCompletionResponseEventType,
96
- ChatCompletionResponseStreamChunk,
97
- CompletionMessage,
98
- CompletionResponse,
99
- CompletionResponseStreamChunk,
100
- GreedySamplingStrategy,
101
- JsonSchemaResponseFormat,
102
- Message,
103
- OpenAIChatCompletion,
104
- OpenAIMessageParam,
105
- OpenAIResponseFormatParam,
106
- SamplingParams,
107
- SystemMessage,
108
- TokenLogProbs,
109
- ToolChoice,
110
- ToolConfig,
111
- ToolResponseMessage,
112
- TopKSamplingStrategy,
113
- TopPSamplingStrategy,
114
- UserMessage,
115
- )
116
- from llama_stack.apis.inference import (
117
- OpenAIChoice as OpenAIChatCompletionChoice,
118
- )
119
15
  from llama_stack.log import get_logger
120
16
  from llama_stack.models.llama.datatypes import (
121
17
  BuiltinTool,
@@ -123,10 +19,6 @@ from llama_stack.models.llama.datatypes import (
123
19
  ToolCall,
124
20
  ToolDefinition,
125
21
  )
126
- from llama_stack.providers.utils.inference.prompt_adapter import (
127
- convert_image_content_to_url,
128
- decode_assistant_message,
129
- )
130
22
 
131
23
  logger = get_logger(name=__name__, category="providers::utils")
132
24
 
@@ -156,48 +48,14 @@ class OpenAICompatCompletionResponse(BaseModel):
156
48
  choices: list[OpenAICompatCompletionChoice]
157
49
 
158
50
 
159
- def get_sampling_strategy_options(params: SamplingParams) -> dict:
160
- options = {}
161
- if isinstance(params.strategy, GreedySamplingStrategy):
162
- options["temperature"] = 0.0
163
- elif isinstance(params.strategy, TopPSamplingStrategy):
164
- options["temperature"] = params.strategy.temperature
165
- options["top_p"] = params.strategy.top_p
166
- elif isinstance(params.strategy, TopKSamplingStrategy):
167
- options["top_k"] = params.strategy.top_k
168
- else:
169
- raise ValueError(f"Unsupported sampling strategy: {params.strategy}")
170
-
171
- return options
172
-
173
-
174
- def get_sampling_options(params: SamplingParams | None) -> dict:
175
- if not params:
176
- return {}
177
-
178
- options = {}
179
- if params:
180
- options.update(get_sampling_strategy_options(params))
181
- if params.max_tokens:
182
- options["max_tokens"] = params.max_tokens
183
-
184
- if params.repetition_penalty is not None and params.repetition_penalty != 1.0:
185
- options["repeat_penalty"] = params.repetition_penalty
186
-
187
- if params.stop is not None:
188
- options["stop"] = params.stop
189
-
190
- return options
191
-
192
-
193
51
  def text_from_choice(choice) -> str:
194
52
  if hasattr(choice, "delta") and choice.delta:
195
- return choice.delta.content
53
+ return choice.delta.content # type: ignore[no-any-return] # external OpenAI types lack precise annotations
196
54
 
197
55
  if hasattr(choice, "message"):
198
- return choice.message.content
56
+ return choice.message.content # type: ignore[no-any-return] # external OpenAI types lack precise annotations
199
57
 
200
- return choice.text
58
+ return choice.text # type: ignore[no-any-return] # external OpenAI types lack precise annotations
201
59
 
202
60
 
203
61
  def get_stop_reason(finish_reason: str) -> StopReason:
@@ -211,341 +69,6 @@ def get_stop_reason(finish_reason: str) -> StopReason:
211
69
  return StopReason.out_of_tokens
212
70
 
213
71
 
214
- def convert_openai_completion_logprobs(
215
- logprobs: OpenAICompatLogprobs | None,
216
- ) -> list[TokenLogProbs] | None:
217
- if not logprobs:
218
- return None
219
- if hasattr(logprobs, "top_logprobs"):
220
- return [TokenLogProbs(logprobs_by_token=x) for x in logprobs.top_logprobs]
221
-
222
- # Together supports logprobs with top_k=1 only. This means for each token position,
223
- # they return only the logprobs for the selected token (vs. the top n most likely tokens).
224
- # Here we construct the response by matching the selected token with the logprobs.
225
- if logprobs.tokens and logprobs.token_logprobs:
226
- return [
227
- TokenLogProbs(logprobs_by_token={token: token_lp})
228
- for token, token_lp in zip(logprobs.tokens, logprobs.token_logprobs, strict=False)
229
- ]
230
- return None
231
-
232
-
233
- def convert_openai_completion_logprobs_stream(text: str, logprobs: float | OpenAICompatLogprobs | None):
234
- if logprobs is None:
235
- return None
236
- if isinstance(logprobs, float):
237
- # Adapt response from Together CompletionChoicesChunk
238
- return [TokenLogProbs(logprobs_by_token={text: logprobs})]
239
- if hasattr(logprobs, "top_logprobs"):
240
- return [TokenLogProbs(logprobs_by_token=x) for x in logprobs.top_logprobs]
241
- return None
242
-
243
-
244
- def process_completion_response(
245
- response: OpenAICompatCompletionResponse,
246
- ) -> CompletionResponse:
247
- choice = response.choices[0]
248
- # drop suffix <eot_id> if present and return stop reason as end of turn
249
- if choice.text.endswith("<|eot_id|>"):
250
- return CompletionResponse(
251
- stop_reason=StopReason.end_of_turn,
252
- content=choice.text[: -len("<|eot_id|>")],
253
- logprobs=convert_openai_completion_logprobs(choice.logprobs),
254
- )
255
- # drop suffix <eom_id> if present and return stop reason as end of message
256
- if choice.text.endswith("<|eom_id|>"):
257
- return CompletionResponse(
258
- stop_reason=StopReason.end_of_message,
259
- content=choice.text[: -len("<|eom_id|>")],
260
- logprobs=convert_openai_completion_logprobs(choice.logprobs),
261
- )
262
- return CompletionResponse(
263
- stop_reason=get_stop_reason(choice.finish_reason),
264
- content=choice.text,
265
- logprobs=convert_openai_completion_logprobs(choice.logprobs),
266
- )
267
-
268
-
269
- def process_chat_completion_response(
270
- response: OpenAICompatCompletionResponse,
271
- request: ChatCompletionRequest,
272
- ) -> ChatCompletionResponse:
273
- choice = response.choices[0]
274
- if choice.finish_reason == "tool_calls":
275
- if not choice.message or not choice.message.tool_calls:
276
- raise ValueError("Tool calls are not present in the response")
277
-
278
- tool_calls = [convert_tool_call(tool_call) for tool_call in choice.message.tool_calls]
279
- if any(isinstance(tool_call, UnparseableToolCall) for tool_call in tool_calls):
280
- # If we couldn't parse a tool call, jsonify the tool calls and return them
281
- return ChatCompletionResponse(
282
- completion_message=CompletionMessage(
283
- stop_reason=StopReason.end_of_turn,
284
- content=json.dumps(tool_calls, default=lambda x: x.model_dump()),
285
- ),
286
- logprobs=None,
287
- )
288
- else:
289
- # Otherwise, return tool calls as normal
290
- return ChatCompletionResponse(
291
- completion_message=CompletionMessage(
292
- tool_calls=tool_calls,
293
- stop_reason=StopReason.end_of_turn,
294
- # Content is not optional
295
- content="",
296
- ),
297
- logprobs=None,
298
- )
299
-
300
- # TODO: This does not work well with tool calls for vLLM remote provider
301
- # Ref: https://github.com/meta-llama/llama-stack/issues/1058
302
- raw_message = decode_assistant_message(text_from_choice(choice), get_stop_reason(choice.finish_reason))
303
-
304
- # NOTE: If we do not set tools in chat-completion request, we should not
305
- # expect the ToolCall in the response. Instead, we should return the raw
306
- # response from the model.
307
- if raw_message.tool_calls:
308
- if not request.tools:
309
- raw_message.tool_calls = []
310
- raw_message.content = text_from_choice(choice)
311
- else:
312
- # only return tool_calls if provided in the request
313
- new_tool_calls = []
314
- request_tools = {t.tool_name: t for t in request.tools}
315
- for t in raw_message.tool_calls:
316
- if t.tool_name in request_tools:
317
- new_tool_calls.append(t)
318
- else:
319
- logger.warning(f"Tool {t.tool_name} not found in request tools")
320
-
321
- if len(new_tool_calls) < len(raw_message.tool_calls):
322
- raw_message.tool_calls = new_tool_calls
323
- raw_message.content = text_from_choice(choice)
324
-
325
- return ChatCompletionResponse(
326
- completion_message=CompletionMessage(
327
- content=raw_message.content,
328
- stop_reason=raw_message.stop_reason,
329
- tool_calls=raw_message.tool_calls,
330
- ),
331
- logprobs=None,
332
- )
333
-
334
-
335
- async def process_completion_stream_response(
336
- stream: AsyncGenerator[OpenAICompatCompletionResponse, None],
337
- ) -> AsyncGenerator[CompletionResponseStreamChunk, None]:
338
- stop_reason = None
339
-
340
- async for chunk in stream:
341
- choice = chunk.choices[0]
342
- finish_reason = choice.finish_reason
343
-
344
- text = text_from_choice(choice)
345
- if text == "<|eot_id|>":
346
- stop_reason = StopReason.end_of_turn
347
- text = ""
348
- continue
349
- elif text == "<|eom_id|>":
350
- stop_reason = StopReason.end_of_message
351
- text = ""
352
- continue
353
- yield CompletionResponseStreamChunk(
354
- delta=text,
355
- stop_reason=stop_reason,
356
- logprobs=convert_openai_completion_logprobs_stream(text, choice.logprobs),
357
- )
358
- if finish_reason:
359
- if finish_reason in ["stop", "eos", "eos_token"]:
360
- stop_reason = StopReason.end_of_turn
361
- elif finish_reason == "length":
362
- stop_reason = StopReason.out_of_tokens
363
- break
364
-
365
- yield CompletionResponseStreamChunk(
366
- delta="",
367
- stop_reason=stop_reason,
368
- )
369
-
370
-
371
- async def process_chat_completion_stream_response(
372
- stream: AsyncGenerator[OpenAICompatCompletionResponse, None],
373
- request: ChatCompletionRequest,
374
- ) -> AsyncGenerator[ChatCompletionResponseStreamChunk, None]:
375
- yield ChatCompletionResponseStreamChunk(
376
- event=ChatCompletionResponseEvent(
377
- event_type=ChatCompletionResponseEventType.start,
378
- delta=TextDelta(text=""),
379
- )
380
- )
381
-
382
- buffer = ""
383
- ipython = False
384
- stop_reason = None
385
-
386
- async for chunk in stream:
387
- choice = chunk.choices[0]
388
- finish_reason = choice.finish_reason
389
-
390
- if finish_reason:
391
- if stop_reason is None and finish_reason in ["stop", "eos", "eos_token"]:
392
- stop_reason = StopReason.end_of_turn
393
- elif stop_reason is None and finish_reason == "length":
394
- stop_reason = StopReason.out_of_tokens
395
- break
396
-
397
- text = text_from_choice(choice)
398
- if not text:
399
- # Sometimes you get empty chunks from providers
400
- continue
401
-
402
- # check if its a tool call ( aka starts with <|python_tag|> )
403
- if not ipython and text.startswith("<|python_tag|>"):
404
- ipython = True
405
- yield ChatCompletionResponseStreamChunk(
406
- event=ChatCompletionResponseEvent(
407
- event_type=ChatCompletionResponseEventType.progress,
408
- delta=ToolCallDelta(
409
- tool_call="",
410
- parse_status=ToolCallParseStatus.started,
411
- ),
412
- )
413
- )
414
- buffer += text
415
- continue
416
-
417
- if text == "<|eot_id|>":
418
- stop_reason = StopReason.end_of_turn
419
- text = ""
420
- continue
421
- elif text == "<|eom_id|>":
422
- stop_reason = StopReason.end_of_message
423
- text = ""
424
- continue
425
-
426
- if ipython:
427
- buffer += text
428
- delta = ToolCallDelta(
429
- tool_call=text,
430
- parse_status=ToolCallParseStatus.in_progress,
431
- )
432
-
433
- yield ChatCompletionResponseStreamChunk(
434
- event=ChatCompletionResponseEvent(
435
- event_type=ChatCompletionResponseEventType.progress,
436
- delta=delta,
437
- stop_reason=stop_reason,
438
- )
439
- )
440
- else:
441
- buffer += text
442
- yield ChatCompletionResponseStreamChunk(
443
- event=ChatCompletionResponseEvent(
444
- event_type=ChatCompletionResponseEventType.progress,
445
- delta=TextDelta(text=text),
446
- stop_reason=stop_reason,
447
- )
448
- )
449
-
450
- # parse tool calls and report errors
451
- message = decode_assistant_message(buffer, stop_reason)
452
-
453
- parsed_tool_calls = len(message.tool_calls) > 0
454
- if ipython and not parsed_tool_calls:
455
- yield ChatCompletionResponseStreamChunk(
456
- event=ChatCompletionResponseEvent(
457
- event_type=ChatCompletionResponseEventType.progress,
458
- delta=ToolCallDelta(
459
- tool_call="",
460
- parse_status=ToolCallParseStatus.failed,
461
- ),
462
- stop_reason=stop_reason,
463
- )
464
- )
465
-
466
- request_tools = {t.tool_name: t for t in request.tools}
467
- for tool_call in message.tool_calls:
468
- if tool_call.tool_name in request_tools:
469
- yield ChatCompletionResponseStreamChunk(
470
- event=ChatCompletionResponseEvent(
471
- event_type=ChatCompletionResponseEventType.progress,
472
- delta=ToolCallDelta(
473
- tool_call=tool_call,
474
- parse_status=ToolCallParseStatus.succeeded,
475
- ),
476
- stop_reason=stop_reason,
477
- )
478
- )
479
- else:
480
- logger.warning(f"Tool {tool_call.tool_name} not found in request tools")
481
- yield ChatCompletionResponseStreamChunk(
482
- event=ChatCompletionResponseEvent(
483
- event_type=ChatCompletionResponseEventType.progress,
484
- delta=ToolCallDelta(
485
- # Parsing tool call failed due to tool call not being found in request tools,
486
- # We still add the raw message text inside tool_call for responding back to the user
487
- tool_call=buffer,
488
- parse_status=ToolCallParseStatus.failed,
489
- ),
490
- stop_reason=stop_reason,
491
- )
492
- )
493
-
494
- yield ChatCompletionResponseStreamChunk(
495
- event=ChatCompletionResponseEvent(
496
- event_type=ChatCompletionResponseEventType.complete,
497
- delta=TextDelta(text=""),
498
- stop_reason=stop_reason,
499
- )
500
- )
501
-
502
-
503
- async def convert_message_to_openai_dict(message: Message, download: bool = False) -> dict:
504
- async def _convert_content(content) -> dict:
505
- if isinstance(content, ImageContentItem):
506
- return {
507
- "type": "image_url",
508
- "image_url": {
509
- "url": await convert_image_content_to_url(content, download=download),
510
- },
511
- }
512
- else:
513
- text = content.text if isinstance(content, TextContentItem) else content
514
- assert isinstance(text, str)
515
- return {"type": "text", "text": text}
516
-
517
- if isinstance(message.content, list):
518
- content = [await _convert_content(c) for c in message.content]
519
- else:
520
- content = [await _convert_content(message.content)]
521
-
522
- result = {
523
- "role": message.role,
524
- "content": content,
525
- }
526
-
527
- if hasattr(message, "tool_calls") and message.tool_calls:
528
- result["tool_calls"] = []
529
- for tc in message.tool_calls:
530
- # The tool.tool_name can be a str or a BuiltinTool enum. If
531
- # it's the latter, convert to a string.
532
- tool_name = tc.tool_name
533
- if isinstance(tool_name, BuiltinTool):
534
- tool_name = tool_name.value
535
-
536
- result["tool_calls"].append(
537
- {
538
- "id": tc.call_id,
539
- "type": "function",
540
- "function": {
541
- "name": tool_name,
542
- "arguments": tc.arguments,
543
- },
544
- }
545
- )
546
- return result
547
-
548
-
549
72
  class UnparseableToolCall(BaseModel):
550
73
  """
551
74
  A ToolCall with arguments that are not valid JSON.
@@ -557,112 +80,6 @@ class UnparseableToolCall(BaseModel):
557
80
  arguments: str = ""
558
81
 
559
82
 
560
- async def convert_message_to_openai_dict_new(
561
- message: Message | dict,
562
- download_images: bool = False,
563
- ) -> OpenAIChatCompletionMessage:
564
- """
565
- Convert a Message to an OpenAI API-compatible dictionary.
566
- """
567
- # users can supply a dict instead of a Message object, we'll
568
- # convert it to a Message object and proceed with some type safety.
569
- if isinstance(message, dict):
570
- if "role" not in message:
571
- raise ValueError("role is required in message")
572
- if message["role"] == "user":
573
- message = UserMessage(**message)
574
- elif message["role"] == "assistant":
575
- message = CompletionMessage(**message)
576
- elif message["role"] == "tool":
577
- message = ToolResponseMessage(**message)
578
- elif message["role"] == "system":
579
- message = SystemMessage(**message)
580
- else:
581
- raise ValueError(f"Unsupported message role: {message['role']}")
582
-
583
- # Map Llama Stack spec to OpenAI spec -
584
- # str -> str
585
- # {"type": "text", "text": ...} -> {"type": "text", "text": ...}
586
- # {"type": "image", "image": {"url": {"uri": ...}}} -> {"type": "image_url", "image_url": {"url": ...}}
587
- # {"type": "image", "image": {"data": ...}} -> {"type": "image_url", "image_url": {"url": "data:image/?;base64,..."}}
588
- # List[...] -> List[...]
589
- async def _convert_message_content(
590
- content: InterleavedContent,
591
- ) -> str | Iterable[OpenAIChatCompletionContentPartParam]:
592
- async def impl(
593
- content_: InterleavedContent,
594
- ) -> str | OpenAIChatCompletionContentPartParam | list[OpenAIChatCompletionContentPartParam]:
595
- # Llama Stack and OpenAI spec match for str and text input
596
- if isinstance(content_, str):
597
- return content_
598
- elif isinstance(content_, TextContentItem):
599
- return OpenAIChatCompletionContentPartTextParam(
600
- type="text",
601
- text=content_.text,
602
- )
603
- elif isinstance(content_, ImageContentItem):
604
- return OpenAIChatCompletionContentPartImageParam(
605
- type="image_url",
606
- image_url=OpenAIImageURL(
607
- url=await convert_image_content_to_url(content_, download=download_images)
608
- ),
609
- )
610
- elif isinstance(content_, list):
611
- return [await impl(item) for item in content_]
612
- else:
613
- raise ValueError(f"Unsupported content type: {type(content_)}")
614
-
615
- ret = await impl(content)
616
-
617
- # OpenAI*Message expects a str or list
618
- if isinstance(ret, str) or isinstance(ret, list):
619
- return ret
620
- else:
621
- return [ret]
622
-
623
- out: OpenAIChatCompletionMessage = None
624
- if isinstance(message, UserMessage):
625
- out = OpenAIChatCompletionUserMessage(
626
- role="user",
627
- content=await _convert_message_content(message.content),
628
- )
629
- elif isinstance(message, CompletionMessage):
630
- tool_calls = [
631
- OpenAIChatCompletionMessageFunctionToolCall(
632
- id=tool.call_id,
633
- function=OpenAIFunction(
634
- name=(tool.tool_name if not isinstance(tool.tool_name, BuiltinTool) else tool.tool_name.value),
635
- arguments=tool.arguments, # Already a JSON string, don't double-encode
636
- ),
637
- type="function",
638
- )
639
- for tool in message.tool_calls
640
- ]
641
- params = {}
642
- if tool_calls:
643
- params["tool_calls"] = tool_calls
644
- out = OpenAIChatCompletionAssistantMessage(
645
- role="assistant",
646
- content=await _convert_message_content(message.content),
647
- **params,
648
- )
649
- elif isinstance(message, ToolResponseMessage):
650
- out = OpenAIChatCompletionToolMessage(
651
- role="tool",
652
- tool_call_id=message.call_id,
653
- content=await _convert_message_content(message.content),
654
- )
655
- elif isinstance(message, SystemMessage):
656
- out = OpenAIChatCompletionSystemMessage(
657
- role="system",
658
- content=await _convert_message_content(message.content),
659
- )
660
- else:
661
- raise ValueError(f"Unsupported message type: {type(message)}")
662
-
663
- return out
664
-
665
-
666
83
  def convert_tool_call(
667
84
  tool_call: ChatCompletionMessageToolCall,
668
85
  ) -> ToolCall | UnparseableToolCall:
@@ -758,16 +175,16 @@ def convert_tooldef_to_openai_tool(tool: ToolDefinition) -> dict:
758
175
  function = out["function"]
759
176
 
760
177
  if isinstance(tool.tool_name, BuiltinTool):
761
- function["name"] = tool.tool_name.value
178
+ function["name"] = tool.tool_name.value # type: ignore[index] # dict value inferred as Any but mypy sees Collection[str]
762
179
  else:
763
- function["name"] = tool.tool_name
180
+ function["name"] = tool.tool_name # type: ignore[index] # dict value inferred as Any but mypy sees Collection[str]
764
181
 
765
182
  if tool.description:
766
- function["description"] = tool.description
183
+ function["description"] = tool.description # type: ignore[index] # dict value inferred as Any but mypy sees Collection[str]
767
184
 
768
185
  if tool.input_schema:
769
186
  # Pass through the entire JSON Schema as-is
770
- function["parameters"] = tool.input_schema
187
+ function["parameters"] = tool.input_schema # type: ignore[index] # dict value inferred as Any but mypy sees Collection[str]
771
188
 
772
189
  # NOTE: OpenAI does not support output_schema, so we drop it here
773
190
  # It's stored in LlamaStack for validation and other provider usage
@@ -775,436 +192,6 @@ def convert_tooldef_to_openai_tool(tool: ToolDefinition) -> dict:
775
192
  return out
776
193
 
777
194
 
778
- def _convert_stop_reason_to_openai_finish_reason(stop_reason: StopReason) -> str:
779
- """
780
- Convert a StopReason to an OpenAI chat completion finish_reason.
781
- """
782
- return {
783
- StopReason.end_of_turn: "stop",
784
- StopReason.end_of_message: "tool_calls",
785
- StopReason.out_of_tokens: "length",
786
- }.get(stop_reason, "stop")
787
-
788
-
789
- def _convert_openai_finish_reason(finish_reason: str) -> StopReason:
790
- """
791
- Convert an OpenAI chat completion finish_reason to a StopReason.
792
-
793
- finish_reason: Literal["stop", "length", "tool_calls", ...]
794
- - stop: model hit a natural stop point or a provided stop sequence
795
- - length: maximum number of tokens specified in the request was reached
796
- - tool_calls: model called a tool
797
-
798
- ->
799
-
800
- class StopReason(Enum):
801
- end_of_turn = "end_of_turn"
802
- end_of_message = "end_of_message"
803
- out_of_tokens = "out_of_tokens"
804
- """
805
-
806
- # TODO(mf): are end_of_turn and end_of_message semantics correct?
807
- return {
808
- "stop": StopReason.end_of_turn,
809
- "length": StopReason.out_of_tokens,
810
- "tool_calls": StopReason.end_of_message,
811
- }.get(finish_reason, StopReason.end_of_turn)
812
-
813
-
814
- def _convert_openai_request_tool_config(tool_choice: str | dict[str, Any] | None = None) -> ToolConfig:
815
- tool_config = ToolConfig()
816
- if tool_choice:
817
- try:
818
- tool_choice = ToolChoice(tool_choice)
819
- except ValueError:
820
- pass
821
- tool_config.tool_choice = tool_choice
822
- return tool_config
823
-
824
-
825
- def _convert_openai_request_tools(tools: list[dict[str, Any]] | None = None) -> list[ToolDefinition]:
826
- lls_tools = []
827
- if not tools:
828
- return lls_tools
829
-
830
- for tool in tools:
831
- tool_fn = tool.get("function", {})
832
- tool_name = tool_fn.get("name", None)
833
- tool_desc = tool_fn.get("description", None)
834
- tool_params = tool_fn.get("parameters", None)
835
-
836
- lls_tool = ToolDefinition(
837
- tool_name=tool_name,
838
- description=tool_desc,
839
- input_schema=tool_params, # Pass through entire JSON Schema
840
- )
841
- lls_tools.append(lls_tool)
842
- return lls_tools
843
-
844
-
845
- def _convert_openai_request_response_format(
846
- response_format: OpenAIResponseFormatParam = None,
847
- ):
848
- if not response_format:
849
- return None
850
- # response_format can be a dict or a pydantic model
851
- response_format = dict(response_format)
852
- if response_format.get("type", "") == "json_schema":
853
- return JsonSchemaResponseFormat(
854
- type="json_schema",
855
- json_schema=response_format.get("json_schema", {}).get("schema", ""),
856
- )
857
- return None
858
-
859
-
860
- def _convert_openai_tool_calls(
861
- tool_calls: list[OpenAIChatCompletionMessageFunctionToolCall],
862
- ) -> list[ToolCall]:
863
- """
864
- Convert an OpenAI ChatCompletionMessageToolCall list into a list of ToolCall.
865
-
866
- OpenAI ChatCompletionMessageToolCall:
867
- id: str
868
- function: Function
869
- type: Literal["function"]
870
-
871
- OpenAI Function:
872
- arguments: str
873
- name: str
874
-
875
- ->
876
-
877
- ToolCall:
878
- call_id: str
879
- tool_name: str
880
- arguments: Dict[str, ...]
881
- """
882
- if not tool_calls:
883
- return [] # CompletionMessage tool_calls is not optional
884
-
885
- return [
886
- ToolCall(
887
- call_id=call.id,
888
- tool_name=call.function.name,
889
- arguments=call.function.arguments,
890
- )
891
- for call in tool_calls
892
- ]
893
-
894
-
895
- def _convert_openai_logprobs(
896
- logprobs: OpenAIChoiceLogprobs,
897
- ) -> list[TokenLogProbs] | None:
898
- """
899
- Convert an OpenAI ChoiceLogprobs into a list of TokenLogProbs.
900
-
901
- OpenAI ChoiceLogprobs:
902
- content: Optional[List[ChatCompletionTokenLogprob]]
903
-
904
- OpenAI ChatCompletionTokenLogprob:
905
- token: str
906
- logprob: float
907
- top_logprobs: List[TopLogprob]
908
-
909
- OpenAI TopLogprob:
910
- token: str
911
- logprob: float
912
-
913
- ->
914
-
915
- TokenLogProbs:
916
- logprobs_by_token: Dict[str, float]
917
- - token, logprob
918
-
919
- """
920
- if not logprobs or not logprobs.content:
921
- return None
922
-
923
- return [
924
- TokenLogProbs(logprobs_by_token={logprobs.token: logprobs.logprob for logprobs in content.top_logprobs})
925
- for content in logprobs.content
926
- ]
927
-
928
-
929
- def _convert_openai_sampling_params(
930
- max_tokens: int | None = None,
931
- temperature: float | None = None,
932
- top_p: float | None = None,
933
- ) -> SamplingParams:
934
- sampling_params = SamplingParams()
935
-
936
- if max_tokens:
937
- sampling_params.max_tokens = max_tokens
938
-
939
- # Map an explicit temperature of 0 to greedy sampling
940
- if temperature == 0:
941
- strategy = GreedySamplingStrategy()
942
- else:
943
- # OpenAI defaults to 1.0 for temperature and top_p if unset
944
- if temperature is None:
945
- temperature = 1.0
946
- if top_p is None:
947
- top_p = 1.0
948
- strategy = TopPSamplingStrategy(temperature=temperature, top_p=top_p)
949
-
950
- sampling_params.strategy = strategy
951
- return sampling_params
952
-
953
-
954
- def openai_messages_to_messages(
955
- messages: list[OpenAIMessageParam],
956
- ) -> list[Message]:
957
- """
958
- Convert a list of OpenAIChatCompletionMessage into a list of Message.
959
- """
960
- converted_messages = []
961
- for message in messages:
962
- if message.role == "system":
963
- converted_message = SystemMessage(content=openai_content_to_content(message.content))
964
- elif message.role == "user":
965
- converted_message = UserMessage(content=openai_content_to_content(message.content))
966
- elif message.role == "assistant":
967
- converted_message = CompletionMessage(
968
- content=openai_content_to_content(message.content),
969
- tool_calls=_convert_openai_tool_calls(message.tool_calls),
970
- stop_reason=StopReason.end_of_turn,
971
- )
972
- elif message.role == "tool":
973
- converted_message = ToolResponseMessage(
974
- role="tool",
975
- call_id=message.tool_call_id,
976
- content=openai_content_to_content(message.content),
977
- )
978
- else:
979
- raise ValueError(f"Unknown role {message.role}")
980
- converted_messages.append(converted_message)
981
- return converted_messages
982
-
983
-
984
- def openai_content_to_content(content: str | Iterable[OpenAIChatCompletionContentPartParam] | None):
985
- if content is None:
986
- return ""
987
- if isinstance(content, str):
988
- return content
989
- elif isinstance(content, list):
990
- return [openai_content_to_content(c) for c in content]
991
- elif hasattr(content, "type"):
992
- if content.type == "text":
993
- return TextContentItem(type="text", text=content.text)
994
- elif content.type == "image_url":
995
- return ImageContentItem(type="image", image=_URLOrData(url=URL(uri=content.image_url.url)))
996
- else:
997
- raise ValueError(f"Unknown content type: {content.type}")
998
- else:
999
- raise ValueError(f"Unknown content type: {content}")
1000
-
1001
-
1002
- def convert_openai_chat_completion_choice(
1003
- choice: OpenAIChoice,
1004
- ) -> ChatCompletionResponse:
1005
- """
1006
- Convert an OpenAI Choice into a ChatCompletionResponse.
1007
-
1008
- OpenAI Choice:
1009
- message: ChatCompletionMessage
1010
- finish_reason: str
1011
- logprobs: Optional[ChoiceLogprobs]
1012
-
1013
- OpenAI ChatCompletionMessage:
1014
- role: Literal["assistant"]
1015
- content: Optional[str]
1016
- tool_calls: Optional[List[ChatCompletionMessageToolCall]]
1017
-
1018
- ->
1019
-
1020
- ChatCompletionResponse:
1021
- completion_message: CompletionMessage
1022
- logprobs: Optional[List[TokenLogProbs]]
1023
-
1024
- CompletionMessage:
1025
- role: Literal["assistant"]
1026
- content: str | ImageMedia | List[str | ImageMedia]
1027
- stop_reason: StopReason
1028
- tool_calls: List[ToolCall]
1029
-
1030
- class StopReason(Enum):
1031
- end_of_turn = "end_of_turn"
1032
- end_of_message = "end_of_message"
1033
- out_of_tokens = "out_of_tokens"
1034
- """
1035
- assert hasattr(choice, "message") and choice.message, "error in server response: message not found"
1036
- assert hasattr(choice, "finish_reason") and choice.finish_reason, (
1037
- "error in server response: finish_reason not found"
1038
- )
1039
-
1040
- return ChatCompletionResponse(
1041
- completion_message=CompletionMessage(
1042
- content=choice.message.content or "", # CompletionMessage content is not optional
1043
- stop_reason=_convert_openai_finish_reason(choice.finish_reason),
1044
- tool_calls=_convert_openai_tool_calls(choice.message.tool_calls),
1045
- ),
1046
- logprobs=_convert_openai_logprobs(getattr(choice, "logprobs", None)),
1047
- )
1048
-
1049
-
1050
- async def convert_openai_chat_completion_stream(
1051
- stream: AsyncStream[OpenAIChatCompletionChunk],
1052
- enable_incremental_tool_calls: bool,
1053
- ) -> AsyncGenerator[ChatCompletionResponseStreamChunk, None]:
1054
- """
1055
- Convert a stream of OpenAI chat completion chunks into a stream
1056
- of ChatCompletionResponseStreamChunk.
1057
- """
1058
- yield ChatCompletionResponseStreamChunk(
1059
- event=ChatCompletionResponseEvent(
1060
- event_type=ChatCompletionResponseEventType.start,
1061
- delta=TextDelta(text=""),
1062
- )
1063
- )
1064
- event_type = ChatCompletionResponseEventType.progress
1065
-
1066
- stop_reason = None
1067
- tool_call_idx_to_buffer = {}
1068
-
1069
- async for chunk in stream:
1070
- choice = chunk.choices[0] # assuming only one choice per chunk
1071
-
1072
- # we assume there's only one finish_reason in the stream
1073
- stop_reason = _convert_openai_finish_reason(choice.finish_reason) or stop_reason
1074
- logprobs = getattr(choice, "logprobs", None)
1075
-
1076
- # if there's a tool call, emit an event for each tool in the list
1077
- # if tool call and content, emit both separately
1078
- if choice.delta.tool_calls:
1079
- # the call may have content and a tool call. ChatCompletionResponseEvent
1080
- # does not support both, so we emit the content first
1081
- if choice.delta.content:
1082
- yield ChatCompletionResponseStreamChunk(
1083
- event=ChatCompletionResponseEvent(
1084
- event_type=event_type,
1085
- delta=TextDelta(text=choice.delta.content),
1086
- logprobs=_convert_openai_logprobs(logprobs),
1087
- )
1088
- )
1089
-
1090
- # it is possible to have parallel tool calls in stream, but
1091
- # ChatCompletionResponseEvent only supports one per stream
1092
- if len(choice.delta.tool_calls) > 1:
1093
- warnings.warn(
1094
- "multiple tool calls found in a single delta, using the first, ignoring the rest",
1095
- stacklevel=2,
1096
- )
1097
-
1098
- if not enable_incremental_tool_calls:
1099
- for tool_call in choice.delta.tool_calls:
1100
- yield ChatCompletionResponseStreamChunk(
1101
- event=ChatCompletionResponseEvent(
1102
- event_type=event_type,
1103
- delta=ToolCallDelta(
1104
- tool_call=_convert_openai_tool_calls([tool_call])[0],
1105
- parse_status=ToolCallParseStatus.succeeded,
1106
- ),
1107
- logprobs=_convert_openai_logprobs(logprobs),
1108
- )
1109
- )
1110
- else:
1111
- for tool_call in choice.delta.tool_calls:
1112
- idx = tool_call.index if hasattr(tool_call, "index") else 0
1113
-
1114
- if idx not in tool_call_idx_to_buffer:
1115
- tool_call_idx_to_buffer[idx] = {
1116
- "call_id": tool_call.id,
1117
- "name": None,
1118
- "arguments": "",
1119
- "content": "",
1120
- }
1121
-
1122
- buffer = tool_call_idx_to_buffer[idx]
1123
-
1124
- if tool_call.function:
1125
- if tool_call.function.name:
1126
- buffer["name"] = tool_call.function.name
1127
- delta = f"{buffer['name']}("
1128
- buffer["content"] += delta
1129
-
1130
- if tool_call.function.arguments:
1131
- delta = tool_call.function.arguments
1132
- buffer["arguments"] += delta
1133
- buffer["content"] += delta
1134
-
1135
- yield ChatCompletionResponseStreamChunk(
1136
- event=ChatCompletionResponseEvent(
1137
- event_type=event_type,
1138
- delta=ToolCallDelta(
1139
- tool_call=delta,
1140
- parse_status=ToolCallParseStatus.in_progress,
1141
- ),
1142
- logprobs=_convert_openai_logprobs(logprobs),
1143
- )
1144
- )
1145
- elif choice.delta.content:
1146
- yield ChatCompletionResponseStreamChunk(
1147
- event=ChatCompletionResponseEvent(
1148
- event_type=event_type,
1149
- delta=TextDelta(text=choice.delta.content or ""),
1150
- logprobs=_convert_openai_logprobs(logprobs),
1151
- )
1152
- )
1153
-
1154
- for idx, buffer in tool_call_idx_to_buffer.items():
1155
- logger.debug(f"toolcall_buffer[{idx}]: {buffer}")
1156
- if buffer["name"]:
1157
- delta = ")"
1158
- buffer["content"] += delta
1159
- yield ChatCompletionResponseStreamChunk(
1160
- event=ChatCompletionResponseEvent(
1161
- event_type=event_type,
1162
- delta=ToolCallDelta(
1163
- tool_call=delta,
1164
- parse_status=ToolCallParseStatus.in_progress,
1165
- ),
1166
- logprobs=None,
1167
- )
1168
- )
1169
-
1170
- try:
1171
- tool_call = ToolCall(
1172
- call_id=buffer["call_id"],
1173
- tool_name=buffer["name"],
1174
- arguments=buffer["arguments"],
1175
- )
1176
- yield ChatCompletionResponseStreamChunk(
1177
- event=ChatCompletionResponseEvent(
1178
- event_type=ChatCompletionResponseEventType.progress,
1179
- delta=ToolCallDelta(
1180
- tool_call=tool_call,
1181
- parse_status=ToolCallParseStatus.succeeded,
1182
- ),
1183
- stop_reason=stop_reason,
1184
- )
1185
- )
1186
- except json.JSONDecodeError as e:
1187
- print(f"Failed to parse arguments: {e}")
1188
- yield ChatCompletionResponseStreamChunk(
1189
- event=ChatCompletionResponseEvent(
1190
- event_type=ChatCompletionResponseEventType.progress,
1191
- delta=ToolCallDelta(
1192
- tool_call=buffer["content"],
1193
- parse_status=ToolCallParseStatus.failed,
1194
- ),
1195
- stop_reason=stop_reason,
1196
- )
1197
- )
1198
-
1199
- yield ChatCompletionResponseStreamChunk(
1200
- event=ChatCompletionResponseEvent(
1201
- event_type=ChatCompletionResponseEventType.complete,
1202
- delta=TextDelta(text=""),
1203
- stop_reason=stop_reason,
1204
- )
1205
- )
1206
-
1207
-
1208
195
  async def prepare_openai_completion_params(**params):
1209
196
  async def _prepare_value(value: Any) -> Any:
1210
197
  new_value = value
@@ -1223,157 +210,6 @@ async def prepare_openai_completion_params(**params):
1223
210
  return completion_params
1224
211
 
1225
212
 
1226
- class OpenAIChatCompletionToLlamaStackMixin:
1227
- async def openai_chat_completion(
1228
- self,
1229
- model: str,
1230
- messages: list[OpenAIMessageParam],
1231
- frequency_penalty: float | None = None,
1232
- function_call: str | dict[str, Any] | None = None,
1233
- functions: list[dict[str, Any]] | None = None,
1234
- logit_bias: dict[str, float] | None = None,
1235
- logprobs: bool | None = None,
1236
- max_completion_tokens: int | None = None,
1237
- max_tokens: int | None = None,
1238
- n: int | None = None,
1239
- parallel_tool_calls: bool | None = None,
1240
- presence_penalty: float | None = None,
1241
- response_format: OpenAIResponseFormatParam | None = None,
1242
- seed: int | None = None,
1243
- stop: str | list[str] | None = None,
1244
- stream: bool | None = None,
1245
- stream_options: dict[str, Any] | None = None,
1246
- temperature: float | None = None,
1247
- tool_choice: str | dict[str, Any] | None = None,
1248
- tools: list[dict[str, Any]] | None = None,
1249
- top_logprobs: int | None = None,
1250
- top_p: float | None = None,
1251
- user: str | None = None,
1252
- ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
1253
- messages = openai_messages_to_messages(messages)
1254
- response_format = _convert_openai_request_response_format(response_format)
1255
- sampling_params = _convert_openai_sampling_params(
1256
- max_tokens=max_tokens,
1257
- temperature=temperature,
1258
- top_p=top_p,
1259
- )
1260
- tool_config = _convert_openai_request_tool_config(tool_choice)
1261
-
1262
- tools = _convert_openai_request_tools(tools)
1263
- if tool_config.tool_choice == ToolChoice.none:
1264
- tools = []
1265
-
1266
- outstanding_responses = []
1267
- # "n" is the number of completions to generate per prompt
1268
- n = n or 1
1269
- for _i in range(0, n):
1270
- response = self.chat_completion(
1271
- model_id=model,
1272
- messages=messages,
1273
- sampling_params=sampling_params,
1274
- response_format=response_format,
1275
- stream=stream,
1276
- tool_config=tool_config,
1277
- tools=tools,
1278
- )
1279
- outstanding_responses.append(response)
1280
-
1281
- if stream:
1282
- return OpenAIChatCompletionToLlamaStackMixin._process_stream_response(self, model, outstanding_responses)
1283
-
1284
- return await OpenAIChatCompletionToLlamaStackMixin._process_non_stream_response(
1285
- self, model, outstanding_responses
1286
- )
1287
-
1288
- async def _process_stream_response(
1289
- self,
1290
- model: str,
1291
- outstanding_responses: list[Awaitable[AsyncIterator[ChatCompletionResponseStreamChunk]]],
1292
- ):
1293
- id = f"chatcmpl-{uuid.uuid4()}"
1294
- for i, outstanding_response in enumerate(outstanding_responses):
1295
- response = await outstanding_response
1296
- async for chunk in response:
1297
- event = chunk.event
1298
- finish_reason = _convert_stop_reason_to_openai_finish_reason(event.stop_reason)
1299
-
1300
- if isinstance(event.delta, TextDelta):
1301
- text_delta = event.delta.text
1302
- delta = OpenAIChoiceDelta(content=text_delta)
1303
- yield OpenAIChatCompletionChunk(
1304
- id=id,
1305
- choices=[OpenAIChatCompletionChunkChoice(index=i, finish_reason=finish_reason, delta=delta)],
1306
- created=int(time.time()),
1307
- model=model,
1308
- object="chat.completion.chunk",
1309
- )
1310
- elif isinstance(event.delta, ToolCallDelta):
1311
- if event.delta.parse_status == ToolCallParseStatus.succeeded:
1312
- tool_call = event.delta.tool_call
1313
-
1314
- # First chunk includes full structure
1315
- openai_tool_call = OpenAIChoiceDeltaToolCall(
1316
- index=0,
1317
- id=tool_call.call_id,
1318
- function=OpenAIChoiceDeltaToolCallFunction(
1319
- name=tool_call.tool_name,
1320
- arguments="",
1321
- ),
1322
- )
1323
- delta = OpenAIChoiceDelta(tool_calls=[openai_tool_call])
1324
- yield OpenAIChatCompletionChunk(
1325
- id=id,
1326
- choices=[
1327
- OpenAIChatCompletionChunkChoice(index=i, finish_reason=finish_reason, delta=delta)
1328
- ],
1329
- created=int(time.time()),
1330
- model=model,
1331
- object="chat.completion.chunk",
1332
- )
1333
- # arguments
1334
- openai_tool_call = OpenAIChoiceDeltaToolCall(
1335
- index=0,
1336
- function=OpenAIChoiceDeltaToolCallFunction(
1337
- arguments=tool_call.arguments,
1338
- ),
1339
- )
1340
- delta = OpenAIChoiceDelta(tool_calls=[openai_tool_call])
1341
- yield OpenAIChatCompletionChunk(
1342
- id=id,
1343
- choices=[
1344
- OpenAIChatCompletionChunkChoice(index=i, finish_reason=finish_reason, delta=delta)
1345
- ],
1346
- created=int(time.time()),
1347
- model=model,
1348
- object="chat.completion.chunk",
1349
- )
1350
-
1351
- async def _process_non_stream_response(
1352
- self, model: str, outstanding_responses: list[Awaitable[ChatCompletionResponse]]
1353
- ) -> OpenAIChatCompletion:
1354
- choices = []
1355
- for outstanding_response in outstanding_responses:
1356
- response = await outstanding_response
1357
- completion_message = response.completion_message
1358
- message = await convert_message_to_openai_dict_new(completion_message)
1359
- finish_reason = _convert_stop_reason_to_openai_finish_reason(completion_message.stop_reason)
1360
-
1361
- choice = OpenAIChatCompletionChoice(
1362
- index=len(choices),
1363
- message=message,
1364
- finish_reason=finish_reason,
1365
- )
1366
- choices.append(choice)
1367
-
1368
- return OpenAIChatCompletion(
1369
- id=f"chatcmpl-{uuid.uuid4()}",
1370
- choices=choices,
1371
- created=int(time.time()),
1372
- model=model,
1373
- object="chat.completion",
1374
- )
1375
-
1376
-
1377
213
  def prepare_openai_embeddings_params(
1378
214
  model: str,
1379
215
  input: str | list[str],
@@ -1399,3 +235,40 @@ def prepare_openai_embeddings_params(
1399
235
  params["user"] = user
1400
236
 
1401
237
  return params
238
+
239
+
240
+ def get_stream_options_for_telemetry(
241
+ stream_options: dict[str, Any] | None,
242
+ is_streaming: bool,
243
+ supports_stream_options: bool = True,
244
+ ) -> dict[str, Any] | None:
245
+ """
246
+ Inject stream_options when streaming and telemetry is active.
247
+
248
+ Active telemetry takes precedence over caller preference to ensure
249
+ complete and consistent observability metrics.
250
+
251
+ Args:
252
+ stream_options: Existing stream options from the request
253
+ is_streaming: Whether this is a streaming request
254
+ supports_stream_options: Whether the provider supports stream_options parameter
255
+
256
+ Returns:
257
+ Updated stream_options with include_usage=True if conditions are met, otherwise original options
258
+ """
259
+ if not is_streaming:
260
+ return stream_options
261
+
262
+ if not supports_stream_options:
263
+ return stream_options
264
+
265
+ from opentelemetry import trace
266
+
267
+ span = trace.get_current_span()
268
+ if not span or not span.is_recording():
269
+ return stream_options
270
+
271
+ if stream_options is None:
272
+ return {"include_usage": True}
273
+
274
+ return {**stream_options, "include_usage": True}