llama-stack 0.3.5__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (458) hide show
  1. llama_stack/__init__.py +0 -5
  2. llama_stack/cli/llama.py +3 -3
  3. llama_stack/cli/stack/_list_deps.py +12 -23
  4. llama_stack/cli/stack/list_stacks.py +37 -18
  5. llama_stack/cli/stack/run.py +121 -11
  6. llama_stack/cli/stack/utils.py +0 -127
  7. llama_stack/core/access_control/access_control.py +69 -28
  8. llama_stack/core/access_control/conditions.py +15 -5
  9. llama_stack/core/admin.py +267 -0
  10. llama_stack/core/build.py +6 -74
  11. llama_stack/core/client.py +1 -1
  12. llama_stack/core/configure.py +6 -6
  13. llama_stack/core/conversations/conversations.py +28 -25
  14. llama_stack/core/datatypes.py +271 -79
  15. llama_stack/core/distribution.py +15 -16
  16. llama_stack/core/external.py +3 -3
  17. llama_stack/core/inspect.py +98 -15
  18. llama_stack/core/library_client.py +73 -61
  19. llama_stack/core/prompts/prompts.py +12 -11
  20. llama_stack/core/providers.py +17 -11
  21. llama_stack/core/resolver.py +65 -56
  22. llama_stack/core/routers/__init__.py +8 -12
  23. llama_stack/core/routers/datasets.py +1 -4
  24. llama_stack/core/routers/eval_scoring.py +7 -4
  25. llama_stack/core/routers/inference.py +55 -271
  26. llama_stack/core/routers/safety.py +52 -24
  27. llama_stack/core/routers/tool_runtime.py +6 -48
  28. llama_stack/core/routers/vector_io.py +130 -51
  29. llama_stack/core/routing_tables/benchmarks.py +24 -20
  30. llama_stack/core/routing_tables/common.py +1 -4
  31. llama_stack/core/routing_tables/datasets.py +22 -22
  32. llama_stack/core/routing_tables/models.py +119 -6
  33. llama_stack/core/routing_tables/scoring_functions.py +7 -7
  34. llama_stack/core/routing_tables/shields.py +1 -2
  35. llama_stack/core/routing_tables/toolgroups.py +17 -7
  36. llama_stack/core/routing_tables/vector_stores.py +51 -16
  37. llama_stack/core/server/auth.py +5 -3
  38. llama_stack/core/server/auth_providers.py +36 -20
  39. llama_stack/core/server/fastapi_router_registry.py +84 -0
  40. llama_stack/core/server/quota.py +2 -2
  41. llama_stack/core/server/routes.py +79 -27
  42. llama_stack/core/server/server.py +102 -87
  43. llama_stack/core/stack.py +201 -58
  44. llama_stack/core/storage/datatypes.py +26 -3
  45. llama_stack/{providers/utils → core/storage}/kvstore/__init__.py +2 -0
  46. llama_stack/{providers/utils → core/storage}/kvstore/kvstore.py +55 -24
  47. llama_stack/{providers/utils → core/storage}/kvstore/mongodb/mongodb.py +13 -10
  48. llama_stack/{providers/utils → core/storage}/kvstore/postgres/postgres.py +28 -17
  49. llama_stack/{providers/utils → core/storage}/kvstore/redis/redis.py +41 -16
  50. llama_stack/{providers/utils → core/storage}/kvstore/sqlite/sqlite.py +1 -1
  51. llama_stack/core/storage/sqlstore/__init__.py +17 -0
  52. llama_stack/{providers/utils → core/storage}/sqlstore/authorized_sqlstore.py +69 -49
  53. llama_stack/{providers/utils → core/storage}/sqlstore/sqlalchemy_sqlstore.py +47 -17
  54. llama_stack/{providers/utils → core/storage}/sqlstore/sqlstore.py +25 -8
  55. llama_stack/core/store/registry.py +1 -1
  56. llama_stack/core/utils/config.py +8 -2
  57. llama_stack/core/utils/config_resolution.py +32 -29
  58. llama_stack/core/utils/context.py +4 -10
  59. llama_stack/core/utils/exec.py +9 -0
  60. llama_stack/core/utils/type_inspection.py +45 -0
  61. llama_stack/distributions/dell/{run.yaml → config.yaml} +3 -2
  62. llama_stack/distributions/dell/dell.py +2 -2
  63. llama_stack/distributions/dell/run-with-safety.yaml +3 -2
  64. llama_stack/distributions/meta-reference-gpu/{run.yaml → config.yaml} +3 -2
  65. llama_stack/distributions/meta-reference-gpu/meta_reference.py +2 -2
  66. llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +3 -2
  67. llama_stack/distributions/nvidia/{run.yaml → config.yaml} +4 -4
  68. llama_stack/distributions/nvidia/nvidia.py +1 -1
  69. llama_stack/distributions/nvidia/run-with-safety.yaml +4 -4
  70. llama_stack/{apis/datasetio → distributions/oci}/__init__.py +1 -1
  71. llama_stack/distributions/oci/config.yaml +134 -0
  72. llama_stack/distributions/oci/oci.py +108 -0
  73. llama_stack/distributions/open-benchmark/{run.yaml → config.yaml} +5 -4
  74. llama_stack/distributions/open-benchmark/open_benchmark.py +2 -3
  75. llama_stack/distributions/postgres-demo/{run.yaml → config.yaml} +4 -3
  76. llama_stack/distributions/starter/{run.yaml → config.yaml} +64 -13
  77. llama_stack/distributions/starter/run-with-postgres-store.yaml +64 -13
  78. llama_stack/distributions/starter/starter.py +8 -5
  79. llama_stack/distributions/starter-gpu/{run.yaml → config.yaml} +64 -13
  80. llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +64 -13
  81. llama_stack/distributions/template.py +13 -69
  82. llama_stack/distributions/watsonx/{run.yaml → config.yaml} +4 -3
  83. llama_stack/distributions/watsonx/watsonx.py +1 -1
  84. llama_stack/log.py +28 -11
  85. llama_stack/models/llama/checkpoint.py +6 -6
  86. llama_stack/models/llama/hadamard_utils.py +2 -0
  87. llama_stack/models/llama/llama3/generation.py +3 -1
  88. llama_stack/models/llama/llama3/interface.py +2 -5
  89. llama_stack/models/llama/llama3/multimodal/encoder_utils.py +3 -3
  90. llama_stack/models/llama/llama3/multimodal/image_transform.py +6 -6
  91. llama_stack/models/llama/llama3/prompt_templates/system_prompts.py +1 -1
  92. llama_stack/models/llama/llama3/tool_utils.py +2 -1
  93. llama_stack/models/llama/llama4/prompt_templates/system_prompts.py +1 -1
  94. llama_stack/providers/inline/agents/meta_reference/__init__.py +3 -3
  95. llama_stack/providers/inline/agents/meta_reference/agents.py +44 -261
  96. llama_stack/providers/inline/agents/meta_reference/config.py +6 -1
  97. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +207 -57
  98. llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +308 -47
  99. llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +162 -96
  100. llama_stack/providers/inline/agents/meta_reference/responses/types.py +23 -8
  101. llama_stack/providers/inline/agents/meta_reference/responses/utils.py +201 -33
  102. llama_stack/providers/inline/agents/meta_reference/safety.py +8 -13
  103. llama_stack/providers/inline/batches/reference/__init__.py +2 -4
  104. llama_stack/providers/inline/batches/reference/batches.py +78 -60
  105. llama_stack/providers/inline/datasetio/localfs/datasetio.py +2 -5
  106. llama_stack/providers/inline/eval/meta_reference/eval.py +16 -61
  107. llama_stack/providers/inline/files/localfs/files.py +37 -28
  108. llama_stack/providers/inline/inference/meta_reference/config.py +2 -2
  109. llama_stack/providers/inline/inference/meta_reference/generators.py +50 -60
  110. llama_stack/providers/inline/inference/meta_reference/inference.py +403 -19
  111. llama_stack/providers/inline/inference/meta_reference/model_parallel.py +7 -26
  112. llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +2 -12
  113. llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +10 -15
  114. llama_stack/providers/inline/post_training/common/validator.py +1 -5
  115. llama_stack/providers/inline/post_training/huggingface/post_training.py +8 -8
  116. llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py +18 -10
  117. llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device_dpo.py +12 -9
  118. llama_stack/providers/inline/post_training/huggingface/utils.py +27 -6
  119. llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py +1 -1
  120. llama_stack/providers/inline/post_training/torchtune/common/utils.py +1 -1
  121. llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py +1 -1
  122. llama_stack/providers/inline/post_training/torchtune/post_training.py +8 -8
  123. llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +16 -16
  124. llama_stack/providers/inline/safety/code_scanner/code_scanner.py +13 -9
  125. llama_stack/providers/inline/safety/llama_guard/llama_guard.py +18 -15
  126. llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +9 -9
  127. llama_stack/providers/inline/scoring/basic/scoring.py +6 -13
  128. llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py +1 -2
  129. llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py +1 -2
  130. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py +2 -2
  131. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py +2 -2
  132. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py +2 -2
  133. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py +2 -2
  134. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py +2 -2
  135. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py +2 -2
  136. llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py +1 -2
  137. llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py +1 -2
  138. llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py +1 -2
  139. llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py +1 -2
  140. llama_stack/providers/inline/scoring/braintrust/braintrust.py +12 -15
  141. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py +2 -2
  142. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py +2 -2
  143. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py +2 -2
  144. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py +2 -2
  145. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py +2 -2
  146. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py +2 -2
  147. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py +2 -2
  148. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py +2 -2
  149. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py +2 -2
  150. llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +7 -14
  151. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py +2 -2
  152. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py +1 -2
  153. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py +1 -3
  154. llama_stack/providers/inline/tool_runtime/rag/__init__.py +1 -1
  155. llama_stack/providers/inline/tool_runtime/rag/config.py +8 -1
  156. llama_stack/providers/inline/tool_runtime/rag/context_retriever.py +7 -6
  157. llama_stack/providers/inline/tool_runtime/rag/memory.py +64 -48
  158. llama_stack/providers/inline/vector_io/chroma/__init__.py +1 -1
  159. llama_stack/providers/inline/vector_io/chroma/config.py +1 -1
  160. llama_stack/providers/inline/vector_io/faiss/__init__.py +1 -1
  161. llama_stack/providers/inline/vector_io/faiss/config.py +1 -1
  162. llama_stack/providers/inline/vector_io/faiss/faiss.py +43 -28
  163. llama_stack/providers/inline/vector_io/milvus/__init__.py +1 -1
  164. llama_stack/providers/inline/vector_io/milvus/config.py +1 -1
  165. llama_stack/providers/inline/vector_io/qdrant/__init__.py +1 -1
  166. llama_stack/providers/inline/vector_io/qdrant/config.py +1 -1
  167. llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py +1 -1
  168. llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +40 -33
  169. llama_stack/providers/registry/agents.py +7 -3
  170. llama_stack/providers/registry/batches.py +1 -1
  171. llama_stack/providers/registry/datasetio.py +1 -1
  172. llama_stack/providers/registry/eval.py +1 -1
  173. llama_stack/{apis/datasets/__init__.py → providers/registry/file_processors.py} +5 -1
  174. llama_stack/providers/registry/files.py +11 -2
  175. llama_stack/providers/registry/inference.py +22 -3
  176. llama_stack/providers/registry/post_training.py +1 -1
  177. llama_stack/providers/registry/safety.py +1 -1
  178. llama_stack/providers/registry/scoring.py +1 -1
  179. llama_stack/providers/registry/tool_runtime.py +2 -2
  180. llama_stack/providers/registry/vector_io.py +7 -7
  181. llama_stack/providers/remote/datasetio/huggingface/huggingface.py +2 -5
  182. llama_stack/providers/remote/datasetio/nvidia/datasetio.py +1 -4
  183. llama_stack/providers/remote/eval/nvidia/eval.py +15 -9
  184. llama_stack/providers/remote/files/openai/__init__.py +19 -0
  185. llama_stack/providers/remote/files/openai/config.py +28 -0
  186. llama_stack/providers/remote/files/openai/files.py +253 -0
  187. llama_stack/providers/remote/files/s3/files.py +52 -30
  188. llama_stack/providers/remote/inference/anthropic/anthropic.py +2 -1
  189. llama_stack/providers/remote/inference/anthropic/config.py +1 -1
  190. llama_stack/providers/remote/inference/azure/azure.py +1 -3
  191. llama_stack/providers/remote/inference/azure/config.py +8 -7
  192. llama_stack/providers/remote/inference/bedrock/__init__.py +1 -1
  193. llama_stack/providers/remote/inference/bedrock/bedrock.py +82 -105
  194. llama_stack/providers/remote/inference/bedrock/config.py +24 -3
  195. llama_stack/providers/remote/inference/cerebras/cerebras.py +5 -5
  196. llama_stack/providers/remote/inference/cerebras/config.py +12 -5
  197. llama_stack/providers/remote/inference/databricks/config.py +13 -6
  198. llama_stack/providers/remote/inference/databricks/databricks.py +16 -6
  199. llama_stack/providers/remote/inference/fireworks/config.py +5 -5
  200. llama_stack/providers/remote/inference/fireworks/fireworks.py +1 -1
  201. llama_stack/providers/remote/inference/gemini/config.py +1 -1
  202. llama_stack/providers/remote/inference/gemini/gemini.py +13 -14
  203. llama_stack/providers/remote/inference/groq/config.py +5 -5
  204. llama_stack/providers/remote/inference/groq/groq.py +1 -1
  205. llama_stack/providers/remote/inference/llama_openai_compat/config.py +5 -5
  206. llama_stack/providers/remote/inference/llama_openai_compat/llama.py +8 -6
  207. llama_stack/providers/remote/inference/nvidia/__init__.py +1 -1
  208. llama_stack/providers/remote/inference/nvidia/config.py +21 -11
  209. llama_stack/providers/remote/inference/nvidia/nvidia.py +115 -3
  210. llama_stack/providers/remote/inference/nvidia/utils.py +1 -1
  211. llama_stack/providers/remote/inference/oci/__init__.py +17 -0
  212. llama_stack/providers/remote/inference/oci/auth.py +79 -0
  213. llama_stack/providers/remote/inference/oci/config.py +75 -0
  214. llama_stack/providers/remote/inference/oci/oci.py +162 -0
  215. llama_stack/providers/remote/inference/ollama/config.py +7 -5
  216. llama_stack/providers/remote/inference/ollama/ollama.py +17 -8
  217. llama_stack/providers/remote/inference/openai/config.py +4 -4
  218. llama_stack/providers/remote/inference/openai/openai.py +1 -1
  219. llama_stack/providers/remote/inference/passthrough/__init__.py +2 -2
  220. llama_stack/providers/remote/inference/passthrough/config.py +5 -10
  221. llama_stack/providers/remote/inference/passthrough/passthrough.py +97 -75
  222. llama_stack/providers/remote/inference/runpod/config.py +12 -5
  223. llama_stack/providers/remote/inference/runpod/runpod.py +2 -20
  224. llama_stack/providers/remote/inference/sambanova/config.py +5 -5
  225. llama_stack/providers/remote/inference/sambanova/sambanova.py +1 -1
  226. llama_stack/providers/remote/inference/tgi/config.py +7 -6
  227. llama_stack/providers/remote/inference/tgi/tgi.py +19 -11
  228. llama_stack/providers/remote/inference/together/config.py +5 -5
  229. llama_stack/providers/remote/inference/together/together.py +15 -12
  230. llama_stack/providers/remote/inference/vertexai/config.py +1 -1
  231. llama_stack/providers/remote/inference/vllm/config.py +5 -5
  232. llama_stack/providers/remote/inference/vllm/vllm.py +13 -14
  233. llama_stack/providers/remote/inference/watsonx/config.py +4 -4
  234. llama_stack/providers/remote/inference/watsonx/watsonx.py +21 -94
  235. llama_stack/providers/remote/post_training/nvidia/post_training.py +4 -4
  236. llama_stack/providers/remote/post_training/nvidia/utils.py +1 -1
  237. llama_stack/providers/remote/safety/bedrock/bedrock.py +6 -6
  238. llama_stack/providers/remote/safety/bedrock/config.py +1 -1
  239. llama_stack/providers/remote/safety/nvidia/config.py +1 -1
  240. llama_stack/providers/remote/safety/nvidia/nvidia.py +11 -5
  241. llama_stack/providers/remote/safety/sambanova/config.py +1 -1
  242. llama_stack/providers/remote/safety/sambanova/sambanova.py +6 -6
  243. llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py +11 -6
  244. llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py +12 -7
  245. llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py +8 -2
  246. llama_stack/providers/remote/tool_runtime/model_context_protocol/model_context_protocol.py +57 -15
  247. llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py +11 -6
  248. llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py +11 -6
  249. llama_stack/providers/remote/vector_io/chroma/__init__.py +1 -1
  250. llama_stack/providers/remote/vector_io/chroma/chroma.py +125 -20
  251. llama_stack/providers/remote/vector_io/chroma/config.py +1 -1
  252. llama_stack/providers/remote/vector_io/milvus/__init__.py +1 -1
  253. llama_stack/providers/remote/vector_io/milvus/config.py +1 -1
  254. llama_stack/providers/remote/vector_io/milvus/milvus.py +27 -21
  255. llama_stack/providers/remote/vector_io/pgvector/__init__.py +1 -1
  256. llama_stack/providers/remote/vector_io/pgvector/config.py +1 -1
  257. llama_stack/providers/remote/vector_io/pgvector/pgvector.py +26 -18
  258. llama_stack/providers/remote/vector_io/qdrant/__init__.py +1 -1
  259. llama_stack/providers/remote/vector_io/qdrant/config.py +1 -1
  260. llama_stack/providers/remote/vector_io/qdrant/qdrant.py +141 -24
  261. llama_stack/providers/remote/vector_io/weaviate/__init__.py +1 -1
  262. llama_stack/providers/remote/vector_io/weaviate/config.py +1 -1
  263. llama_stack/providers/remote/vector_io/weaviate/weaviate.py +26 -21
  264. llama_stack/providers/utils/common/data_schema_validator.py +1 -5
  265. llama_stack/providers/utils/files/form_data.py +1 -1
  266. llama_stack/providers/utils/inference/embedding_mixin.py +1 -1
  267. llama_stack/providers/utils/inference/inference_store.py +7 -8
  268. llama_stack/providers/utils/inference/litellm_openai_mixin.py +79 -79
  269. llama_stack/providers/utils/inference/model_registry.py +1 -3
  270. llama_stack/providers/utils/inference/openai_compat.py +44 -1171
  271. llama_stack/providers/utils/inference/openai_mixin.py +68 -42
  272. llama_stack/providers/utils/inference/prompt_adapter.py +50 -265
  273. llama_stack/providers/utils/inference/stream_utils.py +23 -0
  274. llama_stack/providers/utils/memory/__init__.py +2 -0
  275. llama_stack/providers/utils/memory/file_utils.py +1 -1
  276. llama_stack/providers/utils/memory/openai_vector_store_mixin.py +181 -84
  277. llama_stack/providers/utils/memory/vector_store.py +39 -38
  278. llama_stack/providers/utils/pagination.py +1 -1
  279. llama_stack/providers/utils/responses/responses_store.py +15 -25
  280. llama_stack/providers/utils/scoring/aggregation_utils.py +1 -2
  281. llama_stack/providers/utils/scoring/base_scoring_fn.py +1 -2
  282. llama_stack/providers/utils/tools/mcp.py +93 -11
  283. llama_stack/telemetry/constants.py +27 -0
  284. llama_stack/telemetry/helpers.py +43 -0
  285. llama_stack/testing/api_recorder.py +25 -16
  286. {llama_stack-0.3.5.dist-info → llama_stack-0.4.0.dist-info}/METADATA +56 -54
  287. llama_stack-0.4.0.dist-info/RECORD +588 -0
  288. llama_stack-0.4.0.dist-info/top_level.txt +2 -0
  289. llama_stack_api/__init__.py +945 -0
  290. llama_stack_api/admin/__init__.py +45 -0
  291. llama_stack_api/admin/api.py +72 -0
  292. llama_stack_api/admin/fastapi_routes.py +117 -0
  293. llama_stack_api/admin/models.py +113 -0
  294. llama_stack_api/agents.py +173 -0
  295. llama_stack_api/batches/__init__.py +40 -0
  296. llama_stack_api/batches/api.py +53 -0
  297. llama_stack_api/batches/fastapi_routes.py +113 -0
  298. llama_stack_api/batches/models.py +78 -0
  299. llama_stack_api/benchmarks/__init__.py +43 -0
  300. llama_stack_api/benchmarks/api.py +39 -0
  301. llama_stack_api/benchmarks/fastapi_routes.py +109 -0
  302. llama_stack_api/benchmarks/models.py +109 -0
  303. {llama_stack/apis → llama_stack_api}/common/content_types.py +1 -43
  304. {llama_stack/apis → llama_stack_api}/common/errors.py +0 -8
  305. {llama_stack/apis → llama_stack_api}/common/job_types.py +1 -1
  306. llama_stack_api/common/responses.py +77 -0
  307. {llama_stack/apis → llama_stack_api}/common/training_types.py +1 -1
  308. {llama_stack/apis → llama_stack_api}/common/type_system.py +2 -14
  309. llama_stack_api/connectors.py +146 -0
  310. {llama_stack/apis/conversations → llama_stack_api}/conversations.py +23 -39
  311. {llama_stack/apis/datasetio → llama_stack_api}/datasetio.py +4 -8
  312. llama_stack_api/datasets/__init__.py +61 -0
  313. llama_stack_api/datasets/api.py +35 -0
  314. llama_stack_api/datasets/fastapi_routes.py +104 -0
  315. llama_stack_api/datasets/models.py +152 -0
  316. {llama_stack/providers → llama_stack_api}/datatypes.py +166 -10
  317. {llama_stack/apis/eval → llama_stack_api}/eval.py +8 -40
  318. llama_stack_api/file_processors/__init__.py +27 -0
  319. llama_stack_api/file_processors/api.py +64 -0
  320. llama_stack_api/file_processors/fastapi_routes.py +78 -0
  321. llama_stack_api/file_processors/models.py +42 -0
  322. llama_stack_api/files/__init__.py +35 -0
  323. llama_stack_api/files/api.py +51 -0
  324. llama_stack_api/files/fastapi_routes.py +124 -0
  325. llama_stack_api/files/models.py +107 -0
  326. {llama_stack/apis/inference → llama_stack_api}/inference.py +90 -194
  327. llama_stack_api/inspect_api/__init__.py +37 -0
  328. llama_stack_api/inspect_api/api.py +25 -0
  329. llama_stack_api/inspect_api/fastapi_routes.py +76 -0
  330. llama_stack_api/inspect_api/models.py +28 -0
  331. {llama_stack/apis/agents → llama_stack_api/internal}/__init__.py +3 -1
  332. llama_stack/providers/utils/kvstore/api.py → llama_stack_api/internal/kvstore.py +5 -0
  333. llama_stack_api/internal/sqlstore.py +79 -0
  334. {llama_stack/apis/models → llama_stack_api}/models.py +11 -9
  335. {llama_stack/apis/agents → llama_stack_api}/openai_responses.py +184 -27
  336. {llama_stack/apis/post_training → llama_stack_api}/post_training.py +7 -11
  337. {llama_stack/apis/prompts → llama_stack_api}/prompts.py +3 -4
  338. llama_stack_api/providers/__init__.py +33 -0
  339. llama_stack_api/providers/api.py +16 -0
  340. llama_stack_api/providers/fastapi_routes.py +57 -0
  341. llama_stack_api/providers/models.py +24 -0
  342. {llama_stack/apis/tools → llama_stack_api}/rag_tool.py +2 -52
  343. {llama_stack/apis → llama_stack_api}/resource.py +1 -1
  344. llama_stack_api/router_utils.py +160 -0
  345. {llama_stack/apis/safety → llama_stack_api}/safety.py +6 -9
  346. {llama_stack → llama_stack_api}/schema_utils.py +94 -4
  347. {llama_stack/apis/scoring → llama_stack_api}/scoring.py +3 -3
  348. {llama_stack/apis/scoring_functions → llama_stack_api}/scoring_functions.py +9 -6
  349. {llama_stack/apis/shields → llama_stack_api}/shields.py +6 -7
  350. {llama_stack/apis/tools → llama_stack_api}/tools.py +26 -21
  351. {llama_stack/apis/vector_io → llama_stack_api}/vector_io.py +133 -152
  352. {llama_stack/apis/vector_stores → llama_stack_api}/vector_stores.py +1 -1
  353. llama_stack/apis/agents/agents.py +0 -894
  354. llama_stack/apis/batches/__init__.py +0 -9
  355. llama_stack/apis/batches/batches.py +0 -100
  356. llama_stack/apis/benchmarks/__init__.py +0 -7
  357. llama_stack/apis/benchmarks/benchmarks.py +0 -108
  358. llama_stack/apis/common/responses.py +0 -36
  359. llama_stack/apis/conversations/__init__.py +0 -31
  360. llama_stack/apis/datasets/datasets.py +0 -251
  361. llama_stack/apis/datatypes.py +0 -160
  362. llama_stack/apis/eval/__init__.py +0 -7
  363. llama_stack/apis/files/__init__.py +0 -7
  364. llama_stack/apis/files/files.py +0 -199
  365. llama_stack/apis/inference/__init__.py +0 -7
  366. llama_stack/apis/inference/event_logger.py +0 -43
  367. llama_stack/apis/inspect/__init__.py +0 -7
  368. llama_stack/apis/inspect/inspect.py +0 -94
  369. llama_stack/apis/models/__init__.py +0 -7
  370. llama_stack/apis/post_training/__init__.py +0 -7
  371. llama_stack/apis/prompts/__init__.py +0 -9
  372. llama_stack/apis/providers/__init__.py +0 -7
  373. llama_stack/apis/providers/providers.py +0 -69
  374. llama_stack/apis/safety/__init__.py +0 -7
  375. llama_stack/apis/scoring/__init__.py +0 -7
  376. llama_stack/apis/scoring_functions/__init__.py +0 -7
  377. llama_stack/apis/shields/__init__.py +0 -7
  378. llama_stack/apis/synthetic_data_generation/__init__.py +0 -7
  379. llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py +0 -77
  380. llama_stack/apis/telemetry/__init__.py +0 -7
  381. llama_stack/apis/telemetry/telemetry.py +0 -423
  382. llama_stack/apis/tools/__init__.py +0 -8
  383. llama_stack/apis/vector_io/__init__.py +0 -7
  384. llama_stack/apis/vector_stores/__init__.py +0 -7
  385. llama_stack/core/server/tracing.py +0 -80
  386. llama_stack/core/ui/app.py +0 -55
  387. llama_stack/core/ui/modules/__init__.py +0 -5
  388. llama_stack/core/ui/modules/api.py +0 -32
  389. llama_stack/core/ui/modules/utils.py +0 -42
  390. llama_stack/core/ui/page/__init__.py +0 -5
  391. llama_stack/core/ui/page/distribution/__init__.py +0 -5
  392. llama_stack/core/ui/page/distribution/datasets.py +0 -18
  393. llama_stack/core/ui/page/distribution/eval_tasks.py +0 -20
  394. llama_stack/core/ui/page/distribution/models.py +0 -18
  395. llama_stack/core/ui/page/distribution/providers.py +0 -27
  396. llama_stack/core/ui/page/distribution/resources.py +0 -48
  397. llama_stack/core/ui/page/distribution/scoring_functions.py +0 -18
  398. llama_stack/core/ui/page/distribution/shields.py +0 -19
  399. llama_stack/core/ui/page/evaluations/__init__.py +0 -5
  400. llama_stack/core/ui/page/evaluations/app_eval.py +0 -143
  401. llama_stack/core/ui/page/evaluations/native_eval.py +0 -253
  402. llama_stack/core/ui/page/playground/__init__.py +0 -5
  403. llama_stack/core/ui/page/playground/chat.py +0 -130
  404. llama_stack/core/ui/page/playground/tools.py +0 -352
  405. llama_stack/distributions/dell/build.yaml +0 -33
  406. llama_stack/distributions/meta-reference-gpu/build.yaml +0 -32
  407. llama_stack/distributions/nvidia/build.yaml +0 -29
  408. llama_stack/distributions/open-benchmark/build.yaml +0 -36
  409. llama_stack/distributions/postgres-demo/__init__.py +0 -7
  410. llama_stack/distributions/postgres-demo/build.yaml +0 -23
  411. llama_stack/distributions/postgres-demo/postgres_demo.py +0 -125
  412. llama_stack/distributions/starter/build.yaml +0 -61
  413. llama_stack/distributions/starter-gpu/build.yaml +0 -61
  414. llama_stack/distributions/watsonx/build.yaml +0 -33
  415. llama_stack/providers/inline/agents/meta_reference/agent_instance.py +0 -1024
  416. llama_stack/providers/inline/agents/meta_reference/persistence.py +0 -228
  417. llama_stack/providers/inline/telemetry/__init__.py +0 -5
  418. llama_stack/providers/inline/telemetry/meta_reference/__init__.py +0 -21
  419. llama_stack/providers/inline/telemetry/meta_reference/config.py +0 -47
  420. llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +0 -252
  421. llama_stack/providers/remote/inference/bedrock/models.py +0 -29
  422. llama_stack/providers/utils/kvstore/sqlite/config.py +0 -20
  423. llama_stack/providers/utils/sqlstore/__init__.py +0 -5
  424. llama_stack/providers/utils/sqlstore/api.py +0 -128
  425. llama_stack/providers/utils/telemetry/__init__.py +0 -5
  426. llama_stack/providers/utils/telemetry/trace_protocol.py +0 -142
  427. llama_stack/providers/utils/telemetry/tracing.py +0 -384
  428. llama_stack/strong_typing/__init__.py +0 -19
  429. llama_stack/strong_typing/auxiliary.py +0 -228
  430. llama_stack/strong_typing/classdef.py +0 -440
  431. llama_stack/strong_typing/core.py +0 -46
  432. llama_stack/strong_typing/deserializer.py +0 -877
  433. llama_stack/strong_typing/docstring.py +0 -409
  434. llama_stack/strong_typing/exception.py +0 -23
  435. llama_stack/strong_typing/inspection.py +0 -1085
  436. llama_stack/strong_typing/mapping.py +0 -40
  437. llama_stack/strong_typing/name.py +0 -182
  438. llama_stack/strong_typing/schema.py +0 -792
  439. llama_stack/strong_typing/serialization.py +0 -97
  440. llama_stack/strong_typing/serializer.py +0 -500
  441. llama_stack/strong_typing/slots.py +0 -27
  442. llama_stack/strong_typing/topological.py +0 -89
  443. llama_stack/ui/node_modules/flatted/python/flatted.py +0 -149
  444. llama_stack-0.3.5.dist-info/RECORD +0 -625
  445. llama_stack-0.3.5.dist-info/top_level.txt +0 -1
  446. /llama_stack/{providers/utils → core/storage}/kvstore/config.py +0 -0
  447. /llama_stack/{providers/utils → core/storage}/kvstore/mongodb/__init__.py +0 -0
  448. /llama_stack/{providers/utils → core/storage}/kvstore/postgres/__init__.py +0 -0
  449. /llama_stack/{providers/utils → core/storage}/kvstore/redis/__init__.py +0 -0
  450. /llama_stack/{providers/utils → core/storage}/kvstore/sqlite/__init__.py +0 -0
  451. /llama_stack/{apis → providers/inline/file_processor}/__init__.py +0 -0
  452. /llama_stack/{apis/common → telemetry}/__init__.py +0 -0
  453. {llama_stack-0.3.5.dist-info → llama_stack-0.4.0.dist-info}/WHEEL +0 -0
  454. {llama_stack-0.3.5.dist-info → llama_stack-0.4.0.dist-info}/entry_points.txt +0 -0
  455. {llama_stack-0.3.5.dist-info → llama_stack-0.4.0.dist-info}/licenses/LICENSE +0 -0
  456. {llama_stack/core/ui → llama_stack_api/common}/__init__.py +0 -0
  457. {llama_stack/strong_typing → llama_stack_api}/py.typed +0 -0
  458. {llama_stack/apis → llama_stack_api}/version.py +0 -0
@@ -5,29 +5,26 @@
5
5
  # the root directory of this source tree.
6
6
 
7
7
  import math
8
- from collections.abc import Generator
9
8
  from typing import Optional
10
9
 
11
10
  import torch
12
11
  from lmformatenforcer import JsonSchemaParser, TokenEnforcer, TokenEnforcerTokenizerData
13
12
 
14
- from llama_stack.apis.inference import (
15
- GreedySamplingStrategy,
16
- JsonSchemaResponseFormat,
17
- ResponseFormat,
18
- SamplingParams,
19
- TopPSamplingStrategy,
20
- )
21
- from llama_stack.models.llama.datatypes import QuantizationMode
13
+ from llama_stack.models.llama.datatypes import QuantizationMode, ToolPromptFormat
22
14
  from llama_stack.models.llama.llama3.generation import Llama3
23
15
  from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer
24
16
  from llama_stack.models.llama.llama4.generation import Llama4
25
17
  from llama_stack.models.llama.llama4.tokenizer import Tokenizer as Llama4Tokenizer
26
18
  from llama_stack.models.llama.sku_types import Model, ModelFamily
27
- from llama_stack.providers.utils.inference.prompt_adapter import (
28
- ChatCompletionRequestWithRawContent,
29
- CompletionRequestWithRawContent,
30
- get_default_tool_prompt_format,
19
+ from llama_stack_api import (
20
+ GreedySamplingStrategy,
21
+ JsonSchemaResponseFormat,
22
+ OpenAIChatCompletionRequestWithExtraBody,
23
+ OpenAIResponseFormatJSONSchema,
24
+ ResponseFormat,
25
+ ResponseFormatType,
26
+ SamplingParams,
27
+ TopPSamplingStrategy,
31
28
  )
32
29
 
33
30
  from .common import model_checkpoint_dir
@@ -106,14 +103,6 @@ def _infer_sampling_params(sampling_params: SamplingParams):
106
103
  return temperature, top_p
107
104
 
108
105
 
109
- def _infer_tool_prompt_format(request: ChatCompletionRequestWithRawContent):
110
- tool_config = request.tool_config
111
- if tool_config is not None and tool_config.tool_prompt_format is not None:
112
- return tool_config.tool_prompt_format
113
- else:
114
- return get_default_tool_prompt_format(request.model)
115
-
116
-
117
106
  class LlamaGenerator:
118
107
  def __init__(
119
108
  self,
@@ -157,55 +146,56 @@ class LlamaGenerator:
157
146
  self.args = self.inner_generator.args
158
147
  self.formatter = self.inner_generator.formatter
159
148
 
160
- def completion(
161
- self,
162
- request_batch: list[CompletionRequestWithRawContent],
163
- ) -> Generator:
164
- first_request = request_batch[0]
165
- sampling_params = first_request.sampling_params or SamplingParams()
166
- max_gen_len = sampling_params.max_tokens
167
- if max_gen_len is None or max_gen_len == 0 or max_gen_len >= self.args.max_seq_len:
168
- max_gen_len = self.args.max_seq_len - 1
169
-
170
- temperature, top_p = _infer_sampling_params(sampling_params)
171
- yield from self.inner_generator.generate(
172
- llm_inputs=[self.formatter.encode_content(request.content) for request in request_batch],
173
- max_gen_len=max_gen_len,
174
- temperature=temperature,
175
- top_p=top_p,
176
- logprobs=bool(first_request.logprobs),
177
- echo=False,
178
- logits_processor=get_logits_processor(
179
- self.tokenizer,
180
- self.args.vocab_size,
181
- first_request.response_format,
182
- ),
183
- )
184
-
185
149
  def chat_completion(
186
150
  self,
187
- request_batch: list[ChatCompletionRequestWithRawContent],
188
- ) -> Generator:
189
- first_request = request_batch[0]
190
- sampling_params = first_request.sampling_params or SamplingParams()
151
+ request: OpenAIChatCompletionRequestWithExtraBody,
152
+ raw_messages: list,
153
+ ):
154
+ """Generate chat completion using OpenAI request format.
155
+
156
+ Args:
157
+ request: OpenAI chat completion request
158
+ raw_messages: Pre-converted list of RawMessage objects
159
+ """
160
+
161
+ # Determine tool prompt format
162
+ tool_prompt_format = ToolPromptFormat.json if request.tools else ToolPromptFormat.json
163
+
164
+ # Prepare sampling params
165
+ sampling_params = SamplingParams()
166
+ if request.temperature is not None or request.top_p is not None:
167
+ sampling_params.strategy = TopPSamplingStrategy(
168
+ temperature=request.temperature if request.temperature is not None else 1.0,
169
+ top_p=request.top_p if request.top_p is not None else 1.0,
170
+ )
171
+ if request.max_tokens:
172
+ sampling_params.max_tokens = request.max_tokens
173
+
191
174
  max_gen_len = sampling_params.max_tokens
192
175
  if max_gen_len is None or max_gen_len == 0 or max_gen_len >= self.args.max_seq_len:
193
176
  max_gen_len = self.args.max_seq_len - 1
194
177
 
195
178
  temperature, top_p = _infer_sampling_params(sampling_params)
179
+
180
+ # Get logits processor for response format
181
+ logits_processor = None
182
+ if request.response_format:
183
+ if isinstance(request.response_format, OpenAIResponseFormatJSONSchema):
184
+ # Extract the actual schema from OpenAIJSONSchema TypedDict
185
+ schema_dict = request.response_format.json_schema.get("schema") or {}
186
+ json_schema_format = JsonSchemaResponseFormat(
187
+ type=ResponseFormatType.json_schema,
188
+ json_schema=schema_dict,
189
+ )
190
+ logits_processor = get_logits_processor(self.tokenizer, self.args.vocab_size, json_schema_format)
191
+
192
+ # Generate
196
193
  yield from self.inner_generator.generate(
197
- llm_inputs=[
198
- self.formatter.encode_dialog_prompt(request.messages, _infer_tool_prompt_format(request))
199
- for request in request_batch
200
- ],
194
+ llm_inputs=[self.formatter.encode_dialog_prompt(raw_messages, tool_prompt_format)],
201
195
  max_gen_len=max_gen_len,
202
196
  temperature=temperature,
203
197
  top_p=top_p,
204
- logprobs=bool(first_request.logprobs),
198
+ logprobs=False,
205
199
  echo=False,
206
- logits_processor=get_logits_processor(
207
- self.tokenizer,
208
- self.args.vocab_size,
209
- first_request.response_format,
210
- ),
200
+ logits_processor=logits_processor,
211
201
  )
@@ -5,27 +5,25 @@
5
5
  # the root directory of this source tree.
6
6
 
7
7
  import asyncio
8
+ import time
9
+ import uuid
8
10
  from collections.abc import AsyncIterator
9
11
 
10
- from llama_stack.apis.inference import (
11
- InferenceProvider,
12
- OpenAIChatCompletionRequestWithExtraBody,
13
- OpenAICompletionRequestWithExtraBody,
14
- )
15
- from llama_stack.apis.inference.inference import (
16
- OpenAIChatCompletion,
17
- OpenAIChatCompletionChunk,
18
- OpenAICompletion,
19
- )
20
- from llama_stack.apis.models import Model, ModelType
21
12
  from llama_stack.log import get_logger
13
+ from llama_stack.models.llama.datatypes import RawMessage, RawTextItem, ToolDefinition
22
14
  from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat
15
+ from llama_stack.models.llama.llama3.prompt_templates import (
16
+ JsonCustomToolGenerator,
17
+ SystemDefaultGenerator,
18
+ )
23
19
  from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer
24
20
  from llama_stack.models.llama.llama4.chat_format import ChatFormat as Llama4ChatFormat
21
+ from llama_stack.models.llama.llama4.prompt_templates.system_prompts import (
22
+ PythonListCustomToolGenerator as PythonListCustomToolGeneratorLlama4,
23
+ )
25
24
  from llama_stack.models.llama.llama4.tokenizer import Tokenizer as Llama4Tokenizer
26
25
  from llama_stack.models.llama.sku_list import resolve_model
27
- from llama_stack.models.llama.sku_types import ModelFamily
28
- from llama_stack.providers.datatypes import ModelsProtocolPrivate
26
+ from llama_stack.models.llama.sku_types import ModelFamily, is_multimodal
29
27
  from llama_stack.providers.utils.inference.embedding_mixin import (
30
28
  SentenceTransformerEmbeddingMixin,
31
29
  )
@@ -33,6 +31,22 @@ from llama_stack.providers.utils.inference.model_registry import (
33
31
  ModelRegistryHelper,
34
32
  build_hf_repo_model_entry,
35
33
  )
34
+ from llama_stack_api import (
35
+ InferenceProvider,
36
+ Model,
37
+ ModelsProtocolPrivate,
38
+ ModelType,
39
+ OpenAIAssistantMessageParam,
40
+ OpenAIChatCompletion,
41
+ OpenAIChatCompletionChunk,
42
+ OpenAIChatCompletionRequestWithExtraBody,
43
+ OpenAIChatCompletionUsage,
44
+ OpenAIChoice,
45
+ OpenAICompletion,
46
+ OpenAICompletionRequestWithExtraBody,
47
+ OpenAIUserMessageParam,
48
+ ToolChoice,
49
+ )
36
50
 
37
51
  from .config import MetaReferenceInferenceConfig
38
52
  from .generators import LlamaGenerator
@@ -44,6 +58,170 @@ log = get_logger(__name__, category="inference")
44
58
  SEMAPHORE = asyncio.Semaphore(1)
45
59
 
46
60
 
61
+ def _convert_openai_tool_to_tool_definition(tool) -> ToolDefinition:
62
+ """Convert OpenAI tool format to ToolDefinition format."""
63
+ # OpenAI tools have function.name and function.parameters
64
+ return ToolDefinition(
65
+ tool_name=tool.function.name,
66
+ description=tool.function.description or "",
67
+ parameters=tool.function.parameters or {},
68
+ )
69
+
70
+
71
+ def _get_tool_choice_prompt(tool_choice, tools) -> str:
72
+ """Generate prompt text for tool_choice behavior."""
73
+ if not tool_choice or tool_choice == ToolChoice.auto or tool_choice == "auto":
74
+ return ""
75
+ elif tool_choice == ToolChoice.required or tool_choice == "required":
76
+ return "You MUST use one of the provided functions/tools to answer the user query."
77
+ elif tool_choice == ToolChoice.none or tool_choice == "none":
78
+ return ""
79
+ else:
80
+ # Specific tool specified
81
+ return f"You MUST use the tool `{tool_choice}` to answer the user query."
82
+
83
+
84
+ def _raw_content_as_str(content) -> str:
85
+ """Convert RawContent to string for system messages."""
86
+ if isinstance(content, str):
87
+ return content
88
+ elif isinstance(content, RawTextItem):
89
+ return content.text
90
+ elif isinstance(content, list):
91
+ return "\n".join(_raw_content_as_str(c) for c in content)
92
+ else:
93
+ return "<media>"
94
+
95
+
96
+ def _augment_raw_messages_for_tools_llama_3_1(
97
+ raw_messages: list[RawMessage],
98
+ tools: list,
99
+ tool_choice,
100
+ ) -> list[RawMessage]:
101
+ """Augment raw messages with tool definitions for Llama 3.1 style models."""
102
+ messages = raw_messages.copy()
103
+ existing_system_message = None
104
+ if messages and messages[0].role == "system":
105
+ existing_system_message = messages.pop(0)
106
+
107
+ sys_content = ""
108
+
109
+ # Add tool definitions first (if present)
110
+ if tools:
111
+ # Convert OpenAI tools to ToolDefinitions
112
+ tool_definitions = [_convert_openai_tool_to_tool_definition(t) for t in tools]
113
+
114
+ # For OpenAI format, all tools are custom (have string names)
115
+ tool_gen = JsonCustomToolGenerator()
116
+ tool_template = tool_gen.gen(tool_definitions)
117
+ sys_content += tool_template.render()
118
+ sys_content += "\n"
119
+
120
+ # Add default system prompt
121
+ default_gen = SystemDefaultGenerator()
122
+ default_template = default_gen.gen()
123
+ sys_content += default_template.render()
124
+
125
+ # Add existing system message if present
126
+ if existing_system_message:
127
+ sys_content += "\n" + _raw_content_as_str(existing_system_message.content)
128
+
129
+ # Add tool choice prompt if needed
130
+ if tool_choice_prompt := _get_tool_choice_prompt(tool_choice, tools):
131
+ sys_content += "\n" + tool_choice_prompt
132
+
133
+ # Create new system message
134
+ new_system_message = RawMessage(
135
+ role="system",
136
+ content=[RawTextItem(text=sys_content.strip())],
137
+ )
138
+
139
+ return [new_system_message] + messages
140
+
141
+
142
+ def _augment_raw_messages_for_tools_llama_4(
143
+ raw_messages: list[RawMessage],
144
+ tools: list,
145
+ tool_choice,
146
+ ) -> list[RawMessage]:
147
+ """Augment raw messages with tool definitions for Llama 4/3.2/3.3 style models."""
148
+ messages = raw_messages.copy()
149
+ existing_system_message = None
150
+ if messages and messages[0].role == "system":
151
+ existing_system_message = messages.pop(0)
152
+
153
+ sys_content = ""
154
+
155
+ # Add tool definitions if present
156
+ if tools:
157
+ # Convert OpenAI tools to ToolDefinitions
158
+ tool_definitions = [_convert_openai_tool_to_tool_definition(t) for t in tools]
159
+
160
+ # Use python_list format for Llama 4
161
+ tool_gen = PythonListCustomToolGeneratorLlama4()
162
+ system_prompt = None
163
+ if existing_system_message:
164
+ system_prompt = _raw_content_as_str(existing_system_message.content)
165
+
166
+ tool_template = tool_gen.gen(tool_definitions, system_prompt)
167
+ sys_content = tool_template.render()
168
+ elif existing_system_message:
169
+ # No tools, just use existing system message
170
+ sys_content = _raw_content_as_str(existing_system_message.content)
171
+
172
+ # Add tool choice prompt if needed
173
+ if tool_choice_prompt := _get_tool_choice_prompt(tool_choice, tools):
174
+ sys_content += "\n" + tool_choice_prompt
175
+
176
+ if sys_content:
177
+ new_system_message = RawMessage(
178
+ role="system",
179
+ content=[RawTextItem(text=sys_content.strip())],
180
+ )
181
+ return [new_system_message] + messages
182
+
183
+ return messages
184
+
185
+
186
+ def augment_raw_messages_for_tools(
187
+ raw_messages: list[RawMessage],
188
+ params: OpenAIChatCompletionRequestWithExtraBody,
189
+ llama_model,
190
+ ) -> list[RawMessage]:
191
+ """Augment raw messages with tool definitions based on model family."""
192
+ if not params.tools:
193
+ return raw_messages
194
+
195
+ # Determine augmentation strategy based on model family
196
+ if llama_model.model_family == ModelFamily.llama3_1 or (
197
+ llama_model.model_family == ModelFamily.llama3_2 and is_multimodal(llama_model.core_model_id)
198
+ ):
199
+ # Llama 3.1 and Llama 3.2 multimodal use JSON format
200
+ return _augment_raw_messages_for_tools_llama_3_1(
201
+ raw_messages,
202
+ params.tools,
203
+ params.tool_choice,
204
+ )
205
+ elif llama_model.model_family in (
206
+ ModelFamily.llama3_2,
207
+ ModelFamily.llama3_3,
208
+ ModelFamily.llama4,
209
+ ):
210
+ # Llama 3.2/3.3/4 use python_list format
211
+ return _augment_raw_messages_for_tools_llama_4(
212
+ raw_messages,
213
+ params.tools,
214
+ params.tool_choice,
215
+ )
216
+ else:
217
+ # Default to Llama 3.1 style
218
+ return _augment_raw_messages_for_tools_llama_3_1(
219
+ raw_messages,
220
+ params.tools,
221
+ params.tool_choice,
222
+ )
223
+
224
+
47
225
  def llama_builder_fn(config: MetaReferenceInferenceConfig, model_id: str, llama_model: Model) -> LlamaGenerator:
48
226
  return LlamaGenerator(config, model_id, llama_model)
49
227
 
@@ -68,7 +246,7 @@ class MetaReferenceInferenceImpl(
68
246
  async def openai_completion(
69
247
  self,
70
248
  params: OpenAICompletionRequestWithExtraBody,
71
- ) -> OpenAICompletion:
249
+ ) -> OpenAICompletion | AsyncIterator[OpenAICompletion]:
72
250
  raise NotImplementedError("OpenAI completion not supported by meta reference provider")
73
251
 
74
252
  async def should_refresh_models(self) -> bool:
@@ -136,17 +314,20 @@ class MetaReferenceInferenceImpl(
136
314
  self.llama_model = llama_model
137
315
 
138
316
  log.info("Warming up...")
317
+
139
318
  await self.openai_chat_completion(
140
- model=model_id,
141
- messages=[{"role": "user", "content": "Hi how are you?"}],
142
- max_tokens=20,
319
+ params=OpenAIChatCompletionRequestWithExtraBody(
320
+ model=model_id,
321
+ messages=[OpenAIUserMessageParam(role="user", content="Hi how are you?")],
322
+ max_tokens=20,
323
+ )
143
324
  )
144
325
  log.info("Warmed up!")
145
326
 
146
327
  def check_model(self, request) -> None:
147
328
  if self.model_id is None or self.llama_model is None:
148
329
  raise RuntimeError(
149
- "No avaible model yet, please register your requested model or add your model in the resouces first"
330
+ "No available model yet, please register your requested model or add your model in the resources first"
150
331
  )
151
332
  elif request.model != self.model_id:
152
333
  raise RuntimeError(f"Model mismatch: request model: {request.model} != loaded model: {self.model_id}")
@@ -155,4 +336,207 @@ class MetaReferenceInferenceImpl(
155
336
  self,
156
337
  params: OpenAIChatCompletionRequestWithExtraBody,
157
338
  ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
158
- raise NotImplementedError("OpenAI chat completion not supported by meta-reference inference provider")
339
+ self.check_model(params)
340
+
341
+ # Convert OpenAI messages to RawMessages
342
+ from llama_stack.models.llama.datatypes import StopReason
343
+ from llama_stack.providers.utils.inference.prompt_adapter import (
344
+ convert_openai_message_to_raw_message,
345
+ decode_assistant_message,
346
+ )
347
+
348
+ raw_messages = [await convert_openai_message_to_raw_message(msg) for msg in params.messages]
349
+
350
+ # Augment messages with tool definitions if tools are present
351
+ raw_messages = augment_raw_messages_for_tools(raw_messages, params, self.llama_model)
352
+
353
+ # Call generator's chat_completion method (works for both single-GPU and model-parallel)
354
+ if isinstance(self.generator, LlamaGenerator):
355
+ generator = self.generator.chat_completion(params, raw_messages)
356
+ else:
357
+ # Model parallel: submit task to process group
358
+ generator = self.generator.group.run_inference(("chat_completion", [params, raw_messages]))
359
+
360
+ # Check if streaming is requested
361
+ if params.stream:
362
+ return self._stream_chat_completion(generator, params)
363
+
364
+ # Non-streaming: collect all generated text
365
+ generated_text = ""
366
+ for result_batch in generator:
367
+ for result in result_batch:
368
+ if not result.ignore_token and result.source == "output":
369
+ generated_text += result.text
370
+
371
+ # Decode assistant message to extract tool calls and determine stop_reason
372
+ # Default to end_of_turn if generation completed normally
373
+ decoded_message = decode_assistant_message(generated_text, StopReason.end_of_turn)
374
+
375
+ # Convert tool calls to OpenAI format
376
+ openai_tool_calls = None
377
+ if decoded_message.tool_calls:
378
+ from llama_stack_api import (
379
+ OpenAIChatCompletionToolCall,
380
+ OpenAIChatCompletionToolCallFunction,
381
+ )
382
+
383
+ openai_tool_calls = [
384
+ OpenAIChatCompletionToolCall(
385
+ # generate a uuid for the call id. This is the only inline provider that does this, so need to get creative.
386
+ id=f"call_{uuid.uuid4().hex[:24]}",
387
+ type="function",
388
+ function=OpenAIChatCompletionToolCallFunction(
389
+ name=str(tc.tool_name),
390
+ arguments=tc.arguments,
391
+ ),
392
+ )
393
+ for tc in decoded_message.tool_calls
394
+ ]
395
+
396
+ # Determine finish_reason based on whether tool calls are present
397
+ finish_reason = "tool_calls" if openai_tool_calls else "stop"
398
+
399
+ # Extract content from decoded message
400
+ content = ""
401
+ if isinstance(decoded_message.content, str):
402
+ content = decoded_message.content
403
+ elif isinstance(decoded_message.content, list):
404
+ for item in decoded_message.content:
405
+ if isinstance(item, RawTextItem):
406
+ content += item.text
407
+
408
+ # Create OpenAI response
409
+ # generate a uuid for the call id. This is the only inline provider that does this, so need to get creative.
410
+ response_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
411
+ created = int(time.time())
412
+
413
+ return OpenAIChatCompletion(
414
+ id=response_id,
415
+ object="chat.completion",
416
+ created=created,
417
+ model=params.model,
418
+ choices=[
419
+ OpenAIChoice(
420
+ index=0,
421
+ message=OpenAIAssistantMessageParam(
422
+ role="assistant",
423
+ content=content,
424
+ tool_calls=openai_tool_calls,
425
+ ),
426
+ finish_reason=finish_reason,
427
+ logprobs=None,
428
+ )
429
+ ],
430
+ usage=OpenAIChatCompletionUsage(
431
+ prompt_tokens=0, # TODO: calculate properly
432
+ completion_tokens=0, # TODO: calculate properly
433
+ total_tokens=0, # TODO: calculate properly
434
+ ),
435
+ )
436
+
437
+ async def _stream_chat_completion(
438
+ self,
439
+ generator,
440
+ params: OpenAIChatCompletionRequestWithExtraBody,
441
+ ) -> AsyncIterator[OpenAIChatCompletionChunk]:
442
+ """Stream chat completion chunks as they're generated."""
443
+ from llama_stack.models.llama.datatypes import StopReason
444
+ from llama_stack.providers.utils.inference.prompt_adapter import decode_assistant_message
445
+ from llama_stack_api import (
446
+ OpenAIChatCompletionChunk,
447
+ OpenAIChatCompletionToolCall,
448
+ OpenAIChatCompletionToolCallFunction,
449
+ OpenAIChoiceDelta,
450
+ OpenAIChunkChoice,
451
+ )
452
+
453
+ response_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
454
+ created = int(time.time())
455
+ generated_text = ""
456
+
457
+ # Yield chunks as tokens are generated
458
+ for result_batch in generator:
459
+ for result in result_batch:
460
+ if result.ignore_token or result.source != "output":
461
+ continue
462
+
463
+ generated_text += result.text
464
+
465
+ # Yield delta chunk with the new text
466
+ chunk = OpenAIChatCompletionChunk(
467
+ id=response_id,
468
+ object="chat.completion.chunk",
469
+ created=created,
470
+ model=params.model,
471
+ choices=[
472
+ OpenAIChunkChoice(
473
+ index=0,
474
+ delta=OpenAIChoiceDelta(
475
+ role="assistant",
476
+ content=result.text,
477
+ ),
478
+ finish_reason="",
479
+ logprobs=None,
480
+ )
481
+ ],
482
+ )
483
+ yield chunk
484
+
485
+ # After generation completes, decode the full message to extract tool calls
486
+ decoded_message = decode_assistant_message(generated_text, StopReason.end_of_turn)
487
+
488
+ # If tool calls are present, yield a final chunk with tool_calls
489
+ if decoded_message.tool_calls:
490
+ openai_tool_calls = [
491
+ OpenAIChatCompletionToolCall(
492
+ # generate a uuid for the call id. This is the only inline provider that does this, so need to get creative.
493
+ id=f"call_{uuid.uuid4().hex[:24]}",
494
+ type="function",
495
+ function=OpenAIChatCompletionToolCallFunction(
496
+ name=str(tc.tool_name),
497
+ arguments=tc.arguments,
498
+ ),
499
+ )
500
+ for tc in decoded_message.tool_calls
501
+ ]
502
+
503
+ # Yield chunk with tool_calls
504
+ chunk = OpenAIChatCompletionChunk(
505
+ id=response_id,
506
+ object="chat.completion.chunk",
507
+ created=created,
508
+ model=params.model,
509
+ choices=[
510
+ OpenAIChunkChoice(
511
+ index=0,
512
+ delta=OpenAIChoiceDelta(
513
+ role="assistant",
514
+ tool_calls=openai_tool_calls,
515
+ ),
516
+ finish_reason="",
517
+ logprobs=None,
518
+ )
519
+ ],
520
+ )
521
+ yield chunk
522
+
523
+ finish_reason = "tool_calls"
524
+ else:
525
+ finish_reason = "stop"
526
+
527
+ # Yield final chunk with finish_reason
528
+ final_chunk = OpenAIChatCompletionChunk(
529
+ id=response_id,
530
+ object="chat.completion.chunk",
531
+ created=created,
532
+ model=params.model,
533
+ choices=[
534
+ OpenAIChunkChoice(
535
+ index=0,
536
+ delta=OpenAIChoiceDelta(),
537
+ finish_reason=finish_reason,
538
+ logprobs=None,
539
+ )
540
+ ],
541
+ )
542
+ yield final_chunk
@@ -4,17 +4,12 @@
4
4
  # This source code is licensed under the terms described in the LICENSE file in
5
5
  # the root directory of this source tree.
6
6
 
7
- from collections.abc import Callable, Generator
8
- from copy import deepcopy
7
+ from collections.abc import Callable
9
8
  from functools import partial
10
9
  from typing import Any
11
10
 
12
11
  from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat
13
12
  from llama_stack.models.llama.llama4.chat_format import ChatFormat as Llama4ChatFormat
14
- from llama_stack.providers.utils.inference.prompt_adapter import (
15
- ChatCompletionRequestWithRawContent,
16
- CompletionRequestWithRawContent,
17
- )
18
13
 
19
14
  from .parallel_utils import ModelParallelProcessGroup
20
15
 
@@ -23,12 +18,14 @@ class ModelRunner:
23
18
  def __init__(self, llama):
24
19
  self.llama = llama
25
20
 
26
- # the `task` object is the same that is sent to `ModelParallelProcessGroup.run_inference()`
27
21
  def __call__(self, task: Any):
28
- if task[0] == "chat_completion":
29
- return self.llama.chat_completion(task[1])
22
+ task_type = task[0]
23
+ if task_type == "chat_completion":
24
+ # task[1] is [params, raw_messages]
25
+ params, raw_messages = task[1]
26
+ return self.llama.chat_completion(params, raw_messages)
30
27
  else:
31
- raise ValueError(f"Unexpected task type {task[0]}")
28
+ raise ValueError(f"Unexpected task type {task_type}")
32
29
 
33
30
 
34
31
  def init_model_cb(
@@ -78,19 +75,3 @@ class LlamaModelParallelGenerator:
78
75
 
79
76
  def __exit__(self, exc_type, exc_value, exc_traceback):
80
77
  self.group.stop()
81
-
82
- def completion(
83
- self,
84
- request_batch: list[CompletionRequestWithRawContent],
85
- ) -> Generator:
86
- req_obj = deepcopy(request_batch)
87
- gen = self.group.run_inference(("completion", req_obj))
88
- yield from gen
89
-
90
- def chat_completion(
91
- self,
92
- request_batch: list[ChatCompletionRequestWithRawContent],
93
- ) -> Generator:
94
- req_obj = deepcopy(request_batch)
95
- gen = self.group.run_inference(("chat_completion", req_obj))
96
- yield from gen