llama-stack 0.3.5__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (460) hide show
  1. llama_stack/__init__.py +0 -5
  2. llama_stack/cli/llama.py +3 -3
  3. llama_stack/cli/stack/_list_deps.py +12 -23
  4. llama_stack/cli/stack/list_stacks.py +37 -18
  5. llama_stack/cli/stack/run.py +121 -11
  6. llama_stack/cli/stack/utils.py +0 -127
  7. llama_stack/core/access_control/access_control.py +69 -28
  8. llama_stack/core/access_control/conditions.py +15 -5
  9. llama_stack/core/admin.py +267 -0
  10. llama_stack/core/build.py +6 -74
  11. llama_stack/core/client.py +1 -1
  12. llama_stack/core/configure.py +6 -6
  13. llama_stack/core/conversations/conversations.py +28 -25
  14. llama_stack/core/datatypes.py +271 -79
  15. llama_stack/core/distribution.py +15 -16
  16. llama_stack/core/external.py +3 -3
  17. llama_stack/core/inspect.py +98 -15
  18. llama_stack/core/library_client.py +73 -61
  19. llama_stack/core/prompts/prompts.py +12 -11
  20. llama_stack/core/providers.py +17 -11
  21. llama_stack/core/resolver.py +65 -56
  22. llama_stack/core/routers/__init__.py +8 -12
  23. llama_stack/core/routers/datasets.py +1 -4
  24. llama_stack/core/routers/eval_scoring.py +7 -4
  25. llama_stack/core/routers/inference.py +55 -271
  26. llama_stack/core/routers/safety.py +52 -24
  27. llama_stack/core/routers/tool_runtime.py +6 -48
  28. llama_stack/core/routers/vector_io.py +130 -51
  29. llama_stack/core/routing_tables/benchmarks.py +24 -20
  30. llama_stack/core/routing_tables/common.py +1 -4
  31. llama_stack/core/routing_tables/datasets.py +22 -22
  32. llama_stack/core/routing_tables/models.py +119 -6
  33. llama_stack/core/routing_tables/scoring_functions.py +7 -7
  34. llama_stack/core/routing_tables/shields.py +1 -2
  35. llama_stack/core/routing_tables/toolgroups.py +17 -7
  36. llama_stack/core/routing_tables/vector_stores.py +51 -16
  37. llama_stack/core/server/auth.py +5 -3
  38. llama_stack/core/server/auth_providers.py +36 -20
  39. llama_stack/core/server/fastapi_router_registry.py +84 -0
  40. llama_stack/core/server/quota.py +2 -2
  41. llama_stack/core/server/routes.py +79 -27
  42. llama_stack/core/server/server.py +102 -87
  43. llama_stack/core/stack.py +235 -62
  44. llama_stack/core/storage/datatypes.py +26 -3
  45. llama_stack/{providers/utils → core/storage}/kvstore/__init__.py +2 -0
  46. llama_stack/{providers/utils → core/storage}/kvstore/kvstore.py +55 -24
  47. llama_stack/{providers/utils → core/storage}/kvstore/mongodb/mongodb.py +13 -10
  48. llama_stack/{providers/utils → core/storage}/kvstore/postgres/postgres.py +28 -17
  49. llama_stack/{providers/utils → core/storage}/kvstore/redis/redis.py +41 -16
  50. llama_stack/{providers/utils → core/storage}/kvstore/sqlite/sqlite.py +1 -1
  51. llama_stack/core/storage/sqlstore/__init__.py +17 -0
  52. llama_stack/{providers/utils → core/storage}/sqlstore/authorized_sqlstore.py +69 -49
  53. llama_stack/{providers/utils → core/storage}/sqlstore/sqlalchemy_sqlstore.py +47 -17
  54. llama_stack/{providers/utils → core/storage}/sqlstore/sqlstore.py +25 -8
  55. llama_stack/core/store/registry.py +1 -1
  56. llama_stack/core/utils/config.py +8 -2
  57. llama_stack/core/utils/config_resolution.py +32 -29
  58. llama_stack/core/utils/context.py +4 -10
  59. llama_stack/core/utils/exec.py +9 -0
  60. llama_stack/core/utils/type_inspection.py +45 -0
  61. llama_stack/distributions/dell/{run.yaml → config.yaml} +3 -2
  62. llama_stack/distributions/dell/dell.py +2 -2
  63. llama_stack/distributions/dell/run-with-safety.yaml +3 -2
  64. llama_stack/distributions/meta-reference-gpu/{run.yaml → config.yaml} +3 -2
  65. llama_stack/distributions/meta-reference-gpu/meta_reference.py +2 -2
  66. llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +3 -2
  67. llama_stack/distributions/nvidia/{run.yaml → config.yaml} +4 -4
  68. llama_stack/distributions/nvidia/nvidia.py +1 -1
  69. llama_stack/distributions/nvidia/run-with-safety.yaml +4 -4
  70. llama_stack/{apis/datasetio → distributions/oci}/__init__.py +1 -1
  71. llama_stack/distributions/oci/config.yaml +134 -0
  72. llama_stack/distributions/oci/oci.py +108 -0
  73. llama_stack/distributions/open-benchmark/{run.yaml → config.yaml} +5 -4
  74. llama_stack/distributions/open-benchmark/open_benchmark.py +2 -3
  75. llama_stack/distributions/postgres-demo/{run.yaml → config.yaml} +4 -3
  76. llama_stack/distributions/starter/{run.yaml → config.yaml} +64 -13
  77. llama_stack/distributions/starter/run-with-postgres-store.yaml +64 -13
  78. llama_stack/distributions/starter/starter.py +8 -5
  79. llama_stack/distributions/starter-gpu/{run.yaml → config.yaml} +64 -13
  80. llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +64 -13
  81. llama_stack/distributions/template.py +13 -69
  82. llama_stack/distributions/watsonx/{run.yaml → config.yaml} +4 -3
  83. llama_stack/distributions/watsonx/watsonx.py +1 -1
  84. llama_stack/log.py +28 -11
  85. llama_stack/models/llama/checkpoint.py +6 -6
  86. llama_stack/models/llama/hadamard_utils.py +2 -0
  87. llama_stack/models/llama/llama3/generation.py +3 -1
  88. llama_stack/models/llama/llama3/interface.py +2 -5
  89. llama_stack/models/llama/llama3/multimodal/encoder_utils.py +3 -3
  90. llama_stack/models/llama/llama3/multimodal/image_transform.py +6 -6
  91. llama_stack/models/llama/llama3/prompt_templates/system_prompts.py +1 -1
  92. llama_stack/models/llama/llama3/tool_utils.py +2 -1
  93. llama_stack/models/llama/llama4/prompt_templates/system_prompts.py +1 -1
  94. llama_stack/providers/inline/agents/meta_reference/__init__.py +3 -3
  95. llama_stack/providers/inline/agents/meta_reference/agents.py +44 -261
  96. llama_stack/providers/inline/agents/meta_reference/config.py +6 -1
  97. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +207 -57
  98. llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +308 -47
  99. llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +162 -96
  100. llama_stack/providers/inline/agents/meta_reference/responses/types.py +23 -8
  101. llama_stack/providers/inline/agents/meta_reference/responses/utils.py +201 -33
  102. llama_stack/providers/inline/agents/meta_reference/safety.py +8 -13
  103. llama_stack/providers/inline/batches/reference/__init__.py +2 -4
  104. llama_stack/providers/inline/batches/reference/batches.py +78 -60
  105. llama_stack/providers/inline/datasetio/localfs/datasetio.py +2 -5
  106. llama_stack/providers/inline/eval/meta_reference/eval.py +16 -61
  107. llama_stack/providers/inline/files/localfs/files.py +37 -28
  108. llama_stack/providers/inline/inference/meta_reference/config.py +2 -2
  109. llama_stack/providers/inline/inference/meta_reference/generators.py +50 -60
  110. llama_stack/providers/inline/inference/meta_reference/inference.py +403 -19
  111. llama_stack/providers/inline/inference/meta_reference/model_parallel.py +7 -26
  112. llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +2 -12
  113. llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +10 -15
  114. llama_stack/providers/inline/post_training/common/validator.py +1 -5
  115. llama_stack/providers/inline/post_training/huggingface/post_training.py +8 -8
  116. llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py +18 -10
  117. llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device_dpo.py +12 -9
  118. llama_stack/providers/inline/post_training/huggingface/utils.py +27 -6
  119. llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py +1 -1
  120. llama_stack/providers/inline/post_training/torchtune/common/utils.py +1 -1
  121. llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py +1 -1
  122. llama_stack/providers/inline/post_training/torchtune/post_training.py +8 -8
  123. llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +16 -16
  124. llama_stack/providers/inline/safety/code_scanner/code_scanner.py +13 -9
  125. llama_stack/providers/inline/safety/llama_guard/llama_guard.py +18 -15
  126. llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +9 -9
  127. llama_stack/providers/inline/scoring/basic/scoring.py +6 -13
  128. llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py +1 -2
  129. llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py +1 -2
  130. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py +2 -2
  131. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py +2 -2
  132. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py +2 -2
  133. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py +2 -2
  134. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py +2 -2
  135. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py +2 -2
  136. llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py +1 -2
  137. llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py +1 -2
  138. llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py +1 -2
  139. llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py +1 -2
  140. llama_stack/providers/inline/scoring/braintrust/braintrust.py +12 -15
  141. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py +2 -2
  142. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py +2 -2
  143. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py +2 -2
  144. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py +2 -2
  145. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py +2 -2
  146. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py +2 -2
  147. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py +2 -2
  148. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py +2 -2
  149. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py +2 -2
  150. llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +7 -14
  151. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py +2 -2
  152. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py +1 -2
  153. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py +1 -3
  154. llama_stack/providers/inline/tool_runtime/rag/__init__.py +1 -1
  155. llama_stack/providers/inline/tool_runtime/rag/config.py +8 -1
  156. llama_stack/providers/inline/tool_runtime/rag/context_retriever.py +7 -6
  157. llama_stack/providers/inline/tool_runtime/rag/memory.py +64 -48
  158. llama_stack/providers/inline/vector_io/chroma/__init__.py +1 -1
  159. llama_stack/providers/inline/vector_io/chroma/config.py +1 -1
  160. llama_stack/providers/inline/vector_io/faiss/__init__.py +1 -1
  161. llama_stack/providers/inline/vector_io/faiss/config.py +1 -1
  162. llama_stack/providers/inline/vector_io/faiss/faiss.py +46 -28
  163. llama_stack/providers/inline/vector_io/milvus/__init__.py +1 -1
  164. llama_stack/providers/inline/vector_io/milvus/config.py +1 -1
  165. llama_stack/providers/inline/vector_io/qdrant/__init__.py +1 -1
  166. llama_stack/providers/inline/vector_io/qdrant/config.py +1 -1
  167. llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py +1 -1
  168. llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +44 -33
  169. llama_stack/providers/registry/agents.py +8 -3
  170. llama_stack/providers/registry/batches.py +1 -1
  171. llama_stack/providers/registry/datasetio.py +1 -1
  172. llama_stack/providers/registry/eval.py +1 -1
  173. llama_stack/{apis/datasets/__init__.py → providers/registry/file_processors.py} +5 -1
  174. llama_stack/providers/registry/files.py +11 -2
  175. llama_stack/providers/registry/inference.py +22 -3
  176. llama_stack/providers/registry/post_training.py +1 -1
  177. llama_stack/providers/registry/safety.py +1 -1
  178. llama_stack/providers/registry/scoring.py +1 -1
  179. llama_stack/providers/registry/tool_runtime.py +2 -2
  180. llama_stack/providers/registry/vector_io.py +7 -7
  181. llama_stack/providers/remote/datasetio/huggingface/huggingface.py +2 -5
  182. llama_stack/providers/remote/datasetio/nvidia/datasetio.py +1 -4
  183. llama_stack/providers/remote/eval/nvidia/eval.py +15 -9
  184. llama_stack/providers/remote/files/openai/__init__.py +19 -0
  185. llama_stack/providers/remote/files/openai/config.py +28 -0
  186. llama_stack/providers/remote/files/openai/files.py +253 -0
  187. llama_stack/providers/remote/files/s3/files.py +52 -30
  188. llama_stack/providers/remote/inference/anthropic/anthropic.py +2 -1
  189. llama_stack/providers/remote/inference/anthropic/config.py +1 -1
  190. llama_stack/providers/remote/inference/azure/azure.py +1 -3
  191. llama_stack/providers/remote/inference/azure/config.py +8 -7
  192. llama_stack/providers/remote/inference/bedrock/__init__.py +1 -1
  193. llama_stack/providers/remote/inference/bedrock/bedrock.py +82 -105
  194. llama_stack/providers/remote/inference/bedrock/config.py +24 -3
  195. llama_stack/providers/remote/inference/cerebras/cerebras.py +5 -5
  196. llama_stack/providers/remote/inference/cerebras/config.py +12 -5
  197. llama_stack/providers/remote/inference/databricks/config.py +13 -6
  198. llama_stack/providers/remote/inference/databricks/databricks.py +16 -6
  199. llama_stack/providers/remote/inference/fireworks/config.py +5 -5
  200. llama_stack/providers/remote/inference/fireworks/fireworks.py +1 -1
  201. llama_stack/providers/remote/inference/gemini/config.py +1 -1
  202. llama_stack/providers/remote/inference/gemini/gemini.py +13 -14
  203. llama_stack/providers/remote/inference/groq/config.py +5 -5
  204. llama_stack/providers/remote/inference/groq/groq.py +1 -1
  205. llama_stack/providers/remote/inference/llama_openai_compat/config.py +5 -5
  206. llama_stack/providers/remote/inference/llama_openai_compat/llama.py +8 -6
  207. llama_stack/providers/remote/inference/nvidia/__init__.py +1 -1
  208. llama_stack/providers/remote/inference/nvidia/config.py +21 -11
  209. llama_stack/providers/remote/inference/nvidia/nvidia.py +115 -3
  210. llama_stack/providers/remote/inference/nvidia/utils.py +1 -1
  211. llama_stack/providers/remote/inference/oci/__init__.py +17 -0
  212. llama_stack/providers/remote/inference/oci/auth.py +79 -0
  213. llama_stack/providers/remote/inference/oci/config.py +75 -0
  214. llama_stack/providers/remote/inference/oci/oci.py +162 -0
  215. llama_stack/providers/remote/inference/ollama/config.py +7 -5
  216. llama_stack/providers/remote/inference/ollama/ollama.py +17 -8
  217. llama_stack/providers/remote/inference/openai/config.py +4 -4
  218. llama_stack/providers/remote/inference/openai/openai.py +1 -1
  219. llama_stack/providers/remote/inference/passthrough/__init__.py +2 -2
  220. llama_stack/providers/remote/inference/passthrough/config.py +5 -10
  221. llama_stack/providers/remote/inference/passthrough/passthrough.py +97 -75
  222. llama_stack/providers/remote/inference/runpod/config.py +12 -5
  223. llama_stack/providers/remote/inference/runpod/runpod.py +2 -20
  224. llama_stack/providers/remote/inference/sambanova/config.py +5 -5
  225. llama_stack/providers/remote/inference/sambanova/sambanova.py +1 -1
  226. llama_stack/providers/remote/inference/tgi/config.py +7 -6
  227. llama_stack/providers/remote/inference/tgi/tgi.py +19 -11
  228. llama_stack/providers/remote/inference/together/config.py +5 -5
  229. llama_stack/providers/remote/inference/together/together.py +15 -12
  230. llama_stack/providers/remote/inference/vertexai/config.py +1 -1
  231. llama_stack/providers/remote/inference/vllm/config.py +5 -5
  232. llama_stack/providers/remote/inference/vllm/vllm.py +13 -14
  233. llama_stack/providers/remote/inference/watsonx/config.py +4 -4
  234. llama_stack/providers/remote/inference/watsonx/watsonx.py +21 -94
  235. llama_stack/providers/remote/post_training/nvidia/post_training.py +4 -4
  236. llama_stack/providers/remote/post_training/nvidia/utils.py +1 -1
  237. llama_stack/providers/remote/safety/bedrock/bedrock.py +6 -6
  238. llama_stack/providers/remote/safety/bedrock/config.py +1 -1
  239. llama_stack/providers/remote/safety/nvidia/config.py +1 -1
  240. llama_stack/providers/remote/safety/nvidia/nvidia.py +11 -5
  241. llama_stack/providers/remote/safety/sambanova/config.py +1 -1
  242. llama_stack/providers/remote/safety/sambanova/sambanova.py +6 -6
  243. llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py +11 -6
  244. llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py +12 -7
  245. llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py +8 -2
  246. llama_stack/providers/remote/tool_runtime/model_context_protocol/model_context_protocol.py +57 -15
  247. llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py +11 -6
  248. llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py +11 -6
  249. llama_stack/providers/remote/vector_io/chroma/__init__.py +1 -1
  250. llama_stack/providers/remote/vector_io/chroma/chroma.py +131 -23
  251. llama_stack/providers/remote/vector_io/chroma/config.py +1 -1
  252. llama_stack/providers/remote/vector_io/milvus/__init__.py +1 -1
  253. llama_stack/providers/remote/vector_io/milvus/config.py +1 -1
  254. llama_stack/providers/remote/vector_io/milvus/milvus.py +37 -28
  255. llama_stack/providers/remote/vector_io/pgvector/__init__.py +1 -1
  256. llama_stack/providers/remote/vector_io/pgvector/config.py +1 -1
  257. llama_stack/providers/remote/vector_io/pgvector/pgvector.py +37 -25
  258. llama_stack/providers/remote/vector_io/qdrant/__init__.py +1 -1
  259. llama_stack/providers/remote/vector_io/qdrant/config.py +1 -1
  260. llama_stack/providers/remote/vector_io/qdrant/qdrant.py +147 -30
  261. llama_stack/providers/remote/vector_io/weaviate/__init__.py +1 -1
  262. llama_stack/providers/remote/vector_io/weaviate/config.py +1 -1
  263. llama_stack/providers/remote/vector_io/weaviate/weaviate.py +31 -26
  264. llama_stack/providers/utils/common/data_schema_validator.py +1 -5
  265. llama_stack/providers/utils/files/form_data.py +1 -1
  266. llama_stack/providers/utils/inference/embedding_mixin.py +1 -1
  267. llama_stack/providers/utils/inference/inference_store.py +7 -8
  268. llama_stack/providers/utils/inference/litellm_openai_mixin.py +79 -79
  269. llama_stack/providers/utils/inference/model_registry.py +1 -3
  270. llama_stack/providers/utils/inference/openai_compat.py +44 -1171
  271. llama_stack/providers/utils/inference/openai_mixin.py +68 -42
  272. llama_stack/providers/utils/inference/prompt_adapter.py +50 -265
  273. llama_stack/providers/utils/inference/stream_utils.py +23 -0
  274. llama_stack/providers/utils/memory/__init__.py +2 -0
  275. llama_stack/providers/utils/memory/file_utils.py +1 -1
  276. llama_stack/providers/utils/memory/openai_vector_store_mixin.py +181 -84
  277. llama_stack/providers/utils/memory/vector_store.py +39 -38
  278. llama_stack/providers/utils/pagination.py +1 -1
  279. llama_stack/providers/utils/responses/responses_store.py +15 -25
  280. llama_stack/providers/utils/scoring/aggregation_utils.py +1 -2
  281. llama_stack/providers/utils/scoring/base_scoring_fn.py +1 -2
  282. llama_stack/providers/utils/tools/mcp.py +93 -11
  283. llama_stack/providers/utils/vector_io/__init__.py +16 -0
  284. llama_stack/providers/utils/vector_io/vector_utils.py +36 -0
  285. llama_stack/telemetry/constants.py +27 -0
  286. llama_stack/telemetry/helpers.py +43 -0
  287. llama_stack/testing/api_recorder.py +25 -16
  288. {llama_stack-0.3.5.dist-info → llama_stack-0.4.1.dist-info}/METADATA +57 -55
  289. llama_stack-0.4.1.dist-info/RECORD +588 -0
  290. llama_stack-0.4.1.dist-info/top_level.txt +2 -0
  291. llama_stack_api/__init__.py +945 -0
  292. llama_stack_api/admin/__init__.py +45 -0
  293. llama_stack_api/admin/api.py +72 -0
  294. llama_stack_api/admin/fastapi_routes.py +117 -0
  295. llama_stack_api/admin/models.py +113 -0
  296. llama_stack_api/agents.py +173 -0
  297. llama_stack_api/batches/__init__.py +40 -0
  298. llama_stack_api/batches/api.py +53 -0
  299. llama_stack_api/batches/fastapi_routes.py +113 -0
  300. llama_stack_api/batches/models.py +78 -0
  301. llama_stack_api/benchmarks/__init__.py +43 -0
  302. llama_stack_api/benchmarks/api.py +39 -0
  303. llama_stack_api/benchmarks/fastapi_routes.py +109 -0
  304. llama_stack_api/benchmarks/models.py +109 -0
  305. {llama_stack/apis → llama_stack_api}/common/content_types.py +1 -43
  306. {llama_stack/apis → llama_stack_api}/common/errors.py +0 -8
  307. {llama_stack/apis → llama_stack_api}/common/job_types.py +1 -1
  308. llama_stack_api/common/responses.py +77 -0
  309. {llama_stack/apis → llama_stack_api}/common/training_types.py +1 -1
  310. {llama_stack/apis → llama_stack_api}/common/type_system.py +2 -14
  311. llama_stack_api/connectors.py +146 -0
  312. {llama_stack/apis/conversations → llama_stack_api}/conversations.py +23 -39
  313. {llama_stack/apis/datasetio → llama_stack_api}/datasetio.py +4 -8
  314. llama_stack_api/datasets/__init__.py +61 -0
  315. llama_stack_api/datasets/api.py +35 -0
  316. llama_stack_api/datasets/fastapi_routes.py +104 -0
  317. llama_stack_api/datasets/models.py +152 -0
  318. {llama_stack/providers → llama_stack_api}/datatypes.py +166 -10
  319. {llama_stack/apis/eval → llama_stack_api}/eval.py +8 -40
  320. llama_stack_api/file_processors/__init__.py +27 -0
  321. llama_stack_api/file_processors/api.py +64 -0
  322. llama_stack_api/file_processors/fastapi_routes.py +78 -0
  323. llama_stack_api/file_processors/models.py +42 -0
  324. llama_stack_api/files/__init__.py +35 -0
  325. llama_stack_api/files/api.py +51 -0
  326. llama_stack_api/files/fastapi_routes.py +124 -0
  327. llama_stack_api/files/models.py +107 -0
  328. {llama_stack/apis/inference → llama_stack_api}/inference.py +90 -194
  329. llama_stack_api/inspect_api/__init__.py +37 -0
  330. llama_stack_api/inspect_api/api.py +25 -0
  331. llama_stack_api/inspect_api/fastapi_routes.py +76 -0
  332. llama_stack_api/inspect_api/models.py +28 -0
  333. {llama_stack/apis/agents → llama_stack_api/internal}/__init__.py +3 -1
  334. llama_stack/providers/utils/kvstore/api.py → llama_stack_api/internal/kvstore.py +5 -0
  335. llama_stack_api/internal/sqlstore.py +79 -0
  336. {llama_stack/apis/models → llama_stack_api}/models.py +11 -9
  337. {llama_stack/apis/agents → llama_stack_api}/openai_responses.py +184 -27
  338. {llama_stack/apis/post_training → llama_stack_api}/post_training.py +7 -11
  339. {llama_stack/apis/prompts → llama_stack_api}/prompts.py +3 -4
  340. llama_stack_api/providers/__init__.py +33 -0
  341. llama_stack_api/providers/api.py +16 -0
  342. llama_stack_api/providers/fastapi_routes.py +57 -0
  343. llama_stack_api/providers/models.py +24 -0
  344. {llama_stack/apis/tools → llama_stack_api}/rag_tool.py +2 -52
  345. {llama_stack/apis → llama_stack_api}/resource.py +1 -1
  346. llama_stack_api/router_utils.py +160 -0
  347. {llama_stack/apis/safety → llama_stack_api}/safety.py +6 -9
  348. {llama_stack → llama_stack_api}/schema_utils.py +94 -4
  349. {llama_stack/apis/scoring → llama_stack_api}/scoring.py +3 -3
  350. {llama_stack/apis/scoring_functions → llama_stack_api}/scoring_functions.py +9 -6
  351. {llama_stack/apis/shields → llama_stack_api}/shields.py +6 -7
  352. {llama_stack/apis/tools → llama_stack_api}/tools.py +26 -21
  353. {llama_stack/apis/vector_io → llama_stack_api}/vector_io.py +133 -152
  354. {llama_stack/apis/vector_stores → llama_stack_api}/vector_stores.py +1 -1
  355. llama_stack/apis/agents/agents.py +0 -894
  356. llama_stack/apis/batches/__init__.py +0 -9
  357. llama_stack/apis/batches/batches.py +0 -100
  358. llama_stack/apis/benchmarks/__init__.py +0 -7
  359. llama_stack/apis/benchmarks/benchmarks.py +0 -108
  360. llama_stack/apis/common/responses.py +0 -36
  361. llama_stack/apis/conversations/__init__.py +0 -31
  362. llama_stack/apis/datasets/datasets.py +0 -251
  363. llama_stack/apis/datatypes.py +0 -160
  364. llama_stack/apis/eval/__init__.py +0 -7
  365. llama_stack/apis/files/__init__.py +0 -7
  366. llama_stack/apis/files/files.py +0 -199
  367. llama_stack/apis/inference/__init__.py +0 -7
  368. llama_stack/apis/inference/event_logger.py +0 -43
  369. llama_stack/apis/inspect/__init__.py +0 -7
  370. llama_stack/apis/inspect/inspect.py +0 -94
  371. llama_stack/apis/models/__init__.py +0 -7
  372. llama_stack/apis/post_training/__init__.py +0 -7
  373. llama_stack/apis/prompts/__init__.py +0 -9
  374. llama_stack/apis/providers/__init__.py +0 -7
  375. llama_stack/apis/providers/providers.py +0 -69
  376. llama_stack/apis/safety/__init__.py +0 -7
  377. llama_stack/apis/scoring/__init__.py +0 -7
  378. llama_stack/apis/scoring_functions/__init__.py +0 -7
  379. llama_stack/apis/shields/__init__.py +0 -7
  380. llama_stack/apis/synthetic_data_generation/__init__.py +0 -7
  381. llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py +0 -77
  382. llama_stack/apis/telemetry/__init__.py +0 -7
  383. llama_stack/apis/telemetry/telemetry.py +0 -423
  384. llama_stack/apis/tools/__init__.py +0 -8
  385. llama_stack/apis/vector_io/__init__.py +0 -7
  386. llama_stack/apis/vector_stores/__init__.py +0 -7
  387. llama_stack/core/server/tracing.py +0 -80
  388. llama_stack/core/ui/app.py +0 -55
  389. llama_stack/core/ui/modules/__init__.py +0 -5
  390. llama_stack/core/ui/modules/api.py +0 -32
  391. llama_stack/core/ui/modules/utils.py +0 -42
  392. llama_stack/core/ui/page/__init__.py +0 -5
  393. llama_stack/core/ui/page/distribution/__init__.py +0 -5
  394. llama_stack/core/ui/page/distribution/datasets.py +0 -18
  395. llama_stack/core/ui/page/distribution/eval_tasks.py +0 -20
  396. llama_stack/core/ui/page/distribution/models.py +0 -18
  397. llama_stack/core/ui/page/distribution/providers.py +0 -27
  398. llama_stack/core/ui/page/distribution/resources.py +0 -48
  399. llama_stack/core/ui/page/distribution/scoring_functions.py +0 -18
  400. llama_stack/core/ui/page/distribution/shields.py +0 -19
  401. llama_stack/core/ui/page/evaluations/__init__.py +0 -5
  402. llama_stack/core/ui/page/evaluations/app_eval.py +0 -143
  403. llama_stack/core/ui/page/evaluations/native_eval.py +0 -253
  404. llama_stack/core/ui/page/playground/__init__.py +0 -5
  405. llama_stack/core/ui/page/playground/chat.py +0 -130
  406. llama_stack/core/ui/page/playground/tools.py +0 -352
  407. llama_stack/distributions/dell/build.yaml +0 -33
  408. llama_stack/distributions/meta-reference-gpu/build.yaml +0 -32
  409. llama_stack/distributions/nvidia/build.yaml +0 -29
  410. llama_stack/distributions/open-benchmark/build.yaml +0 -36
  411. llama_stack/distributions/postgres-demo/__init__.py +0 -7
  412. llama_stack/distributions/postgres-demo/build.yaml +0 -23
  413. llama_stack/distributions/postgres-demo/postgres_demo.py +0 -125
  414. llama_stack/distributions/starter/build.yaml +0 -61
  415. llama_stack/distributions/starter-gpu/build.yaml +0 -61
  416. llama_stack/distributions/watsonx/build.yaml +0 -33
  417. llama_stack/providers/inline/agents/meta_reference/agent_instance.py +0 -1024
  418. llama_stack/providers/inline/agents/meta_reference/persistence.py +0 -228
  419. llama_stack/providers/inline/telemetry/__init__.py +0 -5
  420. llama_stack/providers/inline/telemetry/meta_reference/__init__.py +0 -21
  421. llama_stack/providers/inline/telemetry/meta_reference/config.py +0 -47
  422. llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +0 -252
  423. llama_stack/providers/remote/inference/bedrock/models.py +0 -29
  424. llama_stack/providers/utils/kvstore/sqlite/config.py +0 -20
  425. llama_stack/providers/utils/sqlstore/__init__.py +0 -5
  426. llama_stack/providers/utils/sqlstore/api.py +0 -128
  427. llama_stack/providers/utils/telemetry/__init__.py +0 -5
  428. llama_stack/providers/utils/telemetry/trace_protocol.py +0 -142
  429. llama_stack/providers/utils/telemetry/tracing.py +0 -384
  430. llama_stack/strong_typing/__init__.py +0 -19
  431. llama_stack/strong_typing/auxiliary.py +0 -228
  432. llama_stack/strong_typing/classdef.py +0 -440
  433. llama_stack/strong_typing/core.py +0 -46
  434. llama_stack/strong_typing/deserializer.py +0 -877
  435. llama_stack/strong_typing/docstring.py +0 -409
  436. llama_stack/strong_typing/exception.py +0 -23
  437. llama_stack/strong_typing/inspection.py +0 -1085
  438. llama_stack/strong_typing/mapping.py +0 -40
  439. llama_stack/strong_typing/name.py +0 -182
  440. llama_stack/strong_typing/schema.py +0 -792
  441. llama_stack/strong_typing/serialization.py +0 -97
  442. llama_stack/strong_typing/serializer.py +0 -500
  443. llama_stack/strong_typing/slots.py +0 -27
  444. llama_stack/strong_typing/topological.py +0 -89
  445. llama_stack/ui/node_modules/flatted/python/flatted.py +0 -149
  446. llama_stack-0.3.5.dist-info/RECORD +0 -625
  447. llama_stack-0.3.5.dist-info/top_level.txt +0 -1
  448. /llama_stack/{providers/utils → core/storage}/kvstore/config.py +0 -0
  449. /llama_stack/{providers/utils → core/storage}/kvstore/mongodb/__init__.py +0 -0
  450. /llama_stack/{providers/utils → core/storage}/kvstore/postgres/__init__.py +0 -0
  451. /llama_stack/{providers/utils → core/storage}/kvstore/redis/__init__.py +0 -0
  452. /llama_stack/{providers/utils → core/storage}/kvstore/sqlite/__init__.py +0 -0
  453. /llama_stack/{apis → providers/inline/file_processor}/__init__.py +0 -0
  454. /llama_stack/{apis/common → telemetry}/__init__.py +0 -0
  455. {llama_stack-0.3.5.dist-info → llama_stack-0.4.1.dist-info}/WHEEL +0 -0
  456. {llama_stack-0.3.5.dist-info → llama_stack-0.4.1.dist-info}/entry_points.txt +0 -0
  457. {llama_stack-0.3.5.dist-info → llama_stack-0.4.1.dist-info}/licenses/LICENSE +0 -0
  458. {llama_stack/core/ui → llama_stack_api/common}/__init__.py +0 -0
  459. {llama_stack/strong_typing → llama_stack_api}/py.typed +0 -0
  460. {llama_stack/apis → llama_stack_api}/version.py +0 -0
@@ -6,8 +6,7 @@
6
6
 
7
7
  import asyncio
8
8
  import time
9
- from collections.abc import AsyncGenerator, AsyncIterator
10
- from datetime import UTC, datetime
9
+ from collections.abc import AsyncIterator
11
10
  from typing import Annotated, Any
12
11
 
13
12
  from fastapi import Body
@@ -15,23 +14,24 @@ from openai.types.chat import ChatCompletionToolChoiceOptionParam as OpenAIChatC
15
14
  from openai.types.chat import ChatCompletionToolParam as OpenAIChatCompletionToolParam
16
15
  from pydantic import TypeAdapter
17
16
 
18
- from llama_stack.apis.common.content_types import (
19
- InterleavedContent,
20
- )
21
- from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
22
- from llama_stack.apis.inference import (
23
- ChatCompletionResponse,
24
- ChatCompletionResponseEventType,
25
- ChatCompletionResponseStreamChunk,
26
- CompletionMessage,
27
- CompletionResponse,
28
- CompletionResponseStreamChunk,
17
+ from llama_stack.core.access_control.access_control import is_action_allowed
18
+ from llama_stack.core.datatypes import ModelWithOwner
19
+ from llama_stack.core.request_headers import get_authenticated_user
20
+ from llama_stack.log import get_logger
21
+ from llama_stack.providers.utils.inference.inference_store import InferenceStore
22
+ from llama_stack_api import (
23
+ HealthResponse,
24
+ HealthStatus,
29
25
  Inference,
30
26
  ListOpenAIChatCompletionResponse,
31
- Message,
27
+ ModelNotFoundError,
28
+ ModelType,
29
+ ModelTypeError,
32
30
  OpenAIAssistantMessageParam,
33
31
  OpenAIChatCompletion,
34
32
  OpenAIChatCompletionChunk,
33
+ OpenAIChatCompletionContentPartImageParam,
34
+ OpenAIChatCompletionContentPartTextParam,
35
35
  OpenAIChatCompletionRequestWithExtraBody,
36
36
  OpenAIChatCompletionToolCall,
37
37
  OpenAIChatCompletionToolCallFunction,
@@ -43,25 +43,12 @@ from llama_stack.apis.inference import (
43
43
  OpenAIEmbeddingsRequestWithExtraBody,
44
44
  OpenAIEmbeddingsResponse,
45
45
  OpenAIMessageParam,
46
+ OpenAITokenLogProb,
47
+ OpenAITopLogProb,
46
48
  Order,
47
- StopReason,
48
- ToolPromptFormat,
49
- )
50
- from llama_stack.apis.models import Model, ModelType
51
- from llama_stack.apis.telemetry import MetricEvent, MetricInResponse, Telemetry
52
- from llama_stack.core.access_control.access_control import is_action_allowed
53
- from llama_stack.core.datatypes import ModelWithOwner
54
- from llama_stack.core.request_headers import get_authenticated_user
55
- from llama_stack.log import get_logger
56
- from llama_stack.models.llama.llama3.chat_format import ChatFormat
57
- from llama_stack.models.llama.llama3.tokenizer import Tokenizer
58
- from llama_stack.providers.datatypes import (
59
- HealthResponse,
60
- HealthStatus,
49
+ RerankResponse,
61
50
  RoutingTable,
62
51
  )
63
- from llama_stack.providers.utils.inference.inference_store import InferenceStore
64
- from llama_stack.providers.utils.telemetry.tracing import enqueue_event, get_current_span
65
52
 
66
53
  logger = get_logger(name=__name__, category="core::routers")
67
54
 
@@ -72,16 +59,11 @@ class InferenceRouter(Inference):
72
59
  def __init__(
73
60
  self,
74
61
  routing_table: RoutingTable,
75
- telemetry: Telemetry | None = None,
76
62
  store: InferenceStore | None = None,
77
63
  ) -> None:
78
64
  logger.debug("Initializing InferenceRouter")
79
65
  self.routing_table = routing_table
80
- self.telemetry = telemetry
81
66
  self.store = store
82
- if self.telemetry:
83
- self.tokenizer = Tokenizer.get_instance()
84
- self.formatter = ChatFormat(self.tokenizer)
85
67
 
86
68
  async def initialize(self) -> None:
87
69
  logger.debug("InferenceRouter.initialize")
@@ -107,83 +89,6 @@ class InferenceRouter(Inference):
107
89
  )
108
90
  await self.routing_table.register_model(model_id, provider_model_id, provider_id, metadata, model_type)
109
91
 
110
- def _construct_metrics(
111
- self,
112
- prompt_tokens: int,
113
- completion_tokens: int,
114
- total_tokens: int,
115
- fully_qualified_model_id: str,
116
- provider_id: str,
117
- ) -> list[MetricEvent]:
118
- """Constructs a list of MetricEvent objects containing token usage metrics.
119
-
120
- Args:
121
- prompt_tokens: Number of tokens in the prompt
122
- completion_tokens: Number of tokens in the completion
123
- total_tokens: Total number of tokens used
124
- fully_qualified_model_id:
125
- provider_id: The provider identifier
126
-
127
- Returns:
128
- List of MetricEvent objects with token usage metrics
129
- """
130
- span = get_current_span()
131
- if span is None:
132
- logger.warning("No span found for token usage metrics")
133
- return []
134
-
135
- metrics = [
136
- ("prompt_tokens", prompt_tokens),
137
- ("completion_tokens", completion_tokens),
138
- ("total_tokens", total_tokens),
139
- ]
140
- metric_events = []
141
- for metric_name, value in metrics:
142
- metric_events.append(
143
- MetricEvent(
144
- trace_id=span.trace_id,
145
- span_id=span.span_id,
146
- metric=metric_name,
147
- value=value,
148
- timestamp=datetime.now(UTC),
149
- unit="tokens",
150
- attributes={
151
- "model_id": fully_qualified_model_id,
152
- "provider_id": provider_id,
153
- },
154
- )
155
- )
156
- return metric_events
157
-
158
- async def _compute_and_log_token_usage(
159
- self,
160
- prompt_tokens: int,
161
- completion_tokens: int,
162
- total_tokens: int,
163
- model: Model,
164
- ) -> list[MetricInResponse]:
165
- metrics = self._construct_metrics(
166
- prompt_tokens, completion_tokens, total_tokens, model.model_id, model.provider_id
167
- )
168
- if self.telemetry:
169
- for metric in metrics:
170
- enqueue_event(metric)
171
- return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
172
-
173
- async def _count_tokens(
174
- self,
175
- messages: list[Message] | InterleavedContent,
176
- tool_prompt_format: ToolPromptFormat | None = None,
177
- ) -> int | None:
178
- if not hasattr(self, "formatter") or self.formatter is None:
179
- return None
180
-
181
- if isinstance(messages, list):
182
- encoded = self.formatter.encode_dialog_prompt(messages, tool_prompt_format)
183
- else:
184
- encoded = self.formatter.encode_content(messages)
185
- return len(encoded.tokens) if encoded and encoded.tokens else 0
186
-
187
92
  async def _get_model_provider(self, model_id: str, expected_model_type: str) -> tuple[Inference, str]:
188
93
  model = await self.routing_table.get_object_by_identifier("model", model_id)
189
94
  if model:
@@ -230,6 +135,17 @@ class InferenceRouter(Inference):
230
135
 
231
136
  return self.routing_table.impls_by_provider_id[provider_id], provider_resource_id
232
137
 
138
+ async def rerank(
139
+ self,
140
+ model: str,
141
+ query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
142
+ items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
143
+ max_num_results: int | None = None,
144
+ ) -> RerankResponse:
145
+ logger.debug(f"InferenceRouter.rerank: {model}")
146
+ provider, provider_resource_id = await self._get_model_provider(model, ModelType.rerank)
147
+ return await provider.rerank(provider_resource_id, query, items, max_num_results)
148
+
233
149
  async def openai_completion(
234
150
  self,
235
151
  params: Annotated[OpenAICompletionRequestWithExtraBody, Body(...)],
@@ -243,26 +159,9 @@ class InferenceRouter(Inference):
243
159
 
244
160
  if params.stream:
245
161
  return await provider.openai_completion(params)
246
- # TODO: Metrics do NOT work with openai_completion stream=True due to the fact
247
- # that we do not return an AsyncIterator, our tests expect a stream of chunks we cannot intercept currently.
248
162
 
249
163
  response = await provider.openai_completion(params)
250
164
  response.model = request_model_id
251
- if self.telemetry:
252
- metrics = self._construct_metrics(
253
- prompt_tokens=response.usage.prompt_tokens,
254
- completion_tokens=response.usage.completion_tokens,
255
- total_tokens=response.usage.total_tokens,
256
- fully_qualified_model_id=request_model_id,
257
- provider_id=provider.__provider_id__,
258
- )
259
- for metric in metrics:
260
- enqueue_event(metric)
261
-
262
- # these metrics will show up in the client response.
263
- response.metrics = (
264
- metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
265
- )
266
165
  return response
267
166
 
268
167
  async def openai_chat_completion(
@@ -311,20 +210,6 @@ class InferenceRouter(Inference):
311
210
  if self.store:
312
211
  asyncio.create_task(self.store.store_chat_completion(response, params.messages))
313
212
 
314
- if self.telemetry:
315
- metrics = self._construct_metrics(
316
- prompt_tokens=response.usage.prompt_tokens,
317
- completion_tokens=response.usage.completion_tokens,
318
- total_tokens=response.usage.total_tokens,
319
- fully_qualified_model_id=request_model_id,
320
- provider_id=provider.__provider_id__,
321
- )
322
- for metric in metrics:
323
- enqueue_event(metric)
324
- # these metrics will show up in the client response.
325
- response.metrics = (
326
- metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
327
- )
328
213
  return response
329
214
 
330
215
  async def openai_embeddings(
@@ -392,121 +277,6 @@ class InferenceRouter(Inference):
392
277
  )
393
278
  return health_statuses
394
279
 
395
- async def stream_tokens_and_compute_metrics(
396
- self,
397
- response,
398
- prompt_tokens,
399
- fully_qualified_model_id: str,
400
- provider_id: str,
401
- tool_prompt_format: ToolPromptFormat | None = None,
402
- ) -> AsyncGenerator[ChatCompletionResponseStreamChunk, None] | AsyncGenerator[CompletionResponseStreamChunk, None]:
403
- completion_text = ""
404
- async for chunk in response:
405
- complete = False
406
- if hasattr(chunk, "event"): # only ChatCompletions have .event
407
- if chunk.event.event_type == ChatCompletionResponseEventType.progress:
408
- if chunk.event.delta.type == "text":
409
- completion_text += chunk.event.delta.text
410
- if chunk.event.event_type == ChatCompletionResponseEventType.complete:
411
- complete = True
412
- completion_tokens = await self._count_tokens(
413
- [
414
- CompletionMessage(
415
- content=completion_text,
416
- stop_reason=StopReason.end_of_turn,
417
- )
418
- ],
419
- tool_prompt_format=tool_prompt_format,
420
- )
421
- else:
422
- if hasattr(chunk, "delta"):
423
- completion_text += chunk.delta
424
- if hasattr(chunk, "stop_reason") and chunk.stop_reason and self.telemetry:
425
- complete = True
426
- completion_tokens = await self._count_tokens(completion_text)
427
- # if we are done receiving tokens
428
- if complete:
429
- total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
430
-
431
- # Create a separate span for streaming completion metrics
432
- if self.telemetry:
433
- # Log metrics in the new span context
434
- completion_metrics = self._construct_metrics(
435
- prompt_tokens=prompt_tokens,
436
- completion_tokens=completion_tokens,
437
- total_tokens=total_tokens,
438
- fully_qualified_model_id=fully_qualified_model_id,
439
- provider_id=provider_id,
440
- )
441
- for metric in completion_metrics:
442
- if metric.metric in [
443
- "completion_tokens",
444
- "total_tokens",
445
- ]: # Only log completion and total tokens
446
- enqueue_event(metric)
447
-
448
- # Return metrics in response
449
- async_metrics = [
450
- MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics
451
- ]
452
- chunk.metrics = async_metrics if chunk.metrics is None else chunk.metrics + async_metrics
453
- else:
454
- # Fallback if no telemetry
455
- completion_metrics = self._construct_metrics(
456
- prompt_tokens or 0,
457
- completion_tokens or 0,
458
- total_tokens,
459
- fully_qualified_model_id=fully_qualified_model_id,
460
- provider_id=provider_id,
461
- )
462
- async_metrics = [
463
- MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics
464
- ]
465
- chunk.metrics = async_metrics if chunk.metrics is None else chunk.metrics + async_metrics
466
- yield chunk
467
-
468
- async def count_tokens_and_compute_metrics(
469
- self,
470
- response: ChatCompletionResponse | CompletionResponse,
471
- prompt_tokens,
472
- fully_qualified_model_id: str,
473
- provider_id: str,
474
- tool_prompt_format: ToolPromptFormat | None = None,
475
- ):
476
- if isinstance(response, ChatCompletionResponse):
477
- content = [response.completion_message]
478
- else:
479
- content = response.content
480
- completion_tokens = await self._count_tokens(messages=content, tool_prompt_format=tool_prompt_format)
481
- total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
482
-
483
- # Create a separate span for completion metrics
484
- if self.telemetry:
485
- # Log metrics in the new span context
486
- completion_metrics = self._construct_metrics(
487
- prompt_tokens=prompt_tokens,
488
- completion_tokens=completion_tokens,
489
- total_tokens=total_tokens,
490
- fully_qualified_model_id=fully_qualified_model_id,
491
- provider_id=provider_id,
492
- )
493
- for metric in completion_metrics:
494
- if metric.metric in ["completion_tokens", "total_tokens"]: # Only log completion and total tokens
495
- enqueue_event(metric)
496
-
497
- # Return metrics in response
498
- return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics]
499
-
500
- # Fallback if no telemetry
501
- metrics = self._construct_metrics(
502
- prompt_tokens or 0,
503
- completion_tokens or 0,
504
- total_tokens,
505
- fully_qualified_model_id=fully_qualified_model_id,
506
- provider_id=provider_id,
507
- )
508
- return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
509
-
510
280
  async def stream_tokens_and_compute_metrics_openai_chat(
511
281
  self,
512
282
  response: AsyncIterator[OpenAIChatCompletionChunk],
@@ -574,8 +344,34 @@ class InferenceRouter(Inference):
574
344
  )
575
345
  if choice_delta.finish_reason:
576
346
  current_choice_data["finish_reason"] = choice_delta.finish_reason
347
+
348
+ # Convert logprobs from chat completion format to responses format
349
+ # Chat completion returns list of ChatCompletionTokenLogprob, but
350
+ # expecting list of OpenAITokenLogProb in OpenAIChoice
577
351
  if choice_delta.logprobs and choice_delta.logprobs.content:
578
- current_choice_data["logprobs_content_parts"].extend(choice_delta.logprobs.content)
352
+ converted_logprobs = []
353
+ for token_logprob in choice_delta.logprobs.content:
354
+ top_logprobs = None
355
+ if token_logprob.top_logprobs:
356
+ top_logprobs = [
357
+ OpenAITopLogProb(
358
+ token=tlp.token,
359
+ bytes=tlp.bytes,
360
+ logprob=tlp.logprob,
361
+ )
362
+ for tlp in token_logprob.top_logprobs
363
+ ]
364
+ converted_logprobs.append(
365
+ OpenAITokenLogProb(
366
+ token=token_logprob.token,
367
+ bytes=token_logprob.bytes,
368
+ logprob=token_logprob.logprob,
369
+ top_logprobs=top_logprobs,
370
+ )
371
+ )
372
+ # Update choice delta with the newly formatted logprobs object
373
+ choice_delta.logprobs.content = converted_logprobs
374
+ current_choice_data["logprobs_content_parts"].extend(converted_logprobs)
579
375
 
580
376
  # Compute metrics on final chunk
581
377
  if chunk.choices and chunk.choices[0].finish_reason:
@@ -583,18 +379,6 @@ class InferenceRouter(Inference):
583
379
  for choice_data in choices_data.values():
584
380
  completion_text += "".join(choice_data["content_parts"])
585
381
 
586
- # Add metrics to the chunk
587
- if self.telemetry and hasattr(chunk, "usage") and chunk.usage:
588
- metrics = self._construct_metrics(
589
- prompt_tokens=chunk.usage.prompt_tokens,
590
- completion_tokens=chunk.usage.completion_tokens,
591
- total_tokens=chunk.usage.total_tokens,
592
- model_id=fully_qualified_model_id,
593
- provider_id=provider_id,
594
- )
595
- for metric in metrics:
596
- enqueue_event(metric)
597
-
598
382
  yield chunk
599
383
  finally:
600
384
  # Store the final assembled completion
@@ -6,23 +6,26 @@
6
6
 
7
7
  from typing import Any
8
8
 
9
- from llama_stack.apis.inference import Message
10
- from llama_stack.apis.safety import RunShieldResponse, Safety
11
- from llama_stack.apis.safety.safety import ModerationObject
12
- from llama_stack.apis.shields import Shield
9
+ from opentelemetry import trace
10
+
11
+ from llama_stack.core.datatypes import SafetyConfig
13
12
  from llama_stack.log import get_logger
14
- from llama_stack.providers.datatypes import RoutingTable
13
+ from llama_stack.telemetry.helpers import safety_request_span_attributes, safety_span_name
14
+ from llama_stack_api import ModerationObject, OpenAIMessageParam, RoutingTable, RunShieldResponse, Safety, Shield
15
15
 
16
16
  logger = get_logger(name=__name__, category="core::routers")
17
+ tracer = trace.get_tracer(__name__)
17
18
 
18
19
 
19
20
  class SafetyRouter(Safety):
20
21
  def __init__(
21
22
  self,
22
23
  routing_table: RoutingTable,
24
+ safety_config: SafetyConfig | None = None,
23
25
  ) -> None:
24
26
  logger.debug("Initializing SafetyRouter")
25
27
  self.routing_table = routing_table
28
+ self.safety_config = safety_config
26
29
 
27
30
  async def initialize(self) -> None:
28
31
  logger.debug("SafetyRouter.initialize")
@@ -49,37 +52,62 @@ class SafetyRouter(Safety):
49
52
  async def run_shield(
50
53
  self,
51
54
  shield_id: str,
52
- messages: list[Message],
55
+ messages: list[OpenAIMessageParam],
53
56
  params: dict[str, Any] = None,
54
57
  ) -> RunShieldResponse:
55
- logger.debug(f"SafetyRouter.run_shield: {shield_id}")
56
- provider = await self.routing_table.get_provider_impl(shield_id)
57
- return await provider.run_shield(
58
- shield_id=shield_id,
59
- messages=messages,
60
- params=params,
61
- )
58
+ with tracer.start_as_current_span(name=safety_span_name(shield_id)):
59
+ logger.debug(f"SafetyRouter.run_shield: {shield_id}")
60
+ provider = await self.routing_table.get_provider_impl(shield_id)
61
+ response = await provider.run_shield(
62
+ shield_id=shield_id,
63
+ messages=messages,
64
+ params=params,
65
+ )
66
+
67
+ safety_request_span_attributes(shield_id, messages, response)
68
+ return response
62
69
 
63
- async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
64
- async def get_shield_id(self, model: str) -> str:
65
- """Get Shield id from model (provider_resource_id) of shield."""
66
- list_shields_response = await self.routing_table.list_shields()
70
+ async def run_moderation(self, input: str | list[str], model: str | None = None) -> ModerationObject:
71
+ list_shields_response = await self.routing_table.list_shields()
72
+ shields = list_shields_response.data
67
73
 
68
- matches = [s.identifier for s in list_shields_response.data if model == s.provider_resource_id]
74
+ selected_shield: Shield | None = None
75
+ provider_model: str | None = model
69
76
 
77
+ if model:
78
+ matches: list[Shield] = [s for s in shields if model == s.provider_resource_id]
70
79
  if not matches:
71
- raise ValueError(f"No shield associated with provider_resource id {model}")
80
+ raise ValueError(
81
+ f"No shield associated with provider_resource id {model}: choose from {[s.provider_resource_id for s in shields]}"
82
+ )
72
83
  if len(matches) > 1:
73
- raise ValueError(f"Multiple shields associated with provider_resource id {model}")
74
- return matches[0]
75
-
76
- shield_id = await get_shield_id(self, model)
84
+ raise ValueError(
85
+ f"Multiple shields associated with provider_resource id {model}: matched shields {[s.identifier for s in matches]}"
86
+ )
87
+ selected_shield = matches[0]
88
+ else:
89
+ default_shield_id = self.safety_config.default_shield_id if self.safety_config else None
90
+ if not default_shield_id:
91
+ raise ValueError(
92
+ "No moderation model specified and no default_shield_id configured in safety config: select model "
93
+ f"from {[s.provider_resource_id or s.identifier for s in shields]}"
94
+ )
95
+
96
+ selected_shield = next((s for s in shields if s.identifier == default_shield_id), None)
97
+ if selected_shield is None:
98
+ raise ValueError(
99
+ f"Default moderation model not found. Choose from {[s.provider_resource_id or s.identifier for s in shields]}."
100
+ )
101
+
102
+ provider_model = selected_shield.provider_resource_id
103
+
104
+ shield_id = selected_shield.identifier
77
105
  logger.debug(f"SafetyRouter.run_moderation: {shield_id}")
78
106
  provider = await self.routing_table.get_provider_impl(shield_id)
79
107
 
80
108
  response = await provider.run_moderation(
81
109
  input=input,
82
- model=model,
110
+ model=provider_model,
83
111
  )
84
112
 
85
113
  return response
@@ -6,19 +6,12 @@
6
6
 
7
7
  from typing import Any
8
8
 
9
- from llama_stack.apis.common.content_types import (
9
+ from llama_stack.log import get_logger
10
+ from llama_stack_api import (
10
11
  URL,
11
- InterleavedContent,
12
- )
13
- from llama_stack.apis.tools import (
14
12
  ListToolDefsResponse,
15
- RAGDocument,
16
- RAGQueryConfig,
17
- RAGQueryResult,
18
- RAGToolRuntime,
19
13
  ToolRuntime,
20
14
  )
21
- from llama_stack.log import get_logger
22
15
 
23
16
  from ..routing_tables.toolgroups import ToolGroupsRoutingTable
24
17
 
@@ -26,36 +19,6 @@ logger = get_logger(name=__name__, category="core::routers")
26
19
 
27
20
 
28
21
  class ToolRuntimeRouter(ToolRuntime):
29
- class RagToolImpl(RAGToolRuntime):
30
- def __init__(
31
- self,
32
- routing_table: ToolGroupsRoutingTable,
33
- ) -> None:
34
- logger.debug("Initializing ToolRuntimeRouter.RagToolImpl")
35
- self.routing_table = routing_table
36
-
37
- async def query(
38
- self,
39
- content: InterleavedContent,
40
- vector_store_ids: list[str],
41
- query_config: RAGQueryConfig | None = None,
42
- ) -> RAGQueryResult:
43
- logger.debug(f"ToolRuntimeRouter.RagToolImpl.query: {vector_store_ids}")
44
- provider = await self.routing_table.get_provider_impl("knowledge_search")
45
- return await provider.query(content, vector_store_ids, query_config)
46
-
47
- async def insert(
48
- self,
49
- documents: list[RAGDocument],
50
- vector_store_id: str,
51
- chunk_size_in_tokens: int = 512,
52
- ) -> None:
53
- logger.debug(
54
- f"ToolRuntimeRouter.RagToolImpl.insert: {vector_store_id}, {len(documents)} documents, chunk_size={chunk_size_in_tokens}"
55
- )
56
- provider = await self.routing_table.get_provider_impl("insert_into_memory")
57
- return await provider.insert(documents, vector_store_id, chunk_size_in_tokens)
58
-
59
22
  def __init__(
60
23
  self,
61
24
  routing_table: ToolGroupsRoutingTable,
@@ -63,11 +26,6 @@ class ToolRuntimeRouter(ToolRuntime):
63
26
  logger.debug("Initializing ToolRuntimeRouter")
64
27
  self.routing_table = routing_table
65
28
 
66
- # HACK ALERT this should be in sync with "get_all_api_endpoints()"
67
- self.rag_tool = self.RagToolImpl(routing_table)
68
- for method in ("query", "insert"):
69
- setattr(self, f"rag_tool.{method}", getattr(self.rag_tool, method))
70
-
71
29
  async def initialize(self) -> None:
72
30
  logger.debug("ToolRuntimeRouter.initialize")
73
31
  pass
@@ -76,16 +34,16 @@ class ToolRuntimeRouter(ToolRuntime):
76
34
  logger.debug("ToolRuntimeRouter.shutdown")
77
35
  pass
78
36
 
79
- async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> Any:
37
+ async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any], authorization: str | None = None) -> Any:
80
38
  logger.debug(f"ToolRuntimeRouter.invoke_tool: {tool_name}")
81
39
  provider = await self.routing_table.get_provider_impl(tool_name)
82
40
  return await provider.invoke_tool(
83
41
  tool_name=tool_name,
84
42
  kwargs=kwargs,
43
+ authorization=authorization,
85
44
  )
86
45
 
87
46
  async def list_runtime_tools(
88
- self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
47
+ self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None, authorization: str | None = None
89
48
  ) -> ListToolDefsResponse:
90
- logger.debug(f"ToolRuntimeRouter.list_runtime_tools: {tool_group_id}")
91
- return await self.routing_table.list_tools(tool_group_id)
49
+ return await self.routing_table.list_tools(tool_group_id, authorization=authorization)