llama-stack 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (738) hide show
  1. llama_stack/__init__.py +5 -0
  2. llama_stack/apis/agents/__init__.py +1 -1
  3. llama_stack/apis/agents/agents.py +700 -281
  4. llama_stack/apis/agents/openai_responses.py +1311 -0
  5. llama_stack/{providers/adapters/memory/sample/config.py → apis/batches/__init__.py} +2 -5
  6. llama_stack/apis/batches/batches.py +100 -0
  7. llama_stack/apis/benchmarks/__init__.py +7 -0
  8. llama_stack/apis/benchmarks/benchmarks.py +108 -0
  9. llama_stack/apis/common/content_types.py +143 -0
  10. llama_stack/apis/common/errors.py +103 -0
  11. llama_stack/apis/common/job_types.py +38 -0
  12. llama_stack/apis/common/responses.py +36 -0
  13. llama_stack/apis/common/training_types.py +36 -5
  14. llama_stack/apis/common/type_system.py +158 -0
  15. llama_stack/apis/conversations/__init__.py +31 -0
  16. llama_stack/apis/conversations/conversations.py +286 -0
  17. llama_stack/apis/datasetio/__init__.py +7 -0
  18. llama_stack/apis/datasetio/datasetio.py +59 -0
  19. llama_stack/apis/datasets/__init__.py +7 -0
  20. llama_stack/apis/datasets/datasets.py +251 -0
  21. llama_stack/apis/datatypes.py +160 -0
  22. llama_stack/apis/eval/__init__.py +7 -0
  23. llama_stack/apis/eval/eval.py +169 -0
  24. llama_stack/apis/files/__init__.py +7 -0
  25. llama_stack/apis/files/files.py +199 -0
  26. llama_stack/apis/inference/__init__.py +1 -1
  27. llama_stack/apis/inference/inference.py +1169 -113
  28. llama_stack/apis/inspect/__init__.py +1 -1
  29. llama_stack/apis/inspect/inspect.py +69 -16
  30. llama_stack/apis/models/__init__.py +1 -1
  31. llama_stack/apis/models/models.py +148 -21
  32. llama_stack/apis/post_training/__init__.py +1 -1
  33. llama_stack/apis/post_training/post_training.py +265 -120
  34. llama_stack/{providers/adapters/agents/sample/config.py → apis/prompts/__init__.py} +2 -5
  35. llama_stack/apis/prompts/prompts.py +204 -0
  36. llama_stack/apis/providers/__init__.py +7 -0
  37. llama_stack/apis/providers/providers.py +69 -0
  38. llama_stack/apis/resource.py +37 -0
  39. llama_stack/apis/safety/__init__.py +1 -1
  40. llama_stack/apis/safety/safety.py +95 -12
  41. llama_stack/apis/scoring/__init__.py +7 -0
  42. llama_stack/apis/scoring/scoring.py +93 -0
  43. llama_stack/apis/scoring_functions/__init__.py +7 -0
  44. llama_stack/apis/scoring_functions/scoring_functions.py +208 -0
  45. llama_stack/apis/shields/__init__.py +1 -1
  46. llama_stack/apis/shields/shields.py +76 -33
  47. llama_stack/apis/synthetic_data_generation/__init__.py +1 -1
  48. llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py +40 -17
  49. llama_stack/apis/telemetry/__init__.py +1 -1
  50. llama_stack/apis/telemetry/telemetry.py +322 -31
  51. llama_stack/apis/{dataset → tools}/__init__.py +2 -1
  52. llama_stack/apis/tools/rag_tool.py +218 -0
  53. llama_stack/apis/tools/tools.py +221 -0
  54. llama_stack/apis/vector_io/__init__.py +7 -0
  55. llama_stack/apis/vector_io/vector_io.py +960 -0
  56. llama_stack/apis/vector_stores/__init__.py +7 -0
  57. llama_stack/apis/vector_stores/vector_stores.py +51 -0
  58. llama_stack/apis/version.py +9 -0
  59. llama_stack/cli/llama.py +13 -5
  60. llama_stack/cli/stack/_list_deps.py +182 -0
  61. llama_stack/cli/stack/list_apis.py +1 -1
  62. llama_stack/cli/stack/list_deps.py +55 -0
  63. llama_stack/cli/stack/list_providers.py +24 -10
  64. llama_stack/cli/stack/list_stacks.py +56 -0
  65. llama_stack/cli/stack/remove.py +115 -0
  66. llama_stack/cli/stack/run.py +169 -56
  67. llama_stack/cli/stack/stack.py +18 -4
  68. llama_stack/cli/stack/utils.py +151 -0
  69. llama_stack/cli/table.py +23 -61
  70. llama_stack/cli/utils.py +29 -0
  71. llama_stack/core/access_control/access_control.py +131 -0
  72. llama_stack/core/access_control/conditions.py +129 -0
  73. llama_stack/core/access_control/datatypes.py +107 -0
  74. llama_stack/core/build.py +164 -0
  75. llama_stack/core/client.py +205 -0
  76. llama_stack/core/common.sh +37 -0
  77. llama_stack/{distribution → core}/configure.py +74 -55
  78. llama_stack/core/conversations/conversations.py +309 -0
  79. llama_stack/core/datatypes.py +625 -0
  80. llama_stack/core/distribution.py +276 -0
  81. llama_stack/core/external.py +54 -0
  82. llama_stack/core/id_generation.py +42 -0
  83. llama_stack/core/inspect.py +86 -0
  84. llama_stack/core/library_client.py +539 -0
  85. llama_stack/core/prompts/prompts.py +234 -0
  86. llama_stack/core/providers.py +137 -0
  87. llama_stack/core/request_headers.py +115 -0
  88. llama_stack/core/resolver.py +506 -0
  89. llama_stack/core/routers/__init__.py +101 -0
  90. llama_stack/core/routers/datasets.py +73 -0
  91. llama_stack/core/routers/eval_scoring.py +155 -0
  92. llama_stack/core/routers/inference.py +645 -0
  93. llama_stack/core/routers/safety.py +85 -0
  94. llama_stack/core/routers/tool_runtime.py +91 -0
  95. llama_stack/core/routers/vector_io.py +442 -0
  96. llama_stack/core/routing_tables/benchmarks.py +62 -0
  97. llama_stack/core/routing_tables/common.py +254 -0
  98. llama_stack/core/routing_tables/datasets.py +91 -0
  99. llama_stack/core/routing_tables/models.py +163 -0
  100. llama_stack/core/routing_tables/scoring_functions.py +66 -0
  101. llama_stack/core/routing_tables/shields.py +61 -0
  102. llama_stack/core/routing_tables/toolgroups.py +129 -0
  103. llama_stack/core/routing_tables/vector_stores.py +292 -0
  104. llama_stack/core/server/auth.py +187 -0
  105. llama_stack/core/server/auth_providers.py +494 -0
  106. llama_stack/core/server/quota.py +110 -0
  107. llama_stack/core/server/routes.py +141 -0
  108. llama_stack/core/server/server.py +542 -0
  109. llama_stack/core/server/tracing.py +80 -0
  110. llama_stack/core/stack.py +546 -0
  111. llama_stack/core/start_stack.sh +117 -0
  112. llama_stack/core/storage/datatypes.py +283 -0
  113. llama_stack/{cli/model → core/store}/__init__.py +1 -1
  114. llama_stack/core/store/registry.py +199 -0
  115. llama_stack/core/testing_context.py +49 -0
  116. llama_stack/core/ui/app.py +55 -0
  117. llama_stack/core/ui/modules/api.py +32 -0
  118. llama_stack/core/ui/modules/utils.py +42 -0
  119. llama_stack/core/ui/page/distribution/datasets.py +18 -0
  120. llama_stack/core/ui/page/distribution/eval_tasks.py +20 -0
  121. llama_stack/core/ui/page/distribution/models.py +18 -0
  122. llama_stack/core/ui/page/distribution/providers.py +27 -0
  123. llama_stack/core/ui/page/distribution/resources.py +48 -0
  124. llama_stack/core/ui/page/distribution/scoring_functions.py +18 -0
  125. llama_stack/core/ui/page/distribution/shields.py +19 -0
  126. llama_stack/core/ui/page/evaluations/app_eval.py +143 -0
  127. llama_stack/core/ui/page/evaluations/native_eval.py +253 -0
  128. llama_stack/core/ui/page/playground/chat.py +130 -0
  129. llama_stack/core/ui/page/playground/tools.py +352 -0
  130. llama_stack/core/utils/config.py +30 -0
  131. llama_stack/{distribution → core}/utils/config_dirs.py +3 -6
  132. llama_stack/core/utils/config_resolution.py +125 -0
  133. llama_stack/core/utils/context.py +84 -0
  134. llama_stack/core/utils/exec.py +96 -0
  135. llama_stack/{providers/impls/meta_reference/codeshield/config.py → core/utils/image_types.py} +4 -3
  136. llama_stack/{distribution → core}/utils/model_utils.py +2 -2
  137. llama_stack/{distribution → core}/utils/prompt_for_config.py +30 -63
  138. llama_stack/{apis/batch_inference → distributions/dell}/__init__.py +1 -1
  139. llama_stack/distributions/dell/build.yaml +33 -0
  140. llama_stack/distributions/dell/dell.py +158 -0
  141. llama_stack/distributions/dell/run-with-safety.yaml +141 -0
  142. llama_stack/distributions/dell/run.yaml +132 -0
  143. llama_stack/distributions/meta-reference-gpu/__init__.py +7 -0
  144. llama_stack/distributions/meta-reference-gpu/build.yaml +32 -0
  145. llama_stack/distributions/meta-reference-gpu/meta_reference.py +163 -0
  146. llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +154 -0
  147. llama_stack/distributions/meta-reference-gpu/run.yaml +139 -0
  148. llama_stack/{apis/evals → distributions/nvidia}/__init__.py +1 -1
  149. llama_stack/distributions/nvidia/build.yaml +29 -0
  150. llama_stack/distributions/nvidia/nvidia.py +154 -0
  151. llama_stack/distributions/nvidia/run-with-safety.yaml +137 -0
  152. llama_stack/distributions/nvidia/run.yaml +116 -0
  153. llama_stack/distributions/open-benchmark/__init__.py +7 -0
  154. llama_stack/distributions/open-benchmark/build.yaml +36 -0
  155. llama_stack/distributions/open-benchmark/open_benchmark.py +303 -0
  156. llama_stack/distributions/open-benchmark/run.yaml +252 -0
  157. llama_stack/distributions/postgres-demo/__init__.py +7 -0
  158. llama_stack/distributions/postgres-demo/build.yaml +23 -0
  159. llama_stack/distributions/postgres-demo/postgres_demo.py +125 -0
  160. llama_stack/distributions/postgres-demo/run.yaml +115 -0
  161. llama_stack/{apis/memory → distributions/starter}/__init__.py +1 -1
  162. llama_stack/distributions/starter/build.yaml +61 -0
  163. llama_stack/distributions/starter/run-with-postgres-store.yaml +285 -0
  164. llama_stack/distributions/starter/run.yaml +276 -0
  165. llama_stack/distributions/starter/starter.py +345 -0
  166. llama_stack/distributions/starter-gpu/__init__.py +7 -0
  167. llama_stack/distributions/starter-gpu/build.yaml +61 -0
  168. llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +288 -0
  169. llama_stack/distributions/starter-gpu/run.yaml +279 -0
  170. llama_stack/distributions/starter-gpu/starter_gpu.py +20 -0
  171. llama_stack/distributions/template.py +456 -0
  172. llama_stack/distributions/watsonx/__init__.py +7 -0
  173. llama_stack/distributions/watsonx/build.yaml +33 -0
  174. llama_stack/distributions/watsonx/run.yaml +133 -0
  175. llama_stack/distributions/watsonx/watsonx.py +95 -0
  176. llama_stack/env.py +24 -0
  177. llama_stack/log.py +314 -0
  178. llama_stack/models/llama/checkpoint.py +164 -0
  179. llama_stack/models/llama/datatypes.py +164 -0
  180. llama_stack/models/llama/hadamard_utils.py +86 -0
  181. llama_stack/models/llama/llama3/args.py +74 -0
  182. llama_stack/models/llama/llama3/chat_format.py +286 -0
  183. llama_stack/models/llama/llama3/generation.py +376 -0
  184. llama_stack/models/llama/llama3/interface.py +255 -0
  185. llama_stack/models/llama/llama3/model.py +304 -0
  186. llama_stack/models/llama/llama3/multimodal/__init__.py +12 -0
  187. llama_stack/models/llama/llama3/multimodal/encoder_utils.py +180 -0
  188. llama_stack/models/llama/llama3/multimodal/image_transform.py +409 -0
  189. llama_stack/models/llama/llama3/multimodal/model.py +1430 -0
  190. llama_stack/models/llama/llama3/multimodal/utils.py +26 -0
  191. llama_stack/models/llama/llama3/prompt_templates/__init__.py +22 -0
  192. llama_stack/models/llama/llama3/prompt_templates/base.py +39 -0
  193. llama_stack/models/llama/llama3/prompt_templates/system_prompts.py +319 -0
  194. llama_stack/models/llama/llama3/prompt_templates/tool_response.py +62 -0
  195. llama_stack/models/llama/llama3/quantization/loader.py +316 -0
  196. llama_stack/models/llama/llama3/template_data.py +116 -0
  197. llama_stack/models/llama/llama3/tokenizer.model +128000 -0
  198. llama_stack/models/llama/llama3/tokenizer.py +198 -0
  199. llama_stack/models/llama/llama3/tool_utils.py +266 -0
  200. llama_stack/models/llama/llama3_1/__init__.py +12 -0
  201. llama_stack/models/llama/llama3_1/prompt_format.md +358 -0
  202. llama_stack/models/llama/llama3_1/prompts.py +258 -0
  203. llama_stack/models/llama/llama3_2/prompts_text.py +229 -0
  204. llama_stack/models/llama/llama3_2/prompts_vision.py +126 -0
  205. llama_stack/models/llama/llama3_2/text_prompt_format.md +286 -0
  206. llama_stack/models/llama/llama3_2/vision_prompt_format.md +141 -0
  207. llama_stack/models/llama/llama3_3/prompts.py +259 -0
  208. llama_stack/models/llama/llama4/args.py +107 -0
  209. llama_stack/models/llama/llama4/chat_format.py +317 -0
  210. llama_stack/models/llama/llama4/datatypes.py +56 -0
  211. llama_stack/models/llama/llama4/ffn.py +58 -0
  212. llama_stack/models/llama/llama4/generation.py +313 -0
  213. llama_stack/models/llama/llama4/model.py +437 -0
  214. llama_stack/models/llama/llama4/moe.py +214 -0
  215. llama_stack/models/llama/llama4/preprocess.py +435 -0
  216. llama_stack/models/llama/llama4/prompt_format.md +304 -0
  217. llama_stack/models/llama/llama4/prompt_templates/system_prompts.py +136 -0
  218. llama_stack/models/llama/llama4/prompts.py +279 -0
  219. llama_stack/models/llama/llama4/quantization/__init__.py +5 -0
  220. llama_stack/models/llama/llama4/quantization/loader.py +226 -0
  221. llama_stack/models/llama/llama4/tokenizer.model +200000 -0
  222. llama_stack/models/llama/llama4/tokenizer.py +263 -0
  223. llama_stack/models/llama/llama4/vision/__init__.py +5 -0
  224. llama_stack/models/llama/llama4/vision/embedding.py +210 -0
  225. llama_stack/models/llama/llama4/vision/encoder.py +412 -0
  226. llama_stack/models/llama/prompt_format.py +191 -0
  227. llama_stack/models/llama/quantize_impls.py +316 -0
  228. llama_stack/models/llama/sku_list.py +1029 -0
  229. llama_stack/models/llama/sku_types.py +233 -0
  230. llama_stack/models/llama/tokenizer_utils.py +40 -0
  231. llama_stack/providers/datatypes.py +136 -107
  232. llama_stack/providers/inline/__init__.py +5 -0
  233. llama_stack/providers/inline/agents/__init__.py +5 -0
  234. llama_stack/providers/{impls/meta_reference/agents → inline/agents/meta_reference}/__init__.py +12 -5
  235. llama_stack/providers/inline/agents/meta_reference/agent_instance.py +1024 -0
  236. llama_stack/providers/inline/agents/meta_reference/agents.py +383 -0
  237. llama_stack/providers/inline/agents/meta_reference/config.py +37 -0
  238. llama_stack/providers/inline/agents/meta_reference/persistence.py +228 -0
  239. llama_stack/providers/inline/agents/meta_reference/responses/__init__.py +5 -0
  240. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +423 -0
  241. llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +1226 -0
  242. llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +449 -0
  243. llama_stack/providers/inline/agents/meta_reference/responses/types.py +194 -0
  244. llama_stack/providers/inline/agents/meta_reference/responses/utils.py +365 -0
  245. llama_stack/providers/inline/agents/meta_reference/safety.py +52 -0
  246. llama_stack/providers/inline/batches/__init__.py +5 -0
  247. llama_stack/providers/inline/batches/reference/__init__.py +36 -0
  248. llama_stack/providers/inline/batches/reference/batches.py +679 -0
  249. llama_stack/providers/inline/batches/reference/config.py +40 -0
  250. llama_stack/providers/inline/datasetio/__init__.py +5 -0
  251. llama_stack/providers/inline/datasetio/localfs/__init__.py +20 -0
  252. llama_stack/providers/inline/datasetio/localfs/config.py +23 -0
  253. llama_stack/providers/inline/datasetio/localfs/datasetio.py +113 -0
  254. llama_stack/providers/inline/eval/__init__.py +5 -0
  255. llama_stack/providers/inline/eval/meta_reference/__init__.py +28 -0
  256. llama_stack/providers/inline/eval/meta_reference/config.py +23 -0
  257. llama_stack/providers/inline/eval/meta_reference/eval.py +259 -0
  258. llama_stack/providers/inline/files/localfs/__init__.py +20 -0
  259. llama_stack/providers/inline/files/localfs/config.py +31 -0
  260. llama_stack/providers/inline/files/localfs/files.py +219 -0
  261. llama_stack/providers/inline/inference/__init__.py +5 -0
  262. llama_stack/providers/{impls/meta_reference/inference → inline/inference/meta_reference}/__init__.py +4 -4
  263. llama_stack/providers/inline/inference/meta_reference/common.py +24 -0
  264. llama_stack/providers/inline/inference/meta_reference/config.py +68 -0
  265. llama_stack/providers/inline/inference/meta_reference/generators.py +211 -0
  266. llama_stack/providers/inline/inference/meta_reference/inference.py +158 -0
  267. llama_stack/providers/inline/inference/meta_reference/model_parallel.py +96 -0
  268. llama_stack/providers/{impls/meta_reference/inference → inline/inference/meta_reference}/parallel_utils.py +56 -73
  269. llama_stack/providers/inline/inference/sentence_transformers/__init__.py +22 -0
  270. llama_stack/providers/{impls/meta_reference/agents → inline/inference/sentence_transformers}/config.py +6 -4
  271. llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +83 -0
  272. llama_stack/providers/inline/post_training/__init__.py +5 -0
  273. llama_stack/providers/inline/post_training/common/__init__.py +5 -0
  274. llama_stack/providers/inline/post_training/common/utils.py +35 -0
  275. llama_stack/providers/inline/post_training/common/validator.py +36 -0
  276. llama_stack/providers/inline/post_training/huggingface/__init__.py +27 -0
  277. llama_stack/providers/inline/post_training/huggingface/config.py +83 -0
  278. llama_stack/providers/inline/post_training/huggingface/post_training.py +208 -0
  279. llama_stack/providers/inline/post_training/huggingface/recipes/__init__.py +5 -0
  280. llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py +519 -0
  281. llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device_dpo.py +485 -0
  282. llama_stack/providers/inline/post_training/huggingface/utils.py +269 -0
  283. llama_stack/providers/inline/post_training/torchtune/__init__.py +27 -0
  284. llama_stack/providers/inline/post_training/torchtune/common/__init__.py +5 -0
  285. llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py +240 -0
  286. llama_stack/providers/inline/post_training/torchtune/common/utils.py +99 -0
  287. llama_stack/providers/inline/post_training/torchtune/config.py +20 -0
  288. llama_stack/providers/inline/post_training/torchtune/datasets/__init__.py +5 -0
  289. llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py +57 -0
  290. llama_stack/providers/inline/post_training/torchtune/datasets/sft.py +78 -0
  291. llama_stack/providers/inline/post_training/torchtune/post_training.py +178 -0
  292. llama_stack/providers/inline/post_training/torchtune/recipes/__init__.py +5 -0
  293. llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +588 -0
  294. llama_stack/providers/inline/safety/__init__.py +5 -0
  295. llama_stack/providers/{impls/meta_reference/codeshield → inline/safety/code_scanner}/__init__.py +4 -2
  296. llama_stack/providers/inline/safety/code_scanner/code_scanner.py +128 -0
  297. llama_stack/providers/{impls/meta_reference/memory → inline/safety/code_scanner}/config.py +5 -3
  298. llama_stack/providers/inline/safety/llama_guard/__init__.py +19 -0
  299. llama_stack/providers/inline/safety/llama_guard/config.py +19 -0
  300. llama_stack/providers/inline/safety/llama_guard/llama_guard.py +489 -0
  301. llama_stack/providers/{adapters/memory/sample → inline/safety/prompt_guard}/__init__.py +4 -4
  302. llama_stack/providers/inline/safety/prompt_guard/config.py +32 -0
  303. llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +131 -0
  304. llama_stack/providers/inline/scoring/__init__.py +5 -0
  305. llama_stack/providers/inline/scoring/basic/__init__.py +25 -0
  306. llama_stack/providers/{adapters/memory/weaviate → inline/scoring/basic}/config.py +5 -7
  307. llama_stack/providers/inline/scoring/basic/scoring.py +126 -0
  308. llama_stack/providers/inline/scoring/basic/scoring_fn/__init__.py +5 -0
  309. llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py +240 -0
  310. llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py +41 -0
  311. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/__init__.py +5 -0
  312. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py +21 -0
  313. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py +21 -0
  314. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py +23 -0
  315. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py +27 -0
  316. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py +71 -0
  317. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py +21 -0
  318. llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py +80 -0
  319. llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py +66 -0
  320. llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py +58 -0
  321. llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py +38 -0
  322. llama_stack/providers/inline/scoring/basic/utils/__init__.py +5 -0
  323. llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py +3319 -0
  324. llama_stack/providers/inline/scoring/basic/utils/math_utils.py +330 -0
  325. llama_stack/providers/inline/scoring/braintrust/__init__.py +27 -0
  326. llama_stack/providers/inline/scoring/braintrust/braintrust.py +230 -0
  327. llama_stack/providers/inline/scoring/braintrust/config.py +21 -0
  328. llama_stack/providers/inline/scoring/braintrust/scoring_fn/__init__.py +5 -0
  329. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/__init__.py +5 -0
  330. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py +24 -0
  331. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py +24 -0
  332. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py +24 -0
  333. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py +24 -0
  334. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py +24 -0
  335. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py +24 -0
  336. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py +23 -0
  337. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py +24 -0
  338. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py +24 -0
  339. llama_stack/providers/inline/scoring/llm_as_judge/__init__.py +21 -0
  340. llama_stack/providers/inline/scoring/llm_as_judge/config.py +14 -0
  341. llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +113 -0
  342. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/__init__.py +5 -0
  343. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/__init__.py +5 -0
  344. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py +96 -0
  345. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py +20 -0
  346. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py +81 -0
  347. llama_stack/providers/inline/telemetry/__init__.py +5 -0
  348. llama_stack/providers/inline/telemetry/meta_reference/__init__.py +21 -0
  349. llama_stack/providers/inline/telemetry/meta_reference/config.py +47 -0
  350. llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +252 -0
  351. llama_stack/providers/inline/tool_runtime/__init__.py +5 -0
  352. llama_stack/providers/inline/tool_runtime/rag/__init__.py +19 -0
  353. llama_stack/providers/{impls/meta_reference/telemetry → inline/tool_runtime/rag}/config.py +5 -3
  354. llama_stack/providers/inline/tool_runtime/rag/context_retriever.py +77 -0
  355. llama_stack/providers/inline/tool_runtime/rag/memory.py +332 -0
  356. llama_stack/providers/inline/vector_io/__init__.py +5 -0
  357. llama_stack/providers/inline/vector_io/chroma/__init__.py +19 -0
  358. llama_stack/providers/inline/vector_io/chroma/config.py +30 -0
  359. llama_stack/providers/inline/vector_io/faiss/__init__.py +21 -0
  360. llama_stack/providers/inline/vector_io/faiss/config.py +26 -0
  361. llama_stack/providers/inline/vector_io/faiss/faiss.py +293 -0
  362. llama_stack/providers/inline/vector_io/milvus/__init__.py +19 -0
  363. llama_stack/providers/inline/vector_io/milvus/config.py +29 -0
  364. llama_stack/providers/inline/vector_io/qdrant/__init__.py +20 -0
  365. llama_stack/providers/inline/vector_io/qdrant/config.py +29 -0
  366. llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py +20 -0
  367. llama_stack/providers/inline/vector_io/sqlite_vec/config.py +26 -0
  368. llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +483 -0
  369. llama_stack/providers/registry/agents.py +16 -18
  370. llama_stack/providers/registry/batches.py +26 -0
  371. llama_stack/providers/registry/datasetio.py +49 -0
  372. llama_stack/providers/registry/eval.py +46 -0
  373. llama_stack/providers/registry/files.py +31 -0
  374. llama_stack/providers/registry/inference.py +273 -118
  375. llama_stack/providers/registry/post_training.py +69 -0
  376. llama_stack/providers/registry/safety.py +46 -41
  377. llama_stack/providers/registry/scoring.py +51 -0
  378. llama_stack/providers/registry/tool_runtime.py +87 -0
  379. llama_stack/providers/registry/vector_io.py +828 -0
  380. llama_stack/providers/remote/__init__.py +5 -0
  381. llama_stack/providers/remote/agents/__init__.py +5 -0
  382. llama_stack/providers/remote/datasetio/__init__.py +5 -0
  383. llama_stack/providers/{adapters/memory/chroma → remote/datasetio/huggingface}/__init__.py +7 -4
  384. llama_stack/providers/remote/datasetio/huggingface/config.py +23 -0
  385. llama_stack/providers/remote/datasetio/huggingface/huggingface.py +99 -0
  386. llama_stack/providers/remote/datasetio/nvidia/__init__.py +23 -0
  387. llama_stack/providers/remote/datasetio/nvidia/config.py +61 -0
  388. llama_stack/providers/remote/datasetio/nvidia/datasetio.py +116 -0
  389. llama_stack/providers/remote/eval/__init__.py +5 -0
  390. llama_stack/providers/remote/eval/nvidia/__init__.py +31 -0
  391. llama_stack/providers/remote/eval/nvidia/config.py +29 -0
  392. llama_stack/providers/remote/eval/nvidia/eval.py +162 -0
  393. llama_stack/providers/remote/files/s3/__init__.py +19 -0
  394. llama_stack/providers/remote/files/s3/config.py +42 -0
  395. llama_stack/providers/remote/files/s3/files.py +313 -0
  396. llama_stack/providers/remote/inference/__init__.py +5 -0
  397. llama_stack/providers/{adapters/safety/sample → remote/inference/anthropic}/__init__.py +4 -6
  398. llama_stack/providers/remote/inference/anthropic/anthropic.py +36 -0
  399. llama_stack/providers/remote/inference/anthropic/config.py +28 -0
  400. llama_stack/providers/{impls/meta_reference/telemetry → remote/inference/azure}/__init__.py +4 -4
  401. llama_stack/providers/remote/inference/azure/azure.py +25 -0
  402. llama_stack/providers/remote/inference/azure/config.py +61 -0
  403. llama_stack/providers/{adapters → remote}/inference/bedrock/__init__.py +18 -17
  404. llama_stack/providers/remote/inference/bedrock/bedrock.py +142 -0
  405. llama_stack/providers/{adapters/inference/sample → remote/inference/bedrock}/config.py +3 -4
  406. llama_stack/providers/remote/inference/bedrock/models.py +29 -0
  407. llama_stack/providers/remote/inference/cerebras/__init__.py +19 -0
  408. llama_stack/providers/remote/inference/cerebras/cerebras.py +28 -0
  409. llama_stack/providers/remote/inference/cerebras/config.py +30 -0
  410. llama_stack/providers/{adapters → remote}/inference/databricks/__init__.py +4 -5
  411. llama_stack/providers/remote/inference/databricks/config.py +37 -0
  412. llama_stack/providers/remote/inference/databricks/databricks.py +44 -0
  413. llama_stack/providers/{adapters → remote}/inference/fireworks/__init__.py +8 -4
  414. llama_stack/providers/remote/inference/fireworks/config.py +27 -0
  415. llama_stack/providers/remote/inference/fireworks/fireworks.py +27 -0
  416. llama_stack/providers/{adapters/memory/pgvector → remote/inference/gemini}/__init__.py +4 -4
  417. llama_stack/providers/remote/inference/gemini/config.py +28 -0
  418. llama_stack/providers/remote/inference/gemini/gemini.py +82 -0
  419. llama_stack/providers/remote/inference/groq/__init__.py +15 -0
  420. llama_stack/providers/remote/inference/groq/config.py +34 -0
  421. llama_stack/providers/remote/inference/groq/groq.py +18 -0
  422. llama_stack/providers/remote/inference/llama_openai_compat/__init__.py +15 -0
  423. llama_stack/providers/remote/inference/llama_openai_compat/config.py +34 -0
  424. llama_stack/providers/remote/inference/llama_openai_compat/llama.py +46 -0
  425. llama_stack/providers/remote/inference/nvidia/__init__.py +23 -0
  426. llama_stack/providers/remote/inference/nvidia/config.py +64 -0
  427. llama_stack/providers/remote/inference/nvidia/nvidia.py +61 -0
  428. llama_stack/providers/{adapters/safety/sample/config.py → remote/inference/nvidia/utils.py} +3 -4
  429. llama_stack/providers/{impls/vllm → remote/inference/ollama}/__init__.py +4 -6
  430. llama_stack/providers/remote/inference/ollama/config.py +25 -0
  431. llama_stack/providers/remote/inference/ollama/ollama.py +102 -0
  432. llama_stack/providers/{adapters/telemetry/opentelemetry → remote/inference/openai}/__init__.py +4 -4
  433. llama_stack/providers/remote/inference/openai/config.py +39 -0
  434. llama_stack/providers/remote/inference/openai/openai.py +38 -0
  435. llama_stack/providers/remote/inference/passthrough/__init__.py +23 -0
  436. llama_stack/providers/remote/inference/passthrough/config.py +34 -0
  437. llama_stack/providers/remote/inference/passthrough/passthrough.py +122 -0
  438. llama_stack/providers/remote/inference/runpod/__init__.py +16 -0
  439. llama_stack/providers/remote/inference/runpod/config.py +32 -0
  440. llama_stack/providers/remote/inference/runpod/runpod.py +42 -0
  441. llama_stack/providers/remote/inference/sambanova/__init__.py +16 -0
  442. llama_stack/providers/remote/inference/sambanova/config.py +34 -0
  443. llama_stack/providers/remote/inference/sambanova/sambanova.py +28 -0
  444. llama_stack/providers/{adapters → remote}/inference/tgi/__init__.py +3 -4
  445. llama_stack/providers/remote/inference/tgi/config.py +76 -0
  446. llama_stack/providers/remote/inference/tgi/tgi.py +85 -0
  447. llama_stack/providers/{adapters → remote}/inference/together/__init__.py +8 -4
  448. llama_stack/providers/remote/inference/together/config.py +27 -0
  449. llama_stack/providers/remote/inference/together/together.py +102 -0
  450. llama_stack/providers/remote/inference/vertexai/__init__.py +15 -0
  451. llama_stack/providers/remote/inference/vertexai/config.py +48 -0
  452. llama_stack/providers/remote/inference/vertexai/vertexai.py +54 -0
  453. llama_stack/providers/remote/inference/vllm/__init__.py +22 -0
  454. llama_stack/providers/remote/inference/vllm/config.py +59 -0
  455. llama_stack/providers/remote/inference/vllm/vllm.py +111 -0
  456. llama_stack/providers/remote/inference/watsonx/__init__.py +15 -0
  457. llama_stack/providers/remote/inference/watsonx/config.py +45 -0
  458. llama_stack/providers/remote/inference/watsonx/watsonx.py +336 -0
  459. llama_stack/providers/remote/post_training/__init__.py +5 -0
  460. llama_stack/providers/remote/post_training/nvidia/__init__.py +23 -0
  461. llama_stack/providers/remote/post_training/nvidia/config.py +113 -0
  462. llama_stack/providers/remote/post_training/nvidia/models.py +27 -0
  463. llama_stack/providers/remote/post_training/nvidia/post_training.py +430 -0
  464. llama_stack/providers/remote/post_training/nvidia/utils.py +63 -0
  465. llama_stack/providers/remote/safety/__init__.py +5 -0
  466. llama_stack/providers/remote/safety/bedrock/bedrock.py +111 -0
  467. llama_stack/providers/remote/safety/bedrock/config.py +14 -0
  468. llama_stack/providers/{adapters/inference/sample → remote/safety/nvidia}/__init__.py +5 -4
  469. llama_stack/providers/remote/safety/nvidia/config.py +40 -0
  470. llama_stack/providers/remote/safety/nvidia/nvidia.py +161 -0
  471. llama_stack/providers/{adapters/agents/sample → remote/safety/sambanova}/__init__.py +5 -4
  472. llama_stack/providers/remote/safety/sambanova/config.py +37 -0
  473. llama_stack/providers/remote/safety/sambanova/sambanova.py +98 -0
  474. llama_stack/providers/remote/tool_runtime/__init__.py +5 -0
  475. llama_stack/providers/remote/tool_runtime/bing_search/__init__.py +21 -0
  476. llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py +112 -0
  477. llama_stack/providers/remote/tool_runtime/bing_search/config.py +22 -0
  478. llama_stack/providers/remote/tool_runtime/brave_search/__init__.py +20 -0
  479. llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py +148 -0
  480. llama_stack/providers/remote/tool_runtime/brave_search/config.py +27 -0
  481. llama_stack/providers/remote/tool_runtime/model_context_protocol/__init__.py +15 -0
  482. llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py +20 -0
  483. llama_stack/providers/remote/tool_runtime/model_context_protocol/model_context_protocol.py +73 -0
  484. llama_stack/providers/remote/tool_runtime/tavily_search/__init__.py +20 -0
  485. llama_stack/providers/remote/tool_runtime/tavily_search/config.py +27 -0
  486. llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py +84 -0
  487. llama_stack/providers/remote/tool_runtime/wolfram_alpha/__init__.py +22 -0
  488. llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py +21 -0
  489. llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py +140 -0
  490. llama_stack/providers/remote/vector_io/__init__.py +5 -0
  491. llama_stack/providers/remote/vector_io/chroma/__init__.py +17 -0
  492. llama_stack/providers/remote/vector_io/chroma/chroma.py +215 -0
  493. llama_stack/providers/remote/vector_io/chroma/config.py +28 -0
  494. llama_stack/providers/remote/vector_io/milvus/__init__.py +18 -0
  495. llama_stack/providers/remote/vector_io/milvus/config.py +35 -0
  496. llama_stack/providers/remote/vector_io/milvus/milvus.py +375 -0
  497. llama_stack/providers/remote/vector_io/pgvector/__init__.py +17 -0
  498. llama_stack/providers/remote/vector_io/pgvector/config.py +47 -0
  499. llama_stack/providers/remote/vector_io/pgvector/pgvector.py +460 -0
  500. llama_stack/providers/remote/vector_io/qdrant/__init__.py +17 -0
  501. llama_stack/providers/remote/vector_io/qdrant/config.py +37 -0
  502. llama_stack/providers/remote/vector_io/qdrant/qdrant.py +265 -0
  503. llama_stack/providers/remote/vector_io/weaviate/__init__.py +17 -0
  504. llama_stack/providers/remote/vector_io/weaviate/config.py +32 -0
  505. llama_stack/providers/remote/vector_io/weaviate/weaviate.py +393 -0
  506. llama_stack/providers/utils/bedrock/__init__.py +5 -0
  507. llama_stack/providers/utils/bedrock/client.py +74 -0
  508. llama_stack/providers/utils/bedrock/config.py +64 -0
  509. llama_stack/providers/utils/bedrock/refreshable_boto_session.py +112 -0
  510. llama_stack/providers/utils/common/__init__.py +5 -0
  511. llama_stack/providers/utils/common/data_schema_validator.py +103 -0
  512. llama_stack/providers/utils/datasetio/__init__.py +5 -0
  513. llama_stack/providers/utils/datasetio/url_utils.py +47 -0
  514. llama_stack/providers/utils/files/__init__.py +5 -0
  515. llama_stack/providers/utils/files/form_data.py +69 -0
  516. llama_stack/providers/utils/inference/__init__.py +8 -7
  517. llama_stack/providers/utils/inference/embedding_mixin.py +101 -0
  518. llama_stack/providers/utils/inference/inference_store.py +264 -0
  519. llama_stack/providers/utils/inference/litellm_openai_mixin.py +336 -0
  520. llama_stack/providers/utils/inference/model_registry.py +173 -23
  521. llama_stack/providers/utils/inference/openai_compat.py +1261 -49
  522. llama_stack/providers/utils/inference/openai_mixin.py +506 -0
  523. llama_stack/providers/utils/inference/prompt_adapter.py +365 -67
  524. llama_stack/providers/utils/kvstore/api.py +6 -6
  525. llama_stack/providers/utils/kvstore/config.py +28 -48
  526. llama_stack/providers/utils/kvstore/kvstore.py +61 -15
  527. llama_stack/providers/utils/kvstore/mongodb/__init__.py +9 -0
  528. llama_stack/providers/utils/kvstore/mongodb/mongodb.py +82 -0
  529. llama_stack/providers/utils/kvstore/postgres/__init__.py +7 -0
  530. llama_stack/providers/utils/kvstore/postgres/postgres.py +114 -0
  531. llama_stack/providers/utils/kvstore/redis/redis.py +33 -9
  532. llama_stack/providers/utils/kvstore/sqlite/config.py +2 -1
  533. llama_stack/providers/utils/kvstore/sqlite/sqlite.py +123 -22
  534. llama_stack/providers/utils/memory/file_utils.py +1 -1
  535. llama_stack/providers/utils/memory/openai_vector_store_mixin.py +1304 -0
  536. llama_stack/providers/utils/memory/vector_store.py +220 -82
  537. llama_stack/providers/utils/pagination.py +43 -0
  538. llama_stack/providers/utils/responses/__init__.py +5 -0
  539. llama_stack/providers/utils/responses/responses_store.py +292 -0
  540. llama_stack/providers/utils/scheduler.py +270 -0
  541. llama_stack/providers/utils/scoring/__init__.py +5 -0
  542. llama_stack/providers/utils/scoring/aggregation_utils.py +75 -0
  543. llama_stack/providers/utils/scoring/base_scoring_fn.py +114 -0
  544. llama_stack/providers/utils/scoring/basic_scoring_utils.py +26 -0
  545. llama_stack/providers/utils/sqlstore/__init__.py +5 -0
  546. llama_stack/providers/utils/sqlstore/api.py +128 -0
  547. llama_stack/providers/utils/sqlstore/authorized_sqlstore.py +319 -0
  548. llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py +343 -0
  549. llama_stack/providers/utils/sqlstore/sqlstore.py +70 -0
  550. llama_stack/providers/utils/telemetry/trace_protocol.py +142 -0
  551. llama_stack/providers/utils/telemetry/tracing.py +192 -53
  552. llama_stack/providers/utils/tools/__init__.py +5 -0
  553. llama_stack/providers/utils/tools/mcp.py +148 -0
  554. llama_stack/providers/utils/tools/ttl_dict.py +70 -0
  555. llama_stack/providers/utils/vector_io/__init__.py +5 -0
  556. llama_stack/providers/utils/vector_io/vector_utils.py +156 -0
  557. llama_stack/schema_utils.py +118 -0
  558. llama_stack/strong_typing/__init__.py +19 -0
  559. llama_stack/strong_typing/auxiliary.py +228 -0
  560. llama_stack/strong_typing/classdef.py +440 -0
  561. llama_stack/strong_typing/core.py +46 -0
  562. llama_stack/strong_typing/deserializer.py +877 -0
  563. llama_stack/strong_typing/docstring.py +409 -0
  564. llama_stack/strong_typing/exception.py +23 -0
  565. llama_stack/strong_typing/inspection.py +1085 -0
  566. llama_stack/strong_typing/mapping.py +40 -0
  567. llama_stack/strong_typing/name.py +182 -0
  568. llama_stack/strong_typing/py.typed +0 -0
  569. llama_stack/strong_typing/schema.py +792 -0
  570. llama_stack/strong_typing/serialization.py +97 -0
  571. llama_stack/strong_typing/serializer.py +500 -0
  572. llama_stack/strong_typing/slots.py +27 -0
  573. llama_stack/strong_typing/topological.py +89 -0
  574. llama_stack/testing/__init__.py +5 -0
  575. llama_stack/testing/api_recorder.py +956 -0
  576. llama_stack/ui/node_modules/flatted/python/flatted.py +149 -0
  577. llama_stack-0.3.4.dist-info/METADATA +261 -0
  578. llama_stack-0.3.4.dist-info/RECORD +625 -0
  579. {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info}/WHEEL +1 -1
  580. llama_stack/apis/agents/client.py +0 -292
  581. llama_stack/apis/agents/event_logger.py +0 -184
  582. llama_stack/apis/batch_inference/batch_inference.py +0 -72
  583. llama_stack/apis/common/deployment_types.py +0 -31
  584. llama_stack/apis/dataset/dataset.py +0 -63
  585. llama_stack/apis/evals/evals.py +0 -122
  586. llama_stack/apis/inference/client.py +0 -197
  587. llama_stack/apis/inspect/client.py +0 -82
  588. llama_stack/apis/memory/client.py +0 -155
  589. llama_stack/apis/memory/memory.py +0 -65
  590. llama_stack/apis/memory_banks/__init__.py +0 -7
  591. llama_stack/apis/memory_banks/client.py +0 -101
  592. llama_stack/apis/memory_banks/memory_banks.py +0 -78
  593. llama_stack/apis/models/client.py +0 -83
  594. llama_stack/apis/reward_scoring/__init__.py +0 -7
  595. llama_stack/apis/reward_scoring/reward_scoring.py +0 -55
  596. llama_stack/apis/safety/client.py +0 -105
  597. llama_stack/apis/shields/client.py +0 -79
  598. llama_stack/cli/download.py +0 -340
  599. llama_stack/cli/model/describe.py +0 -82
  600. llama_stack/cli/model/download.py +0 -24
  601. llama_stack/cli/model/list.py +0 -62
  602. llama_stack/cli/model/model.py +0 -34
  603. llama_stack/cli/model/prompt_format.py +0 -112
  604. llama_stack/cli/model/safety_models.py +0 -52
  605. llama_stack/cli/stack/build.py +0 -299
  606. llama_stack/cli/stack/configure.py +0 -178
  607. llama_stack/distribution/build.py +0 -123
  608. llama_stack/distribution/build_conda_env.sh +0 -136
  609. llama_stack/distribution/build_container.sh +0 -142
  610. llama_stack/distribution/common.sh +0 -40
  611. llama_stack/distribution/configure_container.sh +0 -47
  612. llama_stack/distribution/datatypes.py +0 -139
  613. llama_stack/distribution/distribution.py +0 -58
  614. llama_stack/distribution/inspect.py +0 -67
  615. llama_stack/distribution/request_headers.py +0 -57
  616. llama_stack/distribution/resolver.py +0 -323
  617. llama_stack/distribution/routers/__init__.py +0 -48
  618. llama_stack/distribution/routers/routers.py +0 -158
  619. llama_stack/distribution/routers/routing_tables.py +0 -173
  620. llama_stack/distribution/server/endpoints.py +0 -48
  621. llama_stack/distribution/server/server.py +0 -343
  622. llama_stack/distribution/start_conda_env.sh +0 -42
  623. llama_stack/distribution/start_container.sh +0 -64
  624. llama_stack/distribution/templates/local-bedrock-conda-example-build.yaml +0 -10
  625. llama_stack/distribution/templates/local-build.yaml +0 -10
  626. llama_stack/distribution/templates/local-databricks-build.yaml +0 -10
  627. llama_stack/distribution/templates/local-fireworks-build.yaml +0 -10
  628. llama_stack/distribution/templates/local-hf-endpoint-build.yaml +0 -10
  629. llama_stack/distribution/templates/local-hf-serverless-build.yaml +0 -10
  630. llama_stack/distribution/templates/local-ollama-build.yaml +0 -10
  631. llama_stack/distribution/templates/local-tgi-build.yaml +0 -10
  632. llama_stack/distribution/templates/local-together-build.yaml +0 -10
  633. llama_stack/distribution/templates/local-vllm-build.yaml +0 -10
  634. llama_stack/distribution/utils/exec.py +0 -105
  635. llama_stack/providers/adapters/agents/sample/sample.py +0 -18
  636. llama_stack/providers/adapters/inference/bedrock/bedrock.py +0 -451
  637. llama_stack/providers/adapters/inference/bedrock/config.py +0 -55
  638. llama_stack/providers/adapters/inference/databricks/config.py +0 -21
  639. llama_stack/providers/adapters/inference/databricks/databricks.py +0 -125
  640. llama_stack/providers/adapters/inference/fireworks/config.py +0 -20
  641. llama_stack/providers/adapters/inference/fireworks/fireworks.py +0 -130
  642. llama_stack/providers/adapters/inference/ollama/__init__.py +0 -19
  643. llama_stack/providers/adapters/inference/ollama/ollama.py +0 -175
  644. llama_stack/providers/adapters/inference/sample/sample.py +0 -23
  645. llama_stack/providers/adapters/inference/tgi/config.py +0 -43
  646. llama_stack/providers/adapters/inference/tgi/tgi.py +0 -200
  647. llama_stack/providers/adapters/inference/together/config.py +0 -22
  648. llama_stack/providers/adapters/inference/together/together.py +0 -143
  649. llama_stack/providers/adapters/memory/chroma/chroma.py +0 -157
  650. llama_stack/providers/adapters/memory/pgvector/config.py +0 -17
  651. llama_stack/providers/adapters/memory/pgvector/pgvector.py +0 -211
  652. llama_stack/providers/adapters/memory/sample/sample.py +0 -23
  653. llama_stack/providers/adapters/memory/weaviate/__init__.py +0 -15
  654. llama_stack/providers/adapters/memory/weaviate/weaviate.py +0 -190
  655. llama_stack/providers/adapters/safety/bedrock/bedrock.py +0 -113
  656. llama_stack/providers/adapters/safety/bedrock/config.py +0 -16
  657. llama_stack/providers/adapters/safety/sample/sample.py +0 -23
  658. llama_stack/providers/adapters/safety/together/__init__.py +0 -18
  659. llama_stack/providers/adapters/safety/together/config.py +0 -26
  660. llama_stack/providers/adapters/safety/together/together.py +0 -101
  661. llama_stack/providers/adapters/telemetry/opentelemetry/config.py +0 -12
  662. llama_stack/providers/adapters/telemetry/opentelemetry/opentelemetry.py +0 -201
  663. llama_stack/providers/adapters/telemetry/sample/__init__.py +0 -17
  664. llama_stack/providers/adapters/telemetry/sample/config.py +0 -12
  665. llama_stack/providers/adapters/telemetry/sample/sample.py +0 -18
  666. llama_stack/providers/impls/meta_reference/agents/agent_instance.py +0 -844
  667. llama_stack/providers/impls/meta_reference/agents/agents.py +0 -161
  668. llama_stack/providers/impls/meta_reference/agents/persistence.py +0 -84
  669. llama_stack/providers/impls/meta_reference/agents/rag/context_retriever.py +0 -74
  670. llama_stack/providers/impls/meta_reference/agents/safety.py +0 -57
  671. llama_stack/providers/impls/meta_reference/agents/tests/code_execution.py +0 -93
  672. llama_stack/providers/impls/meta_reference/agents/tests/test_chat_agent.py +0 -305
  673. llama_stack/providers/impls/meta_reference/agents/tools/base.py +0 -20
  674. llama_stack/providers/impls/meta_reference/agents/tools/builtin.py +0 -375
  675. llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/code_env_prefix.py +0 -133
  676. llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/code_execution.py +0 -256
  677. llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/matplotlib_custom_backend.py +0 -87
  678. llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/utils.py +0 -21
  679. llama_stack/providers/impls/meta_reference/agents/tools/safety.py +0 -43
  680. llama_stack/providers/impls/meta_reference/codeshield/code_scanner.py +0 -58
  681. llama_stack/providers/impls/meta_reference/inference/config.py +0 -45
  682. llama_stack/providers/impls/meta_reference/inference/generation.py +0 -376
  683. llama_stack/providers/impls/meta_reference/inference/inference.py +0 -280
  684. llama_stack/providers/impls/meta_reference/inference/model_parallel.py +0 -99
  685. llama_stack/providers/impls/meta_reference/inference/quantization/fp8_impls.py +0 -184
  686. llama_stack/providers/impls/meta_reference/inference/quantization/fp8_txest_disabled.py +0 -76
  687. llama_stack/providers/impls/meta_reference/inference/quantization/loader.py +0 -97
  688. llama_stack/providers/impls/meta_reference/inference/quantization/scripts/quantize_checkpoint.py +0 -161
  689. llama_stack/providers/impls/meta_reference/memory/__init__.py +0 -19
  690. llama_stack/providers/impls/meta_reference/memory/faiss.py +0 -113
  691. llama_stack/providers/impls/meta_reference/safety/__init__.py +0 -17
  692. llama_stack/providers/impls/meta_reference/safety/base.py +0 -57
  693. llama_stack/providers/impls/meta_reference/safety/config.py +0 -48
  694. llama_stack/providers/impls/meta_reference/safety/llama_guard.py +0 -268
  695. llama_stack/providers/impls/meta_reference/safety/prompt_guard.py +0 -145
  696. llama_stack/providers/impls/meta_reference/safety/safety.py +0 -112
  697. llama_stack/providers/impls/meta_reference/telemetry/console.py +0 -89
  698. llama_stack/providers/impls/vllm/config.py +0 -35
  699. llama_stack/providers/impls/vllm/vllm.py +0 -241
  700. llama_stack/providers/registry/memory.py +0 -78
  701. llama_stack/providers/registry/telemetry.py +0 -44
  702. llama_stack/providers/tests/agents/test_agents.py +0 -210
  703. llama_stack/providers/tests/inference/test_inference.py +0 -257
  704. llama_stack/providers/tests/inference/test_prompt_adapter.py +0 -126
  705. llama_stack/providers/tests/memory/test_memory.py +0 -136
  706. llama_stack/providers/tests/resolver.py +0 -100
  707. llama_stack/providers/tests/safety/test_safety.py +0 -77
  708. llama_stack-0.0.42.dist-info/METADATA +0 -137
  709. llama_stack-0.0.42.dist-info/RECORD +0 -256
  710. /llama_stack/{distribution → core}/__init__.py +0 -0
  711. /llama_stack/{distribution/server → core/access_control}/__init__.py +0 -0
  712. /llama_stack/{distribution/utils → core/conversations}/__init__.py +0 -0
  713. /llama_stack/{providers/adapters → core/prompts}/__init__.py +0 -0
  714. /llama_stack/{providers/adapters/agents → core/routing_tables}/__init__.py +0 -0
  715. /llama_stack/{providers/adapters/inference → core/server}/__init__.py +0 -0
  716. /llama_stack/{providers/adapters/memory → core/storage}/__init__.py +0 -0
  717. /llama_stack/{providers/adapters/safety → core/ui}/__init__.py +0 -0
  718. /llama_stack/{providers/adapters/telemetry → core/ui/modules}/__init__.py +0 -0
  719. /llama_stack/{providers/impls → core/ui/page}/__init__.py +0 -0
  720. /llama_stack/{providers/impls/meta_reference → core/ui/page/distribution}/__init__.py +0 -0
  721. /llama_stack/{providers/impls/meta_reference/agents/rag → core/ui/page/evaluations}/__init__.py +0 -0
  722. /llama_stack/{providers/impls/meta_reference/agents/tests → core/ui/page/playground}/__init__.py +0 -0
  723. /llama_stack/{providers/impls/meta_reference/agents/tools → core/utils}/__init__.py +0 -0
  724. /llama_stack/{distribution → core}/utils/dynamic.py +0 -0
  725. /llama_stack/{distribution → core}/utils/serialize.py +0 -0
  726. /llama_stack/{providers/impls/meta_reference/agents/tools/ipython_tool → distributions}/__init__.py +0 -0
  727. /llama_stack/{providers/impls/meta_reference/inference/quantization → models}/__init__.py +0 -0
  728. /llama_stack/{providers/impls/meta_reference/inference/quantization/scripts → models/llama}/__init__.py +0 -0
  729. /llama_stack/{providers/tests → models/llama/llama3}/__init__.py +0 -0
  730. /llama_stack/{providers/tests/agents → models/llama/llama3/quantization}/__init__.py +0 -0
  731. /llama_stack/{providers/tests/inference → models/llama/llama3_2}/__init__.py +0 -0
  732. /llama_stack/{providers/tests/memory → models/llama/llama3_3}/__init__.py +0 -0
  733. /llama_stack/{providers/tests/safety → models/llama/llama4}/__init__.py +0 -0
  734. /llama_stack/{scripts → models/llama/llama4/prompt_templates}/__init__.py +0 -0
  735. /llama_stack/providers/{adapters → remote}/safety/bedrock/__init__.py +0 -0
  736. {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info}/entry_points.txt +0 -0
  737. {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info/licenses}/LICENSE +0 -0
  738. {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info}/top_level.txt +0 -0
@@ -4,214 +4,1270 @@
4
4
  # This source code is licensed under the terms described in the LICENSE file in
5
5
  # the root directory of this source tree.
6
6
 
7
+ from collections.abc import AsyncIterator
7
8
  from enum import Enum
9
+ from typing import (
10
+ Annotated,
11
+ Any,
12
+ Literal,
13
+ Protocol,
14
+ runtime_checkable,
15
+ )
8
16
 
9
- from typing import List, Literal, Optional, Protocol, runtime_checkable, Union
17
+ from fastapi import Body
18
+ from pydantic import BaseModel, Field, field_validator
19
+ from typing_extensions import TypedDict
10
20
 
11
- from llama_models.schema_utils import json_schema_type, webmethod
21
+ from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
22
+ from llama_stack.apis.common.responses import Order
23
+ from llama_stack.apis.models import Model
24
+ from llama_stack.apis.telemetry import MetricResponseMixin
25
+ from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
26
+ from llama_stack.models.llama.datatypes import (
27
+ BuiltinTool,
28
+ StopReason,
29
+ ToolCall,
30
+ ToolDefinition,
31
+ ToolPromptFormat,
32
+ )
33
+ from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
34
+ from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
12
35
 
13
- from pydantic import BaseModel, Field
14
- from typing_extensions import Annotated
36
+ register_schema(ToolCall)
37
+ register_schema(ToolDefinition)
15
38
 
16
- from llama_models.llama3.api.datatypes import * # noqa: F403
17
- from llama_stack.apis.models import * # noqa: F403
39
+ from enum import StrEnum
18
40
 
19
41
 
20
- class LogProbConfig(BaseModel):
21
- top_k: Optional[int] = 0
42
+ @json_schema_type
43
+ class GreedySamplingStrategy(BaseModel):
44
+ """Greedy sampling strategy that selects the highest probability token at each step.
45
+
46
+ :param type: Must be "greedy" to identify this sampling strategy
47
+ """
48
+
49
+ type: Literal["greedy"] = "greedy"
50
+
51
+
52
+ @json_schema_type
53
+ class TopPSamplingStrategy(BaseModel):
54
+ """Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
55
+
56
+ :param type: Must be "top_p" to identify this sampling strategy
57
+ :param temperature: Controls randomness in sampling. Higher values increase randomness
58
+ :param top_p: Cumulative probability threshold for nucleus sampling. Defaults to 0.95
59
+ """
60
+
61
+ type: Literal["top_p"] = "top_p"
62
+ temperature: float | None = Field(..., gt=0.0)
63
+ top_p: float | None = 0.95
22
64
 
23
65
 
24
66
  @json_schema_type
67
+ class TopKSamplingStrategy(BaseModel):
68
+ """Top-k sampling strategy that restricts sampling to the k most likely tokens.
69
+
70
+ :param type: Must be "top_k" to identify this sampling strategy
71
+ :param top_k: Number of top tokens to consider for sampling. Must be at least 1
72
+ """
73
+
74
+ type: Literal["top_k"] = "top_k"
75
+ top_k: int = Field(..., ge=1)
76
+
77
+
78
+ SamplingStrategy = Annotated[
79
+ GreedySamplingStrategy | TopPSamplingStrategy | TopKSamplingStrategy,
80
+ Field(discriminator="type"),
81
+ ]
82
+ register_schema(SamplingStrategy, name="SamplingStrategy")
83
+
84
+
85
+ @json_schema_type
86
+ class SamplingParams(BaseModel):
87
+ """Sampling parameters.
88
+
89
+ :param strategy: The sampling strategy.
90
+ :param max_tokens: The maximum number of tokens that can be generated in the completion. The token count of
91
+ your prompt plus max_tokens cannot exceed the model's context length.
92
+ :param repetition_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens
93
+ based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
94
+ :param stop: Up to 4 sequences where the API will stop generating further tokens.
95
+ The returned text will not contain the stop sequence.
96
+ """
97
+
98
+ strategy: SamplingStrategy = Field(default_factory=GreedySamplingStrategy)
99
+
100
+ max_tokens: int | None = 0
101
+ repetition_penalty: float | None = 1.0
102
+ stop: list[str] | None = None
103
+
104
+
105
+ class LogProbConfig(BaseModel):
106
+ """
107
+
108
+ :param top_k: How many tokens (for each position) to return log probabilities for.
109
+ """
110
+
111
+ top_k: int | None = 0
112
+
113
+
25
114
  class QuantizationType(Enum):
115
+ """Type of model quantization to run inference with.
116
+
117
+ :cvar bf16: BFloat16 typically this means _no_ quantization
118
+ :cvar fp8_mixed: 8-bit floating point quantization with mixed precision
119
+ :cvar int4_mixed: 4-bit integer quantization with mixed precision
120
+ """
121
+
26
122
  bf16 = "bf16"
27
- fp8 = "fp8"
123
+ fp8_mixed = "fp8_mixed"
124
+ int4_mixed = "int4_mixed"
28
125
 
29
126
 
30
127
  @json_schema_type
31
128
  class Fp8QuantizationConfig(BaseModel):
32
- type: Literal[QuantizationType.fp8.value] = QuantizationType.fp8.value
129
+ """Configuration for 8-bit floating point quantization.
130
+
131
+ :param type: Must be "fp8_mixed" to identify this quantization type
132
+ """
133
+
134
+ type: Literal["fp8_mixed"] = "fp8_mixed"
33
135
 
34
136
 
35
137
  @json_schema_type
36
138
  class Bf16QuantizationConfig(BaseModel):
37
- type: Literal[QuantizationType.bf16.value] = QuantizationType.bf16.value
139
+ """Configuration for BFloat16 precision (typically no quantization).
140
+
141
+ :param type: Must be "bf16" to identify this quantization type
142
+ """
143
+
144
+ type: Literal["bf16"] = "bf16"
145
+
146
+
147
+ @json_schema_type
148
+ class Int4QuantizationConfig(BaseModel):
149
+ """Configuration for 4-bit integer quantization.
150
+
151
+ :param type: Must be "int4" to identify this quantization type
152
+ :param scheme: Quantization scheme to use. Defaults to "int4_weight_int8_dynamic_activation"
153
+ """
154
+
155
+ type: Literal["int4_mixed"] = "int4_mixed"
156
+ scheme: str | None = "int4_weight_int8_dynamic_activation"
38
157
 
39
158
 
40
159
  QuantizationConfig = Annotated[
41
- Union[Bf16QuantizationConfig, Fp8QuantizationConfig],
160
+ Bf16QuantizationConfig | Fp8QuantizationConfig | Int4QuantizationConfig,
42
161
  Field(discriminator="type"),
43
162
  ]
44
163
 
45
164
 
46
165
  @json_schema_type
166
+ class UserMessage(BaseModel):
167
+ """A message from the user in a chat conversation.
168
+
169
+ :param role: Must be "user" to identify this as a user message
170
+ :param content: The content of the message, which can include text and other media
171
+ :param context: (Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future.
172
+ """
173
+
174
+ role: Literal["user"] = "user"
175
+ content: InterleavedContent
176
+ context: InterleavedContent | None = None
177
+
178
+
179
+ @json_schema_type
180
+ class SystemMessage(BaseModel):
181
+ """A system message providing instructions or context to the model.
182
+
183
+ :param role: Must be "system" to identify this as a system message
184
+ :param content: The content of the "system prompt". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions).
185
+ """
186
+
187
+ role: Literal["system"] = "system"
188
+ content: InterleavedContent
189
+
190
+
191
+ @json_schema_type
192
+ class ToolResponseMessage(BaseModel):
193
+ """A message representing the result of a tool invocation.
194
+
195
+ :param role: Must be "tool" to identify this as a tool response
196
+ :param call_id: Unique identifier for the tool call this response is for
197
+ :param content: The response content from the tool
198
+ """
199
+
200
+ role: Literal["tool"] = "tool"
201
+ call_id: str
202
+ content: InterleavedContent
203
+
204
+
205
+ @json_schema_type
206
+ class CompletionMessage(BaseModel):
207
+ """A message containing the model's (assistant) response in a chat conversation.
208
+
209
+ :param role: Must be "assistant" to identify this as the model's response
210
+ :param content: The content of the model's response
211
+ :param stop_reason: Reason why the model stopped generating. Options are:
212
+ - `StopReason.end_of_turn`: The model finished generating the entire response.
213
+ - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response.
214
+ - `StopReason.out_of_tokens`: The model ran out of token budget.
215
+ :param tool_calls: List of tool calls. Each tool call is a ToolCall object.
216
+ """
217
+
218
+ role: Literal["assistant"] = "assistant"
219
+ content: InterleavedContent
220
+ stop_reason: StopReason
221
+ tool_calls: list[ToolCall] | None = Field(default_factory=lambda: [])
222
+
223
+
224
+ Message = Annotated[
225
+ UserMessage | SystemMessage | ToolResponseMessage | CompletionMessage,
226
+ Field(discriminator="role"),
227
+ ]
228
+ register_schema(Message, name="Message")
229
+
230
+
231
+ @json_schema_type
232
+ class ToolResponse(BaseModel):
233
+ """Response from a tool invocation.
234
+
235
+ :param call_id: Unique identifier for the tool call this response is for
236
+ :param tool_name: Name of the tool that was invoked
237
+ :param content: The response content from the tool
238
+ :param metadata: (Optional) Additional metadata about the tool response
239
+ """
240
+
241
+ call_id: str
242
+ tool_name: BuiltinTool | str
243
+ content: InterleavedContent
244
+ metadata: dict[str, Any] | None = None
245
+
246
+ @field_validator("tool_name", mode="before")
247
+ @classmethod
248
+ def validate_field(cls, v):
249
+ if isinstance(v, str):
250
+ try:
251
+ return BuiltinTool(v)
252
+ except ValueError:
253
+ return v
254
+ return v
255
+
256
+
257
+ class ToolChoice(Enum):
258
+ """Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model.
259
+
260
+ :cvar auto: The model may use tools if it determines that is appropriate.
261
+ :cvar required: The model must use tools.
262
+ :cvar none: The model must not use tools.
263
+ """
264
+
265
+ auto = "auto"
266
+ required = "required"
267
+ none = "none"
268
+
269
+
270
+ @json_schema_type
271
+ class TokenLogProbs(BaseModel):
272
+ """Log probabilities for generated tokens.
273
+
274
+ :param logprobs_by_token: Dictionary mapping tokens to their log probabilities
275
+ """
276
+
277
+ logprobs_by_token: dict[str, float]
278
+
279
+
47
280
  class ChatCompletionResponseEventType(Enum):
281
+ """Types of events that can occur during chat completion.
282
+
283
+ :cvar start: Inference has started
284
+ :cvar complete: Inference is complete and a full response is available
285
+ :cvar progress: Inference is in progress and a partial response is available
286
+ """
287
+
48
288
  start = "start"
49
289
  complete = "complete"
50
290
  progress = "progress"
51
291
 
52
292
 
53
293
  @json_schema_type
54
- class ToolCallParseStatus(Enum):
55
- started = "started"
56
- in_progress = "in_progress"
57
- failure = "failure"
58
- success = "success"
294
+ class ChatCompletionResponseEvent(BaseModel):
295
+ """An event during chat completion generation.
59
296
 
297
+ :param event_type: Type of the event
298
+ :param delta: Content generated since last event. This can be one or more tokens, or a tool call.
299
+ :param logprobs: Optional log probabilities for generated tokens
300
+ :param stop_reason: Optional reason why generation stopped, if complete
301
+ """
60
302
 
61
- @json_schema_type
62
- class ToolCallDelta(BaseModel):
63
- content: Union[str, ToolCall]
64
- parse_status: ToolCallParseStatus
303
+ event_type: ChatCompletionResponseEventType
304
+ delta: ContentDelta
305
+ logprobs: list[TokenLogProbs] | None = None
306
+ stop_reason: StopReason | None = None
307
+
308
+
309
+ class ResponseFormatType(StrEnum):
310
+ """Types of formats for structured (guided) decoding.
311
+
312
+ :cvar json_schema: Response should conform to a JSON schema. In a Python SDK, this is often a `pydantic` model.
313
+ :cvar grammar: Response should conform to a BNF grammar
314
+ """
315
+
316
+ json_schema = "json_schema"
317
+ grammar = "grammar"
65
318
 
66
319
 
67
320
  @json_schema_type
68
- class ChatCompletionResponseEvent(BaseModel):
69
- """Chat completion response event."""
321
+ class JsonSchemaResponseFormat(BaseModel):
322
+ """Configuration for JSON schema-guided response generation.
70
323
 
71
- event_type: ChatCompletionResponseEventType
72
- delta: Union[str, ToolCallDelta]
73
- logprobs: Optional[List[TokenLogProbs]] = None
74
- stop_reason: Optional[StopReason] = None
324
+ :param type: Must be "json_schema" to identify this format type
325
+ :param json_schema: The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model.
326
+ """
327
+
328
+ type: Literal[ResponseFormatType.json_schema] = ResponseFormatType.json_schema
329
+ json_schema: dict[str, Any]
75
330
 
76
331
 
77
332
  @json_schema_type
333
+ class GrammarResponseFormat(BaseModel):
334
+ """Configuration for grammar-guided response generation.
335
+
336
+ :param type: Must be "grammar" to identify this format type
337
+ :param bnf: The BNF grammar specification the response should conform to
338
+ """
339
+
340
+ type: Literal[ResponseFormatType.grammar] = ResponseFormatType.grammar
341
+ bnf: dict[str, Any]
342
+
343
+
344
+ ResponseFormat = Annotated[
345
+ JsonSchemaResponseFormat | GrammarResponseFormat,
346
+ Field(discriminator="type"),
347
+ ]
348
+ register_schema(ResponseFormat, name="ResponseFormat")
349
+
350
+
351
+ # This is an internally used class
78
352
  class CompletionRequest(BaseModel):
79
353
  model: str
80
- content: InterleavedTextMedia
81
- sampling_params: Optional[SamplingParams] = SamplingParams()
82
-
83
- stream: Optional[bool] = False
84
- logprobs: Optional[LogProbConfig] = None
354
+ content: InterleavedContent
355
+ sampling_params: SamplingParams | None = Field(default_factory=SamplingParams)
356
+ response_format: ResponseFormat | None = None
357
+ stream: bool | None = False
358
+ logprobs: LogProbConfig | None = None
85
359
 
86
360
 
87
361
  @json_schema_type
88
- class CompletionResponse(BaseModel):
89
- """Completion response."""
362
+ class CompletionResponse(MetricResponseMixin):
363
+ """Response from a completion request.
90
364
 
91
- completion_message: CompletionMessage
92
- logprobs: Optional[List[TokenLogProbs]] = None
365
+ :param content: The generated completion text
366
+ :param stop_reason: Reason why generation stopped
367
+ :param logprobs: Optional log probabilities for generated tokens
368
+ """
369
+
370
+ content: str
371
+ stop_reason: StopReason
372
+ logprobs: list[TokenLogProbs] | None = None
93
373
 
94
374
 
95
375
  @json_schema_type
96
- class CompletionResponseStreamChunk(BaseModel):
97
- """streamed completion response."""
376
+ class CompletionResponseStreamChunk(MetricResponseMixin):
377
+ """A chunk of a streamed completion response.
378
+
379
+ :param delta: New content generated since last chunk. This can be one or more tokens.
380
+ :param stop_reason: Optional reason why generation stopped, if complete
381
+ :param logprobs: Optional log probabilities for generated tokens
382
+ """
98
383
 
99
384
  delta: str
100
- stop_reason: Optional[StopReason] = None
101
- logprobs: Optional[List[TokenLogProbs]] = None
385
+ stop_reason: StopReason | None = None
386
+ logprobs: list[TokenLogProbs] | None = None
102
387
 
103
388
 
104
- @json_schema_type
105
- class BatchCompletionRequest(BaseModel):
106
- model: str
107
- content_batch: List[InterleavedTextMedia]
108
- sampling_params: Optional[SamplingParams] = SamplingParams()
109
- logprobs: Optional[LogProbConfig] = None
389
+ class SystemMessageBehavior(Enum):
390
+ """Config for how to override the default system prompt.
391
+
392
+ :cvar append: Appends the provided system message to the default system prompt:
393
+ https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2/#-function-definitions-in-the-system-prompt-
394
+ :cvar replace: Replaces the default system prompt with the provided system message. The system message can include the string
395
+ '{{function_definitions}}' to indicate where the function definitions should be inserted.
396
+ """
397
+
398
+ append = "append"
399
+ replace = "replace"
110
400
 
111
401
 
112
402
  @json_schema_type
113
- class BatchCompletionResponse(BaseModel):
114
- """Batch completion response."""
403
+ class ToolConfig(BaseModel):
404
+ """Configuration for tool use.
405
+
406
+ :param tool_choice: (Optional) Whether tool use is automatic, required, or none. Can also specify a tool name to use a specific tool. Defaults to ToolChoice.auto.
407
+ :param tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model.
408
+ - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
409
+ - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag.
410
+ - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls.
411
+ :param system_message_behavior: (Optional) Config for how to override the default system prompt.
412
+ - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt.
413
+ - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string
414
+ '{{function_definitions}}' to indicate where the function definitions should be inserted.
415
+ """
416
+
417
+ tool_choice: ToolChoice | str | None = Field(default=ToolChoice.auto)
418
+ tool_prompt_format: ToolPromptFormat | None = Field(default=None)
419
+ system_message_behavior: SystemMessageBehavior | None = Field(default=SystemMessageBehavior.append)
115
420
 
116
- completion_message_batch: List[CompletionMessage]
421
+ def model_post_init(self, __context: Any) -> None:
422
+ if isinstance(self.tool_choice, str):
423
+ try:
424
+ self.tool_choice = ToolChoice[self.tool_choice]
425
+ except KeyError:
426
+ pass
117
427
 
118
428
 
429
+ # This is an internally used class
119
430
  @json_schema_type
120
431
  class ChatCompletionRequest(BaseModel):
121
432
  model: str
122
- messages: List[Message]
123
- sampling_params: Optional[SamplingParams] = SamplingParams()
124
-
125
- # zero-shot tool definitions as input to the model
126
- tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
127
- tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
128
- tool_prompt_format: Optional[ToolPromptFormat] = Field(
129
- default=ToolPromptFormat.json
130
- )
433
+ messages: list[Message]
434
+ sampling_params: SamplingParams | None = Field(default_factory=SamplingParams)
435
+
436
+ tools: list[ToolDefinition] | None = Field(default_factory=lambda: [])
437
+ tool_config: ToolConfig | None = Field(default_factory=ToolConfig)
131
438
 
132
- stream: Optional[bool] = False
133
- logprobs: Optional[LogProbConfig] = None
439
+ response_format: ResponseFormat | None = None
440
+ stream: bool | None = False
441
+ logprobs: LogProbConfig | None = None
134
442
 
135
443
 
136
444
  @json_schema_type
137
- class ChatCompletionResponseStreamChunk(BaseModel):
138
- """SSE-stream of these events."""
445
+ class ChatCompletionResponseStreamChunk(MetricResponseMixin):
446
+ """A chunk of a streamed chat completion response.
447
+
448
+ :param event: The event containing the new content
449
+ """
139
450
 
140
451
  event: ChatCompletionResponseEvent
141
452
 
142
453
 
143
454
  @json_schema_type
144
- class ChatCompletionResponse(BaseModel):
145
- """Chat completion response."""
455
+ class ChatCompletionResponse(MetricResponseMixin):
456
+ """Response from a chat completion request.
457
+
458
+ :param completion_message: The complete response message
459
+ :param logprobs: Optional log probabilities for generated tokens
460
+ """
146
461
 
147
462
  completion_message: CompletionMessage
148
- logprobs: Optional[List[TokenLogProbs]] = None
463
+ logprobs: list[TokenLogProbs] | None = None
464
+
465
+
466
+ @json_schema_type
467
+ class EmbeddingsResponse(BaseModel):
468
+ """Response containing generated embeddings.
469
+
470
+ :param embeddings: List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
471
+ """
472
+
473
+ embeddings: list[list[float]]
474
+
475
+
476
+ @json_schema_type
477
+ class RerankData(BaseModel):
478
+ """A single rerank result from a reranking response.
479
+
480
+ :param index: The original index of the document in the input list
481
+ :param relevance_score: The relevance score from the model output. Values are inverted when applicable so that higher scores indicate greater relevance.
482
+ """
483
+
484
+ index: int
485
+ relevance_score: float
486
+
487
+
488
+ @json_schema_type
489
+ class RerankResponse(BaseModel):
490
+ """Response from a reranking request.
491
+
492
+ :param data: List of rerank result objects, sorted by relevance score (descending)
493
+ """
494
+
495
+ data: list[RerankData]
496
+
497
+
498
+ @json_schema_type
499
+ class OpenAIChatCompletionContentPartTextParam(BaseModel):
500
+ """Text content part for OpenAI-compatible chat completion messages.
501
+
502
+ :param type: Must be "text" to identify this as text content
503
+ :param text: The text content of the message
504
+ """
505
+
506
+ type: Literal["text"] = "text"
507
+ text: str
508
+
509
+
510
+ @json_schema_type
511
+ class OpenAIImageURL(BaseModel):
512
+ """Image URL specification for OpenAI-compatible chat completion messages.
513
+
514
+ :param url: URL of the image to include in the message
515
+ :param detail: (Optional) Level of detail for image processing. Can be "low", "high", or "auto"
516
+ """
517
+
518
+ url: str
519
+ detail: str | None = None
520
+
521
+
522
+ @json_schema_type
523
+ class OpenAIChatCompletionContentPartImageParam(BaseModel):
524
+ """Image content part for OpenAI-compatible chat completion messages.
525
+
526
+ :param type: Must be "image_url" to identify this as image content
527
+ :param image_url: Image URL specification and processing details
528
+ """
529
+
530
+ type: Literal["image_url"] = "image_url"
531
+ image_url: OpenAIImageURL
532
+
533
+
534
+ @json_schema_type
535
+ class OpenAIFileFile(BaseModel):
536
+ file_data: str | None = None
537
+ file_id: str | None = None
538
+ filename: str | None = None
539
+
540
+
541
+ @json_schema_type
542
+ class OpenAIFile(BaseModel):
543
+ type: Literal["file"] = "file"
544
+ file: OpenAIFileFile
545
+
546
+
547
+ OpenAIChatCompletionContentPartParam = Annotated[
548
+ OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam | OpenAIFile,
549
+ Field(discriminator="type"),
550
+ ]
551
+ register_schema(OpenAIChatCompletionContentPartParam, name="OpenAIChatCompletionContentPartParam")
552
+
553
+
554
+ OpenAIChatCompletionMessageContent = str | list[OpenAIChatCompletionContentPartParam]
555
+
556
+ OpenAIChatCompletionTextOnlyMessageContent = str | list[OpenAIChatCompletionContentPartTextParam]
557
+
558
+
559
+ @json_schema_type
560
+ class OpenAIUserMessageParam(BaseModel):
561
+ """A message from the user in an OpenAI-compatible chat completion request.
562
+
563
+ :param role: Must be "user" to identify this as a user message
564
+ :param content: The content of the message, which can include text and other media
565
+ :param name: (Optional) The name of the user message participant.
566
+ """
567
+
568
+ role: Literal["user"] = "user"
569
+ content: OpenAIChatCompletionMessageContent
570
+ name: str | None = None
571
+
572
+
573
+ @json_schema_type
574
+ class OpenAISystemMessageParam(BaseModel):
575
+ """A system message providing instructions or context to the model.
576
+
577
+ :param role: Must be "system" to identify this as a system message
578
+ :param content: The content of the "system prompt". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions).
579
+ :param name: (Optional) The name of the system message participant.
580
+ """
581
+
582
+ role: Literal["system"] = "system"
583
+ content: OpenAIChatCompletionTextOnlyMessageContent
584
+ name: str | None = None
585
+
586
+
587
+ @json_schema_type
588
+ class OpenAIChatCompletionToolCallFunction(BaseModel):
589
+ """Function call details for OpenAI-compatible tool calls.
590
+
591
+ :param name: (Optional) Name of the function to call
592
+ :param arguments: (Optional) Arguments to pass to the function as a JSON string
593
+ """
594
+
595
+ name: str | None = None
596
+ arguments: str | None = None
149
597
 
150
598
 
151
599
  @json_schema_type
152
- class BatchChatCompletionRequest(BaseModel):
600
+ class OpenAIChatCompletionToolCall(BaseModel):
601
+ """Tool call specification for OpenAI-compatible chat completion responses.
602
+
603
+ :param index: (Optional) Index of the tool call in the list
604
+ :param id: (Optional) Unique identifier for the tool call
605
+ :param type: Must be "function" to identify this as a function call
606
+ :param function: (Optional) Function call details
607
+ """
608
+
609
+ index: int | None = None
610
+ id: str | None = None
611
+ type: Literal["function"] = "function"
612
+ function: OpenAIChatCompletionToolCallFunction | None = None
613
+
614
+
615
+ @json_schema_type
616
+ class OpenAIAssistantMessageParam(BaseModel):
617
+ """A message containing the model's (assistant) response in an OpenAI-compatible chat completion request.
618
+
619
+ :param role: Must be "assistant" to identify this as the model's response
620
+ :param content: The content of the model's response
621
+ :param name: (Optional) The name of the assistant message participant.
622
+ :param tool_calls: List of tool calls. Each tool call is an OpenAIChatCompletionToolCall object.
623
+ """
624
+
625
+ role: Literal["assistant"] = "assistant"
626
+ content: OpenAIChatCompletionTextOnlyMessageContent | None = None
627
+ name: str | None = None
628
+ tool_calls: list[OpenAIChatCompletionToolCall] | None = None
629
+
630
+
631
+ @json_schema_type
632
+ class OpenAIToolMessageParam(BaseModel):
633
+ """A message representing the result of a tool invocation in an OpenAI-compatible chat completion request.
634
+
635
+ :param role: Must be "tool" to identify this as a tool response
636
+ :param tool_call_id: Unique identifier for the tool call this response is for
637
+ :param content: The response content from the tool
638
+ """
639
+
640
+ role: Literal["tool"] = "tool"
641
+ tool_call_id: str
642
+ content: OpenAIChatCompletionTextOnlyMessageContent
643
+
644
+
645
+ @json_schema_type
646
+ class OpenAIDeveloperMessageParam(BaseModel):
647
+ """A message from the developer in an OpenAI-compatible chat completion request.
648
+
649
+ :param role: Must be "developer" to identify this as a developer message
650
+ :param content: The content of the developer message
651
+ :param name: (Optional) The name of the developer message participant.
652
+ """
653
+
654
+ role: Literal["developer"] = "developer"
655
+ content: OpenAIChatCompletionTextOnlyMessageContent
656
+ name: str | None = None
657
+
658
+
659
+ OpenAIMessageParam = Annotated[
660
+ OpenAIUserMessageParam
661
+ | OpenAISystemMessageParam
662
+ | OpenAIAssistantMessageParam
663
+ | OpenAIToolMessageParam
664
+ | OpenAIDeveloperMessageParam,
665
+ Field(discriminator="role"),
666
+ ]
667
+ register_schema(OpenAIMessageParam, name="OpenAIMessageParam")
668
+
669
+
670
+ @json_schema_type
671
+ class OpenAIResponseFormatText(BaseModel):
672
+ """Text response format for OpenAI-compatible chat completion requests.
673
+
674
+ :param type: Must be "text" to indicate plain text response format
675
+ """
676
+
677
+ type: Literal["text"] = "text"
678
+
679
+
680
+ @json_schema_type
681
+ class OpenAIJSONSchema(TypedDict, total=False):
682
+ """JSON schema specification for OpenAI-compatible structured response format.
683
+
684
+ :param name: Name of the schema
685
+ :param description: (Optional) Description of the schema
686
+ :param strict: (Optional) Whether to enforce strict adherence to the schema
687
+ :param schema: (Optional) The JSON schema definition
688
+ """
689
+
690
+ name: str
691
+ description: str | None
692
+ strict: bool | None
693
+
694
+ # Pydantic BaseModel cannot be used with a schema param, since it already
695
+ # has one. And, we don't want to alias here because then have to handle
696
+ # that alias when converting to OpenAI params. So, to support schema,
697
+ # we use a TypedDict.
698
+ schema: dict[str, Any] | None
699
+
700
+
701
+ @json_schema_type
702
+ class OpenAIResponseFormatJSONSchema(BaseModel):
703
+ """JSON schema response format for OpenAI-compatible chat completion requests.
704
+
705
+ :param type: Must be "json_schema" to indicate structured JSON response format
706
+ :param json_schema: The JSON schema specification for the response
707
+ """
708
+
709
+ type: Literal["json_schema"] = "json_schema"
710
+ json_schema: OpenAIJSONSchema
711
+
712
+
713
+ @json_schema_type
714
+ class OpenAIResponseFormatJSONObject(BaseModel):
715
+ """JSON object response format for OpenAI-compatible chat completion requests.
716
+
717
+ :param type: Must be "json_object" to indicate generic JSON object response format
718
+ """
719
+
720
+ type: Literal["json_object"] = "json_object"
721
+
722
+
723
+ OpenAIResponseFormatParam = Annotated[
724
+ OpenAIResponseFormatText | OpenAIResponseFormatJSONSchema | OpenAIResponseFormatJSONObject,
725
+ Field(discriminator="type"),
726
+ ]
727
+ register_schema(OpenAIResponseFormatParam, name="OpenAIResponseFormatParam")
728
+
729
+
730
+ @json_schema_type
731
+ class OpenAITopLogProb(BaseModel):
732
+ """The top log probability for a token from an OpenAI-compatible chat completion response.
733
+
734
+ :token: The token
735
+ :bytes: (Optional) The bytes for the token
736
+ :logprob: The log probability of the token
737
+ """
738
+
739
+ token: str
740
+ bytes: list[int] | None = None
741
+ logprob: float
742
+
743
+
744
+ @json_schema_type
745
+ class OpenAITokenLogProb(BaseModel):
746
+ """The log probability for a token from an OpenAI-compatible chat completion response.
747
+
748
+ :token: The token
749
+ :bytes: (Optional) The bytes for the token
750
+ :logprob: The log probability of the token
751
+ :top_logprobs: The top log probabilities for the token
752
+ """
753
+
754
+ token: str
755
+ bytes: list[int] | None = None
756
+ logprob: float
757
+ top_logprobs: list[OpenAITopLogProb]
758
+
759
+
760
+ @json_schema_type
761
+ class OpenAIChoiceLogprobs(BaseModel):
762
+ """The log probabilities for the tokens in the message from an OpenAI-compatible chat completion response.
763
+
764
+ :param content: (Optional) The log probabilities for the tokens in the message
765
+ :param refusal: (Optional) The log probabilities for the tokens in the message
766
+ """
767
+
768
+ content: list[OpenAITokenLogProb] | None = None
769
+ refusal: list[OpenAITokenLogProb] | None = None
770
+
771
+
772
+ @json_schema_type
773
+ class OpenAIChoiceDelta(BaseModel):
774
+ """A delta from an OpenAI-compatible chat completion streaming response.
775
+
776
+ :param content: (Optional) The content of the delta
777
+ :param refusal: (Optional) The refusal of the delta
778
+ :param role: (Optional) The role of the delta
779
+ :param tool_calls: (Optional) The tool calls of the delta
780
+ :param reasoning_content: (Optional) The reasoning content from the model (non-standard, for o1/o3 models)
781
+ """
782
+
783
+ content: str | None = None
784
+ refusal: str | None = None
785
+ role: str | None = None
786
+ tool_calls: list[OpenAIChatCompletionToolCall] | None = None
787
+ reasoning_content: str | None = None
788
+
789
+
790
+ @json_schema_type
791
+ class OpenAIChunkChoice(BaseModel):
792
+ """A chunk choice from an OpenAI-compatible chat completion streaming response.
793
+
794
+ :param delta: The delta from the chunk
795
+ :param finish_reason: The reason the model stopped generating
796
+ :param index: The index of the choice
797
+ :param logprobs: (Optional) The log probabilities for the tokens in the message
798
+ """
799
+
800
+ delta: OpenAIChoiceDelta
801
+ finish_reason: str
802
+ index: int
803
+ logprobs: OpenAIChoiceLogprobs | None = None
804
+
805
+
806
+ @json_schema_type
807
+ class OpenAIChoice(BaseModel):
808
+ """A choice from an OpenAI-compatible chat completion response.
809
+
810
+ :param message: The message from the model
811
+ :param finish_reason: The reason the model stopped generating
812
+ :param index: The index of the choice
813
+ :param logprobs: (Optional) The log probabilities for the tokens in the message
814
+ """
815
+
816
+ message: OpenAIMessageParam
817
+ finish_reason: str
818
+ index: int
819
+ logprobs: OpenAIChoiceLogprobs | None = None
820
+
821
+
822
+ class OpenAIChatCompletionUsageCompletionTokensDetails(BaseModel):
823
+ """Token details for output tokens in OpenAI chat completion usage.
824
+
825
+ :param reasoning_tokens: Number of tokens used for reasoning (o1/o3 models)
826
+ """
827
+
828
+ reasoning_tokens: int | None = None
829
+
830
+
831
+ class OpenAIChatCompletionUsagePromptTokensDetails(BaseModel):
832
+ """Token details for prompt tokens in OpenAI chat completion usage.
833
+
834
+ :param cached_tokens: Number of tokens retrieved from cache
835
+ """
836
+
837
+ cached_tokens: int | None = None
838
+
839
+
840
+ @json_schema_type
841
+ class OpenAIChatCompletionUsage(BaseModel):
842
+ """Usage information for OpenAI chat completion.
843
+
844
+ :param prompt_tokens: Number of tokens in the prompt
845
+ :param completion_tokens: Number of tokens in the completion
846
+ :param total_tokens: Total tokens used (prompt + completion)
847
+ :param input_tokens_details: Detailed breakdown of input token usage
848
+ :param output_tokens_details: Detailed breakdown of output token usage
849
+ """
850
+
851
+ prompt_tokens: int
852
+ completion_tokens: int
853
+ total_tokens: int
854
+ prompt_tokens_details: OpenAIChatCompletionUsagePromptTokensDetails | None = None
855
+ completion_tokens_details: OpenAIChatCompletionUsageCompletionTokensDetails | None = None
856
+
857
+
858
+ @json_schema_type
859
+ class OpenAIChatCompletion(BaseModel):
860
+ """Response from an OpenAI-compatible chat completion request.
861
+
862
+ :param id: The ID of the chat completion
863
+ :param choices: List of choices
864
+ :param object: The object type, which will be "chat.completion"
865
+ :param created: The Unix timestamp in seconds when the chat completion was created
866
+ :param model: The model that was used to generate the chat completion
867
+ :param usage: Token usage information for the completion
868
+ """
869
+
870
+ id: str
871
+ choices: list[OpenAIChoice]
872
+ object: Literal["chat.completion"] = "chat.completion"
873
+ created: int
153
874
  model: str
154
- messages_batch: List[List[Message]]
155
- sampling_params: Optional[SamplingParams] = SamplingParams()
156
-
157
- # zero-shot tool definitions as input to the model
158
- tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
159
- tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
160
- tool_prompt_format: Optional[ToolPromptFormat] = Field(
161
- default=ToolPromptFormat.json
162
- )
163
- logprobs: Optional[LogProbConfig] = None
875
+ usage: OpenAIChatCompletionUsage | None = None
876
+
877
+
878
+ @json_schema_type
879
+ class OpenAIChatCompletionChunk(BaseModel):
880
+ """Chunk from a streaming response to an OpenAI-compatible chat completion request.
881
+
882
+ :param id: The ID of the chat completion
883
+ :param choices: List of choices
884
+ :param object: The object type, which will be "chat.completion.chunk"
885
+ :param created: The Unix timestamp in seconds when the chat completion was created
886
+ :param model: The model that was used to generate the chat completion
887
+ :param usage: Token usage information (typically included in final chunk with stream_options)
888
+ """
889
+
890
+ id: str
891
+ choices: list[OpenAIChunkChoice]
892
+ object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
893
+ created: int
894
+ model: str
895
+ usage: OpenAIChatCompletionUsage | None = None
164
896
 
165
897
 
166
898
  @json_schema_type
167
- class BatchChatCompletionResponse(BaseModel):
168
- completion_message_batch: List[CompletionMessage]
899
+ class OpenAICompletionLogprobs(BaseModel):
900
+ """The log probabilities for the tokens in the message from an OpenAI-compatible completion response.
901
+
902
+ :text_offset: (Optional) The offset of the token in the text
903
+ :token_logprobs: (Optional) The log probabilities for the tokens
904
+ :tokens: (Optional) The tokens
905
+ :top_logprobs: (Optional) The top log probabilities for the tokens
906
+ """
907
+
908
+ text_offset: list[int] | None = None
909
+ token_logprobs: list[float] | None = None
910
+ tokens: list[str] | None = None
911
+ top_logprobs: list[dict[str, float]] | None = None
169
912
 
170
913
 
171
914
  @json_schema_type
172
- class EmbeddingsResponse(BaseModel):
173
- embeddings: List[List[float]]
915
+ class OpenAICompletionChoice(BaseModel):
916
+ """A choice from an OpenAI-compatible completion response.
917
+
918
+ :finish_reason: The reason the model stopped generating
919
+ :text: The text of the choice
920
+ :index: The index of the choice
921
+ :logprobs: (Optional) The log probabilities for the tokens in the choice
922
+ """
923
+
924
+ finish_reason: str
925
+ text: str
926
+ index: int
927
+ logprobs: OpenAIChoiceLogprobs | None = None
928
+
929
+
930
+ @json_schema_type
931
+ class OpenAICompletion(BaseModel):
932
+ """Response from an OpenAI-compatible completion request.
933
+
934
+ :id: The ID of the completion
935
+ :choices: List of choices
936
+ :created: The Unix timestamp in seconds when the completion was created
937
+ :model: The model that was used to generate the completion
938
+ :object: The object type, which will be "text_completion"
939
+ """
940
+
941
+ id: str
942
+ choices: list[OpenAICompletionChoice]
943
+ created: int
944
+ model: str
945
+ object: Literal["text_completion"] = "text_completion"
946
+
947
+
948
+ @json_schema_type
949
+ class OpenAIEmbeddingData(BaseModel):
950
+ """A single embedding data object from an OpenAI-compatible embeddings response.
951
+
952
+ :param object: The object type, which will be "embedding"
953
+ :param embedding: The embedding vector as a list of floats (when encoding_format="float") or as a base64-encoded string (when encoding_format="base64")
954
+ :param index: The index of the embedding in the input list
955
+ """
956
+
957
+ object: Literal["embedding"] = "embedding"
958
+ # TODO: consider dropping str and using openai.types.embeddings.Embedding instead of OpenAIEmbeddingData
959
+ embedding: list[float] | str
960
+ index: int
961
+
962
+
963
+ @json_schema_type
964
+ class OpenAIEmbeddingUsage(BaseModel):
965
+ """Usage information for an OpenAI-compatible embeddings response.
966
+
967
+ :param prompt_tokens: The number of tokens in the input
968
+ :param total_tokens: The total number of tokens used
969
+ """
970
+
971
+ prompt_tokens: int
972
+ total_tokens: int
973
+
974
+
975
+ @json_schema_type
976
+ class OpenAIEmbeddingsResponse(BaseModel):
977
+ """Response from an OpenAI-compatible embeddings request.
978
+
979
+ :param object: The object type, which will be "list"
980
+ :param data: List of embedding data objects
981
+ :param model: The model that was used to generate the embeddings
982
+ :param usage: Usage information
983
+ """
984
+
985
+ object: Literal["list"] = "list"
986
+ data: list[OpenAIEmbeddingData]
987
+ model: str
988
+ usage: OpenAIEmbeddingUsage
174
989
 
175
990
 
176
991
  class ModelStore(Protocol):
177
- def get_model(self, identifier: str) -> ModelDef: ...
992
+ async def get_model(self, identifier: str) -> Model: ...
993
+
994
+
995
+ class TextTruncation(Enum):
996
+ """Config for how to truncate text for embedding when text is longer than the model's max sequence length. Start and End semantics depend on whether the language is left-to-right or right-to-left.
997
+
998
+ :cvar none: No truncation (default). If the text is longer than the model's max sequence length, you will get an error.
999
+ :cvar start: Truncate from the start
1000
+ :cvar end: Truncate from the end
1001
+ """
1002
+
1003
+ none = "none"
1004
+ start = "start"
1005
+ end = "end"
1006
+
1007
+
1008
+ class EmbeddingTaskType(Enum):
1009
+ """How is the embedding being used? This is only supported by asymmetric embedding models.
1010
+
1011
+ :cvar query: Used for a query for semantic search.
1012
+ :cvar document: Used at indexing time when ingesting documents.
1013
+ """
1014
+
1015
+ query = "query"
1016
+ document = "document"
1017
+
1018
+
1019
+ class OpenAICompletionWithInputMessages(OpenAIChatCompletion):
1020
+ input_messages: list[OpenAIMessageParam]
1021
+
1022
+
1023
+ @json_schema_type
1024
+ class ListOpenAIChatCompletionResponse(BaseModel):
1025
+ """Response from listing OpenAI-compatible chat completions.
1026
+
1027
+ :param data: List of chat completion objects with their input messages
1028
+ :param has_more: Whether there are more completions available beyond this list
1029
+ :param first_id: ID of the first completion in this list
1030
+ :param last_id: ID of the last completion in this list
1031
+ :param object: Must be "list" to identify this as a list response
1032
+ """
1033
+
1034
+ data: list[OpenAICompletionWithInputMessages]
1035
+ has_more: bool
1036
+ first_id: str
1037
+ last_id: str
1038
+ object: Literal["list"] = "list"
1039
+
1040
+
1041
+ # extra_body can be accessed via .model_extra
1042
+ @json_schema_type
1043
+ class OpenAICompletionRequestWithExtraBody(BaseModel, extra="allow"):
1044
+ """Request parameters for OpenAI-compatible completion endpoint.
1045
+
1046
+ :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
1047
+ :param prompt: The prompt to generate a completion for.
1048
+ :param best_of: (Optional) The number of completions to generate.
1049
+ :param echo: (Optional) Whether to echo the prompt.
1050
+ :param frequency_penalty: (Optional) The penalty for repeated tokens.
1051
+ :param logit_bias: (Optional) The logit bias to use.
1052
+ :param logprobs: (Optional) The log probabilities to use.
1053
+ :param max_tokens: (Optional) The maximum number of tokens to generate.
1054
+ :param n: (Optional) The number of completions to generate.
1055
+ :param presence_penalty: (Optional) The penalty for repeated tokens.
1056
+ :param seed: (Optional) The seed to use.
1057
+ :param stop: (Optional) The stop tokens to use.
1058
+ :param stream: (Optional) Whether to stream the response.
1059
+ :param stream_options: (Optional) The stream options to use.
1060
+ :param temperature: (Optional) The temperature to use.
1061
+ :param top_p: (Optional) The top p to use.
1062
+ :param user: (Optional) The user to use.
1063
+ :param suffix: (Optional) The suffix that should be appended to the completion.
1064
+ """
1065
+
1066
+ # Standard OpenAI completion parameters
1067
+ model: str
1068
+ prompt: str | list[str] | list[int] | list[list[int]]
1069
+ best_of: int | None = None
1070
+ echo: bool | None = None
1071
+ frequency_penalty: float | None = None
1072
+ logit_bias: dict[str, float] | None = None
1073
+ logprobs: bool | None = None
1074
+ max_tokens: int | None = None
1075
+ n: int | None = None
1076
+ presence_penalty: float | None = None
1077
+ seed: int | None = None
1078
+ stop: str | list[str] | None = None
1079
+ stream: bool | None = None
1080
+ stream_options: dict[str, Any] | None = None
1081
+ temperature: float | None = None
1082
+ top_p: float | None = None
1083
+ user: str | None = None
1084
+ suffix: str | None = None
1085
+
1086
+
1087
+ # extra_body can be accessed via .model_extra
1088
+ @json_schema_type
1089
+ class OpenAIChatCompletionRequestWithExtraBody(BaseModel, extra="allow"):
1090
+ """Request parameters for OpenAI-compatible chat completion endpoint.
1091
+
1092
+ :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
1093
+ :param messages: List of messages in the conversation.
1094
+ :param frequency_penalty: (Optional) The penalty for repeated tokens.
1095
+ :param function_call: (Optional) The function call to use.
1096
+ :param functions: (Optional) List of functions to use.
1097
+ :param logit_bias: (Optional) The logit bias to use.
1098
+ :param logprobs: (Optional) The log probabilities to use.
1099
+ :param max_completion_tokens: (Optional) The maximum number of tokens to generate.
1100
+ :param max_tokens: (Optional) The maximum number of tokens to generate.
1101
+ :param n: (Optional) The number of completions to generate.
1102
+ :param parallel_tool_calls: (Optional) Whether to parallelize tool calls.
1103
+ :param presence_penalty: (Optional) The penalty for repeated tokens.
1104
+ :param response_format: (Optional) The response format to use.
1105
+ :param seed: (Optional) The seed to use.
1106
+ :param stop: (Optional) The stop tokens to use.
1107
+ :param stream: (Optional) Whether to stream the response.
1108
+ :param stream_options: (Optional) The stream options to use.
1109
+ :param temperature: (Optional) The temperature to use.
1110
+ :param tool_choice: (Optional) The tool choice to use.
1111
+ :param tools: (Optional) The tools to use.
1112
+ :param top_logprobs: (Optional) The top log probabilities to use.
1113
+ :param top_p: (Optional) The top p to use.
1114
+ :param user: (Optional) The user to use.
1115
+ """
1116
+
1117
+ # Standard OpenAI chat completion parameters
1118
+ model: str
1119
+ messages: Annotated[list[OpenAIMessageParam], Field(..., min_length=1)]
1120
+ frequency_penalty: float | None = None
1121
+ function_call: str | dict[str, Any] | None = None
1122
+ functions: list[dict[str, Any]] | None = None
1123
+ logit_bias: dict[str, float] | None = None
1124
+ logprobs: bool | None = None
1125
+ max_completion_tokens: int | None = None
1126
+ max_tokens: int | None = None
1127
+ n: int | None = None
1128
+ parallel_tool_calls: bool | None = None
1129
+ presence_penalty: float | None = None
1130
+ response_format: OpenAIResponseFormatParam | None = None
1131
+ seed: int | None = None
1132
+ stop: str | list[str] | None = None
1133
+ stream: bool | None = None
1134
+ stream_options: dict[str, Any] | None = None
1135
+ temperature: float | None = None
1136
+ tool_choice: str | dict[str, Any] | None = None
1137
+ tools: list[dict[str, Any]] | None = None
1138
+ top_logprobs: int | None = None
1139
+ top_p: float | None = None
1140
+ user: str | None = None
1141
+
1142
+
1143
+ # extra_body can be accessed via .model_extra
1144
+ @json_schema_type
1145
+ class OpenAIEmbeddingsRequestWithExtraBody(BaseModel, extra="allow"):
1146
+ """Request parameters for OpenAI-compatible embeddings endpoint.
1147
+
1148
+ :param model: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
1149
+ :param input: Input text to embed, encoded as a string or array of strings. To embed multiple inputs in a single request, pass an array of strings.
1150
+ :param encoding_format: (Optional) The format to return the embeddings in. Can be either "float" or "base64". Defaults to "float".
1151
+ :param dimensions: (Optional) The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
1152
+ :param user: (Optional) A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse.
1153
+ """
1154
+
1155
+ model: str
1156
+ input: str | list[str]
1157
+ encoding_format: str | None = "float"
1158
+ dimensions: int | None = None
1159
+ user: str | None = None
178
1160
 
179
1161
 
180
1162
  @runtime_checkable
181
- class Inference(Protocol):
182
- model_store: ModelStore
1163
+ @trace_protocol
1164
+ class InferenceProvider(Protocol):
1165
+ """
1166
+ This protocol defines the interface that should be implemented by all inference providers.
1167
+ """
1168
+
1169
+ API_NAMESPACE: str = "Inference"
183
1170
 
184
- # This method is not `async def` because it can result in either an
185
- # `AsyncGenerator` or a `CompletionResponse` depending on the value of `stream`.
186
- @webmethod(route="/inference/completion")
187
- def completion(
1171
+ model_store: ModelStore | None = None
1172
+
1173
+ @webmethod(route="/inference/rerank", method="POST", level=LLAMA_STACK_API_V1ALPHA)
1174
+ async def rerank(
188
1175
  self,
189
1176
  model: str,
190
- content: InterleavedTextMedia,
191
- sampling_params: Optional[SamplingParams] = SamplingParams(),
192
- stream: Optional[bool] = False,
193
- logprobs: Optional[LogProbConfig] = None,
194
- ) -> Union[CompletionResponse, CompletionResponseStreamChunk]: ...
195
-
196
- # This method is not `async def` because it can result in either an
197
- # `AsyncGenerator` or a `ChatCompletionResponse` depending on the value of `stream`.
198
- @webmethod(route="/inference/chat_completion")
199
- def chat_completion(
1177
+ query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
1178
+ items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
1179
+ max_num_results: int | None = None,
1180
+ ) -> RerankResponse:
1181
+ """Rerank a list of documents based on their relevance to a query.
1182
+
1183
+ :param model: The identifier of the reranking model to use.
1184
+ :param query: The search query to rank items against. Can be a string, text content part, or image content part. The input must not exceed the model's max input token length.
1185
+ :param items: List of items to rerank. Each item can be a string, text content part, or image content part. Each input must not exceed the model's max input token length.
1186
+ :param max_num_results: (Optional) Maximum number of results to return. Default: returns all.
1187
+ :returns: RerankResponse with indices sorted by relevance score (descending).
1188
+ """
1189
+ raise NotImplementedError("Reranking is not implemented")
1190
+ return # this is so mypy's safe-super rule will consider the method concrete
1191
+
1192
+ @webmethod(route="/openai/v1/completions", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
1193
+ @webmethod(route="/completions", method="POST", level=LLAMA_STACK_API_V1)
1194
+ async def openai_completion(
200
1195
  self,
201
- model: str,
202
- messages: List[Message],
203
- sampling_params: Optional[SamplingParams] = SamplingParams(),
204
- # zero-shot tool definitions as input to the model
205
- tools: Optional[List[ToolDefinition]] = None,
206
- tool_choice: Optional[ToolChoice] = ToolChoice.auto,
207
- tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
208
- stream: Optional[bool] = False,
209
- logprobs: Optional[LogProbConfig] = None,
210
- ) -> Union[ChatCompletionResponse, ChatCompletionResponseStreamChunk]: ...
211
-
212
- @webmethod(route="/inference/embeddings")
213
- async def embeddings(
1196
+ params: Annotated[OpenAICompletionRequestWithExtraBody, Body(...)],
1197
+ ) -> OpenAICompletion:
1198
+ """Create completion.
1199
+
1200
+ Generate an OpenAI-compatible completion for the given prompt using the specified model.
1201
+ :returns: An OpenAICompletion.
1202
+ """
1203
+ ...
1204
+
1205
+ @webmethod(route="/openai/v1/chat/completions", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
1206
+ @webmethod(route="/chat/completions", method="POST", level=LLAMA_STACK_API_V1)
1207
+ async def openai_chat_completion(
214
1208
  self,
215
- model: str,
216
- contents: List[InterleavedTextMedia],
217
- ) -> EmbeddingsResponse: ...
1209
+ params: Annotated[OpenAIChatCompletionRequestWithExtraBody, Body(...)],
1210
+ ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
1211
+ """Create chat completions.
1212
+
1213
+ Generate an OpenAI-compatible chat completion for the given messages using the specified model.
1214
+ :returns: An OpenAIChatCompletion.
1215
+ """
1216
+ ...
1217
+
1218
+ @webmethod(route="/openai/v1/embeddings", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
1219
+ @webmethod(route="/embeddings", method="POST", level=LLAMA_STACK_API_V1)
1220
+ async def openai_embeddings(
1221
+ self,
1222
+ params: Annotated[OpenAIEmbeddingsRequestWithExtraBody, Body(...)],
1223
+ ) -> OpenAIEmbeddingsResponse:
1224
+ """Create embeddings.
1225
+
1226
+ Generate OpenAI-compatible embeddings for the given input using the specified model.
1227
+ :returns: An OpenAIEmbeddingsResponse containing the embeddings.
1228
+ """
1229
+ ...
1230
+
1231
+
1232
+ class Inference(InferenceProvider):
1233
+ """Inference
1234
+
1235
+ Llama Stack Inference API for generating completions, chat completions, and embeddings.
1236
+
1237
+ This API provides the raw interface to the underlying models. Two kinds of models are supported:
1238
+ - LLM models: these models generate "raw" and "chat" (conversational) completions.
1239
+ - Embedding models: these models generate embeddings to be used for semantic search.
1240
+ """
1241
+
1242
+ @webmethod(route="/openai/v1/chat/completions", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
1243
+ @webmethod(route="/chat/completions", method="GET", level=LLAMA_STACK_API_V1)
1244
+ async def list_chat_completions(
1245
+ self,
1246
+ after: str | None = None,
1247
+ limit: int | None = 20,
1248
+ model: str | None = None,
1249
+ order: Order | None = Order.desc,
1250
+ ) -> ListOpenAIChatCompletionResponse:
1251
+ """List chat completions.
1252
+
1253
+ :param after: The ID of the last chat completion to return.
1254
+ :param limit: The maximum number of chat completions to return.
1255
+ :param model: The model to filter by.
1256
+ :param order: The order to sort the chat completions by: "asc" or "desc". Defaults to "desc".
1257
+ :returns: A ListOpenAIChatCompletionResponse.
1258
+ """
1259
+ raise NotImplementedError("List chat completions is not implemented")
1260
+
1261
+ @webmethod(
1262
+ route="/openai/v1/chat/completions/{completion_id}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True
1263
+ )
1264
+ @webmethod(route="/chat/completions/{completion_id}", method="GET", level=LLAMA_STACK_API_V1)
1265
+ async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
1266
+ """Get chat completion.
1267
+
1268
+ Describe a chat completion by its ID.
1269
+
1270
+ :param completion_id: ID of the chat completion.
1271
+ :returns: A OpenAICompletionWithInputMessages.
1272
+ """
1273
+ raise NotImplementedError("Get chat completion is not implemented")