llama-stack 0.3.5__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (460) hide show
  1. llama_stack/__init__.py +0 -5
  2. llama_stack/cli/llama.py +3 -3
  3. llama_stack/cli/stack/_list_deps.py +12 -23
  4. llama_stack/cli/stack/list_stacks.py +37 -18
  5. llama_stack/cli/stack/run.py +121 -11
  6. llama_stack/cli/stack/utils.py +0 -127
  7. llama_stack/core/access_control/access_control.py +69 -28
  8. llama_stack/core/access_control/conditions.py +15 -5
  9. llama_stack/core/admin.py +267 -0
  10. llama_stack/core/build.py +6 -74
  11. llama_stack/core/client.py +1 -1
  12. llama_stack/core/configure.py +6 -6
  13. llama_stack/core/conversations/conversations.py +28 -25
  14. llama_stack/core/datatypes.py +271 -79
  15. llama_stack/core/distribution.py +15 -16
  16. llama_stack/core/external.py +3 -3
  17. llama_stack/core/inspect.py +98 -15
  18. llama_stack/core/library_client.py +73 -61
  19. llama_stack/core/prompts/prompts.py +12 -11
  20. llama_stack/core/providers.py +17 -11
  21. llama_stack/core/resolver.py +65 -56
  22. llama_stack/core/routers/__init__.py +8 -12
  23. llama_stack/core/routers/datasets.py +1 -4
  24. llama_stack/core/routers/eval_scoring.py +7 -4
  25. llama_stack/core/routers/inference.py +55 -271
  26. llama_stack/core/routers/safety.py +52 -24
  27. llama_stack/core/routers/tool_runtime.py +6 -48
  28. llama_stack/core/routers/vector_io.py +130 -51
  29. llama_stack/core/routing_tables/benchmarks.py +24 -20
  30. llama_stack/core/routing_tables/common.py +1 -4
  31. llama_stack/core/routing_tables/datasets.py +22 -22
  32. llama_stack/core/routing_tables/models.py +119 -6
  33. llama_stack/core/routing_tables/scoring_functions.py +7 -7
  34. llama_stack/core/routing_tables/shields.py +1 -2
  35. llama_stack/core/routing_tables/toolgroups.py +17 -7
  36. llama_stack/core/routing_tables/vector_stores.py +51 -16
  37. llama_stack/core/server/auth.py +5 -3
  38. llama_stack/core/server/auth_providers.py +36 -20
  39. llama_stack/core/server/fastapi_router_registry.py +84 -0
  40. llama_stack/core/server/quota.py +2 -2
  41. llama_stack/core/server/routes.py +79 -27
  42. llama_stack/core/server/server.py +102 -87
  43. llama_stack/core/stack.py +235 -62
  44. llama_stack/core/storage/datatypes.py +26 -3
  45. llama_stack/{providers/utils → core/storage}/kvstore/__init__.py +2 -0
  46. llama_stack/{providers/utils → core/storage}/kvstore/kvstore.py +55 -24
  47. llama_stack/{providers/utils → core/storage}/kvstore/mongodb/mongodb.py +13 -10
  48. llama_stack/{providers/utils → core/storage}/kvstore/postgres/postgres.py +28 -17
  49. llama_stack/{providers/utils → core/storage}/kvstore/redis/redis.py +41 -16
  50. llama_stack/{providers/utils → core/storage}/kvstore/sqlite/sqlite.py +1 -1
  51. llama_stack/core/storage/sqlstore/__init__.py +17 -0
  52. llama_stack/{providers/utils → core/storage}/sqlstore/authorized_sqlstore.py +69 -49
  53. llama_stack/{providers/utils → core/storage}/sqlstore/sqlalchemy_sqlstore.py +47 -17
  54. llama_stack/{providers/utils → core/storage}/sqlstore/sqlstore.py +25 -8
  55. llama_stack/core/store/registry.py +1 -1
  56. llama_stack/core/utils/config.py +8 -2
  57. llama_stack/core/utils/config_resolution.py +32 -29
  58. llama_stack/core/utils/context.py +4 -10
  59. llama_stack/core/utils/exec.py +9 -0
  60. llama_stack/core/utils/type_inspection.py +45 -0
  61. llama_stack/distributions/dell/{run.yaml → config.yaml} +3 -2
  62. llama_stack/distributions/dell/dell.py +2 -2
  63. llama_stack/distributions/dell/run-with-safety.yaml +3 -2
  64. llama_stack/distributions/meta-reference-gpu/{run.yaml → config.yaml} +3 -2
  65. llama_stack/distributions/meta-reference-gpu/meta_reference.py +2 -2
  66. llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +3 -2
  67. llama_stack/distributions/nvidia/{run.yaml → config.yaml} +4 -4
  68. llama_stack/distributions/nvidia/nvidia.py +1 -1
  69. llama_stack/distributions/nvidia/run-with-safety.yaml +4 -4
  70. llama_stack/{apis/datasetio → distributions/oci}/__init__.py +1 -1
  71. llama_stack/distributions/oci/config.yaml +134 -0
  72. llama_stack/distributions/oci/oci.py +108 -0
  73. llama_stack/distributions/open-benchmark/{run.yaml → config.yaml} +5 -4
  74. llama_stack/distributions/open-benchmark/open_benchmark.py +2 -3
  75. llama_stack/distributions/postgres-demo/{run.yaml → config.yaml} +4 -3
  76. llama_stack/distributions/starter/{run.yaml → config.yaml} +64 -13
  77. llama_stack/distributions/starter/run-with-postgres-store.yaml +64 -13
  78. llama_stack/distributions/starter/starter.py +8 -5
  79. llama_stack/distributions/starter-gpu/{run.yaml → config.yaml} +64 -13
  80. llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +64 -13
  81. llama_stack/distributions/template.py +13 -69
  82. llama_stack/distributions/watsonx/{run.yaml → config.yaml} +4 -3
  83. llama_stack/distributions/watsonx/watsonx.py +1 -1
  84. llama_stack/log.py +28 -11
  85. llama_stack/models/llama/checkpoint.py +6 -6
  86. llama_stack/models/llama/hadamard_utils.py +2 -0
  87. llama_stack/models/llama/llama3/generation.py +3 -1
  88. llama_stack/models/llama/llama3/interface.py +2 -5
  89. llama_stack/models/llama/llama3/multimodal/encoder_utils.py +3 -3
  90. llama_stack/models/llama/llama3/multimodal/image_transform.py +6 -6
  91. llama_stack/models/llama/llama3/prompt_templates/system_prompts.py +1 -1
  92. llama_stack/models/llama/llama3/tool_utils.py +2 -1
  93. llama_stack/models/llama/llama4/prompt_templates/system_prompts.py +1 -1
  94. llama_stack/providers/inline/agents/meta_reference/__init__.py +3 -3
  95. llama_stack/providers/inline/agents/meta_reference/agents.py +44 -261
  96. llama_stack/providers/inline/agents/meta_reference/config.py +6 -1
  97. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +207 -57
  98. llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +308 -47
  99. llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +162 -96
  100. llama_stack/providers/inline/agents/meta_reference/responses/types.py +23 -8
  101. llama_stack/providers/inline/agents/meta_reference/responses/utils.py +201 -33
  102. llama_stack/providers/inline/agents/meta_reference/safety.py +8 -13
  103. llama_stack/providers/inline/batches/reference/__init__.py +2 -4
  104. llama_stack/providers/inline/batches/reference/batches.py +78 -60
  105. llama_stack/providers/inline/datasetio/localfs/datasetio.py +2 -5
  106. llama_stack/providers/inline/eval/meta_reference/eval.py +16 -61
  107. llama_stack/providers/inline/files/localfs/files.py +37 -28
  108. llama_stack/providers/inline/inference/meta_reference/config.py +2 -2
  109. llama_stack/providers/inline/inference/meta_reference/generators.py +50 -60
  110. llama_stack/providers/inline/inference/meta_reference/inference.py +403 -19
  111. llama_stack/providers/inline/inference/meta_reference/model_parallel.py +7 -26
  112. llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +2 -12
  113. llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +10 -15
  114. llama_stack/providers/inline/post_training/common/validator.py +1 -5
  115. llama_stack/providers/inline/post_training/huggingface/post_training.py +8 -8
  116. llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py +18 -10
  117. llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device_dpo.py +12 -9
  118. llama_stack/providers/inline/post_training/huggingface/utils.py +27 -6
  119. llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py +1 -1
  120. llama_stack/providers/inline/post_training/torchtune/common/utils.py +1 -1
  121. llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py +1 -1
  122. llama_stack/providers/inline/post_training/torchtune/post_training.py +8 -8
  123. llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +16 -16
  124. llama_stack/providers/inline/safety/code_scanner/code_scanner.py +13 -9
  125. llama_stack/providers/inline/safety/llama_guard/llama_guard.py +18 -15
  126. llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +9 -9
  127. llama_stack/providers/inline/scoring/basic/scoring.py +6 -13
  128. llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py +1 -2
  129. llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py +1 -2
  130. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py +2 -2
  131. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py +2 -2
  132. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py +2 -2
  133. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py +2 -2
  134. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py +2 -2
  135. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py +2 -2
  136. llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py +1 -2
  137. llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py +1 -2
  138. llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py +1 -2
  139. llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py +1 -2
  140. llama_stack/providers/inline/scoring/braintrust/braintrust.py +12 -15
  141. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py +2 -2
  142. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py +2 -2
  143. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py +2 -2
  144. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py +2 -2
  145. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py +2 -2
  146. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py +2 -2
  147. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py +2 -2
  148. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py +2 -2
  149. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py +2 -2
  150. llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +7 -14
  151. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py +2 -2
  152. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py +1 -2
  153. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py +1 -3
  154. llama_stack/providers/inline/tool_runtime/rag/__init__.py +1 -1
  155. llama_stack/providers/inline/tool_runtime/rag/config.py +8 -1
  156. llama_stack/providers/inline/tool_runtime/rag/context_retriever.py +7 -6
  157. llama_stack/providers/inline/tool_runtime/rag/memory.py +64 -48
  158. llama_stack/providers/inline/vector_io/chroma/__init__.py +1 -1
  159. llama_stack/providers/inline/vector_io/chroma/config.py +1 -1
  160. llama_stack/providers/inline/vector_io/faiss/__init__.py +1 -1
  161. llama_stack/providers/inline/vector_io/faiss/config.py +1 -1
  162. llama_stack/providers/inline/vector_io/faiss/faiss.py +46 -28
  163. llama_stack/providers/inline/vector_io/milvus/__init__.py +1 -1
  164. llama_stack/providers/inline/vector_io/milvus/config.py +1 -1
  165. llama_stack/providers/inline/vector_io/qdrant/__init__.py +1 -1
  166. llama_stack/providers/inline/vector_io/qdrant/config.py +1 -1
  167. llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py +1 -1
  168. llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +44 -33
  169. llama_stack/providers/registry/agents.py +8 -3
  170. llama_stack/providers/registry/batches.py +1 -1
  171. llama_stack/providers/registry/datasetio.py +1 -1
  172. llama_stack/providers/registry/eval.py +1 -1
  173. llama_stack/{apis/datasets/__init__.py → providers/registry/file_processors.py} +5 -1
  174. llama_stack/providers/registry/files.py +11 -2
  175. llama_stack/providers/registry/inference.py +22 -3
  176. llama_stack/providers/registry/post_training.py +1 -1
  177. llama_stack/providers/registry/safety.py +1 -1
  178. llama_stack/providers/registry/scoring.py +1 -1
  179. llama_stack/providers/registry/tool_runtime.py +2 -2
  180. llama_stack/providers/registry/vector_io.py +7 -7
  181. llama_stack/providers/remote/datasetio/huggingface/huggingface.py +2 -5
  182. llama_stack/providers/remote/datasetio/nvidia/datasetio.py +1 -4
  183. llama_stack/providers/remote/eval/nvidia/eval.py +15 -9
  184. llama_stack/providers/remote/files/openai/__init__.py +19 -0
  185. llama_stack/providers/remote/files/openai/config.py +28 -0
  186. llama_stack/providers/remote/files/openai/files.py +253 -0
  187. llama_stack/providers/remote/files/s3/files.py +52 -30
  188. llama_stack/providers/remote/inference/anthropic/anthropic.py +2 -1
  189. llama_stack/providers/remote/inference/anthropic/config.py +1 -1
  190. llama_stack/providers/remote/inference/azure/azure.py +1 -3
  191. llama_stack/providers/remote/inference/azure/config.py +8 -7
  192. llama_stack/providers/remote/inference/bedrock/__init__.py +1 -1
  193. llama_stack/providers/remote/inference/bedrock/bedrock.py +82 -105
  194. llama_stack/providers/remote/inference/bedrock/config.py +24 -3
  195. llama_stack/providers/remote/inference/cerebras/cerebras.py +5 -5
  196. llama_stack/providers/remote/inference/cerebras/config.py +12 -5
  197. llama_stack/providers/remote/inference/databricks/config.py +13 -6
  198. llama_stack/providers/remote/inference/databricks/databricks.py +16 -6
  199. llama_stack/providers/remote/inference/fireworks/config.py +5 -5
  200. llama_stack/providers/remote/inference/fireworks/fireworks.py +1 -1
  201. llama_stack/providers/remote/inference/gemini/config.py +1 -1
  202. llama_stack/providers/remote/inference/gemini/gemini.py +13 -14
  203. llama_stack/providers/remote/inference/groq/config.py +5 -5
  204. llama_stack/providers/remote/inference/groq/groq.py +1 -1
  205. llama_stack/providers/remote/inference/llama_openai_compat/config.py +5 -5
  206. llama_stack/providers/remote/inference/llama_openai_compat/llama.py +8 -6
  207. llama_stack/providers/remote/inference/nvidia/__init__.py +1 -1
  208. llama_stack/providers/remote/inference/nvidia/config.py +21 -11
  209. llama_stack/providers/remote/inference/nvidia/nvidia.py +115 -3
  210. llama_stack/providers/remote/inference/nvidia/utils.py +1 -1
  211. llama_stack/providers/remote/inference/oci/__init__.py +17 -0
  212. llama_stack/providers/remote/inference/oci/auth.py +79 -0
  213. llama_stack/providers/remote/inference/oci/config.py +75 -0
  214. llama_stack/providers/remote/inference/oci/oci.py +162 -0
  215. llama_stack/providers/remote/inference/ollama/config.py +7 -5
  216. llama_stack/providers/remote/inference/ollama/ollama.py +17 -8
  217. llama_stack/providers/remote/inference/openai/config.py +4 -4
  218. llama_stack/providers/remote/inference/openai/openai.py +1 -1
  219. llama_stack/providers/remote/inference/passthrough/__init__.py +2 -2
  220. llama_stack/providers/remote/inference/passthrough/config.py +5 -10
  221. llama_stack/providers/remote/inference/passthrough/passthrough.py +97 -75
  222. llama_stack/providers/remote/inference/runpod/config.py +12 -5
  223. llama_stack/providers/remote/inference/runpod/runpod.py +2 -20
  224. llama_stack/providers/remote/inference/sambanova/config.py +5 -5
  225. llama_stack/providers/remote/inference/sambanova/sambanova.py +1 -1
  226. llama_stack/providers/remote/inference/tgi/config.py +7 -6
  227. llama_stack/providers/remote/inference/tgi/tgi.py +19 -11
  228. llama_stack/providers/remote/inference/together/config.py +5 -5
  229. llama_stack/providers/remote/inference/together/together.py +15 -12
  230. llama_stack/providers/remote/inference/vertexai/config.py +1 -1
  231. llama_stack/providers/remote/inference/vllm/config.py +5 -5
  232. llama_stack/providers/remote/inference/vllm/vllm.py +13 -14
  233. llama_stack/providers/remote/inference/watsonx/config.py +4 -4
  234. llama_stack/providers/remote/inference/watsonx/watsonx.py +21 -94
  235. llama_stack/providers/remote/post_training/nvidia/post_training.py +4 -4
  236. llama_stack/providers/remote/post_training/nvidia/utils.py +1 -1
  237. llama_stack/providers/remote/safety/bedrock/bedrock.py +6 -6
  238. llama_stack/providers/remote/safety/bedrock/config.py +1 -1
  239. llama_stack/providers/remote/safety/nvidia/config.py +1 -1
  240. llama_stack/providers/remote/safety/nvidia/nvidia.py +11 -5
  241. llama_stack/providers/remote/safety/sambanova/config.py +1 -1
  242. llama_stack/providers/remote/safety/sambanova/sambanova.py +6 -6
  243. llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py +11 -6
  244. llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py +12 -7
  245. llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py +8 -2
  246. llama_stack/providers/remote/tool_runtime/model_context_protocol/model_context_protocol.py +57 -15
  247. llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py +11 -6
  248. llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py +11 -6
  249. llama_stack/providers/remote/vector_io/chroma/__init__.py +1 -1
  250. llama_stack/providers/remote/vector_io/chroma/chroma.py +131 -23
  251. llama_stack/providers/remote/vector_io/chroma/config.py +1 -1
  252. llama_stack/providers/remote/vector_io/milvus/__init__.py +1 -1
  253. llama_stack/providers/remote/vector_io/milvus/config.py +1 -1
  254. llama_stack/providers/remote/vector_io/milvus/milvus.py +37 -28
  255. llama_stack/providers/remote/vector_io/pgvector/__init__.py +1 -1
  256. llama_stack/providers/remote/vector_io/pgvector/config.py +1 -1
  257. llama_stack/providers/remote/vector_io/pgvector/pgvector.py +37 -25
  258. llama_stack/providers/remote/vector_io/qdrant/__init__.py +1 -1
  259. llama_stack/providers/remote/vector_io/qdrant/config.py +1 -1
  260. llama_stack/providers/remote/vector_io/qdrant/qdrant.py +147 -30
  261. llama_stack/providers/remote/vector_io/weaviate/__init__.py +1 -1
  262. llama_stack/providers/remote/vector_io/weaviate/config.py +1 -1
  263. llama_stack/providers/remote/vector_io/weaviate/weaviate.py +31 -26
  264. llama_stack/providers/utils/common/data_schema_validator.py +1 -5
  265. llama_stack/providers/utils/files/form_data.py +1 -1
  266. llama_stack/providers/utils/inference/embedding_mixin.py +1 -1
  267. llama_stack/providers/utils/inference/inference_store.py +7 -8
  268. llama_stack/providers/utils/inference/litellm_openai_mixin.py +79 -79
  269. llama_stack/providers/utils/inference/model_registry.py +1 -3
  270. llama_stack/providers/utils/inference/openai_compat.py +44 -1171
  271. llama_stack/providers/utils/inference/openai_mixin.py +68 -42
  272. llama_stack/providers/utils/inference/prompt_adapter.py +50 -265
  273. llama_stack/providers/utils/inference/stream_utils.py +23 -0
  274. llama_stack/providers/utils/memory/__init__.py +2 -0
  275. llama_stack/providers/utils/memory/file_utils.py +1 -1
  276. llama_stack/providers/utils/memory/openai_vector_store_mixin.py +181 -84
  277. llama_stack/providers/utils/memory/vector_store.py +39 -38
  278. llama_stack/providers/utils/pagination.py +1 -1
  279. llama_stack/providers/utils/responses/responses_store.py +15 -25
  280. llama_stack/providers/utils/scoring/aggregation_utils.py +1 -2
  281. llama_stack/providers/utils/scoring/base_scoring_fn.py +1 -2
  282. llama_stack/providers/utils/tools/mcp.py +93 -11
  283. llama_stack/providers/utils/vector_io/__init__.py +16 -0
  284. llama_stack/providers/utils/vector_io/vector_utils.py +36 -0
  285. llama_stack/telemetry/constants.py +27 -0
  286. llama_stack/telemetry/helpers.py +43 -0
  287. llama_stack/testing/api_recorder.py +25 -16
  288. {llama_stack-0.3.5.dist-info → llama_stack-0.4.1.dist-info}/METADATA +57 -55
  289. llama_stack-0.4.1.dist-info/RECORD +588 -0
  290. llama_stack-0.4.1.dist-info/top_level.txt +2 -0
  291. llama_stack_api/__init__.py +945 -0
  292. llama_stack_api/admin/__init__.py +45 -0
  293. llama_stack_api/admin/api.py +72 -0
  294. llama_stack_api/admin/fastapi_routes.py +117 -0
  295. llama_stack_api/admin/models.py +113 -0
  296. llama_stack_api/agents.py +173 -0
  297. llama_stack_api/batches/__init__.py +40 -0
  298. llama_stack_api/batches/api.py +53 -0
  299. llama_stack_api/batches/fastapi_routes.py +113 -0
  300. llama_stack_api/batches/models.py +78 -0
  301. llama_stack_api/benchmarks/__init__.py +43 -0
  302. llama_stack_api/benchmarks/api.py +39 -0
  303. llama_stack_api/benchmarks/fastapi_routes.py +109 -0
  304. llama_stack_api/benchmarks/models.py +109 -0
  305. {llama_stack/apis → llama_stack_api}/common/content_types.py +1 -43
  306. {llama_stack/apis → llama_stack_api}/common/errors.py +0 -8
  307. {llama_stack/apis → llama_stack_api}/common/job_types.py +1 -1
  308. llama_stack_api/common/responses.py +77 -0
  309. {llama_stack/apis → llama_stack_api}/common/training_types.py +1 -1
  310. {llama_stack/apis → llama_stack_api}/common/type_system.py +2 -14
  311. llama_stack_api/connectors.py +146 -0
  312. {llama_stack/apis/conversations → llama_stack_api}/conversations.py +23 -39
  313. {llama_stack/apis/datasetio → llama_stack_api}/datasetio.py +4 -8
  314. llama_stack_api/datasets/__init__.py +61 -0
  315. llama_stack_api/datasets/api.py +35 -0
  316. llama_stack_api/datasets/fastapi_routes.py +104 -0
  317. llama_stack_api/datasets/models.py +152 -0
  318. {llama_stack/providers → llama_stack_api}/datatypes.py +166 -10
  319. {llama_stack/apis/eval → llama_stack_api}/eval.py +8 -40
  320. llama_stack_api/file_processors/__init__.py +27 -0
  321. llama_stack_api/file_processors/api.py +64 -0
  322. llama_stack_api/file_processors/fastapi_routes.py +78 -0
  323. llama_stack_api/file_processors/models.py +42 -0
  324. llama_stack_api/files/__init__.py +35 -0
  325. llama_stack_api/files/api.py +51 -0
  326. llama_stack_api/files/fastapi_routes.py +124 -0
  327. llama_stack_api/files/models.py +107 -0
  328. {llama_stack/apis/inference → llama_stack_api}/inference.py +90 -194
  329. llama_stack_api/inspect_api/__init__.py +37 -0
  330. llama_stack_api/inspect_api/api.py +25 -0
  331. llama_stack_api/inspect_api/fastapi_routes.py +76 -0
  332. llama_stack_api/inspect_api/models.py +28 -0
  333. {llama_stack/apis/agents → llama_stack_api/internal}/__init__.py +3 -1
  334. llama_stack/providers/utils/kvstore/api.py → llama_stack_api/internal/kvstore.py +5 -0
  335. llama_stack_api/internal/sqlstore.py +79 -0
  336. {llama_stack/apis/models → llama_stack_api}/models.py +11 -9
  337. {llama_stack/apis/agents → llama_stack_api}/openai_responses.py +184 -27
  338. {llama_stack/apis/post_training → llama_stack_api}/post_training.py +7 -11
  339. {llama_stack/apis/prompts → llama_stack_api}/prompts.py +3 -4
  340. llama_stack_api/providers/__init__.py +33 -0
  341. llama_stack_api/providers/api.py +16 -0
  342. llama_stack_api/providers/fastapi_routes.py +57 -0
  343. llama_stack_api/providers/models.py +24 -0
  344. {llama_stack/apis/tools → llama_stack_api}/rag_tool.py +2 -52
  345. {llama_stack/apis → llama_stack_api}/resource.py +1 -1
  346. llama_stack_api/router_utils.py +160 -0
  347. {llama_stack/apis/safety → llama_stack_api}/safety.py +6 -9
  348. {llama_stack → llama_stack_api}/schema_utils.py +94 -4
  349. {llama_stack/apis/scoring → llama_stack_api}/scoring.py +3 -3
  350. {llama_stack/apis/scoring_functions → llama_stack_api}/scoring_functions.py +9 -6
  351. {llama_stack/apis/shields → llama_stack_api}/shields.py +6 -7
  352. {llama_stack/apis/tools → llama_stack_api}/tools.py +26 -21
  353. {llama_stack/apis/vector_io → llama_stack_api}/vector_io.py +133 -152
  354. {llama_stack/apis/vector_stores → llama_stack_api}/vector_stores.py +1 -1
  355. llama_stack/apis/agents/agents.py +0 -894
  356. llama_stack/apis/batches/__init__.py +0 -9
  357. llama_stack/apis/batches/batches.py +0 -100
  358. llama_stack/apis/benchmarks/__init__.py +0 -7
  359. llama_stack/apis/benchmarks/benchmarks.py +0 -108
  360. llama_stack/apis/common/responses.py +0 -36
  361. llama_stack/apis/conversations/__init__.py +0 -31
  362. llama_stack/apis/datasets/datasets.py +0 -251
  363. llama_stack/apis/datatypes.py +0 -160
  364. llama_stack/apis/eval/__init__.py +0 -7
  365. llama_stack/apis/files/__init__.py +0 -7
  366. llama_stack/apis/files/files.py +0 -199
  367. llama_stack/apis/inference/__init__.py +0 -7
  368. llama_stack/apis/inference/event_logger.py +0 -43
  369. llama_stack/apis/inspect/__init__.py +0 -7
  370. llama_stack/apis/inspect/inspect.py +0 -94
  371. llama_stack/apis/models/__init__.py +0 -7
  372. llama_stack/apis/post_training/__init__.py +0 -7
  373. llama_stack/apis/prompts/__init__.py +0 -9
  374. llama_stack/apis/providers/__init__.py +0 -7
  375. llama_stack/apis/providers/providers.py +0 -69
  376. llama_stack/apis/safety/__init__.py +0 -7
  377. llama_stack/apis/scoring/__init__.py +0 -7
  378. llama_stack/apis/scoring_functions/__init__.py +0 -7
  379. llama_stack/apis/shields/__init__.py +0 -7
  380. llama_stack/apis/synthetic_data_generation/__init__.py +0 -7
  381. llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py +0 -77
  382. llama_stack/apis/telemetry/__init__.py +0 -7
  383. llama_stack/apis/telemetry/telemetry.py +0 -423
  384. llama_stack/apis/tools/__init__.py +0 -8
  385. llama_stack/apis/vector_io/__init__.py +0 -7
  386. llama_stack/apis/vector_stores/__init__.py +0 -7
  387. llama_stack/core/server/tracing.py +0 -80
  388. llama_stack/core/ui/app.py +0 -55
  389. llama_stack/core/ui/modules/__init__.py +0 -5
  390. llama_stack/core/ui/modules/api.py +0 -32
  391. llama_stack/core/ui/modules/utils.py +0 -42
  392. llama_stack/core/ui/page/__init__.py +0 -5
  393. llama_stack/core/ui/page/distribution/__init__.py +0 -5
  394. llama_stack/core/ui/page/distribution/datasets.py +0 -18
  395. llama_stack/core/ui/page/distribution/eval_tasks.py +0 -20
  396. llama_stack/core/ui/page/distribution/models.py +0 -18
  397. llama_stack/core/ui/page/distribution/providers.py +0 -27
  398. llama_stack/core/ui/page/distribution/resources.py +0 -48
  399. llama_stack/core/ui/page/distribution/scoring_functions.py +0 -18
  400. llama_stack/core/ui/page/distribution/shields.py +0 -19
  401. llama_stack/core/ui/page/evaluations/__init__.py +0 -5
  402. llama_stack/core/ui/page/evaluations/app_eval.py +0 -143
  403. llama_stack/core/ui/page/evaluations/native_eval.py +0 -253
  404. llama_stack/core/ui/page/playground/__init__.py +0 -5
  405. llama_stack/core/ui/page/playground/chat.py +0 -130
  406. llama_stack/core/ui/page/playground/tools.py +0 -352
  407. llama_stack/distributions/dell/build.yaml +0 -33
  408. llama_stack/distributions/meta-reference-gpu/build.yaml +0 -32
  409. llama_stack/distributions/nvidia/build.yaml +0 -29
  410. llama_stack/distributions/open-benchmark/build.yaml +0 -36
  411. llama_stack/distributions/postgres-demo/__init__.py +0 -7
  412. llama_stack/distributions/postgres-demo/build.yaml +0 -23
  413. llama_stack/distributions/postgres-demo/postgres_demo.py +0 -125
  414. llama_stack/distributions/starter/build.yaml +0 -61
  415. llama_stack/distributions/starter-gpu/build.yaml +0 -61
  416. llama_stack/distributions/watsonx/build.yaml +0 -33
  417. llama_stack/providers/inline/agents/meta_reference/agent_instance.py +0 -1024
  418. llama_stack/providers/inline/agents/meta_reference/persistence.py +0 -228
  419. llama_stack/providers/inline/telemetry/__init__.py +0 -5
  420. llama_stack/providers/inline/telemetry/meta_reference/__init__.py +0 -21
  421. llama_stack/providers/inline/telemetry/meta_reference/config.py +0 -47
  422. llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +0 -252
  423. llama_stack/providers/remote/inference/bedrock/models.py +0 -29
  424. llama_stack/providers/utils/kvstore/sqlite/config.py +0 -20
  425. llama_stack/providers/utils/sqlstore/__init__.py +0 -5
  426. llama_stack/providers/utils/sqlstore/api.py +0 -128
  427. llama_stack/providers/utils/telemetry/__init__.py +0 -5
  428. llama_stack/providers/utils/telemetry/trace_protocol.py +0 -142
  429. llama_stack/providers/utils/telemetry/tracing.py +0 -384
  430. llama_stack/strong_typing/__init__.py +0 -19
  431. llama_stack/strong_typing/auxiliary.py +0 -228
  432. llama_stack/strong_typing/classdef.py +0 -440
  433. llama_stack/strong_typing/core.py +0 -46
  434. llama_stack/strong_typing/deserializer.py +0 -877
  435. llama_stack/strong_typing/docstring.py +0 -409
  436. llama_stack/strong_typing/exception.py +0 -23
  437. llama_stack/strong_typing/inspection.py +0 -1085
  438. llama_stack/strong_typing/mapping.py +0 -40
  439. llama_stack/strong_typing/name.py +0 -182
  440. llama_stack/strong_typing/schema.py +0 -792
  441. llama_stack/strong_typing/serialization.py +0 -97
  442. llama_stack/strong_typing/serializer.py +0 -500
  443. llama_stack/strong_typing/slots.py +0 -27
  444. llama_stack/strong_typing/topological.py +0 -89
  445. llama_stack/ui/node_modules/flatted/python/flatted.py +0 -149
  446. llama_stack-0.3.5.dist-info/RECORD +0 -625
  447. llama_stack-0.3.5.dist-info/top_level.txt +0 -1
  448. /llama_stack/{providers/utils → core/storage}/kvstore/config.py +0 -0
  449. /llama_stack/{providers/utils → core/storage}/kvstore/mongodb/__init__.py +0 -0
  450. /llama_stack/{providers/utils → core/storage}/kvstore/postgres/__init__.py +0 -0
  451. /llama_stack/{providers/utils → core/storage}/kvstore/redis/__init__.py +0 -0
  452. /llama_stack/{providers/utils → core/storage}/kvstore/sqlite/__init__.py +0 -0
  453. /llama_stack/{apis → providers/inline/file_processor}/__init__.py +0 -0
  454. /llama_stack/{apis/common → telemetry}/__init__.py +0 -0
  455. {llama_stack-0.3.5.dist-info → llama_stack-0.4.1.dist-info}/WHEEL +0 -0
  456. {llama_stack-0.3.5.dist-info → llama_stack-0.4.1.dist-info}/entry_points.txt +0 -0
  457. {llama_stack-0.3.5.dist-info → llama_stack-0.4.1.dist-info}/licenses/LICENSE +0 -0
  458. {llama_stack/core/ui → llama_stack_api/common}/__init__.py +0 -0
  459. {llama_stack/strong_typing → llama_stack_api}/py.typed +0 -0
  460. {llama_stack/apis → llama_stack_api}/version.py +0 -0
@@ -15,21 +15,37 @@ from typing import Annotated, Any
15
15
  from fastapi import Body
16
16
  from pydantic import TypeAdapter
17
17
 
18
- from llama_stack.apis.common.errors import VectorStoreNotFoundError
19
- from llama_stack.apis.files import Files, OpenAIFileObject
20
- from llama_stack.apis.vector_io import (
18
+ from llama_stack.core.datatypes import VectorStoresConfig
19
+ from llama_stack.core.id_generation import generate_object_id
20
+ from llama_stack.log import get_logger
21
+ from llama_stack.providers.utils.inference.prompt_adapter import (
22
+ interleaved_content_as_str,
23
+ )
24
+ from llama_stack.providers.utils.memory.vector_store import (
25
+ ChunkForDeletion,
26
+ content_from_data_and_mime_type,
27
+ make_overlapped_chunks,
28
+ )
29
+ from llama_stack_api import (
21
30
  Chunk,
31
+ EmbeddedChunk,
32
+ Files,
33
+ Inference,
22
34
  OpenAICreateVectorStoreFileBatchRequestWithExtraBody,
23
35
  OpenAICreateVectorStoreRequestWithExtraBody,
36
+ OpenAIEmbeddingsRequestWithExtraBody,
37
+ OpenAIFileObject,
24
38
  QueryChunksResponse,
25
39
  SearchRankingOptions,
40
+ VectorStore,
26
41
  VectorStoreChunkingStrategy,
27
42
  VectorStoreChunkingStrategyAuto,
28
43
  VectorStoreChunkingStrategyStatic,
44
+ VectorStoreChunkingStrategyStaticConfig,
29
45
  VectorStoreContent,
30
46
  VectorStoreDeleteResponse,
31
47
  VectorStoreFileBatchObject,
32
- VectorStoreFileContentsResponse,
48
+ VectorStoreFileContentResponse,
33
49
  VectorStoreFileCounts,
34
50
  VectorStoreFileDeleteResponse,
35
51
  VectorStoreFileLastError,
@@ -38,29 +54,22 @@ from llama_stack.apis.vector_io import (
38
54
  VectorStoreFileStatus,
39
55
  VectorStoreListFilesResponse,
40
56
  VectorStoreListResponse,
57
+ VectorStoreNotFoundError,
41
58
  VectorStoreObject,
42
59
  VectorStoreSearchResponse,
43
60
  VectorStoreSearchResponsePage,
44
61
  )
45
- from llama_stack.apis.vector_stores import VectorStore
46
- from llama_stack.core.id_generation import generate_object_id
47
- from llama_stack.log import get_logger
48
- from llama_stack.providers.utils.kvstore.api import KVStore
49
- from llama_stack.providers.utils.memory.vector_store import (
50
- ChunkForDeletion,
51
- content_from_data_and_mime_type,
52
- make_overlapped_chunks,
62
+ from llama_stack_api.files.models import (
63
+ RetrieveFileContentRequest,
64
+ RetrieveFileRequest,
53
65
  )
66
+ from llama_stack_api.internal.kvstore import KVStore
54
67
 
55
68
  EMBEDDING_DIMENSION = 768
56
69
 
57
70
  logger = get_logger(name=__name__, category="providers::utils")
58
71
 
59
72
  # Constants for OpenAI vector stores
60
- CHUNK_MULTIPLIER = 5
61
- FILE_BATCH_CLEANUP_INTERVAL_SECONDS = 24 * 60 * 60 # 1 day in seconds
62
- MAX_CONCURRENT_FILES_PER_BATCH = 3 # Maximum concurrent file processing within a batch
63
- FILE_BATCH_CHUNK_SIZE = 10 # Process files in chunks of this size
64
73
 
65
74
  VERSION = "v3"
66
75
  VECTOR_DBS_PREFIX = f"vector_stores:{VERSION}::"
@@ -81,15 +90,29 @@ class OpenAIVectorStoreMixin(ABC):
81
90
  # to properly initialize the mixin attributes.
82
91
  def __init__(
83
92
  self,
93
+ inference_api: Inference,
84
94
  files_api: Files | None = None,
85
95
  kvstore: KVStore | None = None,
96
+ vector_stores_config: VectorStoresConfig | None = None,
86
97
  ):
98
+ if not inference_api:
99
+ raise RuntimeError("Inference API is required for vector store operations")
100
+
101
+ self.inference_api = inference_api
87
102
  self.openai_vector_stores: dict[str, dict[str, Any]] = {}
88
103
  self.openai_file_batches: dict[str, dict[str, Any]] = {}
89
104
  self.files_api = files_api
90
105
  self.kvstore = kvstore
106
+ self.vector_stores_config = vector_stores_config or VectorStoresConfig()
91
107
  self._last_file_batch_cleanup_time = 0
92
108
  self._file_batch_tasks: dict[str, asyncio.Task[None]] = {}
109
+ self._vector_store_locks: dict[str, asyncio.Lock] = {}
110
+
111
+ def _get_vector_store_lock(self, vector_store_id: str) -> asyncio.Lock:
112
+ """Get or create a lock for a specific vector store."""
113
+ if vector_store_id not in self._vector_store_locks:
114
+ self._vector_store_locks[vector_store_id] = asyncio.Lock()
115
+ return self._vector_store_locks[vector_store_id]
93
116
 
94
117
  async def _save_openai_vector_store(self, store_id: str, store_info: dict[str, Any]) -> None:
95
118
  """Save vector store metadata to persistent storage."""
@@ -333,8 +356,8 @@ class OpenAIVectorStoreMixin(ABC):
333
356
  @abstractmethod
334
357
  async def insert_chunks(
335
358
  self,
336
- vector_db_id: str,
337
- chunks: list[Chunk],
359
+ vector_store_id: str,
360
+ chunks: list[EmbeddedChunk],
338
361
  ttl_seconds: int | None = None,
339
362
  ) -> None:
340
363
  """Insert chunks into a vector database (provider-specific implementation)."""
@@ -342,7 +365,7 @@ class OpenAIVectorStoreMixin(ABC):
342
365
 
343
366
  @abstractmethod
344
367
  async def query_chunks(
345
- self, vector_db_id: str, query: Any, params: dict[str, Any] | None = None
368
+ self, vector_store_id: str, query: Any, params: dict[str, Any] | None = None
346
369
  ) -> QueryChunksResponse:
347
370
  """Query chunks from a vector database (provider-specific implementation)."""
348
371
  pass
@@ -414,6 +437,10 @@ class OpenAIVectorStoreMixin(ABC):
414
437
  in_progress=0,
415
438
  total=0,
416
439
  )
440
+ if not params.chunking_strategy or params.chunking_strategy.type == "auto":
441
+ chunking_strategy = VectorStoreChunkingStrategyStatic(static=VectorStoreChunkingStrategyStaticConfig())
442
+ else:
443
+ chunking_strategy = params.chunking_strategy
417
444
  store_info: dict[str, Any] = {
418
445
  "id": vector_store_id,
419
446
  "object": "vector_store",
@@ -426,7 +453,7 @@ class OpenAIVectorStoreMixin(ABC):
426
453
  "expires_at": None,
427
454
  "last_active_at": created_at,
428
455
  "file_ids": [],
429
- "chunking_strategy": params.chunking_strategy,
456
+ "chunking_strategy": chunking_strategy.model_dump(),
430
457
  }
431
458
 
432
459
  # Add provider information to metadata if provided
@@ -434,6 +461,11 @@ class OpenAIVectorStoreMixin(ABC):
434
461
  metadata["provider_id"] = provider_id
435
462
  if provider_vector_store_id:
436
463
  metadata["provider_vector_store_id"] = provider_vector_store_id
464
+
465
+ # Add embedding configuration to metadata for file processing
466
+ metadata["embedding_model"] = embedding_model
467
+ metadata["embedding_dimension"] = str(embedding_dimension)
468
+
437
469
  store_info["metadata"] = metadata
438
470
 
439
471
  # Save to persistent storage (provider-specific)
@@ -445,7 +477,13 @@ class OpenAIVectorStoreMixin(ABC):
445
477
  # Now that our vector store is created, attach any files that were provided
446
478
  file_ids = params.file_ids or []
447
479
  tasks = [self.openai_attach_file_to_vector_store(vector_store_id, file_id) for file_id in file_ids]
448
- await asyncio.gather(*tasks)
480
+ # Use return_exceptions=True to handle individual file attachment failures gracefully
481
+ results = await asyncio.gather(*tasks, return_exceptions=True)
482
+
483
+ # Log any exceptions but don't fail the vector store creation
484
+ for i, result in enumerate(results):
485
+ if isinstance(result, Exception):
486
+ logger.warning(f"Failed to attach file {file_ids[i]} to vector store {vector_store_id}: {result}")
449
487
 
450
488
  # Get the updated store info and return it
451
489
  store_info = self.openai_vector_stores[vector_store_id]
@@ -579,7 +617,11 @@ class OpenAIVectorStoreMixin(ABC):
579
617
  str | None
580
618
  ) = "vector", # Using str instead of Literal due to OpenAPI schema generator limitations
581
619
  ) -> VectorStoreSearchResponsePage:
582
- """Search for chunks in a vector store."""
620
+ """Search for chunks in a vector store.
621
+
622
+ Note: Query rewriting is handled at the router level, not here.
623
+ The rewrite_query parameter is kept for API compatibility but is ignored.
624
+ """
583
625
  max_num_results = max_num_results or 10
584
626
 
585
627
  # Validate search_mode
@@ -602,21 +644,24 @@ class OpenAIVectorStoreMixin(ABC):
602
644
  else 0.0
603
645
  )
604
646
  params = {
605
- "max_chunks": max_num_results * CHUNK_MULTIPLIER,
647
+ "max_chunks": max_num_results * self.vector_stores_config.chunk_retrieval_params.chunk_multiplier,
606
648
  "score_threshold": score_threshold,
607
649
  "mode": search_mode,
608
650
  }
609
651
  # TODO: Add support for ranking_options.ranker
610
652
 
611
653
  response = await self.query_chunks(
612
- vector_db_id=vector_store_id,
654
+ vector_store_id=vector_store_id,
613
655
  query=search_query,
614
656
  params=params,
615
657
  )
616
658
 
617
659
  # Convert response to OpenAI format
618
660
  data = []
619
- for chunk, score in zip(response.chunks, response.scores, strict=False):
661
+ for embedded_chunk, score in zip(response.chunks, response.scores, strict=False):
662
+ # EmbeddedChunk inherits from Chunk, so use it directly
663
+ chunk = embedded_chunk
664
+
620
665
  # Apply filters if provided
621
666
  if filters:
622
667
  # Simple metadata filtering
@@ -637,7 +682,7 @@ class OpenAIVectorStoreMixin(ABC):
637
682
  break
638
683
 
639
684
  return VectorStoreSearchResponsePage(
640
- search_query=search_query,
685
+ search_query=query if isinstance(query, list) else [query],
641
686
  data=data,
642
687
  has_more=False, # For simplicity, we don't implement pagination here
643
688
  next_page=None,
@@ -647,7 +692,7 @@ class OpenAIVectorStoreMixin(ABC):
647
692
  logger.error(f"Error searching vector store {vector_store_id}: {e}")
648
693
  # Return empty results on error
649
694
  return VectorStoreSearchResponsePage(
650
- search_query=search_query,
695
+ search_query=query if isinstance(query, list) else [query],
651
696
  data=[],
652
697
  has_more=False,
653
698
  next_page=None,
@@ -699,34 +744,35 @@ class OpenAIVectorStoreMixin(ABC):
699
744
  # Unknown filter type, default to no match
700
745
  raise ValueError(f"Unsupported filter type: {filter_type}")
701
746
 
702
- def _chunk_to_vector_store_content(self, chunk: Chunk) -> list[VectorStoreContent]:
703
- # content is InterleavedContent
747
+ def _chunk_to_vector_store_content(
748
+ self, chunk: EmbeddedChunk, include_embeddings: bool = False, include_metadata: bool = False
749
+ ) -> list[VectorStoreContent]:
750
+ def extract_fields() -> dict:
751
+ """Extract metadata fields from chunk based on include flags."""
752
+ return {
753
+ "chunk_metadata": chunk.chunk_metadata if include_metadata else None,
754
+ "metadata": chunk.metadata if include_metadata else None,
755
+ "embedding": chunk.embedding if include_embeddings else None,
756
+ }
757
+
758
+ fields = extract_fields()
759
+
704
760
  if isinstance(chunk.content, str):
705
- content = [
706
- VectorStoreContent(
707
- type="text",
708
- text=chunk.content,
709
- )
710
- ]
761
+ content_item = VectorStoreContent(type="text", text=chunk.content, **fields)
762
+ content = [content_item]
711
763
  elif isinstance(chunk.content, list):
712
764
  # TODO: Add support for other types of content
713
- content = [
714
- VectorStoreContent(
715
- type="text",
716
- text=item.text,
717
- )
718
- for item in chunk.content
719
- if item.type == "text"
720
- ]
765
+ content = []
766
+ for item in chunk.content:
767
+ if item.type == "text":
768
+ content_item = VectorStoreContent(type="text", text=item.text, **fields)
769
+ content.append(content_item)
721
770
  else:
722
771
  if chunk.content.type != "text":
723
772
  raise ValueError(f"Unsupported content type: {chunk.content.type}")
724
- content = [
725
- VectorStoreContent(
726
- type="text",
727
- text=chunk.content.text,
728
- )
729
- ]
773
+
774
+ content_item = VectorStoreContent(type="text", text=chunk.content.text, **fields)
775
+ content = [content_item]
730
776
  return content
731
777
 
732
778
  async def openai_attach_file_to_vector_store(
@@ -751,6 +797,7 @@ class OpenAIVectorStoreMixin(ABC):
751
797
  chunking_strategy = chunking_strategy or VectorStoreChunkingStrategyAuto()
752
798
  created_at = int(time.time())
753
799
  chunks: list[Chunk] = []
800
+ embedded_chunks: list[EmbeddedChunk] = []
754
801
  file_response: OpenAIFileObject | None = None
755
802
 
756
803
  vector_store_file_object = VectorStoreFileObject(
@@ -779,15 +826,22 @@ class OpenAIVectorStoreMixin(ABC):
779
826
  chunk_overlap_tokens = 400
780
827
 
781
828
  try:
782
- file_response = await self.files_api.openai_retrieve_file(file_id)
829
+ file_response = await self.files_api.openai_retrieve_file(RetrieveFileRequest(file_id=file_id))
783
830
  mime_type, _ = mimetypes.guess_type(file_response.filename)
784
- content_response = await self.files_api.openai_retrieve_file_content(file_id)
831
+ content_response = await self.files_api.openai_retrieve_file_content(
832
+ RetrieveFileContentRequest(file_id=file_id)
833
+ )
785
834
 
786
835
  content = content_from_data_and_mime_type(content_response.body, mime_type)
787
836
 
788
837
  chunk_attributes = attributes.copy()
789
838
  chunk_attributes["filename"] = file_response.filename
790
839
 
840
+ # Get embedding model info from vector store metadata
841
+ store_info = self.openai_vector_stores[vector_store_id]
842
+ embedding_model = store_info["metadata"].get("embedding_model")
843
+ embedding_dimension = store_info["metadata"].get("embedding_dimension")
844
+
791
845
  chunks = make_overlapped_chunks(
792
846
  file_id,
793
847
  content,
@@ -802,9 +856,42 @@ class OpenAIVectorStoreMixin(ABC):
802
856
  message="No chunks were generated from the file",
803
857
  )
804
858
  else:
859
+ # Validate embedding model and dimension are available
860
+ if not embedding_model:
861
+ raise RuntimeError(f"Vector store {vector_store_id} is not properly configured for file processing")
862
+ if not embedding_dimension:
863
+ raise RuntimeError(f"Vector store {vector_store_id} is not properly configured for file processing")
864
+
865
+ # Generate embeddings for all chunks before insertion
866
+
867
+ # Prepare embedding request for all chunks
868
+ params = OpenAIEmbeddingsRequestWithExtraBody(
869
+ model=embedding_model,
870
+ input=[interleaved_content_as_str(c.content) for c in chunks],
871
+ )
872
+ resp = await self.inference_api.openai_embeddings(params)
873
+
874
+ # Create EmbeddedChunk instances from chunks and their embeddings
875
+ for chunk, data in zip(chunks, resp.data, strict=False):
876
+ # Ensure embedding is a list of floats
877
+ embedding = data.embedding
878
+ if isinstance(embedding, str):
879
+ # Handle case where embedding might be returned as a string (shouldn't normally happen)
880
+ raise ValueError(f"Received string embedding instead of list: {embedding}")
881
+ embedded_chunk = EmbeddedChunk(
882
+ content=chunk.content,
883
+ chunk_id=chunk.chunk_id,
884
+ metadata=chunk.metadata,
885
+ chunk_metadata=chunk.chunk_metadata,
886
+ embedding=embedding,
887
+ embedding_model=embedding_model,
888
+ embedding_dimension=len(embedding),
889
+ )
890
+ embedded_chunks.append(embedded_chunk)
891
+
805
892
  await self.insert_chunks(
806
- vector_db_id=vector_store_id,
807
- chunks=chunks,
893
+ vector_store_id=vector_store_id,
894
+ chunks=embedded_chunks,
808
895
  )
809
896
  vector_store_file_object.status = "completed"
810
897
  except Exception as e:
@@ -815,26 +902,27 @@ class OpenAIVectorStoreMixin(ABC):
815
902
  message=str(e),
816
903
  )
817
904
 
818
- # Create OpenAI vector store file metadata
905
+ # Save vector store file to persistent storage AFTER insert_chunks
906
+ # so that chunks include the embeddings that were generated
819
907
  file_info = vector_store_file_object.model_dump(exclude={"last_error"})
820
908
  file_info["filename"] = file_response.filename if file_response else ""
821
909
 
822
- # Save vector store file to persistent storage (provider-specific)
823
- dict_chunks = [c.model_dump() for c in chunks]
824
- # This should be updated to include chunk_id
910
+ dict_chunks = [c.model_dump() for c in embedded_chunks]
825
911
  await self._save_openai_vector_store_file(vector_store_id, file_id, file_info, dict_chunks)
826
912
 
827
913
  # Update file_ids and file_counts in vector store metadata
828
- store_info = self.openai_vector_stores[vector_store_id].copy()
829
- store_info["file_ids"].append(file_id)
830
- store_info["file_counts"]["total"] += 1
831
- store_info["file_counts"][vector_store_file_object.status] += 1
832
-
833
- # Save updated vector store to persistent storage
834
- await self._save_openai_vector_store(vector_store_id, store_info)
835
-
836
- # Update vector store in-memory cache
837
- self.openai_vector_stores[vector_store_id] = store_info
914
+ # Use lock to prevent race condition when multiple files are attached concurrently
915
+ async with self._get_vector_store_lock(vector_store_id):
916
+ store_info = self.openai_vector_stores[vector_store_id].copy()
917
+ # Deep copy file_counts to avoid mutating shared dict
918
+ store_info["file_counts"] = store_info["file_counts"].copy()
919
+ store_info["file_ids"] = store_info["file_ids"].copy()
920
+ store_info["file_ids"].append(file_id)
921
+ store_info["file_counts"]["total"] += 1
922
+ store_info["file_counts"][vector_store_file_object.status] += 1
923
+
924
+ # Save updated vector store to persistent storage
925
+ await self._save_openai_vector_store(vector_store_id, store_info)
838
926
 
839
927
  return vector_store_file_object
840
928
 
@@ -886,8 +974,8 @@ class OpenAIVectorStoreMixin(ABC):
886
974
 
887
975
  # Determine pagination info
888
976
  has_more = len(file_objects) > limit
889
- first_id = file_objects[0].id if file_objects else None
890
- last_id = file_objects[-1].id if file_objects else None
977
+ first_id = limited_files[0].id if file_objects else None
978
+ last_id = limited_files[-1].id if file_objects else None
891
979
 
892
980
  return VectorStoreListFilesResponse(
893
981
  data=limited_files,
@@ -916,22 +1004,27 @@ class OpenAIVectorStoreMixin(ABC):
916
1004
  self,
917
1005
  vector_store_id: str,
918
1006
  file_id: str,
919
- ) -> VectorStoreFileContentsResponse:
1007
+ include_embeddings: bool | None = False,
1008
+ include_metadata: bool | None = False,
1009
+ ) -> VectorStoreFileContentResponse:
920
1010
  """Retrieves the contents of a vector store file."""
921
1011
  if vector_store_id not in self.openai_vector_stores:
922
1012
  raise VectorStoreNotFoundError(vector_store_id)
923
1013
 
924
- file_info = await self._load_openai_vector_store_file(vector_store_id, file_id)
1014
+ # Parameters are already provided directly
1015
+ # include_embeddings and include_metadata are now function parameters
1016
+
925
1017
  dict_chunks = await self._load_openai_vector_store_file_contents(vector_store_id, file_id)
926
- chunks = [Chunk.model_validate(c) for c in dict_chunks]
1018
+ chunks = [EmbeddedChunk.model_validate(c) for c in dict_chunks]
927
1019
  content = []
928
1020
  for chunk in chunks:
929
- content.extend(self._chunk_to_vector_store_content(chunk))
930
- return VectorStoreFileContentsResponse(
931
- file_id=file_id,
932
- filename=file_info.get("filename", ""),
933
- attributes=file_info.get("attributes", {}),
934
- content=content,
1021
+ content.extend(
1022
+ self._chunk_to_vector_store_content(
1023
+ chunk, include_embeddings=include_embeddings or False, include_metadata=include_metadata or False
1024
+ )
1025
+ )
1026
+ return VectorStoreFileContentResponse(
1027
+ data=content,
935
1028
  )
936
1029
 
937
1030
  async def openai_update_vector_store_file(
@@ -1048,7 +1141,10 @@ class OpenAIVectorStoreMixin(ABC):
1048
1141
 
1049
1142
  # Run cleanup if needed (throttled to once every 1 day)
1050
1143
  current_time = int(time.time())
1051
- if current_time - self._last_file_batch_cleanup_time >= FILE_BATCH_CLEANUP_INTERVAL_SECONDS:
1144
+ if (
1145
+ current_time - self._last_file_batch_cleanup_time
1146
+ >= self.vector_stores_config.file_batch_params.cleanup_interval_seconds
1147
+ ):
1052
1148
  logger.info("Running throttled cleanup of expired file batches")
1053
1149
  asyncio.create_task(self._cleanup_expired_file_batches())
1054
1150
  self._last_file_batch_cleanup_time = current_time
@@ -1065,7 +1161,7 @@ class OpenAIVectorStoreMixin(ABC):
1065
1161
  batch_info: dict[str, Any],
1066
1162
  ) -> None:
1067
1163
  """Process files with controlled concurrency and chunking."""
1068
- semaphore = asyncio.Semaphore(MAX_CONCURRENT_FILES_PER_BATCH)
1164
+ semaphore = asyncio.Semaphore(self.vector_stores_config.file_batch_params.max_concurrent_files_per_batch)
1069
1165
 
1070
1166
  async def process_single_file(file_id: str) -> tuple[str, bool]:
1071
1167
  """Process a single file with concurrency control."""
@@ -1084,12 +1180,13 @@ class OpenAIVectorStoreMixin(ABC):
1084
1180
 
1085
1181
  # Process files in chunks to avoid creating too many tasks at once
1086
1182
  total_files = len(file_ids)
1087
- for chunk_start in range(0, total_files, FILE_BATCH_CHUNK_SIZE):
1088
- chunk_end = min(chunk_start + FILE_BATCH_CHUNK_SIZE, total_files)
1183
+ chunk_size = self.vector_stores_config.file_batch_params.file_batch_chunk_size
1184
+ for chunk_start in range(0, total_files, chunk_size):
1185
+ chunk_end = min(chunk_start + chunk_size, total_files)
1089
1186
  chunk = file_ids[chunk_start:chunk_end]
1090
1187
 
1091
- chunk_num = chunk_start // FILE_BATCH_CHUNK_SIZE + 1
1092
- total_chunks = (total_files + FILE_BATCH_CHUNK_SIZE - 1) // FILE_BATCH_CHUNK_SIZE
1188
+ chunk_num = chunk_start // chunk_size + 1
1189
+ total_chunks = (total_files + chunk_size - 1) // chunk_size
1093
1190
  logger.info(
1094
1191
  f"Processing chunk {chunk_num} of {total_chunks} ({len(chunk)} files, {chunk_start + 1}-{chunk_end} of {total_files} total files)"
1095
1192
  )
@@ -17,21 +17,25 @@ import numpy as np
17
17
  from numpy.typing import NDArray
18
18
  from pydantic import BaseModel
19
19
 
20
- from llama_stack.apis.common.content_types import (
21
- URL,
22
- InterleavedContent,
23
- )
24
- from llama_stack.apis.inference import OpenAIEmbeddingsRequestWithExtraBody
25
- from llama_stack.apis.tools import RAGDocument
26
- from llama_stack.apis.vector_io import Chunk, ChunkMetadata, QueryChunksResponse
27
- from llama_stack.apis.vector_stores import VectorStore
20
+ from llama_stack.core.datatypes import VectorStoresConfig
28
21
  from llama_stack.log import get_logger
29
22
  from llama_stack.models.llama.llama3.tokenizer import Tokenizer
30
- from llama_stack.providers.datatypes import Api
31
23
  from llama_stack.providers.utils.inference.prompt_adapter import (
32
24
  interleaved_content_as_str,
33
25
  )
34
26
  from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
27
+ from llama_stack_api import (
28
+ URL,
29
+ Api,
30
+ Chunk,
31
+ ChunkMetadata,
32
+ EmbeddedChunk,
33
+ InterleavedContent,
34
+ OpenAIEmbeddingsRequestWithExtraBody,
35
+ QueryChunksResponse,
36
+ RAGDocument,
37
+ VectorStore,
38
+ )
35
39
 
36
40
  log = get_logger(name=__name__, category="providers::utils")
37
41
 
@@ -155,7 +159,11 @@ async def content_from_doc(doc: RAGDocument) -> str:
155
159
 
156
160
 
157
161
  def make_overlapped_chunks(
158
- document_id: str, text: str, window_len: int, overlap_len: int, metadata: dict[str, Any]
162
+ document_id: str,
163
+ text: str,
164
+ window_len: int,
165
+ overlap_len: int,
166
+ metadata: dict[str, Any],
159
167
  ) -> list[Chunk]:
160
168
  default_tokenizer = "DEFAULT_TIKTOKEN_TOKENIZER"
161
169
  tokenizer = Tokenizer.get_instance()
@@ -187,7 +195,6 @@ def make_overlapped_chunks(
187
195
  updated_timestamp=int(time.time()),
188
196
  chunk_window=chunk_window,
189
197
  chunk_tokenizer=default_tokenizer,
190
- chunk_embedding_model=None, # This will be set in `VectorStoreWithIndex.insert_chunks`
191
198
  content_token_count=len(toks),
192
199
  metadata_token_count=len(metadata_tokens),
193
200
  )
@@ -196,6 +203,7 @@ def make_overlapped_chunks(
196
203
  chunks.append(
197
204
  Chunk(
198
205
  content=chunk,
206
+ chunk_id=chunk_id,
199
207
  metadata=chunk_metadata,
200
208
  chunk_metadata=backend_chunk_metadata,
201
209
  )
@@ -222,7 +230,7 @@ def _validate_embedding(embedding: NDArray, index: int, expected_dimension: int)
222
230
 
223
231
  class EmbeddingIndex(ABC):
224
232
  @abstractmethod
225
- async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
233
+ async def add_chunks(self, embedded_chunks: list[EmbeddedChunk]):
226
234
  raise NotImplementedError()
227
235
 
228
236
  @abstractmethod
@@ -259,38 +267,25 @@ class VectorStoreWithIndex:
259
267
  vector_store: VectorStore
260
268
  index: EmbeddingIndex
261
269
  inference_api: Api.inference
270
+ vector_stores_config: VectorStoresConfig | None = None
262
271
 
263
272
  async def insert_chunks(
264
273
  self,
265
- chunks: list[Chunk],
274
+ chunks: list[EmbeddedChunk],
266
275
  ) -> None:
267
- chunks_to_embed = []
268
- for i, c in enumerate(chunks):
269
- if c.embedding is None:
270
- chunks_to_embed.append(c)
271
- if c.chunk_metadata:
272
- c.chunk_metadata.chunk_embedding_model = self.vector_store.embedding_model
273
- c.chunk_metadata.chunk_embedding_dimension = self.vector_store.embedding_dimension
274
- else:
275
- _validate_embedding(c.embedding, i, self.vector_store.embedding_dimension)
276
-
277
- if chunks_to_embed:
278
- params = OpenAIEmbeddingsRequestWithExtraBody(
279
- model=self.vector_store.embedding_model,
280
- input=[c.content for c in chunks_to_embed],
281
- )
282
- resp = await self.inference_api.openai_embeddings(params)
283
- for c, data in zip(chunks_to_embed, resp.data, strict=False):
284
- c.embedding = data.embedding
276
+ # Validate embedding dimensions match the vector store
277
+ for i, embedded_chunk in enumerate(chunks):
278
+ _validate_embedding(embedded_chunk.embedding, i, self.vector_store.embedding_dimension)
285
279
 
286
- embeddings = np.array([c.embedding for c in chunks], dtype=np.float32)
287
- await self.index.add_chunks(chunks, embeddings)
280
+ await self.index.add_chunks(chunks)
288
281
 
289
282
  async def query_chunks(
290
283
  self,
291
284
  query: InterleavedContent,
292
285
  params: dict[str, Any] | None = None,
293
286
  ) -> QueryChunksResponse:
287
+ config = self.vector_stores_config or VectorStoresConfig()
288
+
294
289
  if params is None:
295
290
  params = {}
296
291
  k = params.get("max_chunks", 3)
@@ -299,19 +294,25 @@ class VectorStoreWithIndex:
299
294
 
300
295
  ranker = params.get("ranker")
301
296
  if ranker is None:
302
- reranker_type = RERANKER_TYPE_RRF
303
- reranker_params = {"impact_factor": 60.0}
297
+ reranker_type = (
298
+ RERANKER_TYPE_RRF
299
+ if config.chunk_retrieval_params.default_reranker_strategy == "rrf"
300
+ else config.chunk_retrieval_params.default_reranker_strategy
301
+ )
302
+ reranker_params = {"impact_factor": config.chunk_retrieval_params.rrf_impact_factor}
304
303
  else:
305
- strategy = ranker.get("strategy", "rrf")
304
+ strategy = ranker.get("strategy", config.chunk_retrieval_params.default_reranker_strategy)
306
305
  if strategy == "weighted":
307
306
  weights = ranker.get("params", {}).get("weights", [0.5, 0.5])
308
307
  reranker_type = RERANKER_TYPE_WEIGHTED
309
- reranker_params = {"alpha": weights[0] if len(weights) > 0 else 0.5}
308
+ reranker_params = {
309
+ "alpha": weights[0] if len(weights) > 0 else config.chunk_retrieval_params.weighted_search_alpha
310
+ }
310
311
  elif strategy == "normalized":
311
312
  reranker_type = RERANKER_TYPE_NORMALIZED
312
313
  else:
313
314
  reranker_type = RERANKER_TYPE_RRF
314
- k_value = ranker.get("params", {}).get("k", 60.0)
315
+ k_value = ranker.get("params", {}).get("k", config.chunk_retrieval_params.rrf_impact_factor)
315
316
  reranker_params = {"impact_factor": k_value}
316
317
 
317
318
  query_string = interleaved_content_as_str(query)
@@ -6,7 +6,7 @@
6
6
 
7
7
  from typing import Any
8
8
 
9
- from llama_stack.apis.common.responses import PaginatedResponse
9
+ from llama_stack_api import PaginatedResponse
10
10
 
11
11
 
12
12
  def paginate_records(