llama-stack 0.4.3__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (311) hide show
  1. llama_stack/cli/stack/_list_deps.py +11 -7
  2. llama_stack/cli/stack/run.py +3 -25
  3. llama_stack/core/access_control/datatypes.py +78 -0
  4. llama_stack/core/configure.py +2 -2
  5. {llama_stack_api/internal → llama_stack/core/connectors}/__init__.py +2 -2
  6. llama_stack/core/connectors/connectors.py +162 -0
  7. llama_stack/core/conversations/conversations.py +61 -58
  8. llama_stack/core/datatypes.py +54 -8
  9. llama_stack/core/library_client.py +60 -13
  10. llama_stack/core/prompts/prompts.py +43 -42
  11. llama_stack/core/routers/datasets.py +20 -17
  12. llama_stack/core/routers/eval_scoring.py +143 -53
  13. llama_stack/core/routers/inference.py +20 -9
  14. llama_stack/core/routers/safety.py +30 -42
  15. llama_stack/core/routers/vector_io.py +15 -7
  16. llama_stack/core/routing_tables/models.py +42 -3
  17. llama_stack/core/routing_tables/scoring_functions.py +19 -19
  18. llama_stack/core/routing_tables/shields.py +20 -17
  19. llama_stack/core/routing_tables/vector_stores.py +8 -5
  20. llama_stack/core/server/auth.py +192 -17
  21. llama_stack/core/server/fastapi_router_registry.py +40 -5
  22. llama_stack/core/server/server.py +24 -5
  23. llama_stack/core/stack.py +54 -10
  24. llama_stack/core/storage/datatypes.py +9 -0
  25. llama_stack/core/store/registry.py +1 -1
  26. llama_stack/core/utils/exec.py +2 -2
  27. llama_stack/core/utils/type_inspection.py +16 -2
  28. llama_stack/distributions/dell/config.yaml +4 -1
  29. llama_stack/distributions/dell/doc_template.md +209 -0
  30. llama_stack/distributions/dell/run-with-safety.yaml +4 -1
  31. llama_stack/distributions/nvidia/config.yaml +4 -1
  32. llama_stack/distributions/nvidia/doc_template.md +170 -0
  33. llama_stack/distributions/nvidia/run-with-safety.yaml +4 -1
  34. llama_stack/distributions/oci/config.yaml +4 -1
  35. llama_stack/distributions/oci/doc_template.md +140 -0
  36. llama_stack/distributions/open-benchmark/config.yaml +9 -1
  37. llama_stack/distributions/postgres-demo/config.yaml +1 -1
  38. llama_stack/distributions/starter/build.yaml +62 -0
  39. llama_stack/distributions/starter/config.yaml +22 -3
  40. llama_stack/distributions/starter/run-with-postgres-store.yaml +22 -3
  41. llama_stack/distributions/starter/starter.py +13 -1
  42. llama_stack/distributions/starter-gpu/build.yaml +62 -0
  43. llama_stack/distributions/starter-gpu/config.yaml +22 -3
  44. llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +22 -3
  45. llama_stack/distributions/template.py +10 -2
  46. llama_stack/distributions/watsonx/config.yaml +4 -1
  47. llama_stack/log.py +1 -0
  48. llama_stack/models/llama/resources/dog.jpg +0 -0
  49. llama_stack/models/llama/resources/pasta.jpeg +0 -0
  50. llama_stack/models/llama/resources/small_dog.jpg +0 -0
  51. llama_stack/providers/inline/agents/meta_reference/__init__.py +1 -0
  52. llama_stack/providers/inline/agents/meta_reference/agents.py +58 -61
  53. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +187 -60
  54. llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +99 -22
  55. llama_stack/providers/inline/agents/meta_reference/responses/types.py +2 -1
  56. llama_stack/providers/inline/agents/meta_reference/responses/utils.py +4 -1
  57. llama_stack/providers/inline/agents/meta_reference/safety.py +2 -2
  58. llama_stack/providers/inline/batches/reference/batches.py +2 -1
  59. llama_stack/providers/inline/eval/meta_reference/eval.py +40 -32
  60. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h +9 -0
  61. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift +189 -0
  62. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift +238 -0
  63. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/PromptTemplate.swift +12 -0
  64. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/SystemPrompts.swift +89 -0
  65. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.pbxproj +550 -0
  66. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
  67. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
  68. llama_stack/providers/inline/post_training/huggingface/post_training.py +33 -38
  69. llama_stack/providers/inline/post_training/huggingface/utils.py +2 -5
  70. llama_stack/providers/inline/post_training/torchtune/common/utils.py +5 -9
  71. llama_stack/providers/inline/post_training/torchtune/post_training.py +28 -33
  72. llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +2 -4
  73. llama_stack/providers/inline/safety/code_scanner/code_scanner.py +12 -15
  74. llama_stack/providers/inline/safety/llama_guard/llama_guard.py +20 -24
  75. llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +11 -17
  76. llama_stack/providers/inline/scoring/basic/scoring.py +13 -17
  77. llama_stack/providers/inline/scoring/braintrust/braintrust.py +15 -15
  78. llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +13 -17
  79. llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +1 -1
  80. llama_stack/providers/registry/agents.py +1 -0
  81. llama_stack/providers/registry/inference.py +1 -9
  82. llama_stack/providers/registry/vector_io.py +136 -16
  83. llama_stack/providers/remote/datasetio/nvidia/README.md +74 -0
  84. llama_stack/providers/remote/eval/nvidia/README.md +134 -0
  85. llama_stack/providers/remote/eval/nvidia/eval.py +22 -21
  86. llama_stack/providers/remote/files/s3/README.md +266 -0
  87. llama_stack/providers/remote/files/s3/config.py +5 -3
  88. llama_stack/providers/remote/files/s3/files.py +2 -2
  89. llama_stack/providers/remote/inference/gemini/gemini.py +4 -0
  90. llama_stack/providers/remote/inference/nvidia/NVIDIA.md +203 -0
  91. llama_stack/providers/remote/inference/openai/openai.py +2 -0
  92. llama_stack/providers/remote/inference/together/together.py +4 -0
  93. llama_stack/providers/remote/inference/vertexai/config.py +3 -3
  94. llama_stack/providers/remote/inference/vertexai/vertexai.py +5 -2
  95. llama_stack/providers/remote/inference/vllm/config.py +37 -18
  96. llama_stack/providers/remote/inference/vllm/vllm.py +0 -3
  97. llama_stack/providers/remote/inference/watsonx/watsonx.py +4 -0
  98. llama_stack/providers/remote/post_training/nvidia/README.md +151 -0
  99. llama_stack/providers/remote/post_training/nvidia/models.py +3 -11
  100. llama_stack/providers/remote/post_training/nvidia/post_training.py +31 -33
  101. llama_stack/providers/remote/safety/bedrock/bedrock.py +10 -27
  102. llama_stack/providers/remote/safety/nvidia/README.md +78 -0
  103. llama_stack/providers/remote/safety/nvidia/nvidia.py +9 -25
  104. llama_stack/providers/remote/safety/sambanova/sambanova.py +13 -11
  105. llama_stack/providers/remote/vector_io/elasticsearch/__init__.py +17 -0
  106. llama_stack/providers/remote/vector_io/elasticsearch/config.py +32 -0
  107. llama_stack/providers/remote/vector_io/elasticsearch/elasticsearch.py +463 -0
  108. llama_stack/providers/remote/vector_io/oci/__init__.py +22 -0
  109. llama_stack/providers/remote/vector_io/oci/config.py +41 -0
  110. llama_stack/providers/remote/vector_io/oci/oci26ai.py +595 -0
  111. llama_stack/providers/remote/vector_io/pgvector/config.py +69 -2
  112. llama_stack/providers/remote/vector_io/pgvector/pgvector.py +255 -6
  113. llama_stack/providers/remote/vector_io/qdrant/qdrant.py +62 -38
  114. llama_stack/providers/utils/bedrock/client.py +3 -3
  115. llama_stack/providers/utils/bedrock/config.py +7 -7
  116. llama_stack/providers/utils/inference/__init__.py +0 -25
  117. llama_stack/providers/utils/inference/embedding_mixin.py +4 -0
  118. llama_stack/providers/utils/inference/http_client.py +239 -0
  119. llama_stack/providers/utils/inference/litellm_openai_mixin.py +6 -0
  120. llama_stack/providers/utils/inference/model_registry.py +148 -2
  121. llama_stack/providers/utils/inference/openai_compat.py +1 -158
  122. llama_stack/providers/utils/inference/openai_mixin.py +42 -2
  123. llama_stack/providers/utils/inference/prompt_adapter.py +0 -209
  124. llama_stack/providers/utils/memory/openai_vector_store_mixin.py +92 -5
  125. llama_stack/providers/utils/memory/vector_store.py +46 -19
  126. llama_stack/providers/utils/responses/responses_store.py +40 -6
  127. llama_stack/providers/utils/safety.py +114 -0
  128. llama_stack/providers/utils/tools/mcp.py +44 -3
  129. llama_stack/testing/api_recorder.py +9 -3
  130. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/METADATA +14 -2
  131. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/RECORD +135 -279
  132. llama_stack-0.5.0.dist-info/top_level.txt +1 -0
  133. llama_stack/distributions/meta-reference-gpu/__init__.py +0 -7
  134. llama_stack/distributions/meta-reference-gpu/config.yaml +0 -140
  135. llama_stack/distributions/meta-reference-gpu/meta_reference.py +0 -163
  136. llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +0 -155
  137. llama_stack/models/llama/hadamard_utils.py +0 -88
  138. llama_stack/models/llama/llama3/args.py +0 -74
  139. llama_stack/models/llama/llama3/generation.py +0 -378
  140. llama_stack/models/llama/llama3/model.py +0 -304
  141. llama_stack/models/llama/llama3/multimodal/__init__.py +0 -12
  142. llama_stack/models/llama/llama3/multimodal/encoder_utils.py +0 -180
  143. llama_stack/models/llama/llama3/multimodal/image_transform.py +0 -409
  144. llama_stack/models/llama/llama3/multimodal/model.py +0 -1430
  145. llama_stack/models/llama/llama3/multimodal/utils.py +0 -26
  146. llama_stack/models/llama/llama3/quantization/__init__.py +0 -5
  147. llama_stack/models/llama/llama3/quantization/loader.py +0 -316
  148. llama_stack/models/llama/llama3_1/__init__.py +0 -12
  149. llama_stack/models/llama/llama3_1/prompt_format.md +0 -358
  150. llama_stack/models/llama/llama3_1/prompts.py +0 -258
  151. llama_stack/models/llama/llama3_2/__init__.py +0 -5
  152. llama_stack/models/llama/llama3_2/prompts_text.py +0 -229
  153. llama_stack/models/llama/llama3_2/prompts_vision.py +0 -126
  154. llama_stack/models/llama/llama3_2/text_prompt_format.md +0 -286
  155. llama_stack/models/llama/llama3_2/vision_prompt_format.md +0 -141
  156. llama_stack/models/llama/llama3_3/__init__.py +0 -5
  157. llama_stack/models/llama/llama3_3/prompts.py +0 -259
  158. llama_stack/models/llama/llama4/args.py +0 -107
  159. llama_stack/models/llama/llama4/ffn.py +0 -58
  160. llama_stack/models/llama/llama4/moe.py +0 -214
  161. llama_stack/models/llama/llama4/preprocess.py +0 -435
  162. llama_stack/models/llama/llama4/quantization/__init__.py +0 -5
  163. llama_stack/models/llama/llama4/quantization/loader.py +0 -226
  164. llama_stack/models/llama/llama4/vision/__init__.py +0 -5
  165. llama_stack/models/llama/llama4/vision/embedding.py +0 -210
  166. llama_stack/models/llama/llama4/vision/encoder.py +0 -412
  167. llama_stack/models/llama/quantize_impls.py +0 -316
  168. llama_stack/providers/inline/inference/meta_reference/__init__.py +0 -20
  169. llama_stack/providers/inline/inference/meta_reference/common.py +0 -24
  170. llama_stack/providers/inline/inference/meta_reference/config.py +0 -68
  171. llama_stack/providers/inline/inference/meta_reference/generators.py +0 -201
  172. llama_stack/providers/inline/inference/meta_reference/inference.py +0 -542
  173. llama_stack/providers/inline/inference/meta_reference/model_parallel.py +0 -77
  174. llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +0 -353
  175. llama_stack-0.4.3.dist-info/top_level.txt +0 -2
  176. llama_stack_api/__init__.py +0 -945
  177. llama_stack_api/admin/__init__.py +0 -45
  178. llama_stack_api/admin/api.py +0 -72
  179. llama_stack_api/admin/fastapi_routes.py +0 -117
  180. llama_stack_api/admin/models.py +0 -113
  181. llama_stack_api/agents.py +0 -173
  182. llama_stack_api/batches/__init__.py +0 -40
  183. llama_stack_api/batches/api.py +0 -53
  184. llama_stack_api/batches/fastapi_routes.py +0 -113
  185. llama_stack_api/batches/models.py +0 -78
  186. llama_stack_api/benchmarks/__init__.py +0 -43
  187. llama_stack_api/benchmarks/api.py +0 -39
  188. llama_stack_api/benchmarks/fastapi_routes.py +0 -109
  189. llama_stack_api/benchmarks/models.py +0 -109
  190. llama_stack_api/common/__init__.py +0 -5
  191. llama_stack_api/common/content_types.py +0 -101
  192. llama_stack_api/common/errors.py +0 -95
  193. llama_stack_api/common/job_types.py +0 -38
  194. llama_stack_api/common/responses.py +0 -77
  195. llama_stack_api/common/training_types.py +0 -47
  196. llama_stack_api/common/type_system.py +0 -146
  197. llama_stack_api/connectors.py +0 -146
  198. llama_stack_api/conversations.py +0 -270
  199. llama_stack_api/datasetio.py +0 -55
  200. llama_stack_api/datasets/__init__.py +0 -61
  201. llama_stack_api/datasets/api.py +0 -35
  202. llama_stack_api/datasets/fastapi_routes.py +0 -104
  203. llama_stack_api/datasets/models.py +0 -152
  204. llama_stack_api/datatypes.py +0 -373
  205. llama_stack_api/eval.py +0 -137
  206. llama_stack_api/file_processors/__init__.py +0 -27
  207. llama_stack_api/file_processors/api.py +0 -64
  208. llama_stack_api/file_processors/fastapi_routes.py +0 -78
  209. llama_stack_api/file_processors/models.py +0 -42
  210. llama_stack_api/files/__init__.py +0 -35
  211. llama_stack_api/files/api.py +0 -51
  212. llama_stack_api/files/fastapi_routes.py +0 -124
  213. llama_stack_api/files/models.py +0 -107
  214. llama_stack_api/inference.py +0 -1169
  215. llama_stack_api/inspect_api/__init__.py +0 -37
  216. llama_stack_api/inspect_api/api.py +0 -25
  217. llama_stack_api/inspect_api/fastapi_routes.py +0 -76
  218. llama_stack_api/inspect_api/models.py +0 -28
  219. llama_stack_api/internal/kvstore.py +0 -28
  220. llama_stack_api/internal/sqlstore.py +0 -81
  221. llama_stack_api/llama_stack_api/__init__.py +0 -945
  222. llama_stack_api/llama_stack_api/admin/__init__.py +0 -45
  223. llama_stack_api/llama_stack_api/admin/api.py +0 -72
  224. llama_stack_api/llama_stack_api/admin/fastapi_routes.py +0 -117
  225. llama_stack_api/llama_stack_api/admin/models.py +0 -113
  226. llama_stack_api/llama_stack_api/agents.py +0 -173
  227. llama_stack_api/llama_stack_api/batches/__init__.py +0 -40
  228. llama_stack_api/llama_stack_api/batches/api.py +0 -53
  229. llama_stack_api/llama_stack_api/batches/fastapi_routes.py +0 -113
  230. llama_stack_api/llama_stack_api/batches/models.py +0 -78
  231. llama_stack_api/llama_stack_api/benchmarks/__init__.py +0 -43
  232. llama_stack_api/llama_stack_api/benchmarks/api.py +0 -39
  233. llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +0 -109
  234. llama_stack_api/llama_stack_api/benchmarks/models.py +0 -109
  235. llama_stack_api/llama_stack_api/common/__init__.py +0 -5
  236. llama_stack_api/llama_stack_api/common/content_types.py +0 -101
  237. llama_stack_api/llama_stack_api/common/errors.py +0 -95
  238. llama_stack_api/llama_stack_api/common/job_types.py +0 -38
  239. llama_stack_api/llama_stack_api/common/responses.py +0 -77
  240. llama_stack_api/llama_stack_api/common/training_types.py +0 -47
  241. llama_stack_api/llama_stack_api/common/type_system.py +0 -146
  242. llama_stack_api/llama_stack_api/connectors.py +0 -146
  243. llama_stack_api/llama_stack_api/conversations.py +0 -270
  244. llama_stack_api/llama_stack_api/datasetio.py +0 -55
  245. llama_stack_api/llama_stack_api/datasets/__init__.py +0 -61
  246. llama_stack_api/llama_stack_api/datasets/api.py +0 -35
  247. llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +0 -104
  248. llama_stack_api/llama_stack_api/datasets/models.py +0 -152
  249. llama_stack_api/llama_stack_api/datatypes.py +0 -373
  250. llama_stack_api/llama_stack_api/eval.py +0 -137
  251. llama_stack_api/llama_stack_api/file_processors/__init__.py +0 -27
  252. llama_stack_api/llama_stack_api/file_processors/api.py +0 -64
  253. llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +0 -78
  254. llama_stack_api/llama_stack_api/file_processors/models.py +0 -42
  255. llama_stack_api/llama_stack_api/files/__init__.py +0 -35
  256. llama_stack_api/llama_stack_api/files/api.py +0 -51
  257. llama_stack_api/llama_stack_api/files/fastapi_routes.py +0 -124
  258. llama_stack_api/llama_stack_api/files/models.py +0 -107
  259. llama_stack_api/llama_stack_api/inference.py +0 -1169
  260. llama_stack_api/llama_stack_api/inspect_api/__init__.py +0 -37
  261. llama_stack_api/llama_stack_api/inspect_api/api.py +0 -25
  262. llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +0 -76
  263. llama_stack_api/llama_stack_api/inspect_api/models.py +0 -28
  264. llama_stack_api/llama_stack_api/internal/__init__.py +0 -9
  265. llama_stack_api/llama_stack_api/internal/kvstore.py +0 -28
  266. llama_stack_api/llama_stack_api/internal/sqlstore.py +0 -81
  267. llama_stack_api/llama_stack_api/models.py +0 -171
  268. llama_stack_api/llama_stack_api/openai_responses.py +0 -1468
  269. llama_stack_api/llama_stack_api/post_training.py +0 -370
  270. llama_stack_api/llama_stack_api/prompts.py +0 -203
  271. llama_stack_api/llama_stack_api/providers/__init__.py +0 -33
  272. llama_stack_api/llama_stack_api/providers/api.py +0 -16
  273. llama_stack_api/llama_stack_api/providers/fastapi_routes.py +0 -57
  274. llama_stack_api/llama_stack_api/providers/models.py +0 -24
  275. llama_stack_api/llama_stack_api/py.typed +0 -0
  276. llama_stack_api/llama_stack_api/rag_tool.py +0 -168
  277. llama_stack_api/llama_stack_api/resource.py +0 -37
  278. llama_stack_api/llama_stack_api/router_utils.py +0 -160
  279. llama_stack_api/llama_stack_api/safety.py +0 -132
  280. llama_stack_api/llama_stack_api/schema_utils.py +0 -208
  281. llama_stack_api/llama_stack_api/scoring.py +0 -93
  282. llama_stack_api/llama_stack_api/scoring_functions.py +0 -211
  283. llama_stack_api/llama_stack_api/shields.py +0 -93
  284. llama_stack_api/llama_stack_api/tools.py +0 -226
  285. llama_stack_api/llama_stack_api/vector_io.py +0 -941
  286. llama_stack_api/llama_stack_api/vector_stores.py +0 -53
  287. llama_stack_api/llama_stack_api/version.py +0 -9
  288. llama_stack_api/models.py +0 -171
  289. llama_stack_api/openai_responses.py +0 -1468
  290. llama_stack_api/post_training.py +0 -370
  291. llama_stack_api/prompts.py +0 -203
  292. llama_stack_api/providers/__init__.py +0 -33
  293. llama_stack_api/providers/api.py +0 -16
  294. llama_stack_api/providers/fastapi_routes.py +0 -57
  295. llama_stack_api/providers/models.py +0 -24
  296. llama_stack_api/py.typed +0 -0
  297. llama_stack_api/rag_tool.py +0 -168
  298. llama_stack_api/resource.py +0 -37
  299. llama_stack_api/router_utils.py +0 -160
  300. llama_stack_api/safety.py +0 -132
  301. llama_stack_api/schema_utils.py +0 -208
  302. llama_stack_api/scoring.py +0 -93
  303. llama_stack_api/scoring_functions.py +0 -211
  304. llama_stack_api/shields.py +0 -93
  305. llama_stack_api/tools.py +0 -226
  306. llama_stack_api/vector_io.py +0 -941
  307. llama_stack_api/vector_stores.py +0 -53
  308. llama_stack_api/version.py +0 -9
  309. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/WHEEL +0 -0
  310. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/entry_points.txt +0 -0
  311. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -190,7 +190,7 @@ class CachedDiskDistributionRegistry(DiskDistributionRegistry):
190
190
 
191
191
 
192
192
  async def create_dist_registry(
193
- metadata_store: KVStoreReference, image_name: str
193
+ metadata_store: KVStoreReference, distro_name: str
194
194
  ) -> tuple[CachedDiskDistributionRegistry, KVStore]:
195
195
  # instantiate kvstore for storing and retrieving distribution metadata
196
196
  dist_kvstore = await kvstore_impl(metadata_store)
@@ -17,10 +17,10 @@ from llama_stack.log import get_logger
17
17
  log = get_logger(name=__name__, category="core")
18
18
 
19
19
 
20
- def formulate_run_args(image_type: str, image_name: str) -> list:
20
+ def formulate_run_args(image_type: str, distro_name: str) -> list:
21
21
  # Only venv is supported now
22
22
  current_venv = os.environ.get("VIRTUAL_ENV")
23
- env_name = image_name or current_venv
23
+ env_name = distro_name or current_venv
24
24
  if not env_name:
25
25
  cprint(
26
26
  "No current virtual environment detected, please specify a virtual environment name with --image-name",
@@ -36,10 +36,24 @@ def is_unwrapped_body_param(param_type: Any) -> bool:
36
36
  base_type = args[0]
37
37
  metadata = args[1:]
38
38
 
39
- # Look for Body annotation with embed=False
39
+ # Look for Body annotation; treat embed=None (default) as unwrapped
40
40
  # Body() returns a FieldInfo object, so we check for that type and the embed attribute
41
41
  for item in metadata:
42
- if isinstance(item, FieldInfo) and hasattr(item, "embed") and not item.embed:
42
+ if isinstance(item, FieldInfo) and hasattr(item, "embed") and item.embed is not True:
43
43
  return inspect.isclass(base_type) and issubclass(base_type, BaseModel)
44
44
 
45
45
  return False
46
+
47
+
48
+ def is_body_param(param_type: Any) -> bool:
49
+ """
50
+ Check if a parameter type represents a body parameter (Annotated with Body()).
51
+ """
52
+ if get_origin(param_type) is typing.Annotated:
53
+ args = get_args(param_type)
54
+ base_type = args[0]
55
+ metadata = args[1:]
56
+ for item in metadata:
57
+ if isinstance(item, FieldInfo):
58
+ return inspect.isclass(base_type) and issubclass(base_type, BaseModel)
59
+ return False
@@ -1,5 +1,5 @@
1
1
  version: 2
2
- image_name: dell
2
+ distro_name: dell
3
3
  apis:
4
4
  - agents
5
5
  - datasetio
@@ -108,6 +108,9 @@ storage:
108
108
  prompts:
109
109
  namespace: prompts
110
110
  backend: kv_default
111
+ connectors:
112
+ namespace: connectors
113
+ backend: kv_default
111
114
  registered_resources:
112
115
  models:
113
116
  - metadata: {}
@@ -0,0 +1,209 @@
1
+ ---
2
+ orphan: true
3
+ ---
4
+
5
+ # Dell Distribution of Llama Stack
6
+
7
+ ```{toctree}
8
+ :maxdepth: 2
9
+ :hidden:
10
+
11
+ self
12
+ ```
13
+
14
+ The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
15
+
16
+ {{ providers_table }}
17
+
18
+ You can use this distribution if you have GPUs and want to run an independent TGI or Dell Enterprise Hub container for running inference.
19
+
20
+ {% if run_config_env_vars %}
21
+ ### Environment Variables
22
+
23
+ The following environment variables can be configured:
24
+
25
+ {% for var, (default_value, description) in run_config_env_vars.items() %}
26
+ - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
27
+ {% endfor %}
28
+ {% endif %}
29
+
30
+
31
+ ## Setting up Inference server using Dell Enterprise Hub's custom TGI container.
32
+
33
+ NOTE: This is a placeholder to run inference with TGI. This will be updated to use [Dell Enterprise Hub's containers](https://dell.huggingface.co/authenticated/models) once verified.
34
+
35
+ ```bash
36
+ export INFERENCE_PORT=8181
37
+ export DEH_URL=http://0.0.0.0:$INFERENCE_PORT
38
+ export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
39
+ export CHROMADB_HOST=localhost
40
+ export CHROMADB_PORT=6601
41
+ export CHROMA_URL=http://$CHROMADB_HOST:$CHROMADB_PORT
42
+ export CUDA_VISIBLE_DEVICES=0
43
+ export LLAMA_STACK_PORT=8321
44
+
45
+ docker run --rm -it \
46
+ --pull always \
47
+ --network host \
48
+ -v $HOME/.cache/huggingface:/data \
49
+ -e HF_TOKEN=$HF_TOKEN \
50
+ -p $INFERENCE_PORT:$INFERENCE_PORT \
51
+ --gpus $CUDA_VISIBLE_DEVICES \
52
+ ghcr.io/huggingface/text-generation-inference \
53
+ --dtype bfloat16 \
54
+ --usage-stats off \
55
+ --sharded false \
56
+ --cuda-memory-fraction 0.7 \
57
+ --model-id $INFERENCE_MODEL \
58
+ --port $INFERENCE_PORT --hostname 0.0.0.0
59
+ ```
60
+
61
+ If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
62
+
63
+ ```bash
64
+ export SAFETY_INFERENCE_PORT=8282
65
+ export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
66
+ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
67
+ export CUDA_VISIBLE_DEVICES=1
68
+
69
+ docker run --rm -it \
70
+ --pull always \
71
+ --network host \
72
+ -v $HOME/.cache/huggingface:/data \
73
+ -e HF_TOKEN=$HF_TOKEN \
74
+ -p $SAFETY_INFERENCE_PORT:$SAFETY_INFERENCE_PORT \
75
+ --gpus $CUDA_VISIBLE_DEVICES \
76
+ ghcr.io/huggingface/text-generation-inference \
77
+ --dtype bfloat16 \
78
+ --usage-stats off \
79
+ --sharded false \
80
+ --cuda-memory-fraction 0.7 \
81
+ --model-id $SAFETY_MODEL \
82
+ --hostname 0.0.0.0 \
83
+ --port $SAFETY_INFERENCE_PORT
84
+ ```
85
+
86
+ ## Dell distribution relies on ChromaDB for vector database usage
87
+
88
+ You can start a chroma-db easily using docker.
89
+ ```bash
90
+ # This is where the indices are persisted
91
+ mkdir -p $HOME/chromadb
92
+
93
+ podman run --rm -it \
94
+ --network host \
95
+ --name chromadb \
96
+ -v $HOME/chromadb:/chroma/chroma \
97
+ -e IS_PERSISTENT=TRUE \
98
+ chromadb/chroma:latest \
99
+ --port $CHROMADB_PORT \
100
+ --host $CHROMADB_HOST
101
+ ```
102
+
103
+ ## Running Llama Stack
104
+
105
+ Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
106
+
107
+ ### Via Docker
108
+
109
+ This method allows you to get started quickly without having to build the distribution code.
110
+
111
+ ```bash
112
+ docker run -it \
113
+ --pull always \
114
+ --network host \
115
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
116
+ -v $HOME/.llama:/root/.llama \
117
+ # NOTE: mount the llama-stack directory if testing local changes else not needed
118
+ -v $HOME/git/llama-stack:/app/llama-stack-source \
119
+ # localhost/distribution-dell:dev if building / testing locally
120
+ -e INFERENCE_MODEL=$INFERENCE_MODEL \
121
+ -e DEH_URL=$DEH_URL \
122
+ -e CHROMA_URL=$CHROMA_URL \
123
+ llamastack/distribution-{{ name }}\
124
+ --port $LLAMA_STACK_PORT
125
+
126
+ ```
127
+
128
+ If you are using Llama Stack Safety / Shield APIs, use:
129
+
130
+ ```bash
131
+ # You need a local checkout of llama-stack to run this, get it using
132
+ # git clone https://github.com/meta-llama/llama-stack.git
133
+ cd /path/to/llama-stack
134
+
135
+ export SAFETY_INFERENCE_PORT=8282
136
+ export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
137
+ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
138
+
139
+ docker run \
140
+ -it \
141
+ --pull always \
142
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
143
+ -v $HOME/.llama:/root/.llama \
144
+ -v ./llama_stack/distributions/tgi/run-with-safety.yaml:/root/my-config.yaml \
145
+ -e INFERENCE_MODEL=$INFERENCE_MODEL \
146
+ -e DEH_URL=$DEH_URL \
147
+ -e SAFETY_MODEL=$SAFETY_MODEL \
148
+ -e DEH_SAFETY_URL=$DEH_SAFETY_URL \
149
+ -e CHROMA_URL=$CHROMA_URL \
150
+ llamastack/distribution-{{ name }} \
151
+ --config /root/my-config.yaml \
152
+ --port $LLAMA_STACK_PORT
153
+ ```
154
+
155
+ ### Via Docker with Custom Run Configuration
156
+
157
+ You can also run the Docker container with a custom run configuration file by mounting it into the container:
158
+
159
+ ```bash
160
+ # Set the path to your custom config.yaml file
161
+ CUSTOM_RUN_CONFIG=/path/to/your/custom-config.yaml
162
+
163
+ docker run -it \
164
+ --pull always \
165
+ --network host \
166
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
167
+ -v $HOME/.llama:/root/.llama \
168
+ -v $CUSTOM_RUN_CONFIG:/app/custom-config.yaml \
169
+ -e RUN_CONFIG_PATH=/app/custom-config.yaml \
170
+ -e INFERENCE_MODEL=$INFERENCE_MODEL \
171
+ -e DEH_URL=$DEH_URL \
172
+ -e CHROMA_URL=$CHROMA_URL \
173
+ llamastack/distribution-{{ name }} \
174
+ --port $LLAMA_STACK_PORT
175
+ ```
176
+
177
+ **Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
178
+
179
+ {% if run_configs %}
180
+ Available run configurations for this distribution:
181
+ {% for config in run_configs %}
182
+ - `{{ config }}`
183
+ {% endfor %}
184
+ {% endif %}
185
+
186
+ ### Via Conda
187
+
188
+ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
189
+
190
+ ```bash
191
+ llama stack list-deps {{ name }} | xargs -L1 pip install
192
+ INFERENCE_MODEL=$INFERENCE_MODEL \
193
+ DEH_URL=$DEH_URL \
194
+ CHROMA_URL=$CHROMA_URL \
195
+ llama stack run {{ name }} \
196
+ --port $LLAMA_STACK_PORT
197
+ ```
198
+
199
+ If you are using Llama Stack Safety / Shield APIs, use:
200
+
201
+ ```bash
202
+ INFERENCE_MODEL=$INFERENCE_MODEL \
203
+ DEH_URL=$DEH_URL \
204
+ SAFETY_MODEL=$SAFETY_MODEL \
205
+ DEH_SAFETY_URL=$DEH_SAFETY_URL \
206
+ CHROMA_URL=$CHROMA_URL \
207
+ llama stack run ./run-with-safety.yaml \
208
+ --port $LLAMA_STACK_PORT
209
+ ```
@@ -1,5 +1,5 @@
1
1
  version: 2
2
- image_name: dell
2
+ distro_name: dell
3
3
  apis:
4
4
  - agents
5
5
  - datasetio
@@ -112,6 +112,9 @@ storage:
112
112
  prompts:
113
113
  namespace: prompts
114
114
  backend: kv_default
115
+ connectors:
116
+ namespace: connectors
117
+ backend: kv_default
115
118
  registered_resources:
116
119
  models:
117
120
  - metadata: {}
@@ -1,5 +1,5 @@
1
1
  version: 2
2
- image_name: nvidia
2
+ distro_name: nvidia
3
3
  apis:
4
4
  - agents
5
5
  - datasetio
@@ -102,6 +102,9 @@ storage:
102
102
  prompts:
103
103
  namespace: prompts
104
104
  backend: kv_default
105
+ connectors:
106
+ namespace: connectors
107
+ backend: kv_default
105
108
  registered_resources:
106
109
  models: []
107
110
  shields: []
@@ -0,0 +1,170 @@
1
+ ---
2
+ orphan: true
3
+ ---
4
+ # NVIDIA Distribution
5
+
6
+ The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
7
+
8
+ {{ providers_table }}
9
+
10
+ {% if run_config_env_vars %}
11
+ ### Environment Variables
12
+
13
+ The following environment variables can be configured:
14
+
15
+ {% for var, (default_value, description) in run_config_env_vars.items() %}
16
+ - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
17
+ {% endfor %}
18
+ {% endif %}
19
+
20
+ {% if default_models %}
21
+ ### Models
22
+
23
+ The following models are available by default:
24
+
25
+ {% for model in default_models %}
26
+ - `{{ model.model_id }} {{ model.doc_string }}`
27
+ {% endfor %}
28
+ {% endif %}
29
+
30
+
31
+ ## Prerequisites
32
+ ### NVIDIA API Keys
33
+
34
+ Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
35
+
36
+ ### Deploy NeMo Microservices Platform
37
+ The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform.
38
+
39
+ ## Supported Services
40
+ Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
41
+
42
+ ### Inference: NVIDIA NIM
43
+ NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
44
+ 1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
45
+ 2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
46
+
47
+ The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
48
+
49
+ ### Datasetio API: NeMo Data Store
50
+ The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
51
+
52
+ See the [NVIDIA Datasetio docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
53
+
54
+ ### Eval API: NeMo Evaluator
55
+ The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
56
+
57
+ See the [NVIDIA Eval docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
58
+
59
+ ### Post-Training API: NeMo Customizer
60
+ The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
61
+
62
+ See the [NVIDIA Post-Training docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
63
+
64
+ ### Safety API: NeMo Guardrails
65
+ The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
66
+
67
+ See the [NVIDIA Safety docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/safety/nvidia/README.md) for supported features and example usage.
68
+
69
+ ## Deploying models
70
+ In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
71
+
72
+ Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
73
+ ```sh
74
+ # URL to NeMo NIM Proxy service
75
+ export NEMO_URL="http://nemo.test"
76
+
77
+ curl --location "$NEMO_URL/v1/deployment/model-deployments" \
78
+ -H 'accept: application/json' \
79
+ -H 'Content-Type: application/json' \
80
+ -d '{
81
+ "name": "llama-3.2-1b-instruct",
82
+ "namespace": "meta",
83
+ "config": {
84
+ "model": "meta/llama-3.2-1b-instruct",
85
+ "nim_deployment": {
86
+ "image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
87
+ "image_tag": "1.8.3",
88
+ "pvc_size": "25Gi",
89
+ "gpu": 1,
90
+ "additional_envs": {
91
+ "NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
92
+ }
93
+ }
94
+ }
95
+ }'
96
+ ```
97
+ This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference.
98
+
99
+ You can also remove a deployed NIM to free up GPU resources, if needed.
100
+ ```sh
101
+ export NEMO_URL="http://nemo.test"
102
+
103
+ curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
104
+ ```
105
+
106
+ ## Running Llama Stack with NVIDIA
107
+
108
+ You can do this via venv (build code), or Docker which has a pre-built image.
109
+
110
+ ### Via Docker
111
+
112
+ This method allows you to get started quickly without having to build the distribution code.
113
+
114
+ ```bash
115
+ LLAMA_STACK_PORT=8321
116
+ docker run \
117
+ -it \
118
+ --pull always \
119
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
120
+ -v ~/.llama:/root/.llama \
121
+ -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
122
+ llamastack/distribution-{{ name }} \
123
+ --port $LLAMA_STACK_PORT
124
+ ```
125
+
126
+ ### Via Docker with Custom Run Configuration
127
+
128
+ You can also run the Docker container with a custom run configuration file by mounting it into the container:
129
+
130
+ ```bash
131
+ # Set the path to your custom config.yaml file
132
+ CUSTOM_RUN_CONFIG=/path/to/your/custom-config.yaml
133
+ LLAMA_STACK_PORT=8321
134
+
135
+ docker run \
136
+ -it \
137
+ --pull always \
138
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
139
+ -v ~/.llama:/root/.llama \
140
+ -v $CUSTOM_RUN_CONFIG:/app/custom-config.yaml \
141
+ -e RUN_CONFIG_PATH=/app/custom-config.yaml \
142
+ -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
143
+ llamastack/distribution-{{ name }} \
144
+ --port $LLAMA_STACK_PORT
145
+ ```
146
+
147
+ **Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
148
+
149
+ {% if run_configs %}
150
+ Available run configurations for this distribution:
151
+ {% for config in run_configs %}
152
+ - `{{ config }}`
153
+ {% endfor %}
154
+ {% endif %}
155
+
156
+ ### Via venv
157
+
158
+ If you've set up your local development environment, you can also install the distribution dependencies using your local virtual environment.
159
+
160
+ ```bash
161
+ INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
162
+ llama stack list-deps nvidia | xargs -L1 uv pip install
163
+ NVIDIA_API_KEY=$NVIDIA_API_KEY \
164
+ INFERENCE_MODEL=$INFERENCE_MODEL \
165
+ llama stack run ./config.yaml \
166
+ --port 8321
167
+ ```
168
+
169
+ ## Example Notebooks
170
+ For examples of how to use the NVIDIA Distribution to run inference, fine-tune, evaluate, and run safety checks on your LLMs, you can reference the example notebooks in [docs/notebooks/nvidia](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks/nvidia).
@@ -1,5 +1,5 @@
1
1
  version: 2
2
- image_name: nvidia
2
+ distro_name: nvidia
3
3
  apis:
4
4
  - agents
5
5
  - datasetio
@@ -113,6 +113,9 @@ storage:
113
113
  prompts:
114
114
  namespace: prompts
115
115
  backend: kv_default
116
+ connectors:
117
+ namespace: connectors
118
+ backend: kv_default
116
119
  registered_resources:
117
120
  models:
118
121
  - metadata: {}
@@ -1,5 +1,5 @@
1
1
  version: 2
2
- image_name: oci
2
+ distro_name: oci
3
3
  apis:
4
4
  - agents
5
5
  - datasetio
@@ -120,6 +120,9 @@ storage:
120
120
  prompts:
121
121
  namespace: prompts
122
122
  backend: kv_default
123
+ connectors:
124
+ namespace: connectors
125
+ backend: kv_default
123
126
  registered_resources:
124
127
  models: []
125
128
  shields: []
@@ -0,0 +1,140 @@
1
+ ---
2
+ orphan: true
3
+ ---
4
+ # OCI Distribution
5
+
6
+ The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
7
+
8
+ {{ providers_table }}
9
+
10
+ {% if run_config_env_vars %}
11
+ ### Environment Variables
12
+
13
+ The following environment variables can be configured:
14
+
15
+ {% for var, (default_value, description) in run_config_env_vars.items() %}
16
+ - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
17
+ {% endfor %}
18
+ {% endif %}
19
+
20
+ {% if default_models %}
21
+ ### Models
22
+
23
+ The following models are available by default:
24
+
25
+ {% for model in default_models %}
26
+ - `{{ model.model_id }} {{ model.doc_string }}`
27
+ {% endfor %}
28
+ {% endif %}
29
+
30
+ ## Prerequisites
31
+ ### Oracle Cloud Infrastructure Setup
32
+
33
+ Before using the OCI Generative AI distribution, ensure you have:
34
+
35
+ 1. **Oracle Cloud Infrastructure Account**: Sign up at [Oracle Cloud Infrastructure](https://cloud.oracle.com/)
36
+ 2. **Generative AI Service Access**: Enable the Generative AI service in your OCI tenancy
37
+ 3. **Compartment**: Create or identify a compartment where you'll deploy Generative AI models
38
+ 4. **Authentication**: Configure authentication using either:
39
+ - **Instance Principal** (recommended for cloud-hosted deployments)
40
+ - **API Key** (for on-premises or development environments)
41
+
42
+ ### Authentication Methods
43
+
44
+ #### Instance Principal Authentication (Recommended)
45
+ Instance Principal authentication allows OCI resources to authenticate using the identity of the compute instance they're running on. This is the most secure method for production deployments.
46
+
47
+ Requirements:
48
+ - Instance must be running in an Oracle Cloud Infrastructure compartment
49
+ - Instance must have appropriate IAM policies to access Generative AI services
50
+
51
+ #### API Key Authentication
52
+ For development or on-premises deployments, follow [this doc](https://docs.oracle.com/en-us/iaas/Content/API/Concepts/apisigningkey.htm) to learn how to create your API signing key for your config file.
53
+
54
+ ### Required IAM Policies
55
+
56
+ Ensure your OCI user or instance has the following policy statements:
57
+
58
+ ```
59
+ Allow group <group_name> to use generative-ai-inference-endpoints in compartment <compartment_name>
60
+ Allow group <group_name> to manage generative-ai-inference-endpoints in compartment <compartment_name>
61
+ ```
62
+
63
+ ## Supported Services
64
+
65
+ ### Inference: OCI Generative AI
66
+ Oracle Cloud Infrastructure Generative AI provides access to high-performance AI models through OCI's Platform-as-a-Service offering. The service supports:
67
+
68
+ - **Chat Completions**: Conversational AI with context awareness
69
+ - **Text Generation**: Complete prompts and generate text content
70
+
71
+ #### Available Models
72
+ Common OCI Generative AI models include access to Meta, Cohere, OpenAI, Grok, and more models.
73
+
74
+ ### Safety: Llama Guard
75
+ For content safety and moderation, this distribution uses Meta's LlamaGuard model through the OCI Generative AI service to provide:
76
+ - Content filtering and moderation
77
+ - Policy compliance checking
78
+ - Harmful content detection
79
+
80
+ ### Vector Storage: Multiple Options
81
+ The distribution supports several vector storage providers:
82
+ - **FAISS**: Local in-memory vector search
83
+ - **ChromaDB**: Distributed vector database
84
+ - **PGVector**: PostgreSQL with vector extensions
85
+
86
+ ### Additional Services
87
+ - **Dataset I/O**: Local filesystem and Hugging Face integration
88
+ - **Tool Runtime**: Web search (Brave, Tavily) and RAG capabilities
89
+ - **Evaluation**: Meta reference evaluation framework
90
+
91
+ ## Running Llama Stack with OCI
92
+
93
+ You can run the OCI distribution via Docker or local virtual environment.
94
+
95
+ ### Via venv
96
+
97
+ If you've set up your local development environment, you can also build the image using your local virtual environment.
98
+
99
+ ```bash
100
+ OCI_AUTH=$OCI_AUTH_TYPE OCI_REGION=$OCI_REGION OCI_COMPARTMENT_OCID=$OCI_COMPARTMENT_OCID llama stack run --port 8321 oci
101
+ ```
102
+
103
+ ### Configuration Examples
104
+
105
+ #### Using Instance Principal (Recommended for Production)
106
+ ```bash
107
+ export OCI_AUTH_TYPE=instance_principal
108
+ export OCI_REGION=us-chicago-1
109
+ export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..<your-compartment-id>
110
+ ```
111
+
112
+ #### Using API Key Authentication (Development)
113
+ ```bash
114
+ export OCI_AUTH_TYPE=config_file
115
+ export OCI_CONFIG_FILE_PATH=~/.oci/config
116
+ export OCI_CLI_PROFILE=DEFAULT
117
+ export OCI_REGION=us-chicago-1
118
+ export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..your-compartment-id
119
+ ```
120
+
121
+ ## Regional Endpoints
122
+
123
+ OCI Generative AI is available in multiple regions. The service automatically routes to the appropriate regional endpoint based on your configuration. For a full list of regional model availability, visit:
124
+
125
+ https://docs.oracle.com/en-us/iaas/Content/generative-ai/overview.htm#regions
126
+
127
+ ## Troubleshooting
128
+
129
+ ### Common Issues
130
+
131
+ 1. **Authentication Errors**: Verify your OCI credentials and IAM policies
132
+ 2. **Model Not Found**: Ensure the model OCID is correct and the model is available in your region
133
+ 3. **Permission Denied**: Check compartment permissions and Generative AI service access
134
+ 4. **Region Unavailable**: Verify the specified region supports Generative AI services
135
+
136
+ ### Getting Help
137
+
138
+ For additional support:
139
+ - [OCI Generative AI Documentation](https://docs.oracle.com/en-us/iaas/Content/generative-ai/home.htm)
140
+ - [Llama Stack Issues](https://github.com/meta-llama/llama-stack/issues)
@@ -1,5 +1,5 @@
1
1
  version: 2
2
- image_name: open-benchmark
2
+ distro_name: open-benchmark
3
3
  apis:
4
4
  - agents
5
5
  - datasetio
@@ -57,6 +57,11 @@ providers:
57
57
  db: ${env.PGVECTOR_DB:=}
58
58
  user: ${env.PGVECTOR_USER:=}
59
59
  password: ${env.PGVECTOR_PASSWORD:=}
60
+ distance_metric: COSINE
61
+ vector_index:
62
+ type: HNSW
63
+ m: 16
64
+ ef_construction: 64
60
65
  persistence:
61
66
  namespace: vector_io::pgvector
62
67
  backend: kv_default
@@ -145,6 +150,9 @@ storage:
145
150
  prompts:
146
151
  namespace: prompts
147
152
  backend: kv_default
153
+ connectors:
154
+ namespace: connectors
155
+ backend: kv_default
148
156
  registered_resources:
149
157
  models:
150
158
  - metadata: {}