llama-stack 0.4.3__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (307) hide show
  1. llama_stack/cli/stack/_list_deps.py +11 -7
  2. llama_stack/cli/stack/run.py +3 -25
  3. llama_stack/core/access_control/datatypes.py +78 -0
  4. llama_stack/core/configure.py +2 -2
  5. {llama_stack_api/internal → llama_stack/core/connectors}/__init__.py +2 -2
  6. llama_stack/core/connectors/connectors.py +162 -0
  7. llama_stack/core/conversations/conversations.py +61 -58
  8. llama_stack/core/datatypes.py +54 -8
  9. llama_stack/core/library_client.py +60 -13
  10. llama_stack/core/prompts/prompts.py +43 -42
  11. llama_stack/core/routers/datasets.py +20 -17
  12. llama_stack/core/routers/eval_scoring.py +143 -53
  13. llama_stack/core/routers/inference.py +20 -9
  14. llama_stack/core/routers/safety.py +30 -42
  15. llama_stack/core/routers/vector_io.py +15 -7
  16. llama_stack/core/routing_tables/models.py +42 -3
  17. llama_stack/core/routing_tables/scoring_functions.py +19 -19
  18. llama_stack/core/routing_tables/shields.py +20 -17
  19. llama_stack/core/routing_tables/vector_stores.py +8 -5
  20. llama_stack/core/server/auth.py +192 -17
  21. llama_stack/core/server/fastapi_router_registry.py +40 -5
  22. llama_stack/core/server/server.py +24 -5
  23. llama_stack/core/stack.py +54 -10
  24. llama_stack/core/storage/datatypes.py +9 -0
  25. llama_stack/core/store/registry.py +1 -1
  26. llama_stack/core/utils/exec.py +2 -2
  27. llama_stack/core/utils/type_inspection.py +16 -2
  28. llama_stack/distributions/dell/config.yaml +4 -1
  29. llama_stack/distributions/dell/doc_template.md +209 -0
  30. llama_stack/distributions/dell/run-with-safety.yaml +4 -1
  31. llama_stack/distributions/nvidia/config.yaml +4 -1
  32. llama_stack/distributions/nvidia/doc_template.md +170 -0
  33. llama_stack/distributions/nvidia/run-with-safety.yaml +4 -1
  34. llama_stack/distributions/oci/config.yaml +4 -1
  35. llama_stack/distributions/oci/doc_template.md +140 -0
  36. llama_stack/distributions/open-benchmark/config.yaml +9 -1
  37. llama_stack/distributions/postgres-demo/config.yaml +1 -1
  38. llama_stack/distributions/starter/build.yaml +62 -0
  39. llama_stack/distributions/starter/config.yaml +22 -3
  40. llama_stack/distributions/starter/run-with-postgres-store.yaml +22 -3
  41. llama_stack/distributions/starter/starter.py +13 -1
  42. llama_stack/distributions/starter-gpu/build.yaml +62 -0
  43. llama_stack/distributions/starter-gpu/config.yaml +22 -3
  44. llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +22 -3
  45. llama_stack/distributions/template.py +10 -2
  46. llama_stack/distributions/watsonx/config.yaml +4 -1
  47. llama_stack/log.py +1 -0
  48. llama_stack/models/llama/resources/dog.jpg +0 -0
  49. llama_stack/models/llama/resources/pasta.jpeg +0 -0
  50. llama_stack/models/llama/resources/small_dog.jpg +0 -0
  51. llama_stack/providers/inline/agents/meta_reference/__init__.py +1 -0
  52. llama_stack/providers/inline/agents/meta_reference/agents.py +57 -61
  53. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +183 -60
  54. llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +94 -22
  55. llama_stack/providers/inline/agents/meta_reference/responses/types.py +2 -1
  56. llama_stack/providers/inline/agents/meta_reference/responses/utils.py +4 -1
  57. llama_stack/providers/inline/agents/meta_reference/safety.py +2 -2
  58. llama_stack/providers/inline/batches/reference/batches.py +2 -1
  59. llama_stack/providers/inline/eval/meta_reference/eval.py +40 -32
  60. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h +9 -0
  61. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift +189 -0
  62. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift +238 -0
  63. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/PromptTemplate.swift +12 -0
  64. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/SystemPrompts.swift +89 -0
  65. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.pbxproj +550 -0
  66. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
  67. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
  68. llama_stack/providers/inline/post_training/huggingface/post_training.py +33 -38
  69. llama_stack/providers/inline/post_training/huggingface/utils.py +2 -5
  70. llama_stack/providers/inline/post_training/torchtune/post_training.py +28 -33
  71. llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +2 -4
  72. llama_stack/providers/inline/safety/code_scanner/code_scanner.py +12 -15
  73. llama_stack/providers/inline/safety/llama_guard/llama_guard.py +15 -18
  74. llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +11 -17
  75. llama_stack/providers/inline/scoring/basic/scoring.py +13 -17
  76. llama_stack/providers/inline/scoring/braintrust/braintrust.py +15 -15
  77. llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +13 -17
  78. llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +1 -1
  79. llama_stack/providers/registry/agents.py +1 -0
  80. llama_stack/providers/registry/inference.py +1 -9
  81. llama_stack/providers/registry/vector_io.py +136 -16
  82. llama_stack/providers/remote/datasetio/nvidia/README.md +74 -0
  83. llama_stack/providers/remote/eval/nvidia/README.md +134 -0
  84. llama_stack/providers/remote/eval/nvidia/eval.py +22 -21
  85. llama_stack/providers/remote/files/s3/README.md +266 -0
  86. llama_stack/providers/remote/files/s3/config.py +5 -3
  87. llama_stack/providers/remote/files/s3/files.py +2 -2
  88. llama_stack/providers/remote/inference/gemini/gemini.py +4 -0
  89. llama_stack/providers/remote/inference/nvidia/NVIDIA.md +203 -0
  90. llama_stack/providers/remote/inference/openai/openai.py +2 -0
  91. llama_stack/providers/remote/inference/together/together.py +4 -0
  92. llama_stack/providers/remote/inference/vertexai/config.py +3 -3
  93. llama_stack/providers/remote/inference/vertexai/vertexai.py +5 -2
  94. llama_stack/providers/remote/inference/vllm/config.py +37 -18
  95. llama_stack/providers/remote/inference/vllm/vllm.py +0 -3
  96. llama_stack/providers/remote/inference/watsonx/watsonx.py +4 -0
  97. llama_stack/providers/remote/post_training/nvidia/README.md +151 -0
  98. llama_stack/providers/remote/post_training/nvidia/post_training.py +31 -33
  99. llama_stack/providers/remote/safety/bedrock/bedrock.py +10 -27
  100. llama_stack/providers/remote/safety/nvidia/README.md +78 -0
  101. llama_stack/providers/remote/safety/nvidia/nvidia.py +9 -25
  102. llama_stack/providers/remote/safety/sambanova/sambanova.py +13 -11
  103. llama_stack/providers/remote/vector_io/elasticsearch/__init__.py +17 -0
  104. llama_stack/providers/remote/vector_io/elasticsearch/config.py +32 -0
  105. llama_stack/providers/remote/vector_io/elasticsearch/elasticsearch.py +463 -0
  106. llama_stack/providers/remote/vector_io/oci/__init__.py +22 -0
  107. llama_stack/providers/remote/vector_io/oci/config.py +41 -0
  108. llama_stack/providers/remote/vector_io/oci/oci26ai.py +595 -0
  109. llama_stack/providers/remote/vector_io/pgvector/config.py +69 -2
  110. llama_stack/providers/remote/vector_io/pgvector/pgvector.py +255 -6
  111. llama_stack/providers/remote/vector_io/qdrant/qdrant.py +62 -38
  112. llama_stack/providers/utils/bedrock/client.py +3 -3
  113. llama_stack/providers/utils/bedrock/config.py +7 -7
  114. llama_stack/providers/utils/inference/embedding_mixin.py +4 -0
  115. llama_stack/providers/utils/inference/http_client.py +239 -0
  116. llama_stack/providers/utils/inference/litellm_openai_mixin.py +5 -0
  117. llama_stack/providers/utils/inference/model_registry.py +148 -2
  118. llama_stack/providers/utils/inference/openai_compat.py +2 -1
  119. llama_stack/providers/utils/inference/openai_mixin.py +41 -2
  120. llama_stack/providers/utils/memory/openai_vector_store_mixin.py +92 -5
  121. llama_stack/providers/utils/memory/vector_store.py +46 -19
  122. llama_stack/providers/utils/responses/responses_store.py +40 -6
  123. llama_stack/providers/utils/safety.py +114 -0
  124. llama_stack/providers/utils/tools/mcp.py +44 -3
  125. llama_stack/testing/api_recorder.py +9 -3
  126. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/METADATA +14 -2
  127. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/RECORD +131 -275
  128. llama_stack-0.5.0rc1.dist-info/top_level.txt +1 -0
  129. llama_stack/distributions/meta-reference-gpu/__init__.py +0 -7
  130. llama_stack/distributions/meta-reference-gpu/config.yaml +0 -140
  131. llama_stack/distributions/meta-reference-gpu/meta_reference.py +0 -163
  132. llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +0 -155
  133. llama_stack/models/llama/hadamard_utils.py +0 -88
  134. llama_stack/models/llama/llama3/args.py +0 -74
  135. llama_stack/models/llama/llama3/generation.py +0 -378
  136. llama_stack/models/llama/llama3/model.py +0 -304
  137. llama_stack/models/llama/llama3/multimodal/__init__.py +0 -12
  138. llama_stack/models/llama/llama3/multimodal/encoder_utils.py +0 -180
  139. llama_stack/models/llama/llama3/multimodal/image_transform.py +0 -409
  140. llama_stack/models/llama/llama3/multimodal/model.py +0 -1430
  141. llama_stack/models/llama/llama3/multimodal/utils.py +0 -26
  142. llama_stack/models/llama/llama3/quantization/__init__.py +0 -5
  143. llama_stack/models/llama/llama3/quantization/loader.py +0 -316
  144. llama_stack/models/llama/llama3_1/__init__.py +0 -12
  145. llama_stack/models/llama/llama3_1/prompt_format.md +0 -358
  146. llama_stack/models/llama/llama3_1/prompts.py +0 -258
  147. llama_stack/models/llama/llama3_2/__init__.py +0 -5
  148. llama_stack/models/llama/llama3_2/prompts_text.py +0 -229
  149. llama_stack/models/llama/llama3_2/prompts_vision.py +0 -126
  150. llama_stack/models/llama/llama3_2/text_prompt_format.md +0 -286
  151. llama_stack/models/llama/llama3_2/vision_prompt_format.md +0 -141
  152. llama_stack/models/llama/llama3_3/__init__.py +0 -5
  153. llama_stack/models/llama/llama3_3/prompts.py +0 -259
  154. llama_stack/models/llama/llama4/args.py +0 -107
  155. llama_stack/models/llama/llama4/ffn.py +0 -58
  156. llama_stack/models/llama/llama4/moe.py +0 -214
  157. llama_stack/models/llama/llama4/preprocess.py +0 -435
  158. llama_stack/models/llama/llama4/quantization/__init__.py +0 -5
  159. llama_stack/models/llama/llama4/quantization/loader.py +0 -226
  160. llama_stack/models/llama/llama4/vision/__init__.py +0 -5
  161. llama_stack/models/llama/llama4/vision/embedding.py +0 -210
  162. llama_stack/models/llama/llama4/vision/encoder.py +0 -412
  163. llama_stack/models/llama/quantize_impls.py +0 -316
  164. llama_stack/providers/inline/inference/meta_reference/__init__.py +0 -20
  165. llama_stack/providers/inline/inference/meta_reference/common.py +0 -24
  166. llama_stack/providers/inline/inference/meta_reference/config.py +0 -68
  167. llama_stack/providers/inline/inference/meta_reference/generators.py +0 -201
  168. llama_stack/providers/inline/inference/meta_reference/inference.py +0 -542
  169. llama_stack/providers/inline/inference/meta_reference/model_parallel.py +0 -77
  170. llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +0 -353
  171. llama_stack-0.4.3.dist-info/top_level.txt +0 -2
  172. llama_stack_api/__init__.py +0 -945
  173. llama_stack_api/admin/__init__.py +0 -45
  174. llama_stack_api/admin/api.py +0 -72
  175. llama_stack_api/admin/fastapi_routes.py +0 -117
  176. llama_stack_api/admin/models.py +0 -113
  177. llama_stack_api/agents.py +0 -173
  178. llama_stack_api/batches/__init__.py +0 -40
  179. llama_stack_api/batches/api.py +0 -53
  180. llama_stack_api/batches/fastapi_routes.py +0 -113
  181. llama_stack_api/batches/models.py +0 -78
  182. llama_stack_api/benchmarks/__init__.py +0 -43
  183. llama_stack_api/benchmarks/api.py +0 -39
  184. llama_stack_api/benchmarks/fastapi_routes.py +0 -109
  185. llama_stack_api/benchmarks/models.py +0 -109
  186. llama_stack_api/common/__init__.py +0 -5
  187. llama_stack_api/common/content_types.py +0 -101
  188. llama_stack_api/common/errors.py +0 -95
  189. llama_stack_api/common/job_types.py +0 -38
  190. llama_stack_api/common/responses.py +0 -77
  191. llama_stack_api/common/training_types.py +0 -47
  192. llama_stack_api/common/type_system.py +0 -146
  193. llama_stack_api/connectors.py +0 -146
  194. llama_stack_api/conversations.py +0 -270
  195. llama_stack_api/datasetio.py +0 -55
  196. llama_stack_api/datasets/__init__.py +0 -61
  197. llama_stack_api/datasets/api.py +0 -35
  198. llama_stack_api/datasets/fastapi_routes.py +0 -104
  199. llama_stack_api/datasets/models.py +0 -152
  200. llama_stack_api/datatypes.py +0 -373
  201. llama_stack_api/eval.py +0 -137
  202. llama_stack_api/file_processors/__init__.py +0 -27
  203. llama_stack_api/file_processors/api.py +0 -64
  204. llama_stack_api/file_processors/fastapi_routes.py +0 -78
  205. llama_stack_api/file_processors/models.py +0 -42
  206. llama_stack_api/files/__init__.py +0 -35
  207. llama_stack_api/files/api.py +0 -51
  208. llama_stack_api/files/fastapi_routes.py +0 -124
  209. llama_stack_api/files/models.py +0 -107
  210. llama_stack_api/inference.py +0 -1169
  211. llama_stack_api/inspect_api/__init__.py +0 -37
  212. llama_stack_api/inspect_api/api.py +0 -25
  213. llama_stack_api/inspect_api/fastapi_routes.py +0 -76
  214. llama_stack_api/inspect_api/models.py +0 -28
  215. llama_stack_api/internal/kvstore.py +0 -28
  216. llama_stack_api/internal/sqlstore.py +0 -81
  217. llama_stack_api/llama_stack_api/__init__.py +0 -945
  218. llama_stack_api/llama_stack_api/admin/__init__.py +0 -45
  219. llama_stack_api/llama_stack_api/admin/api.py +0 -72
  220. llama_stack_api/llama_stack_api/admin/fastapi_routes.py +0 -117
  221. llama_stack_api/llama_stack_api/admin/models.py +0 -113
  222. llama_stack_api/llama_stack_api/agents.py +0 -173
  223. llama_stack_api/llama_stack_api/batches/__init__.py +0 -40
  224. llama_stack_api/llama_stack_api/batches/api.py +0 -53
  225. llama_stack_api/llama_stack_api/batches/fastapi_routes.py +0 -113
  226. llama_stack_api/llama_stack_api/batches/models.py +0 -78
  227. llama_stack_api/llama_stack_api/benchmarks/__init__.py +0 -43
  228. llama_stack_api/llama_stack_api/benchmarks/api.py +0 -39
  229. llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +0 -109
  230. llama_stack_api/llama_stack_api/benchmarks/models.py +0 -109
  231. llama_stack_api/llama_stack_api/common/__init__.py +0 -5
  232. llama_stack_api/llama_stack_api/common/content_types.py +0 -101
  233. llama_stack_api/llama_stack_api/common/errors.py +0 -95
  234. llama_stack_api/llama_stack_api/common/job_types.py +0 -38
  235. llama_stack_api/llama_stack_api/common/responses.py +0 -77
  236. llama_stack_api/llama_stack_api/common/training_types.py +0 -47
  237. llama_stack_api/llama_stack_api/common/type_system.py +0 -146
  238. llama_stack_api/llama_stack_api/connectors.py +0 -146
  239. llama_stack_api/llama_stack_api/conversations.py +0 -270
  240. llama_stack_api/llama_stack_api/datasetio.py +0 -55
  241. llama_stack_api/llama_stack_api/datasets/__init__.py +0 -61
  242. llama_stack_api/llama_stack_api/datasets/api.py +0 -35
  243. llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +0 -104
  244. llama_stack_api/llama_stack_api/datasets/models.py +0 -152
  245. llama_stack_api/llama_stack_api/datatypes.py +0 -373
  246. llama_stack_api/llama_stack_api/eval.py +0 -137
  247. llama_stack_api/llama_stack_api/file_processors/__init__.py +0 -27
  248. llama_stack_api/llama_stack_api/file_processors/api.py +0 -64
  249. llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +0 -78
  250. llama_stack_api/llama_stack_api/file_processors/models.py +0 -42
  251. llama_stack_api/llama_stack_api/files/__init__.py +0 -35
  252. llama_stack_api/llama_stack_api/files/api.py +0 -51
  253. llama_stack_api/llama_stack_api/files/fastapi_routes.py +0 -124
  254. llama_stack_api/llama_stack_api/files/models.py +0 -107
  255. llama_stack_api/llama_stack_api/inference.py +0 -1169
  256. llama_stack_api/llama_stack_api/inspect_api/__init__.py +0 -37
  257. llama_stack_api/llama_stack_api/inspect_api/api.py +0 -25
  258. llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +0 -76
  259. llama_stack_api/llama_stack_api/inspect_api/models.py +0 -28
  260. llama_stack_api/llama_stack_api/internal/__init__.py +0 -9
  261. llama_stack_api/llama_stack_api/internal/kvstore.py +0 -28
  262. llama_stack_api/llama_stack_api/internal/sqlstore.py +0 -81
  263. llama_stack_api/llama_stack_api/models.py +0 -171
  264. llama_stack_api/llama_stack_api/openai_responses.py +0 -1468
  265. llama_stack_api/llama_stack_api/post_training.py +0 -370
  266. llama_stack_api/llama_stack_api/prompts.py +0 -203
  267. llama_stack_api/llama_stack_api/providers/__init__.py +0 -33
  268. llama_stack_api/llama_stack_api/providers/api.py +0 -16
  269. llama_stack_api/llama_stack_api/providers/fastapi_routes.py +0 -57
  270. llama_stack_api/llama_stack_api/providers/models.py +0 -24
  271. llama_stack_api/llama_stack_api/py.typed +0 -0
  272. llama_stack_api/llama_stack_api/rag_tool.py +0 -168
  273. llama_stack_api/llama_stack_api/resource.py +0 -37
  274. llama_stack_api/llama_stack_api/router_utils.py +0 -160
  275. llama_stack_api/llama_stack_api/safety.py +0 -132
  276. llama_stack_api/llama_stack_api/schema_utils.py +0 -208
  277. llama_stack_api/llama_stack_api/scoring.py +0 -93
  278. llama_stack_api/llama_stack_api/scoring_functions.py +0 -211
  279. llama_stack_api/llama_stack_api/shields.py +0 -93
  280. llama_stack_api/llama_stack_api/tools.py +0 -226
  281. llama_stack_api/llama_stack_api/vector_io.py +0 -941
  282. llama_stack_api/llama_stack_api/vector_stores.py +0 -53
  283. llama_stack_api/llama_stack_api/version.py +0 -9
  284. llama_stack_api/models.py +0 -171
  285. llama_stack_api/openai_responses.py +0 -1468
  286. llama_stack_api/post_training.py +0 -370
  287. llama_stack_api/prompts.py +0 -203
  288. llama_stack_api/providers/__init__.py +0 -33
  289. llama_stack_api/providers/api.py +0 -16
  290. llama_stack_api/providers/fastapi_routes.py +0 -57
  291. llama_stack_api/providers/models.py +0 -24
  292. llama_stack_api/py.typed +0 -0
  293. llama_stack_api/rag_tool.py +0 -168
  294. llama_stack_api/resource.py +0 -37
  295. llama_stack_api/router_utils.py +0 -160
  296. llama_stack_api/safety.py +0 -132
  297. llama_stack_api/schema_utils.py +0 -208
  298. llama_stack_api/scoring.py +0 -93
  299. llama_stack_api/scoring_functions.py +0 -211
  300. llama_stack_api/shields.py +0 -93
  301. llama_stack_api/tools.py +0 -226
  302. llama_stack_api/vector_io.py +0 -941
  303. llama_stack_api/vector_stores.py +0 -53
  304. llama_stack_api/version.py +0 -9
  305. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/WHEEL +0 -0
  306. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/entry_points.txt +0 -0
  307. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1 @@
1
+ llama_stack
@@ -1,7 +0,0 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the terms described in the LICENSE file in
5
- # the root directory of this source tree.
6
-
7
- from .meta_reference import get_distribution_template # noqa: F401
@@ -1,140 +0,0 @@
1
- version: 2
2
- image_name: meta-reference-gpu
3
- apis:
4
- - agents
5
- - datasetio
6
- - eval
7
- - inference
8
- - safety
9
- - scoring
10
- - tool_runtime
11
- - vector_io
12
- providers:
13
- inference:
14
- - provider_id: meta-reference-inference
15
- provider_type: inline::meta-reference
16
- config:
17
- model: ${env.INFERENCE_MODEL}
18
- checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:=null}
19
- quantization:
20
- type: ${env.QUANTIZATION_TYPE:=bf16}
21
- model_parallel_size: ${env.MODEL_PARALLEL_SIZE:=0}
22
- max_batch_size: ${env.MAX_BATCH_SIZE:=1}
23
- max_seq_len: ${env.MAX_SEQ_LEN:=4096}
24
- - provider_id: sentence-transformers
25
- provider_type: inline::sentence-transformers
26
- vector_io:
27
- - provider_id: faiss
28
- provider_type: inline::faiss
29
- config:
30
- persistence:
31
- namespace: vector_io::faiss
32
- backend: kv_default
33
- safety:
34
- - provider_id: llama-guard
35
- provider_type: inline::llama-guard
36
- config:
37
- excluded_categories: []
38
- agents:
39
- - provider_id: meta-reference
40
- provider_type: inline::meta-reference
41
- config:
42
- persistence:
43
- agent_state:
44
- namespace: agents
45
- backend: kv_default
46
- responses:
47
- table_name: responses
48
- backend: sql_default
49
- max_write_queue_size: 10000
50
- num_writers: 4
51
- eval:
52
- - provider_id: meta-reference
53
- provider_type: inline::meta-reference
54
- config:
55
- kvstore:
56
- namespace: eval
57
- backend: kv_default
58
- datasetio:
59
- - provider_id: huggingface
60
- provider_type: remote::huggingface
61
- config:
62
- kvstore:
63
- namespace: datasetio::huggingface
64
- backend: kv_default
65
- - provider_id: localfs
66
- provider_type: inline::localfs
67
- config:
68
- kvstore:
69
- namespace: datasetio::localfs
70
- backend: kv_default
71
- scoring:
72
- - provider_id: basic
73
- provider_type: inline::basic
74
- - provider_id: llm-as-judge
75
- provider_type: inline::llm-as-judge
76
- - provider_id: braintrust
77
- provider_type: inline::braintrust
78
- config:
79
- openai_api_key: ${env.OPENAI_API_KEY:=}
80
- tool_runtime:
81
- - provider_id: brave-search
82
- provider_type: remote::brave-search
83
- config:
84
- api_key: ${env.BRAVE_SEARCH_API_KEY:=}
85
- max_results: 3
86
- - provider_id: tavily-search
87
- provider_type: remote::tavily-search
88
- config:
89
- api_key: ${env.TAVILY_SEARCH_API_KEY:=}
90
- max_results: 3
91
- - provider_id: rag-runtime
92
- provider_type: inline::rag-runtime
93
- - provider_id: model-context-protocol
94
- provider_type: remote::model-context-protocol
95
- storage:
96
- backends:
97
- kv_default:
98
- type: kv_sqlite
99
- db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/kvstore.db
100
- sql_default:
101
- type: sql_sqlite
102
- db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/sql_store.db
103
- stores:
104
- metadata:
105
- namespace: registry
106
- backend: kv_default
107
- inference:
108
- table_name: inference_store
109
- backend: sql_default
110
- max_write_queue_size: 10000
111
- num_writers: 4
112
- conversations:
113
- table_name: openai_conversations
114
- backend: sql_default
115
- prompts:
116
- namespace: prompts
117
- backend: kv_default
118
- registered_resources:
119
- models:
120
- - metadata: {}
121
- model_id: ${env.INFERENCE_MODEL}
122
- provider_id: meta-reference-inference
123
- model_type: llm
124
- - metadata:
125
- embedding_dimension: 768
126
- model_id: nomic-embed-text-v1.5
127
- provider_id: sentence-transformers
128
- model_type: embedding
129
- shields: []
130
- vector_dbs: []
131
- datasets: []
132
- scoring_fns: []
133
- benchmarks: []
134
- tool_groups:
135
- - toolgroup_id: builtin::websearch
136
- provider_id: tavily-search
137
- - toolgroup_id: builtin::rag
138
- provider_id: rag-runtime
139
- server:
140
- port: 8321
@@ -1,163 +0,0 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the terms described in the LICENSE file in
5
- # the root directory of this source tree.
6
-
7
- from pathlib import Path
8
-
9
- from llama_stack.core.datatypes import (
10
- BuildProvider,
11
- ModelInput,
12
- Provider,
13
- ShieldInput,
14
- ToolGroupInput,
15
- )
16
- from llama_stack.distributions.template import DistributionTemplate, RunConfigSettings
17
- from llama_stack.providers.inline.inference.meta_reference import (
18
- MetaReferenceInferenceConfig,
19
- )
20
- from llama_stack.providers.inline.inference.sentence_transformers import (
21
- SentenceTransformersInferenceConfig,
22
- )
23
- from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
24
- from llama_stack_api import ModelType
25
-
26
-
27
- def get_distribution_template() -> DistributionTemplate:
28
- providers = {
29
- "inference": [BuildProvider(provider_type="inline::meta-reference")],
30
- "vector_io": [
31
- BuildProvider(provider_type="inline::faiss"),
32
- BuildProvider(provider_type="remote::chromadb"),
33
- BuildProvider(provider_type="remote::pgvector"),
34
- ],
35
- "safety": [BuildProvider(provider_type="inline::llama-guard")],
36
- "agents": [BuildProvider(provider_type="inline::meta-reference")],
37
- "eval": [BuildProvider(provider_type="inline::meta-reference")],
38
- "datasetio": [
39
- BuildProvider(provider_type="remote::huggingface"),
40
- BuildProvider(provider_type="inline::localfs"),
41
- ],
42
- "scoring": [
43
- BuildProvider(provider_type="inline::basic"),
44
- BuildProvider(provider_type="inline::llm-as-judge"),
45
- BuildProvider(provider_type="inline::braintrust"),
46
- ],
47
- "tool_runtime": [
48
- BuildProvider(provider_type="remote::brave-search"),
49
- BuildProvider(provider_type="remote::tavily-search"),
50
- BuildProvider(provider_type="inline::rag-runtime"),
51
- BuildProvider(provider_type="remote::model-context-protocol"),
52
- ],
53
- }
54
- name = "meta-reference-gpu"
55
- inference_provider = Provider(
56
- provider_id="meta-reference-inference",
57
- provider_type="inline::meta-reference",
58
- config=MetaReferenceInferenceConfig.sample_run_config(
59
- model="${env.INFERENCE_MODEL}",
60
- checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:=null}",
61
- ),
62
- )
63
- embedding_provider = Provider(
64
- provider_id="sentence-transformers",
65
- provider_type="inline::sentence-transformers",
66
- config=SentenceTransformersInferenceConfig.sample_run_config(),
67
- )
68
- vector_io_provider = Provider(
69
- provider_id="faiss",
70
- provider_type="inline::faiss",
71
- config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
72
- )
73
-
74
- inference_model = ModelInput(
75
- model_id="${env.INFERENCE_MODEL}",
76
- provider_id="meta-reference-inference",
77
- )
78
- embedding_model = ModelInput(
79
- model_id="nomic-embed-text-v1.5",
80
- provider_id="sentence-transformers",
81
- model_type=ModelType.embedding,
82
- metadata={
83
- "embedding_dimension": 768,
84
- },
85
- )
86
- safety_model = ModelInput(
87
- model_id="${env.SAFETY_MODEL}",
88
- provider_id="meta-reference-safety",
89
- )
90
- default_tool_groups = [
91
- ToolGroupInput(
92
- toolgroup_id="builtin::websearch",
93
- provider_id="tavily-search",
94
- ),
95
- ToolGroupInput(
96
- toolgroup_id="builtin::rag",
97
- provider_id="rag-runtime",
98
- ),
99
- ]
100
-
101
- return DistributionTemplate(
102
- name=name,
103
- distro_type="self_hosted",
104
- description="Use Meta Reference for running LLM inference",
105
- template_path=Path(__file__).parent / "doc_template.md",
106
- providers=providers,
107
- run_configs={
108
- "config.yaml": RunConfigSettings(
109
- provider_overrides={
110
- "inference": [inference_provider, embedding_provider],
111
- "vector_io": [vector_io_provider],
112
- },
113
- default_models=[inference_model, embedding_model],
114
- default_tool_groups=default_tool_groups,
115
- ),
116
- "run-with-safety.yaml": RunConfigSettings(
117
- provider_overrides={
118
- "inference": [
119
- inference_provider,
120
- embedding_provider,
121
- Provider(
122
- provider_id="meta-reference-safety",
123
- provider_type="inline::meta-reference",
124
- config=MetaReferenceInferenceConfig.sample_run_config(
125
- model="${env.SAFETY_MODEL}",
126
- checkpoint_dir="${env.SAFETY_CHECKPOINT_DIR:=null}",
127
- ),
128
- ),
129
- ],
130
- "vector_io": [vector_io_provider],
131
- },
132
- default_models=[
133
- inference_model,
134
- safety_model,
135
- embedding_model,
136
- ],
137
- default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
138
- default_tool_groups=default_tool_groups,
139
- ),
140
- },
141
- run_config_env_vars={
142
- "LLAMA_STACK_PORT": (
143
- "8321",
144
- "Port for the Llama Stack distribution server",
145
- ),
146
- "INFERENCE_MODEL": (
147
- "meta-llama/Llama-3.2-3B-Instruct",
148
- "Inference model loaded into the Meta Reference server",
149
- ),
150
- "INFERENCE_CHECKPOINT_DIR": (
151
- "null",
152
- "Directory containing the Meta Reference model checkpoint",
153
- ),
154
- "SAFETY_MODEL": (
155
- "meta-llama/Llama-Guard-3-1B",
156
- "Name of the safety (Llama-Guard) model to use",
157
- ),
158
- "SAFETY_CHECKPOINT_DIR": (
159
- "null",
160
- "Directory containing the Llama-Guard model checkpoint",
161
- ),
162
- },
163
- )
@@ -1,155 +0,0 @@
1
- version: 2
2
- image_name: meta-reference-gpu
3
- apis:
4
- - agents
5
- - datasetio
6
- - eval
7
- - inference
8
- - safety
9
- - scoring
10
- - tool_runtime
11
- - vector_io
12
- providers:
13
- inference:
14
- - provider_id: meta-reference-inference
15
- provider_type: inline::meta-reference
16
- config:
17
- model: ${env.INFERENCE_MODEL}
18
- checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:=null}
19
- quantization:
20
- type: ${env.QUANTIZATION_TYPE:=bf16}
21
- model_parallel_size: ${env.MODEL_PARALLEL_SIZE:=0}
22
- max_batch_size: ${env.MAX_BATCH_SIZE:=1}
23
- max_seq_len: ${env.MAX_SEQ_LEN:=4096}
24
- - provider_id: sentence-transformers
25
- provider_type: inline::sentence-transformers
26
- - provider_id: meta-reference-safety
27
- provider_type: inline::meta-reference
28
- config:
29
- model: ${env.SAFETY_MODEL}
30
- checkpoint_dir: ${env.SAFETY_CHECKPOINT_DIR:=null}
31
- quantization:
32
- type: ${env.QUANTIZATION_TYPE:=bf16}
33
- model_parallel_size: ${env.MODEL_PARALLEL_SIZE:=0}
34
- max_batch_size: ${env.MAX_BATCH_SIZE:=1}
35
- max_seq_len: ${env.MAX_SEQ_LEN:=4096}
36
- vector_io:
37
- - provider_id: faiss
38
- provider_type: inline::faiss
39
- config:
40
- persistence:
41
- namespace: vector_io::faiss
42
- backend: kv_default
43
- safety:
44
- - provider_id: llama-guard
45
- provider_type: inline::llama-guard
46
- config:
47
- excluded_categories: []
48
- agents:
49
- - provider_id: meta-reference
50
- provider_type: inline::meta-reference
51
- config:
52
- persistence:
53
- agent_state:
54
- namespace: agents
55
- backend: kv_default
56
- responses:
57
- table_name: responses
58
- backend: sql_default
59
- max_write_queue_size: 10000
60
- num_writers: 4
61
- eval:
62
- - provider_id: meta-reference
63
- provider_type: inline::meta-reference
64
- config:
65
- kvstore:
66
- namespace: eval
67
- backend: kv_default
68
- datasetio:
69
- - provider_id: huggingface
70
- provider_type: remote::huggingface
71
- config:
72
- kvstore:
73
- namespace: datasetio::huggingface
74
- backend: kv_default
75
- - provider_id: localfs
76
- provider_type: inline::localfs
77
- config:
78
- kvstore:
79
- namespace: datasetio::localfs
80
- backend: kv_default
81
- scoring:
82
- - provider_id: basic
83
- provider_type: inline::basic
84
- - provider_id: llm-as-judge
85
- provider_type: inline::llm-as-judge
86
- - provider_id: braintrust
87
- provider_type: inline::braintrust
88
- config:
89
- openai_api_key: ${env.OPENAI_API_KEY:=}
90
- tool_runtime:
91
- - provider_id: brave-search
92
- provider_type: remote::brave-search
93
- config:
94
- api_key: ${env.BRAVE_SEARCH_API_KEY:=}
95
- max_results: 3
96
- - provider_id: tavily-search
97
- provider_type: remote::tavily-search
98
- config:
99
- api_key: ${env.TAVILY_SEARCH_API_KEY:=}
100
- max_results: 3
101
- - provider_id: rag-runtime
102
- provider_type: inline::rag-runtime
103
- - provider_id: model-context-protocol
104
- provider_type: remote::model-context-protocol
105
- storage:
106
- backends:
107
- kv_default:
108
- type: kv_sqlite
109
- db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/kvstore.db
110
- sql_default:
111
- type: sql_sqlite
112
- db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/sql_store.db
113
- stores:
114
- metadata:
115
- namespace: registry
116
- backend: kv_default
117
- inference:
118
- table_name: inference_store
119
- backend: sql_default
120
- max_write_queue_size: 10000
121
- num_writers: 4
122
- conversations:
123
- table_name: openai_conversations
124
- backend: sql_default
125
- prompts:
126
- namespace: prompts
127
- backend: kv_default
128
- registered_resources:
129
- models:
130
- - metadata: {}
131
- model_id: ${env.INFERENCE_MODEL}
132
- provider_id: meta-reference-inference
133
- model_type: llm
134
- - metadata: {}
135
- model_id: ${env.SAFETY_MODEL}
136
- provider_id: meta-reference-safety
137
- model_type: llm
138
- - metadata:
139
- embedding_dimension: 768
140
- model_id: nomic-embed-text-v1.5
141
- provider_id: sentence-transformers
142
- model_type: embedding
143
- shields:
144
- - shield_id: ${env.SAFETY_MODEL}
145
- vector_dbs: []
146
- datasets: []
147
- scoring_fns: []
148
- benchmarks: []
149
- tool_groups:
150
- - toolgroup_id: builtin::websearch
151
- provider_id: tavily-search
152
- - toolgroup_id: builtin::rag
153
- provider_id: rag-runtime
154
- server:
155
- port: 8321
@@ -1,88 +0,0 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the terms described in the LICENSE file in
5
- # the root directory of this source tree.
6
-
7
- import math
8
- import re
9
-
10
- import torch
11
- from torch import nn
12
-
13
-
14
- def hadamard_transform(x: torch.Tensor) -> torch.Tensor:
15
- """Hadamard transform.
16
-
17
- This function performs the Hadamard transform on the input tensor 'x'.
18
- The Hadamard transform is a linear transformation that multiplies the input
19
- tensor by the Hadamard matrix of dimension n x n, where n is the size of
20
- the last dimension of the input tensor.
21
- """
22
- *_, n = x.shape
23
- m = int(math.log2(n))
24
- assert n == 1 << m, "n must be a power of 2"
25
- x = x[..., None]
26
- inv_sqrt2 = 0.5**0.5
27
- for _ in range(m):
28
- top = x[..., ::2, :] + x[..., 1::2, :]
29
- bot = x[..., ::2, :] - x[..., 1::2, :]
30
- x = torch.cat((top, bot), dim=-1)
31
- x *= inv_sqrt2
32
- res = x.squeeze(-2)
33
- return res
34
-
35
-
36
- class HadamardModule(torch.nn.Module):
37
- """A module that applies the Hadamard transform to the input tensor.
38
-
39
- Args:
40
- group_size: The size of the groups that the input tensor will be divided into
41
- before applying the Hadamard transform.
42
- """
43
-
44
- def __init__(self, group_size: int) -> None:
45
- super().__init__()
46
- self.group_size = group_size
47
-
48
- def forward(self, x: torch.Tensor) -> torch.Tensor:
49
- reshape_back = False
50
- orig_shape = x.shape
51
- if self.group_size != x.shape[-1]:
52
- reshape_back = True
53
- x = x.reshape(-1, x.shape[-1] // self.group_size, self.group_size)
54
- x = hadamard_transform(x)
55
- if reshape_back:
56
- x = x.reshape(orig_shape)
57
- return x
58
-
59
-
60
- def add_hadamard_transform_for_spinquant(model: torch.nn.Module, prefix: str = "") -> None:
61
- """
62
- Adds a Hadamard transform to the last linear layer of each feedforward network (FFN) in the model.
63
- This function recursively traverses the model's children and looks for layers that match the pattern
64
- "layers.<digit>.feed_forward.w2", where <digit> is one or more digits. When such a layer is found,
65
- it is replaced with a new sequential module that consists of a HadamardModule followed by the original
66
- layer. The HadamardModule applies the Hadamard transform to the input tensor.
67
-
68
- See `SpinQuant <https://arxiv.org/abs/2405.16406>_` paper for more details.
69
-
70
- Args:
71
- model: An instance of 'torch.nn.Module' (e.g., Transformer model).
72
- prefix: A string prefix to add to the full name of each child module.
73
-
74
- Returns:
75
- None
76
- """
77
-
78
- pattern_last_linear_ffn = r"layers.\d+.feed_forward.w2"
79
- for module_name, module in model.named_children():
80
- child_full_name = prefix + "." + module_name
81
- if re.search(pattern_last_linear_ffn, child_full_name):
82
- # Module matching this pattern should be nn.Linear with in_features
83
- assert isinstance(module, nn.Linear), f"Expected nn.Linear, got {type(module)}"
84
- new_module = nn.Sequential(HadamardModule(group_size=module.in_features), module)
85
- del module
86
- setattr(model, module_name, new_module)
87
- else:
88
- add_hadamard_transform_for_spinquant(module, (prefix + "." if prefix else prefix) + module_name)
@@ -1,74 +0,0 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the terms described in the LICENSE file in
5
- # the root directory of this source tree.
6
-
7
- from dataclasses import dataclass
8
- from enum import Enum
9
-
10
-
11
- class QuantizationScheme(Enum):
12
- int4_weight_int8_dynamic_activation = "int4_weight_int8_dynamic_activation"
13
-
14
-
15
- @dataclass
16
- class QuantizationArgs:
17
- scheme: QuantizationScheme | None = None
18
- group_size: int | None = None
19
- spinquant: bool = False
20
-
21
- def __init__(self, **kwargs):
22
- for k, v in kwargs.items():
23
- if k == "scheme":
24
- setattr(self, k, QuantizationScheme(v))
25
- else:
26
- if hasattr(self, k):
27
- setattr(self, k, v)
28
-
29
-
30
- @dataclass
31
- class LoRAArgs:
32
- rank: int
33
- scale: float
34
-
35
-
36
- @dataclass
37
- class ModelArgs:
38
- dim: int = 4096
39
- n_layers: int = 32
40
- n_heads: int = 32
41
- n_kv_heads: int | None = None
42
- vocab_size: int = -1
43
- multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2
44
- ffn_dim_multiplier: float | None = None
45
- norm_eps: float = 1e-5
46
- rope_theta: float = 500000
47
- use_scaled_rope: bool = False
48
-
49
- max_batch_size: int = 32
50
- max_seq_len: int = 2048
51
-
52
- # vision model params
53
- vision_chunk_size: int = -1 # image resolution for image models
54
- vision_max_num_chunks: int = 4
55
- vision_num_cross_attention_layers: int = -1
56
-
57
- quantization_args: QuantizationArgs | None = None
58
- lora_args: LoRAArgs | None = None
59
-
60
- def __init__(self, **kwargs):
61
- for k, v in kwargs.items():
62
- if k == "lora_args":
63
- setattr(self, k, LoRAArgs(**v))
64
- elif k == "quantization_args":
65
- setattr(self, k, QuantizationArgs(**v))
66
- else:
67
- if hasattr(self, k):
68
- setattr(self, k, v)
69
-
70
- if self.n_kv_heads is None:
71
- self.n_kv_heads = self.n_heads
72
- assert self.n_kv_heads <= self.n_heads
73
- assert self.n_heads % self.n_kv_heads == 0
74
- assert self.dim % self.n_heads == 0