llama-stack 0.4.3__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (311) hide show
  1. llama_stack/cli/stack/_list_deps.py +11 -7
  2. llama_stack/cli/stack/run.py +3 -25
  3. llama_stack/core/access_control/datatypes.py +78 -0
  4. llama_stack/core/configure.py +2 -2
  5. {llama_stack_api/internal → llama_stack/core/connectors}/__init__.py +2 -2
  6. llama_stack/core/connectors/connectors.py +162 -0
  7. llama_stack/core/conversations/conversations.py +61 -58
  8. llama_stack/core/datatypes.py +54 -8
  9. llama_stack/core/library_client.py +60 -13
  10. llama_stack/core/prompts/prompts.py +43 -42
  11. llama_stack/core/routers/datasets.py +20 -17
  12. llama_stack/core/routers/eval_scoring.py +143 -53
  13. llama_stack/core/routers/inference.py +20 -9
  14. llama_stack/core/routers/safety.py +30 -42
  15. llama_stack/core/routers/vector_io.py +15 -7
  16. llama_stack/core/routing_tables/models.py +42 -3
  17. llama_stack/core/routing_tables/scoring_functions.py +19 -19
  18. llama_stack/core/routing_tables/shields.py +20 -17
  19. llama_stack/core/routing_tables/vector_stores.py +8 -5
  20. llama_stack/core/server/auth.py +192 -17
  21. llama_stack/core/server/fastapi_router_registry.py +40 -5
  22. llama_stack/core/server/server.py +24 -5
  23. llama_stack/core/stack.py +54 -10
  24. llama_stack/core/storage/datatypes.py +9 -0
  25. llama_stack/core/store/registry.py +1 -1
  26. llama_stack/core/utils/exec.py +2 -2
  27. llama_stack/core/utils/type_inspection.py +16 -2
  28. llama_stack/distributions/dell/config.yaml +4 -1
  29. llama_stack/distributions/dell/doc_template.md +209 -0
  30. llama_stack/distributions/dell/run-with-safety.yaml +4 -1
  31. llama_stack/distributions/nvidia/config.yaml +4 -1
  32. llama_stack/distributions/nvidia/doc_template.md +170 -0
  33. llama_stack/distributions/nvidia/run-with-safety.yaml +4 -1
  34. llama_stack/distributions/oci/config.yaml +4 -1
  35. llama_stack/distributions/oci/doc_template.md +140 -0
  36. llama_stack/distributions/open-benchmark/config.yaml +9 -1
  37. llama_stack/distributions/postgres-demo/config.yaml +1 -1
  38. llama_stack/distributions/starter/build.yaml +62 -0
  39. llama_stack/distributions/starter/config.yaml +22 -3
  40. llama_stack/distributions/starter/run-with-postgres-store.yaml +22 -3
  41. llama_stack/distributions/starter/starter.py +13 -1
  42. llama_stack/distributions/starter-gpu/build.yaml +62 -0
  43. llama_stack/distributions/starter-gpu/config.yaml +22 -3
  44. llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +22 -3
  45. llama_stack/distributions/template.py +10 -2
  46. llama_stack/distributions/watsonx/config.yaml +4 -1
  47. llama_stack/log.py +1 -0
  48. llama_stack/models/llama/resources/dog.jpg +0 -0
  49. llama_stack/models/llama/resources/pasta.jpeg +0 -0
  50. llama_stack/models/llama/resources/small_dog.jpg +0 -0
  51. llama_stack/providers/inline/agents/meta_reference/__init__.py +1 -0
  52. llama_stack/providers/inline/agents/meta_reference/agents.py +58 -61
  53. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +187 -60
  54. llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +99 -22
  55. llama_stack/providers/inline/agents/meta_reference/responses/types.py +2 -1
  56. llama_stack/providers/inline/agents/meta_reference/responses/utils.py +4 -1
  57. llama_stack/providers/inline/agents/meta_reference/safety.py +2 -2
  58. llama_stack/providers/inline/batches/reference/batches.py +2 -1
  59. llama_stack/providers/inline/eval/meta_reference/eval.py +40 -32
  60. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h +9 -0
  61. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift +189 -0
  62. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift +238 -0
  63. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/PromptTemplate.swift +12 -0
  64. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/SystemPrompts.swift +89 -0
  65. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.pbxproj +550 -0
  66. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
  67. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
  68. llama_stack/providers/inline/post_training/huggingface/post_training.py +33 -38
  69. llama_stack/providers/inline/post_training/huggingface/utils.py +2 -5
  70. llama_stack/providers/inline/post_training/torchtune/common/utils.py +5 -9
  71. llama_stack/providers/inline/post_training/torchtune/post_training.py +28 -33
  72. llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +2 -4
  73. llama_stack/providers/inline/safety/code_scanner/code_scanner.py +12 -15
  74. llama_stack/providers/inline/safety/llama_guard/llama_guard.py +20 -24
  75. llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +11 -17
  76. llama_stack/providers/inline/scoring/basic/scoring.py +13 -17
  77. llama_stack/providers/inline/scoring/braintrust/braintrust.py +15 -15
  78. llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +13 -17
  79. llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +1 -1
  80. llama_stack/providers/registry/agents.py +1 -0
  81. llama_stack/providers/registry/inference.py +1 -9
  82. llama_stack/providers/registry/vector_io.py +136 -16
  83. llama_stack/providers/remote/datasetio/nvidia/README.md +74 -0
  84. llama_stack/providers/remote/eval/nvidia/README.md +134 -0
  85. llama_stack/providers/remote/eval/nvidia/eval.py +22 -21
  86. llama_stack/providers/remote/files/s3/README.md +266 -0
  87. llama_stack/providers/remote/files/s3/config.py +5 -3
  88. llama_stack/providers/remote/files/s3/files.py +2 -2
  89. llama_stack/providers/remote/inference/gemini/gemini.py +4 -0
  90. llama_stack/providers/remote/inference/nvidia/NVIDIA.md +203 -0
  91. llama_stack/providers/remote/inference/openai/openai.py +2 -0
  92. llama_stack/providers/remote/inference/together/together.py +4 -0
  93. llama_stack/providers/remote/inference/vertexai/config.py +3 -3
  94. llama_stack/providers/remote/inference/vertexai/vertexai.py +5 -2
  95. llama_stack/providers/remote/inference/vllm/config.py +37 -18
  96. llama_stack/providers/remote/inference/vllm/vllm.py +0 -3
  97. llama_stack/providers/remote/inference/watsonx/watsonx.py +4 -0
  98. llama_stack/providers/remote/post_training/nvidia/README.md +151 -0
  99. llama_stack/providers/remote/post_training/nvidia/models.py +3 -11
  100. llama_stack/providers/remote/post_training/nvidia/post_training.py +31 -33
  101. llama_stack/providers/remote/safety/bedrock/bedrock.py +10 -27
  102. llama_stack/providers/remote/safety/nvidia/README.md +78 -0
  103. llama_stack/providers/remote/safety/nvidia/nvidia.py +9 -25
  104. llama_stack/providers/remote/safety/sambanova/sambanova.py +13 -11
  105. llama_stack/providers/remote/vector_io/elasticsearch/__init__.py +17 -0
  106. llama_stack/providers/remote/vector_io/elasticsearch/config.py +32 -0
  107. llama_stack/providers/remote/vector_io/elasticsearch/elasticsearch.py +463 -0
  108. llama_stack/providers/remote/vector_io/oci/__init__.py +22 -0
  109. llama_stack/providers/remote/vector_io/oci/config.py +41 -0
  110. llama_stack/providers/remote/vector_io/oci/oci26ai.py +595 -0
  111. llama_stack/providers/remote/vector_io/pgvector/config.py +69 -2
  112. llama_stack/providers/remote/vector_io/pgvector/pgvector.py +255 -6
  113. llama_stack/providers/remote/vector_io/qdrant/qdrant.py +62 -38
  114. llama_stack/providers/utils/bedrock/client.py +3 -3
  115. llama_stack/providers/utils/bedrock/config.py +7 -7
  116. llama_stack/providers/utils/inference/__init__.py +0 -25
  117. llama_stack/providers/utils/inference/embedding_mixin.py +4 -0
  118. llama_stack/providers/utils/inference/http_client.py +239 -0
  119. llama_stack/providers/utils/inference/litellm_openai_mixin.py +6 -0
  120. llama_stack/providers/utils/inference/model_registry.py +148 -2
  121. llama_stack/providers/utils/inference/openai_compat.py +1 -158
  122. llama_stack/providers/utils/inference/openai_mixin.py +42 -2
  123. llama_stack/providers/utils/inference/prompt_adapter.py +0 -209
  124. llama_stack/providers/utils/memory/openai_vector_store_mixin.py +92 -5
  125. llama_stack/providers/utils/memory/vector_store.py +46 -19
  126. llama_stack/providers/utils/responses/responses_store.py +40 -6
  127. llama_stack/providers/utils/safety.py +114 -0
  128. llama_stack/providers/utils/tools/mcp.py +44 -3
  129. llama_stack/testing/api_recorder.py +9 -3
  130. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/METADATA +14 -2
  131. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/RECORD +135 -279
  132. llama_stack-0.5.0.dist-info/top_level.txt +1 -0
  133. llama_stack/distributions/meta-reference-gpu/__init__.py +0 -7
  134. llama_stack/distributions/meta-reference-gpu/config.yaml +0 -140
  135. llama_stack/distributions/meta-reference-gpu/meta_reference.py +0 -163
  136. llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +0 -155
  137. llama_stack/models/llama/hadamard_utils.py +0 -88
  138. llama_stack/models/llama/llama3/args.py +0 -74
  139. llama_stack/models/llama/llama3/generation.py +0 -378
  140. llama_stack/models/llama/llama3/model.py +0 -304
  141. llama_stack/models/llama/llama3/multimodal/__init__.py +0 -12
  142. llama_stack/models/llama/llama3/multimodal/encoder_utils.py +0 -180
  143. llama_stack/models/llama/llama3/multimodal/image_transform.py +0 -409
  144. llama_stack/models/llama/llama3/multimodal/model.py +0 -1430
  145. llama_stack/models/llama/llama3/multimodal/utils.py +0 -26
  146. llama_stack/models/llama/llama3/quantization/__init__.py +0 -5
  147. llama_stack/models/llama/llama3/quantization/loader.py +0 -316
  148. llama_stack/models/llama/llama3_1/__init__.py +0 -12
  149. llama_stack/models/llama/llama3_1/prompt_format.md +0 -358
  150. llama_stack/models/llama/llama3_1/prompts.py +0 -258
  151. llama_stack/models/llama/llama3_2/__init__.py +0 -5
  152. llama_stack/models/llama/llama3_2/prompts_text.py +0 -229
  153. llama_stack/models/llama/llama3_2/prompts_vision.py +0 -126
  154. llama_stack/models/llama/llama3_2/text_prompt_format.md +0 -286
  155. llama_stack/models/llama/llama3_2/vision_prompt_format.md +0 -141
  156. llama_stack/models/llama/llama3_3/__init__.py +0 -5
  157. llama_stack/models/llama/llama3_3/prompts.py +0 -259
  158. llama_stack/models/llama/llama4/args.py +0 -107
  159. llama_stack/models/llama/llama4/ffn.py +0 -58
  160. llama_stack/models/llama/llama4/moe.py +0 -214
  161. llama_stack/models/llama/llama4/preprocess.py +0 -435
  162. llama_stack/models/llama/llama4/quantization/__init__.py +0 -5
  163. llama_stack/models/llama/llama4/quantization/loader.py +0 -226
  164. llama_stack/models/llama/llama4/vision/__init__.py +0 -5
  165. llama_stack/models/llama/llama4/vision/embedding.py +0 -210
  166. llama_stack/models/llama/llama4/vision/encoder.py +0 -412
  167. llama_stack/models/llama/quantize_impls.py +0 -316
  168. llama_stack/providers/inline/inference/meta_reference/__init__.py +0 -20
  169. llama_stack/providers/inline/inference/meta_reference/common.py +0 -24
  170. llama_stack/providers/inline/inference/meta_reference/config.py +0 -68
  171. llama_stack/providers/inline/inference/meta_reference/generators.py +0 -201
  172. llama_stack/providers/inline/inference/meta_reference/inference.py +0 -542
  173. llama_stack/providers/inline/inference/meta_reference/model_parallel.py +0 -77
  174. llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +0 -353
  175. llama_stack-0.4.3.dist-info/top_level.txt +0 -2
  176. llama_stack_api/__init__.py +0 -945
  177. llama_stack_api/admin/__init__.py +0 -45
  178. llama_stack_api/admin/api.py +0 -72
  179. llama_stack_api/admin/fastapi_routes.py +0 -117
  180. llama_stack_api/admin/models.py +0 -113
  181. llama_stack_api/agents.py +0 -173
  182. llama_stack_api/batches/__init__.py +0 -40
  183. llama_stack_api/batches/api.py +0 -53
  184. llama_stack_api/batches/fastapi_routes.py +0 -113
  185. llama_stack_api/batches/models.py +0 -78
  186. llama_stack_api/benchmarks/__init__.py +0 -43
  187. llama_stack_api/benchmarks/api.py +0 -39
  188. llama_stack_api/benchmarks/fastapi_routes.py +0 -109
  189. llama_stack_api/benchmarks/models.py +0 -109
  190. llama_stack_api/common/__init__.py +0 -5
  191. llama_stack_api/common/content_types.py +0 -101
  192. llama_stack_api/common/errors.py +0 -95
  193. llama_stack_api/common/job_types.py +0 -38
  194. llama_stack_api/common/responses.py +0 -77
  195. llama_stack_api/common/training_types.py +0 -47
  196. llama_stack_api/common/type_system.py +0 -146
  197. llama_stack_api/connectors.py +0 -146
  198. llama_stack_api/conversations.py +0 -270
  199. llama_stack_api/datasetio.py +0 -55
  200. llama_stack_api/datasets/__init__.py +0 -61
  201. llama_stack_api/datasets/api.py +0 -35
  202. llama_stack_api/datasets/fastapi_routes.py +0 -104
  203. llama_stack_api/datasets/models.py +0 -152
  204. llama_stack_api/datatypes.py +0 -373
  205. llama_stack_api/eval.py +0 -137
  206. llama_stack_api/file_processors/__init__.py +0 -27
  207. llama_stack_api/file_processors/api.py +0 -64
  208. llama_stack_api/file_processors/fastapi_routes.py +0 -78
  209. llama_stack_api/file_processors/models.py +0 -42
  210. llama_stack_api/files/__init__.py +0 -35
  211. llama_stack_api/files/api.py +0 -51
  212. llama_stack_api/files/fastapi_routes.py +0 -124
  213. llama_stack_api/files/models.py +0 -107
  214. llama_stack_api/inference.py +0 -1169
  215. llama_stack_api/inspect_api/__init__.py +0 -37
  216. llama_stack_api/inspect_api/api.py +0 -25
  217. llama_stack_api/inspect_api/fastapi_routes.py +0 -76
  218. llama_stack_api/inspect_api/models.py +0 -28
  219. llama_stack_api/internal/kvstore.py +0 -28
  220. llama_stack_api/internal/sqlstore.py +0 -81
  221. llama_stack_api/llama_stack_api/__init__.py +0 -945
  222. llama_stack_api/llama_stack_api/admin/__init__.py +0 -45
  223. llama_stack_api/llama_stack_api/admin/api.py +0 -72
  224. llama_stack_api/llama_stack_api/admin/fastapi_routes.py +0 -117
  225. llama_stack_api/llama_stack_api/admin/models.py +0 -113
  226. llama_stack_api/llama_stack_api/agents.py +0 -173
  227. llama_stack_api/llama_stack_api/batches/__init__.py +0 -40
  228. llama_stack_api/llama_stack_api/batches/api.py +0 -53
  229. llama_stack_api/llama_stack_api/batches/fastapi_routes.py +0 -113
  230. llama_stack_api/llama_stack_api/batches/models.py +0 -78
  231. llama_stack_api/llama_stack_api/benchmarks/__init__.py +0 -43
  232. llama_stack_api/llama_stack_api/benchmarks/api.py +0 -39
  233. llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +0 -109
  234. llama_stack_api/llama_stack_api/benchmarks/models.py +0 -109
  235. llama_stack_api/llama_stack_api/common/__init__.py +0 -5
  236. llama_stack_api/llama_stack_api/common/content_types.py +0 -101
  237. llama_stack_api/llama_stack_api/common/errors.py +0 -95
  238. llama_stack_api/llama_stack_api/common/job_types.py +0 -38
  239. llama_stack_api/llama_stack_api/common/responses.py +0 -77
  240. llama_stack_api/llama_stack_api/common/training_types.py +0 -47
  241. llama_stack_api/llama_stack_api/common/type_system.py +0 -146
  242. llama_stack_api/llama_stack_api/connectors.py +0 -146
  243. llama_stack_api/llama_stack_api/conversations.py +0 -270
  244. llama_stack_api/llama_stack_api/datasetio.py +0 -55
  245. llama_stack_api/llama_stack_api/datasets/__init__.py +0 -61
  246. llama_stack_api/llama_stack_api/datasets/api.py +0 -35
  247. llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +0 -104
  248. llama_stack_api/llama_stack_api/datasets/models.py +0 -152
  249. llama_stack_api/llama_stack_api/datatypes.py +0 -373
  250. llama_stack_api/llama_stack_api/eval.py +0 -137
  251. llama_stack_api/llama_stack_api/file_processors/__init__.py +0 -27
  252. llama_stack_api/llama_stack_api/file_processors/api.py +0 -64
  253. llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +0 -78
  254. llama_stack_api/llama_stack_api/file_processors/models.py +0 -42
  255. llama_stack_api/llama_stack_api/files/__init__.py +0 -35
  256. llama_stack_api/llama_stack_api/files/api.py +0 -51
  257. llama_stack_api/llama_stack_api/files/fastapi_routes.py +0 -124
  258. llama_stack_api/llama_stack_api/files/models.py +0 -107
  259. llama_stack_api/llama_stack_api/inference.py +0 -1169
  260. llama_stack_api/llama_stack_api/inspect_api/__init__.py +0 -37
  261. llama_stack_api/llama_stack_api/inspect_api/api.py +0 -25
  262. llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +0 -76
  263. llama_stack_api/llama_stack_api/inspect_api/models.py +0 -28
  264. llama_stack_api/llama_stack_api/internal/__init__.py +0 -9
  265. llama_stack_api/llama_stack_api/internal/kvstore.py +0 -28
  266. llama_stack_api/llama_stack_api/internal/sqlstore.py +0 -81
  267. llama_stack_api/llama_stack_api/models.py +0 -171
  268. llama_stack_api/llama_stack_api/openai_responses.py +0 -1468
  269. llama_stack_api/llama_stack_api/post_training.py +0 -370
  270. llama_stack_api/llama_stack_api/prompts.py +0 -203
  271. llama_stack_api/llama_stack_api/providers/__init__.py +0 -33
  272. llama_stack_api/llama_stack_api/providers/api.py +0 -16
  273. llama_stack_api/llama_stack_api/providers/fastapi_routes.py +0 -57
  274. llama_stack_api/llama_stack_api/providers/models.py +0 -24
  275. llama_stack_api/llama_stack_api/py.typed +0 -0
  276. llama_stack_api/llama_stack_api/rag_tool.py +0 -168
  277. llama_stack_api/llama_stack_api/resource.py +0 -37
  278. llama_stack_api/llama_stack_api/router_utils.py +0 -160
  279. llama_stack_api/llama_stack_api/safety.py +0 -132
  280. llama_stack_api/llama_stack_api/schema_utils.py +0 -208
  281. llama_stack_api/llama_stack_api/scoring.py +0 -93
  282. llama_stack_api/llama_stack_api/scoring_functions.py +0 -211
  283. llama_stack_api/llama_stack_api/shields.py +0 -93
  284. llama_stack_api/llama_stack_api/tools.py +0 -226
  285. llama_stack_api/llama_stack_api/vector_io.py +0 -941
  286. llama_stack_api/llama_stack_api/vector_stores.py +0 -53
  287. llama_stack_api/llama_stack_api/version.py +0 -9
  288. llama_stack_api/models.py +0 -171
  289. llama_stack_api/openai_responses.py +0 -1468
  290. llama_stack_api/post_training.py +0 -370
  291. llama_stack_api/prompts.py +0 -203
  292. llama_stack_api/providers/__init__.py +0 -33
  293. llama_stack_api/providers/api.py +0 -16
  294. llama_stack_api/providers/fastapi_routes.py +0 -57
  295. llama_stack_api/providers/models.py +0 -24
  296. llama_stack_api/py.typed +0 -0
  297. llama_stack_api/rag_tool.py +0 -168
  298. llama_stack_api/resource.py +0 -37
  299. llama_stack_api/router_utils.py +0 -160
  300. llama_stack_api/safety.py +0 -132
  301. llama_stack_api/schema_utils.py +0 -208
  302. llama_stack_api/scoring.py +0 -93
  303. llama_stack_api/scoring_functions.py +0 -211
  304. llama_stack_api/shields.py +0 -93
  305. llama_stack_api/tools.py +0 -226
  306. llama_stack_api/vector_io.py +0 -941
  307. llama_stack_api/vector_stores.py +0 -53
  308. llama_stack_api/version.py +0 -9
  309. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/WHEEL +0 -0
  310. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/entry_points.txt +0 -0
  311. {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,141 +0,0 @@
1
- ## User and assistant conversation
2
-
3
- Here is a regular multi-turn user assistant conversation and how its formatted.
4
-
5
- ##### Input Prompt Format
6
- ```
7
- <|begin_of_text|><|start_header_id|>system<|end_header_id|>
8
-
9
- You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
10
-
11
- Who are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
12
-
13
-
14
- ```
15
-
16
- ##### Model Response Format
17
- ```
18
- I'm a helpful assistant, here to provide information, answer questions, and assist with tasks to the best of my abilities. I'm a large language model, which means I can understand and respond to natural language inputs, and I'm constantly learning and improving to provide more accurate and helpful responses.
19
-
20
- I can help with a wide range of topics, from general knowledge and trivia to more specific areas like science, history, technology, and more. I can also assist with tasks like language translation, text summarization, and even generating creative content like stories or dialogues.
21
-
22
- What can I help you with today?<|eot_id|>
23
- ```
24
-
25
-
26
- ##### Notes
27
- This format is unchanged from Llama3.1
28
-
29
- ## User and assistant conversation with Images
30
-
31
- This example shows how to pass and image to the model as part of the messages.
32
-
33
- ##### Input Prompt Format
34
- ```
35
- <|begin_of_text|><|start_header_id|>user<|end_header_id|>
36
-
37
- <|image|>Describe this image in two sentences<|eot_id|><|start_header_id|>assistant<|end_header_id|>
38
-
39
-
40
- ```
41
-
42
- ##### Model Response Format
43
- ```
44
- The image depicts a small dog standing on a skateboard, with its front paws firmly planted on the board and its back paws slightly raised. The dog's fur is predominantly brown and white, with a distinctive black stripe running down its back, and it is wearing a black collar around its neck.<|eot_id|>
45
- ```
46
-
47
-
48
- ##### Notes
49
-
50
- - The `<|image|>` tag is used to indicate presence of the image
51
- - The model isn't an early fusion model so doesn't actually translate an image into several tokens. Instead the cross-attention layers take input "on the side" from a vision encoder
52
- ![Image](mm-model.png)
53
- - Its important to postion the <|image|> tag appropriately in the prompt. Image will only attend to the subsequent text tokens
54
- - The <|image|> tag is part of the user message body, implying that it should only come after the header `<|start_header_id|>{role}<|end_header_id|>` in the message body
55
- - We recommend using a single image in one prompt
56
-
57
-
58
- ## Builtin and Zero Shot Tool Calling
59
-
60
-
61
- Llama3.2 vision models follow the same tool calling format as Llama3.1 models when inputs are text only.
62
- Use `Environment: ipython` to enable tools.
63
- Add `Tools: {{tool_name1}},{{tool_name2}}` for each of the builtin tools.
64
- The same builtin tools as Llama3.1 are available,
65
- - code_interpreter (for executing python code)
66
- - brave_search (to search the web)
67
- - wolfram_alpha (for querying wolfram alpha for mathematical questions)
68
-
69
-
70
- ##### Input Prompt Format
71
- ```
72
- <|begin_of_text|><|start_header_id|>system<|end_header_id|>
73
-
74
- Environment: ipython
75
- Tools: brave_search, wolfram_alpha
76
- Cutting Knowledge Date: December 2023
77
- Today Date: 23 September 2024
78
-
79
- You are a helpful assistant.
80
- <|eot_id|><|start_header_id|>user<|end_header_id|>
81
-
82
- Search the web for the latest price of 1oz gold?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
83
-
84
-
85
- ```
86
-
87
- ##### Model Response Format
88
- ```
89
- <|python_tag|>brave_search.call(query="latest price of 1oz gold")<|eom_id|>
90
- ```
91
-
92
-
93
- ##### Notes
94
-
95
- - Note the `<|python_tag|>` before `brave_search` function call.
96
- - The `<|eom_id|>` tag is used to indicate the end of the message.
97
- - Similar to Llama3.1, code_interpreter is not explicitly mentioned but is enabled via `Environment: ipython`.
98
- - Tool Calling does NOT work with images in the prompt as of now.
99
-
100
-
101
- ## Prompt format for base models
102
-
103
-
104
- For base models (Llama3.2-11B-Vision and Llama3.2-90B-Vision), the prompt format for a simple completion is as follows
105
-
106
-
107
- ##### Input Prompt Format
108
- ```
109
- <|begin_of_text|>The color of the sky is blue but sometimes it can also be
110
- ```
111
-
112
- ##### Model Response Format
113
- ```
114
- red, orange, pink, purple, and even black. The color of the sky is determined by the amount of sunlight that is scattered by the atmosphere and the amount of dust and water vapor present in the atmosphere. During sunrise and sunset, the sky can take on a range of colors due to the scattering of light by
115
- ```
116
-
117
-
118
- ##### Notes
119
- - Same as Llama3.1
120
-
121
- ## Prompt format for base models with Image
122
-
123
-
124
- For base models (Llama3.2-11B-Vision and Llama3.2-90B-Vision), here is an example of how the text completion format looks with an image,
125
-
126
-
127
- ##### Input Prompt Format
128
- ```
129
- <|begin_of_text|><|image|>If I had to write a haiku for this one
130
- ```
131
-
132
- ##### Model Response Format
133
- ```
134
- , it would be: A skateboarder's delight, a puppy on a board, a furry little thrill-seeker. This puppy is a true skateboarding enthusiast, always eager to hit the streets and show off his skills. He's a master of the board, gliding effortlessly across the pavement with grace and style.
135
- ```
136
-
137
-
138
- ##### Notes
139
- - Note the placement of the special tags <|begin_of_text|> and <|image|>
140
-
141
- Thank You!
@@ -1,5 +0,0 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the terms described in the LICENSE file in
5
- # the root directory of this source tree.
@@ -1,259 +0,0 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the terms described in the LICENSE file in
5
- # the root directory of this source tree.
6
-
7
- # Copyright (c) Meta Platforms, Inc. and affiliates.
8
- # All rights reserved.
9
- #
10
- # This source code is licensed under the terms described in the LICENSE file in
11
- # top-level folder for each specific model found within the models/ directory at
12
- # the top-level of this source tree.
13
-
14
- import json
15
- import textwrap
16
-
17
- from llama_stack.models.llama.datatypes import (
18
- BuiltinTool,
19
- RawMessage,
20
- StopReason,
21
- ToolCall,
22
- ToolPromptFormat,
23
- )
24
-
25
- from ..prompt_format import (
26
- # llama3_1_e2e_tool_call_dialog,
27
- TextCompletionContent,
28
- UseCase,
29
- llama3_1_builtin_tool_call_dialog,
30
- llama3_1_custom_tool_call_dialog,
31
- )
32
-
33
-
34
- def wolfram_alpha_response():
35
- return textwrap.dedent(
36
- """
37
- {
38
- "queryresult": {
39
- "success": true,
40
- "inputstring": "100th decimal of pi",
41
- "pods": [
42
- {
43
- "title": "Input interpretation",
44
- "subpods": [
45
- {
46
- "title": "",
47
- "plaintext": "100th digit | \u03c0"
48
- }
49
- ]
50
- },
51
- {
52
- "title": "Nearby digits",
53
- "subpods": [
54
- {
55
- "title": "",
56
- "plaintext": "...86208998628034825342117067982148086513282306647093..."
57
- }
58
- ]
59
- },
60
- {
61
- "title": "Result",
62
- "primary": true,
63
- "subpods": [
64
- {
65
- "title": "",
66
- "plaintext": "7"
67
- }
68
- ]
69
- }
70
- ]
71
- }
72
- }
73
- """
74
- )
75
-
76
-
77
- def usecases() -> list[UseCase | str]:
78
- return [
79
- textwrap.dedent(
80
- """
81
- # Llama 3.1 - Prompt Formats
82
- ## Tokens
83
- Here is a list of special tokens that are supported by Llama 3.1:
84
- - `<|begin_of_text|>`: Specifies the start of the prompt
85
- - `<|end_of_text|>`: Model will cease to generate more tokens. This token is generated only by the base models.
86
- - `<|finetune_right_pad_id|>`: This token is used for padding text sequences to the same length in a batch.
87
- - `<|start_header_id|>` and `<|end_header_id|>`: These tokens enclose the role for a particular message. The possible roles are: [system, user, assistant and tool]
88
- - `<|eom_id|>`: End of message. A message represents a possible stopping point for execution where the model can inform the executor that a tool call needs to be made. This is used for multi-step interactions between the model and any available tools. This token is emitted by the model when the Environment: ipython instruction is used in the system prompt, or if the model calls for a built-in tool.
89
- - `<|eot_id|>`: End of turn. Represents when the model has determined that it has finished interacting with the user message that initiated its response. This is used in two scenarios:
90
- - at the end of a direct interaction between the model and the user
91
- - at the end of multiple interactions between the model and any available tools
92
- This token signals to the executor that the model has finished generating a response.
93
- - `<|python_tag|>`: Is a special tag used in the model's response to signify a tool call.
94
- """
95
- ),
96
- textwrap.dedent(
97
- """
98
- There are 4 different roles that are supported by Llama 3.1
99
- - `system`: Sets the context in which to interact with the AI model. It typically includes rules, guidelines, or necessary information that helps the model respond effectively.
100
- - `user`: Represents the human interacting with the model. It includes the inputs, commands, and questions to the model.
101
- - `tool`: A new role introduced in Llama 3.1. This role is used to mark messages with the output of a tool call when sent back to the model from the executor. (The actual token used by the model for this role is "ipython".)
102
- - `assistant`: Represents the response generated by the AI model based on the context provided in the `system`, `tool` and `user` prompts.
103
- """
104
- ),
105
- UseCase(
106
- title="Llama 3.1 Base Model",
107
- description="Text completion for Llama 3.1 base model uses this format.",
108
- dialogs=[TextCompletionContent(content="Color of sky is blue but sometimes can also be")],
109
- notes="Note start special tag",
110
- ),
111
- "## Llama 3.1 Instruct Model",
112
- UseCase(
113
- title="User and assistant conversation",
114
- description="Here is a regular multi-turn user assistant conversation and how its formatted.",
115
- dialogs=[
116
- [
117
- RawMessage(role="system", content="You are a helpful assistant"),
118
- RawMessage(
119
- role="user",
120
- content="Answer who are you in the form of jeopardy?",
121
- ),
122
- ]
123
- ],
124
- notes="",
125
- ),
126
- "## Tool Calling Formats",
127
- textwrap.dedent(
128
- """
129
- The three built-in tools (brave_search, wolfram_alpha, and code interpreter) can be turned on using the system prompt:
130
- - Brave Search: Tool call to perform web searches.
131
- - Wolfram Alpha: Tool call to perform complex mathematical calculations.
132
- - Code Interpreter: Enables the model to output python code.
133
- """
134
- ),
135
- UseCase(
136
- title="Builtin Tool Calling",
137
- description=textwrap.dedent(
138
- """
139
- Here is an example of a conversation using brave search
140
- """
141
- ),
142
- dialogs=[llama3_1_builtin_tool_call_dialog()],
143
- notes=textwrap.dedent(
144
- """
145
- - Just including Environment: ipython turns on code interpreter; therefore, you don't need to specify code interpretation on the Tools: line. The model can generate python code which is interpreted by the executor, with the result provided back to the model.
146
- - The message body of the assistant response starts with a special tag <|python_tag|>
147
- - As alluded to above, in such an environment, the model can generate <|eom_id|> instead of just the standard <|eot_id|> . The latter indicates the turn is finished, while the former indicates continued multi-step reasoning. That is, the model is expecting a continuation message with the output of the tool call.
148
- - The model tool call response is of the form `tool.call(query="...")` wher tool is `brave_search` or `wolfram_alpha`
149
- """
150
- ),
151
- ),
152
- UseCase(
153
- title="Builtin Code Interpreter",
154
- description="Here is an actual example of model responding with code",
155
- dialogs=[
156
- [
157
- RawMessage(role="system", content="Environment: ipython"),
158
- RawMessage(
159
- role="user",
160
- content="Write code to check if number is prime, use that to see if the number 7 is prime",
161
- ),
162
- ],
163
- ],
164
- notes=textwrap.dedent(
165
- """
166
- - Model starts with <|python_tag|> and continues writing python code that it needs to be executed
167
- - No explicit mention of code_interpreter in system prompt. `Environment: ipython` implicitly enables it.
168
- """
169
- ),
170
- ),
171
- UseCase(
172
- title="Built-in tools full interaction",
173
- description="Here is a full interaction with the built-in tools including the tool response and the final assistant response.",
174
- dialogs=[
175
- [
176
- RawMessage(
177
- role="system",
178
- content="Environment: ipython\nTools: brave_search, wolfram_alpha\n",
179
- ),
180
- RawMessage(role="user", content="What is the 100th decimal of pi?"),
181
- RawMessage(
182
- role="assistant",
183
- content="",
184
- stop_reason=StopReason.end_of_message,
185
- tool_calls=[
186
- ToolCall(
187
- call_id="tool_call_id",
188
- tool_name=BuiltinTool.wolfram_alpha,
189
- arguments=json.dumps({"query": "100th decimal of pi"}),
190
- )
191
- ],
192
- ),
193
- RawMessage(
194
- role="tool",
195
- content=wolfram_alpha_response(),
196
- ),
197
- ],
198
- ],
199
- notes=textwrap.dedent(
200
- """
201
- - Note the `<|python_tag|>` in the assistant response.
202
- - Role is `tool` for the wolfram alpha response that is passed back to the model.
203
- - Final message from assistant has <|eot_id|> tag.
204
- """
205
- ),
206
- ),
207
- "## Zero shot tool calling",
208
- UseCase(
209
- title="JSON based tool calling",
210
- description=textwrap.dedent(
211
- """
212
- Llama models can now output custom tool calls from a single message to allow easier tool calling.
213
- The following prompts provide an example of how custom tools can be called from the output of the model.
214
- It's important to note that the model itself does not execute the calls; it provides structured output to facilitate calling by an executor.
215
- """
216
- ),
217
- dialogs=[llama3_1_custom_tool_call_dialog()],
218
- notes=textwrap.dedent(
219
- """
220
- - JSON format for providing tools needs name, description and parameters
221
- - Model responds with `<|python_tag|>` and `<|eom_id|>` as `Environment: ipython` was in the system prompt
222
- - Instructions for tools added as a user message
223
- - Only single tool calls are supported as of now
224
- """
225
- ),
226
- ),
227
- # FIXME: This is not working yet as expected
228
- # UseCase(
229
- # title="E2E tool call example",
230
- # description=textwrap.dedent(
231
- # """
232
- # Here is an example showing the whole multi-step turn by taking custom tool outputs and passing back to the model.
233
- # """
234
- # ),
235
- # dialogs=[
236
- # llama3_1_e2e_tool_call_dialog(
237
- # tool_prompt_format=ToolPromptFormat.function_tag
238
- # )
239
- # ],
240
- # notes="",
241
- # ),
242
- "## Example of a user defined tool calling",
243
- UseCase(
244
- title="`<function>` based tool calling",
245
- description=textwrap.dedent(
246
- """
247
- Here is an example of how you could also write custom instructions for model to do zero shot tool calling.
248
- In this example, we define a custom tool calling format using the `<function>` tag.
249
- """
250
- ),
251
- dialogs=[llama3_1_custom_tool_call_dialog(ToolPromptFormat.function_tag)],
252
- notes=textwrap.dedent(
253
- """
254
- - In this case, model does NOT respond with `<|python_tag|>` and ends with `<|eot_id|>`
255
- - Instructions for tools added as a user message
256
- """
257
- ),
258
- ),
259
- ]
@@ -1,107 +0,0 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the terms described in the LICENSE file in
5
- # the root directory of this source tree.
6
-
7
- from enum import Enum
8
-
9
- from pydantic import BaseModel, model_validator
10
-
11
-
12
- class QuantizationScheme(Enum):
13
- int4_weight_int8_dynamic_activation = "int4_weight_int8_dynamic_activation"
14
-
15
-
16
- class QuantizationArgs(BaseModel):
17
- scheme: QuantizationScheme | None = None
18
- group_size: int | None = None
19
- spinquant: bool = False
20
-
21
-
22
- class LoRAArgs(BaseModel):
23
- rank: int
24
- scale: float
25
-
26
-
27
- class MoEArgs(BaseModel):
28
- num_experts: int = -1
29
- capacity_factor: float = 1.0 # capacity factor determines how many tokens each expert can choose
30
- auto_scale_F: bool = ( # noqa: N815
31
- True # if true, rescales hidden_dim such that number of activated params is same as equivalent dense layer
32
- )
33
- top_k: int = 1
34
- interleave_moe_layer_step: int = 1
35
-
36
-
37
- class Size(BaseModel):
38
- height: int
39
- width: int
40
-
41
-
42
- class VisionArgs(BaseModel):
43
- image_size: Size
44
- patch_size: Size
45
-
46
- # parameters for the encoder transformer
47
- dim: int
48
- n_layers: int
49
- n_heads: int
50
- mlp_ratio: float
51
- output_dim: int
52
-
53
- pixel_shuffle_ratio: float
54
-
55
-
56
- class ModelArgs(BaseModel):
57
- dim: int = -1
58
- n_layers: int = -1
59
- n_heads: int = -1
60
- n_kv_heads: int | None = None
61
- head_dim: int | None = None
62
-
63
- vocab_size: int = -1
64
- multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2
65
- ffn_dim_multiplier: float | None = None
66
- ffn_exp: float | None = None
67
- norm_eps: float = 1e-5
68
-
69
- attention_chunk_size: int | None = None
70
- rope_theta: float = 500000
71
- use_scaled_rope: bool = False
72
- rope_scaling_factor: float | None = None
73
- rope_high_freq_factor: float | None = None
74
-
75
- nope_layer_interval: int | None = None # No position encoding in every n layers
76
- use_qk_norm: bool = False
77
- # Set to True to enable inference-time temperature tuning (useful for very long context)
78
- attn_temperature_tuning: bool = False
79
- floor_scale: float = 8192.0
80
- attn_scale: float = 0.1
81
-
82
- vision_args: VisionArgs | None = None
83
- moe_args: MoEArgs | None = None
84
- quantization_args: QuantizationArgs | None = None
85
- lora_args: LoRAArgs | None = None
86
-
87
- max_batch_size: int = 32
88
- max_seq_len: int = 2048
89
-
90
- @model_validator(mode="after")
91
- def validate(self) -> "ModelArgs":
92
- assert self.n_kv_heads <= self.n_heads, f"n_kv_heads ({self.n_kv_heads}) must be <= n_heads ({self.n_heads})"
93
- assert self.n_heads % self.n_kv_heads == 0, (
94
- f"n_heads ({self.n_heads}) must be divisible by n_kv_heads ({self.n_kv_heads})"
95
- )
96
- assert self.dim % self.n_heads == 0, f"dim ({self.dim}) must be divisible by n_heads ({self.n_heads})"
97
-
98
- if self.use_scaled_rope:
99
- # NOTE: ideally these values should have come from params.json. However, we have
100
- # shipped the models everywhere. Only Llama-4-Scout uses scaled rope and needs these
101
- # specific values.
102
- if self.rope_scaling_factor is None:
103
- self.rope_scaling_factor = 16
104
- if self.rope_high_freq_factor is None:
105
- self.rope_high_freq_factor = 1
106
-
107
- return self
@@ -1,58 +0,0 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the terms described in the LICENSE file in
5
- # the root directory of this source tree.
6
-
7
- # Copyright (c) Meta Platforms, Inc. and affiliates.
8
- # All rights reserved.
9
- #
10
- # This source code is licensed under the terms described in the LICENSE file in
11
- # top-level folder for each specific model found within the models/ directory at
12
- # the top-level of this source tree.
13
-
14
- from typing import Any
15
-
16
- from fairscale.nn.model_parallel.layers import ColumnParallelLinear, RowParallelLinear
17
- from fairscale.nn.model_parallel.mappings import reduce_from_model_parallel_region
18
- from torch import nn
19
- from torch.nn import functional as F
20
-
21
-
22
- class FeedForward(nn.Module):
23
- def __init__(
24
- self,
25
- dim: int,
26
- hidden_dim: int,
27
- do_reduce: bool = True,
28
- ):
29
- super().__init__()
30
- self.do_reduce = do_reduce
31
-
32
- self.w1 = ColumnParallelLinear(dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x)
33
- self.w2 = RowParallelLinear(hidden_dim, dim, bias=False, input_is_parallel=True, init_method=lambda x: x)
34
- self.w3 = ColumnParallelLinear(dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x)
35
- self._register_load_state_dict_pre_hook(self.load_hook)
36
-
37
- def load_hook(
38
- self,
39
- state_dict: dict[str, Any],
40
- prefix: str,
41
- local_metadata: dict[str, Any],
42
- strict: bool,
43
- missing_keys: list[str],
44
- unexpected_keys: list[str],
45
- error_msgs: list[str],
46
- ) -> None:
47
- if prefix + "mlp.fc1_weight" in state_dict:
48
- w1, w3 = state_dict.pop(prefix + "mlp.fc1_weight").chunk(2, dim=0)
49
- state_dict[prefix + "w1.weight"] = w1
50
- state_dict[prefix + "w3.weight"] = w3
51
- state_dict[prefix + "w2.weight"] = state_dict.pop(prefix + "mlp.fc2_weight")
52
-
53
- def forward(self, x):
54
- x = F.silu(F.linear(x, self.w1.weight)) * F.linear(x, self.w3.weight)
55
- out = F.linear(x, self.w2.weight)
56
- if self.do_reduce:
57
- return reduce_from_model_parallel_region(out)
58
- return out