llama-stack 0.4.3__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. llama_stack/distributions/dell/doc_template.md +209 -0
  2. llama_stack/distributions/meta-reference-gpu/doc_template.md +119 -0
  3. llama_stack/distributions/nvidia/doc_template.md +170 -0
  4. llama_stack/distributions/oci/doc_template.md +140 -0
  5. llama_stack/models/llama/llama3/dog.jpg +0 -0
  6. llama_stack/models/llama/llama3/pasta.jpeg +0 -0
  7. llama_stack/models/llama/resources/dog.jpg +0 -0
  8. llama_stack/models/llama/resources/pasta.jpeg +0 -0
  9. llama_stack/models/llama/resources/small_dog.jpg +0 -0
  10. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +136 -11
  11. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h +9 -0
  12. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift +189 -0
  13. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift +238 -0
  14. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/PromptTemplate.swift +12 -0
  15. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/SystemPrompts.swift +89 -0
  16. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.pbxproj +550 -0
  17. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
  18. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
  19. llama_stack/providers/remote/datasetio/nvidia/README.md +74 -0
  20. llama_stack/providers/remote/eval/nvidia/README.md +134 -0
  21. llama_stack/providers/remote/files/s3/README.md +266 -0
  22. llama_stack/providers/remote/inference/nvidia/NVIDIA.md +203 -0
  23. llama_stack/providers/remote/post_training/nvidia/README.md +151 -0
  24. llama_stack/providers/remote/safety/nvidia/README.md +78 -0
  25. llama_stack/providers/utils/responses/responses_store.py +34 -0
  26. {llama_stack-0.4.3.dist-info → llama_stack-0.4.4.dist-info}/METADATA +2 -2
  27. {llama_stack-0.4.3.dist-info → llama_stack-0.4.4.dist-info}/RECORD +31 -142
  28. llama_stack-0.4.4.dist-info/top_level.txt +1 -0
  29. llama_stack-0.4.3.dist-info/top_level.txt +0 -2
  30. llama_stack_api/__init__.py +0 -945
  31. llama_stack_api/admin/__init__.py +0 -45
  32. llama_stack_api/admin/api.py +0 -72
  33. llama_stack_api/admin/fastapi_routes.py +0 -117
  34. llama_stack_api/admin/models.py +0 -113
  35. llama_stack_api/agents.py +0 -173
  36. llama_stack_api/batches/__init__.py +0 -40
  37. llama_stack_api/batches/api.py +0 -53
  38. llama_stack_api/batches/fastapi_routes.py +0 -113
  39. llama_stack_api/batches/models.py +0 -78
  40. llama_stack_api/benchmarks/__init__.py +0 -43
  41. llama_stack_api/benchmarks/api.py +0 -39
  42. llama_stack_api/benchmarks/fastapi_routes.py +0 -109
  43. llama_stack_api/benchmarks/models.py +0 -109
  44. llama_stack_api/common/__init__.py +0 -5
  45. llama_stack_api/common/content_types.py +0 -101
  46. llama_stack_api/common/errors.py +0 -95
  47. llama_stack_api/common/job_types.py +0 -38
  48. llama_stack_api/common/responses.py +0 -77
  49. llama_stack_api/common/training_types.py +0 -47
  50. llama_stack_api/common/type_system.py +0 -146
  51. llama_stack_api/connectors.py +0 -146
  52. llama_stack_api/conversations.py +0 -270
  53. llama_stack_api/datasetio.py +0 -55
  54. llama_stack_api/datasets/__init__.py +0 -61
  55. llama_stack_api/datasets/api.py +0 -35
  56. llama_stack_api/datasets/fastapi_routes.py +0 -104
  57. llama_stack_api/datasets/models.py +0 -152
  58. llama_stack_api/datatypes.py +0 -373
  59. llama_stack_api/eval.py +0 -137
  60. llama_stack_api/file_processors/__init__.py +0 -27
  61. llama_stack_api/file_processors/api.py +0 -64
  62. llama_stack_api/file_processors/fastapi_routes.py +0 -78
  63. llama_stack_api/file_processors/models.py +0 -42
  64. llama_stack_api/files/__init__.py +0 -35
  65. llama_stack_api/files/api.py +0 -51
  66. llama_stack_api/files/fastapi_routes.py +0 -124
  67. llama_stack_api/files/models.py +0 -107
  68. llama_stack_api/inference.py +0 -1169
  69. llama_stack_api/inspect_api/__init__.py +0 -37
  70. llama_stack_api/inspect_api/api.py +0 -25
  71. llama_stack_api/inspect_api/fastapi_routes.py +0 -76
  72. llama_stack_api/inspect_api/models.py +0 -28
  73. llama_stack_api/internal/__init__.py +0 -9
  74. llama_stack_api/internal/kvstore.py +0 -28
  75. llama_stack_api/internal/sqlstore.py +0 -81
  76. llama_stack_api/llama_stack_api/__init__.py +0 -945
  77. llama_stack_api/llama_stack_api/admin/__init__.py +0 -45
  78. llama_stack_api/llama_stack_api/admin/api.py +0 -72
  79. llama_stack_api/llama_stack_api/admin/fastapi_routes.py +0 -117
  80. llama_stack_api/llama_stack_api/admin/models.py +0 -113
  81. llama_stack_api/llama_stack_api/agents.py +0 -173
  82. llama_stack_api/llama_stack_api/batches/__init__.py +0 -40
  83. llama_stack_api/llama_stack_api/batches/api.py +0 -53
  84. llama_stack_api/llama_stack_api/batches/fastapi_routes.py +0 -113
  85. llama_stack_api/llama_stack_api/batches/models.py +0 -78
  86. llama_stack_api/llama_stack_api/benchmarks/__init__.py +0 -43
  87. llama_stack_api/llama_stack_api/benchmarks/api.py +0 -39
  88. llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +0 -109
  89. llama_stack_api/llama_stack_api/benchmarks/models.py +0 -109
  90. llama_stack_api/llama_stack_api/common/__init__.py +0 -5
  91. llama_stack_api/llama_stack_api/common/content_types.py +0 -101
  92. llama_stack_api/llama_stack_api/common/errors.py +0 -95
  93. llama_stack_api/llama_stack_api/common/job_types.py +0 -38
  94. llama_stack_api/llama_stack_api/common/responses.py +0 -77
  95. llama_stack_api/llama_stack_api/common/training_types.py +0 -47
  96. llama_stack_api/llama_stack_api/common/type_system.py +0 -146
  97. llama_stack_api/llama_stack_api/connectors.py +0 -146
  98. llama_stack_api/llama_stack_api/conversations.py +0 -270
  99. llama_stack_api/llama_stack_api/datasetio.py +0 -55
  100. llama_stack_api/llama_stack_api/datasets/__init__.py +0 -61
  101. llama_stack_api/llama_stack_api/datasets/api.py +0 -35
  102. llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +0 -104
  103. llama_stack_api/llama_stack_api/datasets/models.py +0 -152
  104. llama_stack_api/llama_stack_api/datatypes.py +0 -373
  105. llama_stack_api/llama_stack_api/eval.py +0 -137
  106. llama_stack_api/llama_stack_api/file_processors/__init__.py +0 -27
  107. llama_stack_api/llama_stack_api/file_processors/api.py +0 -64
  108. llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +0 -78
  109. llama_stack_api/llama_stack_api/file_processors/models.py +0 -42
  110. llama_stack_api/llama_stack_api/files/__init__.py +0 -35
  111. llama_stack_api/llama_stack_api/files/api.py +0 -51
  112. llama_stack_api/llama_stack_api/files/fastapi_routes.py +0 -124
  113. llama_stack_api/llama_stack_api/files/models.py +0 -107
  114. llama_stack_api/llama_stack_api/inference.py +0 -1169
  115. llama_stack_api/llama_stack_api/inspect_api/__init__.py +0 -37
  116. llama_stack_api/llama_stack_api/inspect_api/api.py +0 -25
  117. llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +0 -76
  118. llama_stack_api/llama_stack_api/inspect_api/models.py +0 -28
  119. llama_stack_api/llama_stack_api/internal/__init__.py +0 -9
  120. llama_stack_api/llama_stack_api/internal/kvstore.py +0 -28
  121. llama_stack_api/llama_stack_api/internal/sqlstore.py +0 -81
  122. llama_stack_api/llama_stack_api/models.py +0 -171
  123. llama_stack_api/llama_stack_api/openai_responses.py +0 -1468
  124. llama_stack_api/llama_stack_api/post_training.py +0 -370
  125. llama_stack_api/llama_stack_api/prompts.py +0 -203
  126. llama_stack_api/llama_stack_api/providers/__init__.py +0 -33
  127. llama_stack_api/llama_stack_api/providers/api.py +0 -16
  128. llama_stack_api/llama_stack_api/providers/fastapi_routes.py +0 -57
  129. llama_stack_api/llama_stack_api/providers/models.py +0 -24
  130. llama_stack_api/llama_stack_api/py.typed +0 -0
  131. llama_stack_api/llama_stack_api/rag_tool.py +0 -168
  132. llama_stack_api/llama_stack_api/resource.py +0 -37
  133. llama_stack_api/llama_stack_api/router_utils.py +0 -160
  134. llama_stack_api/llama_stack_api/safety.py +0 -132
  135. llama_stack_api/llama_stack_api/schema_utils.py +0 -208
  136. llama_stack_api/llama_stack_api/scoring.py +0 -93
  137. llama_stack_api/llama_stack_api/scoring_functions.py +0 -211
  138. llama_stack_api/llama_stack_api/shields.py +0 -93
  139. llama_stack_api/llama_stack_api/tools.py +0 -226
  140. llama_stack_api/llama_stack_api/vector_io.py +0 -941
  141. llama_stack_api/llama_stack_api/vector_stores.py +0 -53
  142. llama_stack_api/llama_stack_api/version.py +0 -9
  143. llama_stack_api/models.py +0 -171
  144. llama_stack_api/openai_responses.py +0 -1468
  145. llama_stack_api/post_training.py +0 -370
  146. llama_stack_api/prompts.py +0 -203
  147. llama_stack_api/providers/__init__.py +0 -33
  148. llama_stack_api/providers/api.py +0 -16
  149. llama_stack_api/providers/fastapi_routes.py +0 -57
  150. llama_stack_api/providers/models.py +0 -24
  151. llama_stack_api/py.typed +0 -0
  152. llama_stack_api/rag_tool.py +0 -168
  153. llama_stack_api/resource.py +0 -37
  154. llama_stack_api/router_utils.py +0 -160
  155. llama_stack_api/safety.py +0 -132
  156. llama_stack_api/schema_utils.py +0 -208
  157. llama_stack_api/scoring.py +0 -93
  158. llama_stack_api/scoring_functions.py +0 -211
  159. llama_stack_api/shields.py +0 -93
  160. llama_stack_api/tools.py +0 -226
  161. llama_stack_api/vector_io.py +0 -941
  162. llama_stack_api/vector_stores.py +0 -53
  163. llama_stack_api/version.py +0 -9
  164. {llama_stack-0.4.3.dist-info → llama_stack-0.4.4.dist-info}/WHEEL +0 -0
  165. {llama_stack-0.4.3.dist-info → llama_stack-0.4.4.dist-info}/entry_points.txt +0 -0
  166. {llama_stack-0.4.3.dist-info → llama_stack-0.4.4.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,209 @@
1
+ ---
2
+ orphan: true
3
+ ---
4
+
5
+ # Dell Distribution of Llama Stack
6
+
7
+ ```{toctree}
8
+ :maxdepth: 2
9
+ :hidden:
10
+
11
+ self
12
+ ```
13
+
14
+ The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
15
+
16
+ {{ providers_table }}
17
+
18
+ You can use this distribution if you have GPUs and want to run an independent TGI or Dell Enterprise Hub container for running inference.
19
+
20
+ {% if run_config_env_vars %}
21
+ ### Environment Variables
22
+
23
+ The following environment variables can be configured:
24
+
25
+ {% for var, (default_value, description) in run_config_env_vars.items() %}
26
+ - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
27
+ {% endfor %}
28
+ {% endif %}
29
+
30
+
31
+ ## Setting up Inference server using Dell Enterprise Hub's custom TGI container.
32
+
33
+ NOTE: This is a placeholder to run inference with TGI. This will be updated to use [Dell Enterprise Hub's containers](https://dell.huggingface.co/authenticated/models) once verified.
34
+
35
+ ```bash
36
+ export INFERENCE_PORT=8181
37
+ export DEH_URL=http://0.0.0.0:$INFERENCE_PORT
38
+ export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
39
+ export CHROMADB_HOST=localhost
40
+ export CHROMADB_PORT=6601
41
+ export CHROMA_URL=http://$CHROMADB_HOST:$CHROMADB_PORT
42
+ export CUDA_VISIBLE_DEVICES=0
43
+ export LLAMA_STACK_PORT=8321
44
+
45
+ docker run --rm -it \
46
+ --pull always \
47
+ --network host \
48
+ -v $HOME/.cache/huggingface:/data \
49
+ -e HF_TOKEN=$HF_TOKEN \
50
+ -p $INFERENCE_PORT:$INFERENCE_PORT \
51
+ --gpus $CUDA_VISIBLE_DEVICES \
52
+ ghcr.io/huggingface/text-generation-inference \
53
+ --dtype bfloat16 \
54
+ --usage-stats off \
55
+ --sharded false \
56
+ --cuda-memory-fraction 0.7 \
57
+ --model-id $INFERENCE_MODEL \
58
+ --port $INFERENCE_PORT --hostname 0.0.0.0
59
+ ```
60
+
61
+ If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
62
+
63
+ ```bash
64
+ export SAFETY_INFERENCE_PORT=8282
65
+ export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
66
+ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
67
+ export CUDA_VISIBLE_DEVICES=1
68
+
69
+ docker run --rm -it \
70
+ --pull always \
71
+ --network host \
72
+ -v $HOME/.cache/huggingface:/data \
73
+ -e HF_TOKEN=$HF_TOKEN \
74
+ -p $SAFETY_INFERENCE_PORT:$SAFETY_INFERENCE_PORT \
75
+ --gpus $CUDA_VISIBLE_DEVICES \
76
+ ghcr.io/huggingface/text-generation-inference \
77
+ --dtype bfloat16 \
78
+ --usage-stats off \
79
+ --sharded false \
80
+ --cuda-memory-fraction 0.7 \
81
+ --model-id $SAFETY_MODEL \
82
+ --hostname 0.0.0.0 \
83
+ --port $SAFETY_INFERENCE_PORT
84
+ ```
85
+
86
+ ## Dell distribution relies on ChromaDB for vector database usage
87
+
88
+ You can start a chroma-db easily using docker.
89
+ ```bash
90
+ # This is where the indices are persisted
91
+ mkdir -p $HOME/chromadb
92
+
93
+ podman run --rm -it \
94
+ --network host \
95
+ --name chromadb \
96
+ -v $HOME/chromadb:/chroma/chroma \
97
+ -e IS_PERSISTENT=TRUE \
98
+ chromadb/chroma:latest \
99
+ --port $CHROMADB_PORT \
100
+ --host $CHROMADB_HOST
101
+ ```
102
+
103
+ ## Running Llama Stack
104
+
105
+ Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
106
+
107
+ ### Via Docker
108
+
109
+ This method allows you to get started quickly without having to build the distribution code.
110
+
111
+ ```bash
112
+ docker run -it \
113
+ --pull always \
114
+ --network host \
115
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
116
+ -v $HOME/.llama:/root/.llama \
117
+ # NOTE: mount the llama-stack directory if testing local changes else not needed
118
+ -v $HOME/git/llama-stack:/app/llama-stack-source \
119
+ # localhost/distribution-dell:dev if building / testing locally
120
+ -e INFERENCE_MODEL=$INFERENCE_MODEL \
121
+ -e DEH_URL=$DEH_URL \
122
+ -e CHROMA_URL=$CHROMA_URL \
123
+ llamastack/distribution-{{ name }}\
124
+ --port $LLAMA_STACK_PORT
125
+
126
+ ```
127
+
128
+ If you are using Llama Stack Safety / Shield APIs, use:
129
+
130
+ ```bash
131
+ # You need a local checkout of llama-stack to run this, get it using
132
+ # git clone https://github.com/meta-llama/llama-stack.git
133
+ cd /path/to/llama-stack
134
+
135
+ export SAFETY_INFERENCE_PORT=8282
136
+ export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
137
+ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
138
+
139
+ docker run \
140
+ -it \
141
+ --pull always \
142
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
143
+ -v $HOME/.llama:/root/.llama \
144
+ -v ./llama_stack/distributions/tgi/run-with-safety.yaml:/root/my-config.yaml \
145
+ -e INFERENCE_MODEL=$INFERENCE_MODEL \
146
+ -e DEH_URL=$DEH_URL \
147
+ -e SAFETY_MODEL=$SAFETY_MODEL \
148
+ -e DEH_SAFETY_URL=$DEH_SAFETY_URL \
149
+ -e CHROMA_URL=$CHROMA_URL \
150
+ llamastack/distribution-{{ name }} \
151
+ --config /root/my-config.yaml \
152
+ --port $LLAMA_STACK_PORT
153
+ ```
154
+
155
+ ### Via Docker with Custom Run Configuration
156
+
157
+ You can also run the Docker container with a custom run configuration file by mounting it into the container:
158
+
159
+ ```bash
160
+ # Set the path to your custom config.yaml file
161
+ CUSTOM_RUN_CONFIG=/path/to/your/custom-config.yaml
162
+
163
+ docker run -it \
164
+ --pull always \
165
+ --network host \
166
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
167
+ -v $HOME/.llama:/root/.llama \
168
+ -v $CUSTOM_RUN_CONFIG:/app/custom-config.yaml \
169
+ -e RUN_CONFIG_PATH=/app/custom-config.yaml \
170
+ -e INFERENCE_MODEL=$INFERENCE_MODEL \
171
+ -e DEH_URL=$DEH_URL \
172
+ -e CHROMA_URL=$CHROMA_URL \
173
+ llamastack/distribution-{{ name }} \
174
+ --port $LLAMA_STACK_PORT
175
+ ```
176
+
177
+ **Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
178
+
179
+ {% if run_configs %}
180
+ Available run configurations for this distribution:
181
+ {% for config in run_configs %}
182
+ - `{{ config }}`
183
+ {% endfor %}
184
+ {% endif %}
185
+
186
+ ### Via Conda
187
+
188
+ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
189
+
190
+ ```bash
191
+ llama stack list-deps {{ name }} | xargs -L1 pip install
192
+ INFERENCE_MODEL=$INFERENCE_MODEL \
193
+ DEH_URL=$DEH_URL \
194
+ CHROMA_URL=$CHROMA_URL \
195
+ llama stack run {{ name }} \
196
+ --port $LLAMA_STACK_PORT
197
+ ```
198
+
199
+ If you are using Llama Stack Safety / Shield APIs, use:
200
+
201
+ ```bash
202
+ INFERENCE_MODEL=$INFERENCE_MODEL \
203
+ DEH_URL=$DEH_URL \
204
+ SAFETY_MODEL=$SAFETY_MODEL \
205
+ DEH_SAFETY_URL=$DEH_SAFETY_URL \
206
+ CHROMA_URL=$CHROMA_URL \
207
+ llama stack run ./run-with-safety.yaml \
208
+ --port $LLAMA_STACK_PORT
209
+ ```
@@ -0,0 +1,119 @@
1
+ ---
2
+ orphan: true
3
+ ---
4
+ # Meta Reference GPU Distribution
5
+
6
+ ```{toctree}
7
+ :maxdepth: 2
8
+ :hidden:
9
+
10
+ self
11
+ ```
12
+
13
+ The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
14
+
15
+ {{ providers_table }}
16
+
17
+ Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.
18
+
19
+ {% if run_config_env_vars %}
20
+ ### Environment Variables
21
+
22
+ The following environment variables can be configured:
23
+
24
+ {% for var, (default_value, description) in run_config_env_vars.items() %}
25
+ - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
26
+ {% endfor %}
27
+ {% endif %}
28
+
29
+
30
+ ## Prerequisite: Downloading Models
31
+
32
+ Please check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models using the Hugging Face CLI.
33
+ ```
34
+
35
+ ## Running the Distribution
36
+
37
+ You can do this via venv or Docker which has a pre-built image.
38
+
39
+ ### Via Docker
40
+
41
+ This method allows you to get started quickly without having to build the distribution code.
42
+
43
+ ```bash
44
+ LLAMA_STACK_PORT=8321
45
+ docker run \
46
+ -it \
47
+ --pull always \
48
+ --gpu all \
49
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
50
+ -v ~/.llama:/root/.llama \
51
+ -e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
52
+ llamastack/distribution-{{ name }} \
53
+ --port $LLAMA_STACK_PORT
54
+ ```
55
+
56
+ If you are using Llama Stack Safety / Shield APIs, use:
57
+
58
+ ```bash
59
+ docker run \
60
+ -it \
61
+ --pull always \
62
+ --gpu all \
63
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
64
+ -v ~/.llama:/root/.llama \
65
+ -e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
66
+ -e SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
67
+ llamastack/distribution-{{ name }} \
68
+ --port $LLAMA_STACK_PORT
69
+ ```
70
+
71
+ ### Via Docker with Custom Run Configuration
72
+
73
+ You can also run the Docker container with a custom run configuration file by mounting it into the container:
74
+
75
+ ```bash
76
+ # Set the path to your custom config.yaml file
77
+ CUSTOM_RUN_CONFIG=/path/to/your/custom-config.yaml
78
+ LLAMA_STACK_PORT=8321
79
+
80
+ docker run \
81
+ -it \
82
+ --pull always \
83
+ --gpu all \
84
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
85
+ -v ~/.llama:/root/.llama \
86
+ -v $CUSTOM_RUN_CONFIG:/app/custom-config.yaml \
87
+ -e RUN_CONFIG_PATH=/app/custom-config.yaml \
88
+ llamastack/distribution-{{ name }} \
89
+ --port $LLAMA_STACK_PORT
90
+ ```
91
+
92
+ **Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
93
+
94
+ {% if run_configs %}
95
+ Available run configurations for this distribution:
96
+ {% for config in run_configs %}
97
+ - `{{ config }}`
98
+ {% endfor %}
99
+ {% endif %}
100
+
101
+ ### Via venv
102
+
103
+ Make sure you have the Llama Stack CLI available.
104
+
105
+ ```bash
106
+ llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
107
+ INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
108
+ llama stack run distributions/{{ name }}/config.yaml \
109
+ --port 8321
110
+ ```
111
+
112
+ If you are using Llama Stack Safety / Shield APIs, use:
113
+
114
+ ```bash
115
+ INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
116
+ SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
117
+ llama stack run distributions/{{ name }}/run-with-safety.yaml \
118
+ --port 8321
119
+ ```
@@ -0,0 +1,170 @@
1
+ ---
2
+ orphan: true
3
+ ---
4
+ # NVIDIA Distribution
5
+
6
+ The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
7
+
8
+ {{ providers_table }}
9
+
10
+ {% if run_config_env_vars %}
11
+ ### Environment Variables
12
+
13
+ The following environment variables can be configured:
14
+
15
+ {% for var, (default_value, description) in run_config_env_vars.items() %}
16
+ - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
17
+ {% endfor %}
18
+ {% endif %}
19
+
20
+ {% if default_models %}
21
+ ### Models
22
+
23
+ The following models are available by default:
24
+
25
+ {% for model in default_models %}
26
+ - `{{ model.model_id }} {{ model.doc_string }}`
27
+ {% endfor %}
28
+ {% endif %}
29
+
30
+
31
+ ## Prerequisites
32
+ ### NVIDIA API Keys
33
+
34
+ Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
35
+
36
+ ### Deploy NeMo Microservices Platform
37
+ The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform.
38
+
39
+ ## Supported Services
40
+ Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
41
+
42
+ ### Inference: NVIDIA NIM
43
+ NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
44
+ 1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
45
+ 2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
46
+
47
+ The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
48
+
49
+ ### Datasetio API: NeMo Data Store
50
+ The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
51
+
52
+ See the [NVIDIA Datasetio docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
53
+
54
+ ### Eval API: NeMo Evaluator
55
+ The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
56
+
57
+ See the [NVIDIA Eval docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
58
+
59
+ ### Post-Training API: NeMo Customizer
60
+ The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
61
+
62
+ See the [NVIDIA Post-Training docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
63
+
64
+ ### Safety API: NeMo Guardrails
65
+ The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
66
+
67
+ See the [NVIDIA Safety docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/safety/nvidia/README.md) for supported features and example usage.
68
+
69
+ ## Deploying models
70
+ In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
71
+
72
+ Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
73
+ ```sh
74
+ # URL to NeMo NIM Proxy service
75
+ export NEMO_URL="http://nemo.test"
76
+
77
+ curl --location "$NEMO_URL/v1/deployment/model-deployments" \
78
+ -H 'accept: application/json' \
79
+ -H 'Content-Type: application/json' \
80
+ -d '{
81
+ "name": "llama-3.2-1b-instruct",
82
+ "namespace": "meta",
83
+ "config": {
84
+ "model": "meta/llama-3.2-1b-instruct",
85
+ "nim_deployment": {
86
+ "image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
87
+ "image_tag": "1.8.3",
88
+ "pvc_size": "25Gi",
89
+ "gpu": 1,
90
+ "additional_envs": {
91
+ "NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
92
+ }
93
+ }
94
+ }
95
+ }'
96
+ ```
97
+ This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference.
98
+
99
+ You can also remove a deployed NIM to free up GPU resources, if needed.
100
+ ```sh
101
+ export NEMO_URL="http://nemo.test"
102
+
103
+ curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
104
+ ```
105
+
106
+ ## Running Llama Stack with NVIDIA
107
+
108
+ You can do this via venv (build code), or Docker which has a pre-built image.
109
+
110
+ ### Via Docker
111
+
112
+ This method allows you to get started quickly without having to build the distribution code.
113
+
114
+ ```bash
115
+ LLAMA_STACK_PORT=8321
116
+ docker run \
117
+ -it \
118
+ --pull always \
119
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
120
+ -v ~/.llama:/root/.llama \
121
+ -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
122
+ llamastack/distribution-{{ name }} \
123
+ --port $LLAMA_STACK_PORT
124
+ ```
125
+
126
+ ### Via Docker with Custom Run Configuration
127
+
128
+ You can also run the Docker container with a custom run configuration file by mounting it into the container:
129
+
130
+ ```bash
131
+ # Set the path to your custom config.yaml file
132
+ CUSTOM_RUN_CONFIG=/path/to/your/custom-config.yaml
133
+ LLAMA_STACK_PORT=8321
134
+
135
+ docker run \
136
+ -it \
137
+ --pull always \
138
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
139
+ -v ~/.llama:/root/.llama \
140
+ -v $CUSTOM_RUN_CONFIG:/app/custom-config.yaml \
141
+ -e RUN_CONFIG_PATH=/app/custom-config.yaml \
142
+ -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
143
+ llamastack/distribution-{{ name }} \
144
+ --port $LLAMA_STACK_PORT
145
+ ```
146
+
147
+ **Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
148
+
149
+ {% if run_configs %}
150
+ Available run configurations for this distribution:
151
+ {% for config in run_configs %}
152
+ - `{{ config }}`
153
+ {% endfor %}
154
+ {% endif %}
155
+
156
+ ### Via venv
157
+
158
+ If you've set up your local development environment, you can also install the distribution dependencies using your local virtual environment.
159
+
160
+ ```bash
161
+ INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
162
+ llama stack list-deps nvidia | xargs -L1 uv pip install
163
+ NVIDIA_API_KEY=$NVIDIA_API_KEY \
164
+ INFERENCE_MODEL=$INFERENCE_MODEL \
165
+ llama stack run ./config.yaml \
166
+ --port 8321
167
+ ```
168
+
169
+ ## Example Notebooks
170
+ For examples of how to use the NVIDIA Distribution to run inference, fine-tune, evaluate, and run safety checks on your LLMs, you can reference the example notebooks in [docs/notebooks/nvidia](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks/nvidia).
@@ -0,0 +1,140 @@
1
+ ---
2
+ orphan: true
3
+ ---
4
+ # OCI Distribution
5
+
6
+ The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
7
+
8
+ {{ providers_table }}
9
+
10
+ {% if run_config_env_vars %}
11
+ ### Environment Variables
12
+
13
+ The following environment variables can be configured:
14
+
15
+ {% for var, (default_value, description) in run_config_env_vars.items() %}
16
+ - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
17
+ {% endfor %}
18
+ {% endif %}
19
+
20
+ {% if default_models %}
21
+ ### Models
22
+
23
+ The following models are available by default:
24
+
25
+ {% for model in default_models %}
26
+ - `{{ model.model_id }} {{ model.doc_string }}`
27
+ {% endfor %}
28
+ {% endif %}
29
+
30
+ ## Prerequisites
31
+ ### Oracle Cloud Infrastructure Setup
32
+
33
+ Before using the OCI Generative AI distribution, ensure you have:
34
+
35
+ 1. **Oracle Cloud Infrastructure Account**: Sign up at [Oracle Cloud Infrastructure](https://cloud.oracle.com/)
36
+ 2. **Generative AI Service Access**: Enable the Generative AI service in your OCI tenancy
37
+ 3. **Compartment**: Create or identify a compartment where you'll deploy Generative AI models
38
+ 4. **Authentication**: Configure authentication using either:
39
+ - **Instance Principal** (recommended for cloud-hosted deployments)
40
+ - **API Key** (for on-premises or development environments)
41
+
42
+ ### Authentication Methods
43
+
44
+ #### Instance Principal Authentication (Recommended)
45
+ Instance Principal authentication allows OCI resources to authenticate using the identity of the compute instance they're running on. This is the most secure method for production deployments.
46
+
47
+ Requirements:
48
+ - Instance must be running in an Oracle Cloud Infrastructure compartment
49
+ - Instance must have appropriate IAM policies to access Generative AI services
50
+
51
+ #### API Key Authentication
52
+ For development or on-premises deployments, follow [this doc](https://docs.oracle.com/en-us/iaas/Content/API/Concepts/apisigningkey.htm) to learn how to create your API signing key for your config file.
53
+
54
+ ### Required IAM Policies
55
+
56
+ Ensure your OCI user or instance has the following policy statements:
57
+
58
+ ```
59
+ Allow group <group_name> to use generative-ai-inference-endpoints in compartment <compartment_name>
60
+ Allow group <group_name> to manage generative-ai-inference-endpoints in compartment <compartment_name>
61
+ ```
62
+
63
+ ## Supported Services
64
+
65
+ ### Inference: OCI Generative AI
66
+ Oracle Cloud Infrastructure Generative AI provides access to high-performance AI models through OCI's Platform-as-a-Service offering. The service supports:
67
+
68
+ - **Chat Completions**: Conversational AI with context awareness
69
+ - **Text Generation**: Complete prompts and generate text content
70
+
71
+ #### Available Models
72
+ Common OCI Generative AI models include access to Meta, Cohere, OpenAI, Grok, and more models.
73
+
74
+ ### Safety: Llama Guard
75
+ For content safety and moderation, this distribution uses Meta's LlamaGuard model through the OCI Generative AI service to provide:
76
+ - Content filtering and moderation
77
+ - Policy compliance checking
78
+ - Harmful content detection
79
+
80
+ ### Vector Storage: Multiple Options
81
+ The distribution supports several vector storage providers:
82
+ - **FAISS**: Local in-memory vector search
83
+ - **ChromaDB**: Distributed vector database
84
+ - **PGVector**: PostgreSQL with vector extensions
85
+
86
+ ### Additional Services
87
+ - **Dataset I/O**: Local filesystem and Hugging Face integration
88
+ - **Tool Runtime**: Web search (Brave, Tavily) and RAG capabilities
89
+ - **Evaluation**: Meta reference evaluation framework
90
+
91
+ ## Running Llama Stack with OCI
92
+
93
+ You can run the OCI distribution via Docker or local virtual environment.
94
+
95
+ ### Via venv
96
+
97
+ If you've set up your local development environment, you can also build the image using your local virtual environment.
98
+
99
+ ```bash
100
+ OCI_AUTH=$OCI_AUTH_TYPE OCI_REGION=$OCI_REGION OCI_COMPARTMENT_OCID=$OCI_COMPARTMENT_OCID llama stack run --port 8321 oci
101
+ ```
102
+
103
+ ### Configuration Examples
104
+
105
+ #### Using Instance Principal (Recommended for Production)
106
+ ```bash
107
+ export OCI_AUTH_TYPE=instance_principal
108
+ export OCI_REGION=us-chicago-1
109
+ export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..<your-compartment-id>
110
+ ```
111
+
112
+ #### Using API Key Authentication (Development)
113
+ ```bash
114
+ export OCI_AUTH_TYPE=config_file
115
+ export OCI_CONFIG_FILE_PATH=~/.oci/config
116
+ export OCI_CLI_PROFILE=DEFAULT
117
+ export OCI_REGION=us-chicago-1
118
+ export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..your-compartment-id
119
+ ```
120
+
121
+ ## Regional Endpoints
122
+
123
+ OCI Generative AI is available in multiple regions. The service automatically routes to the appropriate regional endpoint based on your configuration. For a full list of regional model availability, visit:
124
+
125
+ https://docs.oracle.com/en-us/iaas/Content/generative-ai/overview.htm#regions
126
+
127
+ ## Troubleshooting
128
+
129
+ ### Common Issues
130
+
131
+ 1. **Authentication Errors**: Verify your OCI credentials and IAM policies
132
+ 2. **Model Not Found**: Ensure the model OCID is correct and the model is available in your region
133
+ 3. **Permission Denied**: Check compartment permissions and Generative AI service access
134
+ 4. **Region Unavailable**: Verify the specified region supports Generative AI services
135
+
136
+ ### Getting Help
137
+
138
+ For additional support:
139
+ - [OCI Generative AI Documentation](https://docs.oracle.com/en-us/iaas/Content/generative-ai/home.htm)
140
+ - [Llama Stack Issues](https://github.com/meta-llama/llama-stack/issues)
Binary file