llama-stack 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_stack/cli/stack/_list_deps.py +11 -7
- llama_stack/cli/stack/run.py +3 -25
- llama_stack/core/access_control/datatypes.py +78 -0
- llama_stack/core/configure.py +2 -2
- llama_stack/{distributions/meta-reference-gpu → core/connectors}/__init__.py +3 -1
- llama_stack/core/connectors/connectors.py +162 -0
- llama_stack/core/conversations/conversations.py +61 -58
- llama_stack/core/datatypes.py +54 -8
- llama_stack/core/library_client.py +60 -13
- llama_stack/core/prompts/prompts.py +43 -42
- llama_stack/core/routers/datasets.py +20 -17
- llama_stack/core/routers/eval_scoring.py +143 -53
- llama_stack/core/routers/inference.py +20 -9
- llama_stack/core/routers/safety.py +30 -42
- llama_stack/core/routers/vector_io.py +15 -7
- llama_stack/core/routing_tables/models.py +42 -3
- llama_stack/core/routing_tables/scoring_functions.py +19 -19
- llama_stack/core/routing_tables/shields.py +20 -17
- llama_stack/core/routing_tables/vector_stores.py +8 -5
- llama_stack/core/server/auth.py +192 -17
- llama_stack/core/server/fastapi_router_registry.py +40 -5
- llama_stack/core/server/server.py +24 -5
- llama_stack/core/stack.py +54 -10
- llama_stack/core/storage/datatypes.py +9 -0
- llama_stack/core/store/registry.py +1 -1
- llama_stack/core/utils/exec.py +2 -2
- llama_stack/core/utils/type_inspection.py +16 -2
- llama_stack/distributions/dell/config.yaml +4 -1
- llama_stack/distributions/dell/run-with-safety.yaml +4 -1
- llama_stack/distributions/nvidia/config.yaml +4 -1
- llama_stack/distributions/nvidia/run-with-safety.yaml +4 -1
- llama_stack/distributions/oci/config.yaml +4 -1
- llama_stack/distributions/open-benchmark/config.yaml +9 -1
- llama_stack/distributions/postgres-demo/config.yaml +1 -1
- llama_stack/distributions/starter/build.yaml +62 -0
- llama_stack/distributions/starter/config.yaml +22 -3
- llama_stack/distributions/starter/run-with-postgres-store.yaml +22 -3
- llama_stack/distributions/starter/starter.py +13 -1
- llama_stack/distributions/starter-gpu/build.yaml +62 -0
- llama_stack/distributions/starter-gpu/config.yaml +22 -3
- llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +22 -3
- llama_stack/distributions/template.py +10 -2
- llama_stack/distributions/watsonx/config.yaml +4 -1
- llama_stack/log.py +1 -0
- llama_stack/providers/inline/agents/meta_reference/__init__.py +1 -0
- llama_stack/providers/inline/agents/meta_reference/agents.py +58 -61
- llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +53 -51
- llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +99 -22
- llama_stack/providers/inline/agents/meta_reference/responses/types.py +2 -1
- llama_stack/providers/inline/agents/meta_reference/responses/utils.py +4 -1
- llama_stack/providers/inline/agents/meta_reference/safety.py +2 -2
- llama_stack/providers/inline/batches/reference/batches.py +2 -1
- llama_stack/providers/inline/eval/meta_reference/eval.py +40 -32
- llama_stack/providers/inline/post_training/huggingface/post_training.py +33 -38
- llama_stack/providers/inline/post_training/huggingface/utils.py +2 -5
- llama_stack/providers/inline/post_training/torchtune/common/utils.py +5 -9
- llama_stack/providers/inline/post_training/torchtune/post_training.py +28 -33
- llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +2 -4
- llama_stack/providers/inline/safety/code_scanner/code_scanner.py +12 -15
- llama_stack/providers/inline/safety/llama_guard/llama_guard.py +20 -24
- llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +11 -17
- llama_stack/providers/inline/scoring/basic/scoring.py +13 -17
- llama_stack/providers/inline/scoring/braintrust/braintrust.py +15 -15
- llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +13 -17
- llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +1 -1
- llama_stack/providers/registry/agents.py +1 -0
- llama_stack/providers/registry/inference.py +1 -9
- llama_stack/providers/registry/vector_io.py +136 -16
- llama_stack/providers/remote/eval/nvidia/eval.py +22 -21
- llama_stack/providers/remote/files/s3/config.py +5 -3
- llama_stack/providers/remote/files/s3/files.py +2 -2
- llama_stack/providers/remote/inference/gemini/gemini.py +4 -0
- llama_stack/providers/remote/inference/openai/openai.py +2 -0
- llama_stack/providers/remote/inference/together/together.py +4 -0
- llama_stack/providers/remote/inference/vertexai/config.py +3 -3
- llama_stack/providers/remote/inference/vertexai/vertexai.py +5 -2
- llama_stack/providers/remote/inference/vllm/config.py +37 -18
- llama_stack/providers/remote/inference/vllm/vllm.py +0 -3
- llama_stack/providers/remote/inference/watsonx/watsonx.py +4 -0
- llama_stack/providers/remote/post_training/nvidia/models.py +3 -11
- llama_stack/providers/remote/post_training/nvidia/post_training.py +31 -33
- llama_stack/providers/remote/safety/bedrock/bedrock.py +10 -27
- llama_stack/providers/remote/safety/nvidia/nvidia.py +9 -25
- llama_stack/providers/remote/safety/sambanova/sambanova.py +13 -11
- llama_stack/providers/remote/vector_io/elasticsearch/__init__.py +17 -0
- llama_stack/providers/remote/vector_io/elasticsearch/config.py +32 -0
- llama_stack/providers/remote/vector_io/elasticsearch/elasticsearch.py +463 -0
- llama_stack/providers/remote/vector_io/oci/__init__.py +22 -0
- llama_stack/providers/remote/vector_io/oci/config.py +41 -0
- llama_stack/providers/remote/vector_io/oci/oci26ai.py +595 -0
- llama_stack/providers/remote/vector_io/pgvector/config.py +69 -2
- llama_stack/providers/remote/vector_io/pgvector/pgvector.py +255 -6
- llama_stack/providers/remote/vector_io/qdrant/qdrant.py +62 -38
- llama_stack/providers/utils/bedrock/client.py +3 -3
- llama_stack/providers/utils/bedrock/config.py +7 -7
- llama_stack/providers/utils/inference/__init__.py +0 -25
- llama_stack/providers/utils/inference/embedding_mixin.py +4 -0
- llama_stack/providers/utils/inference/http_client.py +239 -0
- llama_stack/providers/utils/inference/litellm_openai_mixin.py +6 -0
- llama_stack/providers/utils/inference/model_registry.py +148 -2
- llama_stack/providers/utils/inference/openai_compat.py +1 -158
- llama_stack/providers/utils/inference/openai_mixin.py +42 -2
- llama_stack/providers/utils/inference/prompt_adapter.py +0 -209
- llama_stack/providers/utils/memory/openai_vector_store_mixin.py +92 -5
- llama_stack/providers/utils/memory/vector_store.py +46 -19
- llama_stack/providers/utils/responses/responses_store.py +7 -7
- llama_stack/providers/utils/safety.py +114 -0
- llama_stack/providers/utils/tools/mcp.py +44 -3
- llama_stack/testing/api_recorder.py +9 -3
- {llama_stack-0.4.4.dist-info → llama_stack-0.5.0.dist-info}/METADATA +14 -2
- {llama_stack-0.4.4.dist-info → llama_stack-0.5.0.dist-info}/RECORD +115 -148
- llama_stack/distributions/meta-reference-gpu/config.yaml +0 -140
- llama_stack/distributions/meta-reference-gpu/doc_template.md +0 -119
- llama_stack/distributions/meta-reference-gpu/meta_reference.py +0 -163
- llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +0 -155
- llama_stack/models/llama/hadamard_utils.py +0 -88
- llama_stack/models/llama/llama3/args.py +0 -74
- llama_stack/models/llama/llama3/dog.jpg +0 -0
- llama_stack/models/llama/llama3/generation.py +0 -378
- llama_stack/models/llama/llama3/model.py +0 -304
- llama_stack/models/llama/llama3/multimodal/__init__.py +0 -12
- llama_stack/models/llama/llama3/multimodal/encoder_utils.py +0 -180
- llama_stack/models/llama/llama3/multimodal/image_transform.py +0 -409
- llama_stack/models/llama/llama3/multimodal/model.py +0 -1430
- llama_stack/models/llama/llama3/multimodal/utils.py +0 -26
- llama_stack/models/llama/llama3/pasta.jpeg +0 -0
- llama_stack/models/llama/llama3/quantization/__init__.py +0 -5
- llama_stack/models/llama/llama3/quantization/loader.py +0 -316
- llama_stack/models/llama/llama3_1/__init__.py +0 -12
- llama_stack/models/llama/llama3_1/prompt_format.md +0 -358
- llama_stack/models/llama/llama3_1/prompts.py +0 -258
- llama_stack/models/llama/llama3_2/__init__.py +0 -5
- llama_stack/models/llama/llama3_2/prompts_text.py +0 -229
- llama_stack/models/llama/llama3_2/prompts_vision.py +0 -126
- llama_stack/models/llama/llama3_2/text_prompt_format.md +0 -286
- llama_stack/models/llama/llama3_2/vision_prompt_format.md +0 -141
- llama_stack/models/llama/llama3_3/__init__.py +0 -5
- llama_stack/models/llama/llama3_3/prompts.py +0 -259
- llama_stack/models/llama/llama4/args.py +0 -107
- llama_stack/models/llama/llama4/ffn.py +0 -58
- llama_stack/models/llama/llama4/moe.py +0 -214
- llama_stack/models/llama/llama4/preprocess.py +0 -435
- llama_stack/models/llama/llama4/quantization/__init__.py +0 -5
- llama_stack/models/llama/llama4/quantization/loader.py +0 -226
- llama_stack/models/llama/llama4/vision/__init__.py +0 -5
- llama_stack/models/llama/llama4/vision/embedding.py +0 -210
- llama_stack/models/llama/llama4/vision/encoder.py +0 -412
- llama_stack/models/llama/quantize_impls.py +0 -316
- llama_stack/providers/inline/inference/meta_reference/__init__.py +0 -20
- llama_stack/providers/inline/inference/meta_reference/common.py +0 -24
- llama_stack/providers/inline/inference/meta_reference/config.py +0 -68
- llama_stack/providers/inline/inference/meta_reference/generators.py +0 -201
- llama_stack/providers/inline/inference/meta_reference/inference.py +0 -542
- llama_stack/providers/inline/inference/meta_reference/model_parallel.py +0 -77
- llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +0 -353
- {llama_stack-0.4.4.dist-info → llama_stack-0.5.0.dist-info}/WHEEL +0 -0
- {llama_stack-0.4.4.dist-info → llama_stack-0.5.0.dist-info}/entry_points.txt +0 -0
- {llama_stack-0.4.4.dist-info → llama_stack-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {llama_stack-0.4.4.dist-info → llama_stack-0.5.0.dist-info}/top_level.txt +0 -0
|
@@ -1,140 +0,0 @@
|
|
|
1
|
-
version: 2
|
|
2
|
-
image_name: meta-reference-gpu
|
|
3
|
-
apis:
|
|
4
|
-
- agents
|
|
5
|
-
- datasetio
|
|
6
|
-
- eval
|
|
7
|
-
- inference
|
|
8
|
-
- safety
|
|
9
|
-
- scoring
|
|
10
|
-
- tool_runtime
|
|
11
|
-
- vector_io
|
|
12
|
-
providers:
|
|
13
|
-
inference:
|
|
14
|
-
- provider_id: meta-reference-inference
|
|
15
|
-
provider_type: inline::meta-reference
|
|
16
|
-
config:
|
|
17
|
-
model: ${env.INFERENCE_MODEL}
|
|
18
|
-
checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:=null}
|
|
19
|
-
quantization:
|
|
20
|
-
type: ${env.QUANTIZATION_TYPE:=bf16}
|
|
21
|
-
model_parallel_size: ${env.MODEL_PARALLEL_SIZE:=0}
|
|
22
|
-
max_batch_size: ${env.MAX_BATCH_SIZE:=1}
|
|
23
|
-
max_seq_len: ${env.MAX_SEQ_LEN:=4096}
|
|
24
|
-
- provider_id: sentence-transformers
|
|
25
|
-
provider_type: inline::sentence-transformers
|
|
26
|
-
vector_io:
|
|
27
|
-
- provider_id: faiss
|
|
28
|
-
provider_type: inline::faiss
|
|
29
|
-
config:
|
|
30
|
-
persistence:
|
|
31
|
-
namespace: vector_io::faiss
|
|
32
|
-
backend: kv_default
|
|
33
|
-
safety:
|
|
34
|
-
- provider_id: llama-guard
|
|
35
|
-
provider_type: inline::llama-guard
|
|
36
|
-
config:
|
|
37
|
-
excluded_categories: []
|
|
38
|
-
agents:
|
|
39
|
-
- provider_id: meta-reference
|
|
40
|
-
provider_type: inline::meta-reference
|
|
41
|
-
config:
|
|
42
|
-
persistence:
|
|
43
|
-
agent_state:
|
|
44
|
-
namespace: agents
|
|
45
|
-
backend: kv_default
|
|
46
|
-
responses:
|
|
47
|
-
table_name: responses
|
|
48
|
-
backend: sql_default
|
|
49
|
-
max_write_queue_size: 10000
|
|
50
|
-
num_writers: 4
|
|
51
|
-
eval:
|
|
52
|
-
- provider_id: meta-reference
|
|
53
|
-
provider_type: inline::meta-reference
|
|
54
|
-
config:
|
|
55
|
-
kvstore:
|
|
56
|
-
namespace: eval
|
|
57
|
-
backend: kv_default
|
|
58
|
-
datasetio:
|
|
59
|
-
- provider_id: huggingface
|
|
60
|
-
provider_type: remote::huggingface
|
|
61
|
-
config:
|
|
62
|
-
kvstore:
|
|
63
|
-
namespace: datasetio::huggingface
|
|
64
|
-
backend: kv_default
|
|
65
|
-
- provider_id: localfs
|
|
66
|
-
provider_type: inline::localfs
|
|
67
|
-
config:
|
|
68
|
-
kvstore:
|
|
69
|
-
namespace: datasetio::localfs
|
|
70
|
-
backend: kv_default
|
|
71
|
-
scoring:
|
|
72
|
-
- provider_id: basic
|
|
73
|
-
provider_type: inline::basic
|
|
74
|
-
- provider_id: llm-as-judge
|
|
75
|
-
provider_type: inline::llm-as-judge
|
|
76
|
-
- provider_id: braintrust
|
|
77
|
-
provider_type: inline::braintrust
|
|
78
|
-
config:
|
|
79
|
-
openai_api_key: ${env.OPENAI_API_KEY:=}
|
|
80
|
-
tool_runtime:
|
|
81
|
-
- provider_id: brave-search
|
|
82
|
-
provider_type: remote::brave-search
|
|
83
|
-
config:
|
|
84
|
-
api_key: ${env.BRAVE_SEARCH_API_KEY:=}
|
|
85
|
-
max_results: 3
|
|
86
|
-
- provider_id: tavily-search
|
|
87
|
-
provider_type: remote::tavily-search
|
|
88
|
-
config:
|
|
89
|
-
api_key: ${env.TAVILY_SEARCH_API_KEY:=}
|
|
90
|
-
max_results: 3
|
|
91
|
-
- provider_id: rag-runtime
|
|
92
|
-
provider_type: inline::rag-runtime
|
|
93
|
-
- provider_id: model-context-protocol
|
|
94
|
-
provider_type: remote::model-context-protocol
|
|
95
|
-
storage:
|
|
96
|
-
backends:
|
|
97
|
-
kv_default:
|
|
98
|
-
type: kv_sqlite
|
|
99
|
-
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/kvstore.db
|
|
100
|
-
sql_default:
|
|
101
|
-
type: sql_sqlite
|
|
102
|
-
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/sql_store.db
|
|
103
|
-
stores:
|
|
104
|
-
metadata:
|
|
105
|
-
namespace: registry
|
|
106
|
-
backend: kv_default
|
|
107
|
-
inference:
|
|
108
|
-
table_name: inference_store
|
|
109
|
-
backend: sql_default
|
|
110
|
-
max_write_queue_size: 10000
|
|
111
|
-
num_writers: 4
|
|
112
|
-
conversations:
|
|
113
|
-
table_name: openai_conversations
|
|
114
|
-
backend: sql_default
|
|
115
|
-
prompts:
|
|
116
|
-
namespace: prompts
|
|
117
|
-
backend: kv_default
|
|
118
|
-
registered_resources:
|
|
119
|
-
models:
|
|
120
|
-
- metadata: {}
|
|
121
|
-
model_id: ${env.INFERENCE_MODEL}
|
|
122
|
-
provider_id: meta-reference-inference
|
|
123
|
-
model_type: llm
|
|
124
|
-
- metadata:
|
|
125
|
-
embedding_dimension: 768
|
|
126
|
-
model_id: nomic-embed-text-v1.5
|
|
127
|
-
provider_id: sentence-transformers
|
|
128
|
-
model_type: embedding
|
|
129
|
-
shields: []
|
|
130
|
-
vector_dbs: []
|
|
131
|
-
datasets: []
|
|
132
|
-
scoring_fns: []
|
|
133
|
-
benchmarks: []
|
|
134
|
-
tool_groups:
|
|
135
|
-
- toolgroup_id: builtin::websearch
|
|
136
|
-
provider_id: tavily-search
|
|
137
|
-
- toolgroup_id: builtin::rag
|
|
138
|
-
provider_id: rag-runtime
|
|
139
|
-
server:
|
|
140
|
-
port: 8321
|
|
@@ -1,119 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
orphan: true
|
|
3
|
-
---
|
|
4
|
-
# Meta Reference GPU Distribution
|
|
5
|
-
|
|
6
|
-
```{toctree}
|
|
7
|
-
:maxdepth: 2
|
|
8
|
-
:hidden:
|
|
9
|
-
|
|
10
|
-
self
|
|
11
|
-
```
|
|
12
|
-
|
|
13
|
-
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
|
|
14
|
-
|
|
15
|
-
{{ providers_table }}
|
|
16
|
-
|
|
17
|
-
Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.
|
|
18
|
-
|
|
19
|
-
{% if run_config_env_vars %}
|
|
20
|
-
### Environment Variables
|
|
21
|
-
|
|
22
|
-
The following environment variables can be configured:
|
|
23
|
-
|
|
24
|
-
{% for var, (default_value, description) in run_config_env_vars.items() %}
|
|
25
|
-
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
|
|
26
|
-
{% endfor %}
|
|
27
|
-
{% endif %}
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
## Prerequisite: Downloading Models
|
|
31
|
-
|
|
32
|
-
Please check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models using the Hugging Face CLI.
|
|
33
|
-
```
|
|
34
|
-
|
|
35
|
-
## Running the Distribution
|
|
36
|
-
|
|
37
|
-
You can do this via venv or Docker which has a pre-built image.
|
|
38
|
-
|
|
39
|
-
### Via Docker
|
|
40
|
-
|
|
41
|
-
This method allows you to get started quickly without having to build the distribution code.
|
|
42
|
-
|
|
43
|
-
```bash
|
|
44
|
-
LLAMA_STACK_PORT=8321
|
|
45
|
-
docker run \
|
|
46
|
-
-it \
|
|
47
|
-
--pull always \
|
|
48
|
-
--gpu all \
|
|
49
|
-
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
|
50
|
-
-v ~/.llama:/root/.llama \
|
|
51
|
-
-e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
|
52
|
-
llamastack/distribution-{{ name }} \
|
|
53
|
-
--port $LLAMA_STACK_PORT
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
If you are using Llama Stack Safety / Shield APIs, use:
|
|
57
|
-
|
|
58
|
-
```bash
|
|
59
|
-
docker run \
|
|
60
|
-
-it \
|
|
61
|
-
--pull always \
|
|
62
|
-
--gpu all \
|
|
63
|
-
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
|
64
|
-
-v ~/.llama:/root/.llama \
|
|
65
|
-
-e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
|
66
|
-
-e SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
|
|
67
|
-
llamastack/distribution-{{ name }} \
|
|
68
|
-
--port $LLAMA_STACK_PORT
|
|
69
|
-
```
|
|
70
|
-
|
|
71
|
-
### Via Docker with Custom Run Configuration
|
|
72
|
-
|
|
73
|
-
You can also run the Docker container with a custom run configuration file by mounting it into the container:
|
|
74
|
-
|
|
75
|
-
```bash
|
|
76
|
-
# Set the path to your custom config.yaml file
|
|
77
|
-
CUSTOM_RUN_CONFIG=/path/to/your/custom-config.yaml
|
|
78
|
-
LLAMA_STACK_PORT=8321
|
|
79
|
-
|
|
80
|
-
docker run \
|
|
81
|
-
-it \
|
|
82
|
-
--pull always \
|
|
83
|
-
--gpu all \
|
|
84
|
-
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
|
85
|
-
-v ~/.llama:/root/.llama \
|
|
86
|
-
-v $CUSTOM_RUN_CONFIG:/app/custom-config.yaml \
|
|
87
|
-
-e RUN_CONFIG_PATH=/app/custom-config.yaml \
|
|
88
|
-
llamastack/distribution-{{ name }} \
|
|
89
|
-
--port $LLAMA_STACK_PORT
|
|
90
|
-
```
|
|
91
|
-
|
|
92
|
-
**Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
|
|
93
|
-
|
|
94
|
-
{% if run_configs %}
|
|
95
|
-
Available run configurations for this distribution:
|
|
96
|
-
{% for config in run_configs %}
|
|
97
|
-
- `{{ config }}`
|
|
98
|
-
{% endfor %}
|
|
99
|
-
{% endif %}
|
|
100
|
-
|
|
101
|
-
### Via venv
|
|
102
|
-
|
|
103
|
-
Make sure you have the Llama Stack CLI available.
|
|
104
|
-
|
|
105
|
-
```bash
|
|
106
|
-
llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
|
|
107
|
-
INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
|
108
|
-
llama stack run distributions/{{ name }}/config.yaml \
|
|
109
|
-
--port 8321
|
|
110
|
-
```
|
|
111
|
-
|
|
112
|
-
If you are using Llama Stack Safety / Shield APIs, use:
|
|
113
|
-
|
|
114
|
-
```bash
|
|
115
|
-
INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
|
116
|
-
SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
|
|
117
|
-
llama stack run distributions/{{ name }}/run-with-safety.yaml \
|
|
118
|
-
--port 8321
|
|
119
|
-
```
|
|
@@ -1,163 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
#
|
|
4
|
-
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
|
-
# the root directory of this source tree.
|
|
6
|
-
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
|
|
9
|
-
from llama_stack.core.datatypes import (
|
|
10
|
-
BuildProvider,
|
|
11
|
-
ModelInput,
|
|
12
|
-
Provider,
|
|
13
|
-
ShieldInput,
|
|
14
|
-
ToolGroupInput,
|
|
15
|
-
)
|
|
16
|
-
from llama_stack.distributions.template import DistributionTemplate, RunConfigSettings
|
|
17
|
-
from llama_stack.providers.inline.inference.meta_reference import (
|
|
18
|
-
MetaReferenceInferenceConfig,
|
|
19
|
-
)
|
|
20
|
-
from llama_stack.providers.inline.inference.sentence_transformers import (
|
|
21
|
-
SentenceTransformersInferenceConfig,
|
|
22
|
-
)
|
|
23
|
-
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
|
|
24
|
-
from llama_stack_api import ModelType
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def get_distribution_template() -> DistributionTemplate:
|
|
28
|
-
providers = {
|
|
29
|
-
"inference": [BuildProvider(provider_type="inline::meta-reference")],
|
|
30
|
-
"vector_io": [
|
|
31
|
-
BuildProvider(provider_type="inline::faiss"),
|
|
32
|
-
BuildProvider(provider_type="remote::chromadb"),
|
|
33
|
-
BuildProvider(provider_type="remote::pgvector"),
|
|
34
|
-
],
|
|
35
|
-
"safety": [BuildProvider(provider_type="inline::llama-guard")],
|
|
36
|
-
"agents": [BuildProvider(provider_type="inline::meta-reference")],
|
|
37
|
-
"eval": [BuildProvider(provider_type="inline::meta-reference")],
|
|
38
|
-
"datasetio": [
|
|
39
|
-
BuildProvider(provider_type="remote::huggingface"),
|
|
40
|
-
BuildProvider(provider_type="inline::localfs"),
|
|
41
|
-
],
|
|
42
|
-
"scoring": [
|
|
43
|
-
BuildProvider(provider_type="inline::basic"),
|
|
44
|
-
BuildProvider(provider_type="inline::llm-as-judge"),
|
|
45
|
-
BuildProvider(provider_type="inline::braintrust"),
|
|
46
|
-
],
|
|
47
|
-
"tool_runtime": [
|
|
48
|
-
BuildProvider(provider_type="remote::brave-search"),
|
|
49
|
-
BuildProvider(provider_type="remote::tavily-search"),
|
|
50
|
-
BuildProvider(provider_type="inline::rag-runtime"),
|
|
51
|
-
BuildProvider(provider_type="remote::model-context-protocol"),
|
|
52
|
-
],
|
|
53
|
-
}
|
|
54
|
-
name = "meta-reference-gpu"
|
|
55
|
-
inference_provider = Provider(
|
|
56
|
-
provider_id="meta-reference-inference",
|
|
57
|
-
provider_type="inline::meta-reference",
|
|
58
|
-
config=MetaReferenceInferenceConfig.sample_run_config(
|
|
59
|
-
model="${env.INFERENCE_MODEL}",
|
|
60
|
-
checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:=null}",
|
|
61
|
-
),
|
|
62
|
-
)
|
|
63
|
-
embedding_provider = Provider(
|
|
64
|
-
provider_id="sentence-transformers",
|
|
65
|
-
provider_type="inline::sentence-transformers",
|
|
66
|
-
config=SentenceTransformersInferenceConfig.sample_run_config(),
|
|
67
|
-
)
|
|
68
|
-
vector_io_provider = Provider(
|
|
69
|
-
provider_id="faiss",
|
|
70
|
-
provider_type="inline::faiss",
|
|
71
|
-
config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
inference_model = ModelInput(
|
|
75
|
-
model_id="${env.INFERENCE_MODEL}",
|
|
76
|
-
provider_id="meta-reference-inference",
|
|
77
|
-
)
|
|
78
|
-
embedding_model = ModelInput(
|
|
79
|
-
model_id="nomic-embed-text-v1.5",
|
|
80
|
-
provider_id="sentence-transformers",
|
|
81
|
-
model_type=ModelType.embedding,
|
|
82
|
-
metadata={
|
|
83
|
-
"embedding_dimension": 768,
|
|
84
|
-
},
|
|
85
|
-
)
|
|
86
|
-
safety_model = ModelInput(
|
|
87
|
-
model_id="${env.SAFETY_MODEL}",
|
|
88
|
-
provider_id="meta-reference-safety",
|
|
89
|
-
)
|
|
90
|
-
default_tool_groups = [
|
|
91
|
-
ToolGroupInput(
|
|
92
|
-
toolgroup_id="builtin::websearch",
|
|
93
|
-
provider_id="tavily-search",
|
|
94
|
-
),
|
|
95
|
-
ToolGroupInput(
|
|
96
|
-
toolgroup_id="builtin::rag",
|
|
97
|
-
provider_id="rag-runtime",
|
|
98
|
-
),
|
|
99
|
-
]
|
|
100
|
-
|
|
101
|
-
return DistributionTemplate(
|
|
102
|
-
name=name,
|
|
103
|
-
distro_type="self_hosted",
|
|
104
|
-
description="Use Meta Reference for running LLM inference",
|
|
105
|
-
template_path=Path(__file__).parent / "doc_template.md",
|
|
106
|
-
providers=providers,
|
|
107
|
-
run_configs={
|
|
108
|
-
"config.yaml": RunConfigSettings(
|
|
109
|
-
provider_overrides={
|
|
110
|
-
"inference": [inference_provider, embedding_provider],
|
|
111
|
-
"vector_io": [vector_io_provider],
|
|
112
|
-
},
|
|
113
|
-
default_models=[inference_model, embedding_model],
|
|
114
|
-
default_tool_groups=default_tool_groups,
|
|
115
|
-
),
|
|
116
|
-
"run-with-safety.yaml": RunConfigSettings(
|
|
117
|
-
provider_overrides={
|
|
118
|
-
"inference": [
|
|
119
|
-
inference_provider,
|
|
120
|
-
embedding_provider,
|
|
121
|
-
Provider(
|
|
122
|
-
provider_id="meta-reference-safety",
|
|
123
|
-
provider_type="inline::meta-reference",
|
|
124
|
-
config=MetaReferenceInferenceConfig.sample_run_config(
|
|
125
|
-
model="${env.SAFETY_MODEL}",
|
|
126
|
-
checkpoint_dir="${env.SAFETY_CHECKPOINT_DIR:=null}",
|
|
127
|
-
),
|
|
128
|
-
),
|
|
129
|
-
],
|
|
130
|
-
"vector_io": [vector_io_provider],
|
|
131
|
-
},
|
|
132
|
-
default_models=[
|
|
133
|
-
inference_model,
|
|
134
|
-
safety_model,
|
|
135
|
-
embedding_model,
|
|
136
|
-
],
|
|
137
|
-
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
|
|
138
|
-
default_tool_groups=default_tool_groups,
|
|
139
|
-
),
|
|
140
|
-
},
|
|
141
|
-
run_config_env_vars={
|
|
142
|
-
"LLAMA_STACK_PORT": (
|
|
143
|
-
"8321",
|
|
144
|
-
"Port for the Llama Stack distribution server",
|
|
145
|
-
),
|
|
146
|
-
"INFERENCE_MODEL": (
|
|
147
|
-
"meta-llama/Llama-3.2-3B-Instruct",
|
|
148
|
-
"Inference model loaded into the Meta Reference server",
|
|
149
|
-
),
|
|
150
|
-
"INFERENCE_CHECKPOINT_DIR": (
|
|
151
|
-
"null",
|
|
152
|
-
"Directory containing the Meta Reference model checkpoint",
|
|
153
|
-
),
|
|
154
|
-
"SAFETY_MODEL": (
|
|
155
|
-
"meta-llama/Llama-Guard-3-1B",
|
|
156
|
-
"Name of the safety (Llama-Guard) model to use",
|
|
157
|
-
),
|
|
158
|
-
"SAFETY_CHECKPOINT_DIR": (
|
|
159
|
-
"null",
|
|
160
|
-
"Directory containing the Llama-Guard model checkpoint",
|
|
161
|
-
),
|
|
162
|
-
},
|
|
163
|
-
)
|
|
@@ -1,155 +0,0 @@
|
|
|
1
|
-
version: 2
|
|
2
|
-
image_name: meta-reference-gpu
|
|
3
|
-
apis:
|
|
4
|
-
- agents
|
|
5
|
-
- datasetio
|
|
6
|
-
- eval
|
|
7
|
-
- inference
|
|
8
|
-
- safety
|
|
9
|
-
- scoring
|
|
10
|
-
- tool_runtime
|
|
11
|
-
- vector_io
|
|
12
|
-
providers:
|
|
13
|
-
inference:
|
|
14
|
-
- provider_id: meta-reference-inference
|
|
15
|
-
provider_type: inline::meta-reference
|
|
16
|
-
config:
|
|
17
|
-
model: ${env.INFERENCE_MODEL}
|
|
18
|
-
checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:=null}
|
|
19
|
-
quantization:
|
|
20
|
-
type: ${env.QUANTIZATION_TYPE:=bf16}
|
|
21
|
-
model_parallel_size: ${env.MODEL_PARALLEL_SIZE:=0}
|
|
22
|
-
max_batch_size: ${env.MAX_BATCH_SIZE:=1}
|
|
23
|
-
max_seq_len: ${env.MAX_SEQ_LEN:=4096}
|
|
24
|
-
- provider_id: sentence-transformers
|
|
25
|
-
provider_type: inline::sentence-transformers
|
|
26
|
-
- provider_id: meta-reference-safety
|
|
27
|
-
provider_type: inline::meta-reference
|
|
28
|
-
config:
|
|
29
|
-
model: ${env.SAFETY_MODEL}
|
|
30
|
-
checkpoint_dir: ${env.SAFETY_CHECKPOINT_DIR:=null}
|
|
31
|
-
quantization:
|
|
32
|
-
type: ${env.QUANTIZATION_TYPE:=bf16}
|
|
33
|
-
model_parallel_size: ${env.MODEL_PARALLEL_SIZE:=0}
|
|
34
|
-
max_batch_size: ${env.MAX_BATCH_SIZE:=1}
|
|
35
|
-
max_seq_len: ${env.MAX_SEQ_LEN:=4096}
|
|
36
|
-
vector_io:
|
|
37
|
-
- provider_id: faiss
|
|
38
|
-
provider_type: inline::faiss
|
|
39
|
-
config:
|
|
40
|
-
persistence:
|
|
41
|
-
namespace: vector_io::faiss
|
|
42
|
-
backend: kv_default
|
|
43
|
-
safety:
|
|
44
|
-
- provider_id: llama-guard
|
|
45
|
-
provider_type: inline::llama-guard
|
|
46
|
-
config:
|
|
47
|
-
excluded_categories: []
|
|
48
|
-
agents:
|
|
49
|
-
- provider_id: meta-reference
|
|
50
|
-
provider_type: inline::meta-reference
|
|
51
|
-
config:
|
|
52
|
-
persistence:
|
|
53
|
-
agent_state:
|
|
54
|
-
namespace: agents
|
|
55
|
-
backend: kv_default
|
|
56
|
-
responses:
|
|
57
|
-
table_name: responses
|
|
58
|
-
backend: sql_default
|
|
59
|
-
max_write_queue_size: 10000
|
|
60
|
-
num_writers: 4
|
|
61
|
-
eval:
|
|
62
|
-
- provider_id: meta-reference
|
|
63
|
-
provider_type: inline::meta-reference
|
|
64
|
-
config:
|
|
65
|
-
kvstore:
|
|
66
|
-
namespace: eval
|
|
67
|
-
backend: kv_default
|
|
68
|
-
datasetio:
|
|
69
|
-
- provider_id: huggingface
|
|
70
|
-
provider_type: remote::huggingface
|
|
71
|
-
config:
|
|
72
|
-
kvstore:
|
|
73
|
-
namespace: datasetio::huggingface
|
|
74
|
-
backend: kv_default
|
|
75
|
-
- provider_id: localfs
|
|
76
|
-
provider_type: inline::localfs
|
|
77
|
-
config:
|
|
78
|
-
kvstore:
|
|
79
|
-
namespace: datasetio::localfs
|
|
80
|
-
backend: kv_default
|
|
81
|
-
scoring:
|
|
82
|
-
- provider_id: basic
|
|
83
|
-
provider_type: inline::basic
|
|
84
|
-
- provider_id: llm-as-judge
|
|
85
|
-
provider_type: inline::llm-as-judge
|
|
86
|
-
- provider_id: braintrust
|
|
87
|
-
provider_type: inline::braintrust
|
|
88
|
-
config:
|
|
89
|
-
openai_api_key: ${env.OPENAI_API_KEY:=}
|
|
90
|
-
tool_runtime:
|
|
91
|
-
- provider_id: brave-search
|
|
92
|
-
provider_type: remote::brave-search
|
|
93
|
-
config:
|
|
94
|
-
api_key: ${env.BRAVE_SEARCH_API_KEY:=}
|
|
95
|
-
max_results: 3
|
|
96
|
-
- provider_id: tavily-search
|
|
97
|
-
provider_type: remote::tavily-search
|
|
98
|
-
config:
|
|
99
|
-
api_key: ${env.TAVILY_SEARCH_API_KEY:=}
|
|
100
|
-
max_results: 3
|
|
101
|
-
- provider_id: rag-runtime
|
|
102
|
-
provider_type: inline::rag-runtime
|
|
103
|
-
- provider_id: model-context-protocol
|
|
104
|
-
provider_type: remote::model-context-protocol
|
|
105
|
-
storage:
|
|
106
|
-
backends:
|
|
107
|
-
kv_default:
|
|
108
|
-
type: kv_sqlite
|
|
109
|
-
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/kvstore.db
|
|
110
|
-
sql_default:
|
|
111
|
-
type: sql_sqlite
|
|
112
|
-
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/sql_store.db
|
|
113
|
-
stores:
|
|
114
|
-
metadata:
|
|
115
|
-
namespace: registry
|
|
116
|
-
backend: kv_default
|
|
117
|
-
inference:
|
|
118
|
-
table_name: inference_store
|
|
119
|
-
backend: sql_default
|
|
120
|
-
max_write_queue_size: 10000
|
|
121
|
-
num_writers: 4
|
|
122
|
-
conversations:
|
|
123
|
-
table_name: openai_conversations
|
|
124
|
-
backend: sql_default
|
|
125
|
-
prompts:
|
|
126
|
-
namespace: prompts
|
|
127
|
-
backend: kv_default
|
|
128
|
-
registered_resources:
|
|
129
|
-
models:
|
|
130
|
-
- metadata: {}
|
|
131
|
-
model_id: ${env.INFERENCE_MODEL}
|
|
132
|
-
provider_id: meta-reference-inference
|
|
133
|
-
model_type: llm
|
|
134
|
-
- metadata: {}
|
|
135
|
-
model_id: ${env.SAFETY_MODEL}
|
|
136
|
-
provider_id: meta-reference-safety
|
|
137
|
-
model_type: llm
|
|
138
|
-
- metadata:
|
|
139
|
-
embedding_dimension: 768
|
|
140
|
-
model_id: nomic-embed-text-v1.5
|
|
141
|
-
provider_id: sentence-transformers
|
|
142
|
-
model_type: embedding
|
|
143
|
-
shields:
|
|
144
|
-
- shield_id: ${env.SAFETY_MODEL}
|
|
145
|
-
vector_dbs: []
|
|
146
|
-
datasets: []
|
|
147
|
-
scoring_fns: []
|
|
148
|
-
benchmarks: []
|
|
149
|
-
tool_groups:
|
|
150
|
-
- toolgroup_id: builtin::websearch
|
|
151
|
-
provider_id: tavily-search
|
|
152
|
-
- toolgroup_id: builtin::rag
|
|
153
|
-
provider_id: rag-runtime
|
|
154
|
-
server:
|
|
155
|
-
port: 8321
|
|
@@ -1,88 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
#
|
|
4
|
-
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
|
-
# the root directory of this source tree.
|
|
6
|
-
|
|
7
|
-
import math
|
|
8
|
-
import re
|
|
9
|
-
|
|
10
|
-
import torch
|
|
11
|
-
from torch import nn
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def hadamard_transform(x: torch.Tensor) -> torch.Tensor:
|
|
15
|
-
"""Hadamard transform.
|
|
16
|
-
|
|
17
|
-
This function performs the Hadamard transform on the input tensor 'x'.
|
|
18
|
-
The Hadamard transform is a linear transformation that multiplies the input
|
|
19
|
-
tensor by the Hadamard matrix of dimension n x n, where n is the size of
|
|
20
|
-
the last dimension of the input tensor.
|
|
21
|
-
"""
|
|
22
|
-
*_, n = x.shape
|
|
23
|
-
m = int(math.log2(n))
|
|
24
|
-
assert n == 1 << m, "n must be a power of 2"
|
|
25
|
-
x = x[..., None]
|
|
26
|
-
inv_sqrt2 = 0.5**0.5
|
|
27
|
-
for _ in range(m):
|
|
28
|
-
top = x[..., ::2, :] + x[..., 1::2, :]
|
|
29
|
-
bot = x[..., ::2, :] - x[..., 1::2, :]
|
|
30
|
-
x = torch.cat((top, bot), dim=-1)
|
|
31
|
-
x *= inv_sqrt2
|
|
32
|
-
res = x.squeeze(-2)
|
|
33
|
-
return res
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class HadamardModule(torch.nn.Module):
|
|
37
|
-
"""A module that applies the Hadamard transform to the input tensor.
|
|
38
|
-
|
|
39
|
-
Args:
|
|
40
|
-
group_size: The size of the groups that the input tensor will be divided into
|
|
41
|
-
before applying the Hadamard transform.
|
|
42
|
-
"""
|
|
43
|
-
|
|
44
|
-
def __init__(self, group_size: int) -> None:
|
|
45
|
-
super().__init__()
|
|
46
|
-
self.group_size = group_size
|
|
47
|
-
|
|
48
|
-
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
49
|
-
reshape_back = False
|
|
50
|
-
orig_shape = x.shape
|
|
51
|
-
if self.group_size != x.shape[-1]:
|
|
52
|
-
reshape_back = True
|
|
53
|
-
x = x.reshape(-1, x.shape[-1] // self.group_size, self.group_size)
|
|
54
|
-
x = hadamard_transform(x)
|
|
55
|
-
if reshape_back:
|
|
56
|
-
x = x.reshape(orig_shape)
|
|
57
|
-
return x
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def add_hadamard_transform_for_spinquant(model: torch.nn.Module, prefix: str = "") -> None:
|
|
61
|
-
"""
|
|
62
|
-
Adds a Hadamard transform to the last linear layer of each feedforward network (FFN) in the model.
|
|
63
|
-
This function recursively traverses the model's children and looks for layers that match the pattern
|
|
64
|
-
"layers.<digit>.feed_forward.w2", where <digit> is one or more digits. When such a layer is found,
|
|
65
|
-
it is replaced with a new sequential module that consists of a HadamardModule followed by the original
|
|
66
|
-
layer. The HadamardModule applies the Hadamard transform to the input tensor.
|
|
67
|
-
|
|
68
|
-
See `SpinQuant <https://arxiv.org/abs/2405.16406>_` paper for more details.
|
|
69
|
-
|
|
70
|
-
Args:
|
|
71
|
-
model: An instance of 'torch.nn.Module' (e.g., Transformer model).
|
|
72
|
-
prefix: A string prefix to add to the full name of each child module.
|
|
73
|
-
|
|
74
|
-
Returns:
|
|
75
|
-
None
|
|
76
|
-
"""
|
|
77
|
-
|
|
78
|
-
pattern_last_linear_ffn = r"layers.\d+.feed_forward.w2"
|
|
79
|
-
for module_name, module in model.named_children():
|
|
80
|
-
child_full_name = prefix + "." + module_name
|
|
81
|
-
if re.search(pattern_last_linear_ffn, child_full_name):
|
|
82
|
-
# Module matching this pattern should be nn.Linear with in_features
|
|
83
|
-
assert isinstance(module, nn.Linear), f"Expected nn.Linear, got {type(module)}"
|
|
84
|
-
new_module = nn.Sequential(HadamardModule(group_size=module.in_features), module)
|
|
85
|
-
del module
|
|
86
|
-
setattr(model, module_name, new_module)
|
|
87
|
-
else:
|
|
88
|
-
add_hadamard_transform_for_spinquant(module, (prefix + "." if prefix else prefix) + module_name)
|